1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
32; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
33; GFX7LESS-NEXT:    s_mov_b32 m0, -1
34; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:  .LBB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
43; GFX7LESS-NEXT:    s_mov_b32 s2, -1
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz .LBB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    s_mul_i32 s2, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, 0
61; GFX8-NEXT:    v_mov_b32_e32 v2, s2
62; GFX8-NEXT:    s_mov_b32 m0, -1
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:  .LBB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
70; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
71; GFX8-NEXT:    s_mov_b32 s3, 0xf000
72; GFX8-NEXT:    s_mov_b32 s2, -1
73; GFX8-NEXT:    s_nop 1
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz .LBB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    s_mul_i32 s2, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v1, 0
91; GFX9-NEXT:    v_mov_b32_e32 v2, s2
92; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:  .LBB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
99; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
100; GFX9-NEXT:    s_mov_b32 s3, 0xf000
101; GFX9-NEXT:    s_mov_b32 s2, -1
102; GFX9-NEXT:    s_nop 1
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
119; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
120; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
121; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
122; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
124; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX1064-NEXT:    buffer_gl0_inv
126; GFX1064-NEXT:  .LBB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
135; GFX1064-NEXT:    s_endpgm
136;
137; GFX1032-LABEL: add_i32_constant:
138; GFX1032:       ; %bb.0: ; %entry
139; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
140; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
141; GFX1032-NEXT:    ; implicit-def: $vgpr1
142; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
143; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
144; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
145; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
146; GFX1032-NEXT:  ; %bb.1:
147; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
148; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
149; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
150; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
151; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
152; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
153; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
154; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX1032-NEXT:    buffer_gl0_inv
156; GFX1032-NEXT:  .LBB0_2:
157; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
158; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
159; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
160; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
161; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
162; GFX1032-NEXT:    s_mov_b32 s2, -1
163; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
165; GFX1032-NEXT:    s_endpgm
166entry:
167  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
168  store i32 %old, i32 addrspace(1)* %out
169  ret void
170}
171
172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
173;
174;
175; GFX7LESS-LABEL: add_i32_uniform:
176; GFX7LESS:       ; %bb.0: ; %entry
177; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
178; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
179; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
180; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
181; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
182; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
183; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
184; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
185; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
186; GFX7LESS-NEXT:  ; %bb.1:
187; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
188; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
190; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
191; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
192; GFX7LESS-NEXT:    s_mov_b32 m0, -1
193; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
195; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX7LESS-NEXT:  .LBB1_2:
197; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
198; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
200; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
201; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
202; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
203; GFX7LESS-NEXT:    s_mov_b32 s6, -1
204; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
205; GFX7LESS-NEXT:    s_endpgm
206;
207; GFX8-LABEL: add_i32_uniform:
208; GFX8:       ; %bb.0: ; %entry
209; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
210; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
211; GFX8-NEXT:    s_mov_b64 s[2:3], exec
212; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
213; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
214; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
215; GFX8-NEXT:    ; implicit-def: $vgpr1
216; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
217; GFX8-NEXT:    s_cbranch_execz .LBB1_2
218; GFX8-NEXT:  ; %bb.1:
219; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
220; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX8-NEXT:    s_mul_i32 s2, s6, s2
222; GFX8-NEXT:    v_mov_b32_e32 v1, 0
223; GFX8-NEXT:    v_mov_b32_e32 v2, s2
224; GFX8-NEXT:    s_mov_b32 m0, -1
225; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
227; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX8-NEXT:  .LBB1_2:
229; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
230; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
232; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
233; GFX8-NEXT:    s_mov_b32 s7, 0xf000
234; GFX8-NEXT:    s_mov_b32 s6, -1
235; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
236; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
237; GFX8-NEXT:    s_endpgm
238;
239; GFX9-LABEL: add_i32_uniform:
240; GFX9:       ; %bb.0: ; %entry
241; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
242; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
243; GFX9-NEXT:    s_mov_b64 s[2:3], exec
244; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
245; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
246; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
247; GFX9-NEXT:    ; implicit-def: $vgpr1
248; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
249; GFX9-NEXT:    s_cbranch_execz .LBB1_2
250; GFX9-NEXT:  ; %bb.1:
251; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    s_mul_i32 s2, s6, s2
254; GFX9-NEXT:    v_mov_b32_e32 v1, 0
255; GFX9-NEXT:    v_mov_b32_e32 v2, s2
256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:  .LBB1_2:
260; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
261; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
263; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
264; GFX9-NEXT:    s_mov_b32 s7, 0xf000
265; GFX9-NEXT:    s_mov_b32 s6, -1
266; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
267; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
268; GFX9-NEXT:    s_endpgm
269;
270; GFX1064-LABEL: add_i32_uniform:
271; GFX1064:       ; %bb.0: ; %entry
272; GFX1064-NEXT:    s_clause 0x1
273; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
274; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
275; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
276; GFX1064-NEXT:    ; implicit-def: $vgpr1
277; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
278; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
279; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
280; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
281; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
282; GFX1064-NEXT:  ; %bb.1:
283; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
284; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
285; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
287; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
288; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
289; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX1064-NEXT:    buffer_gl0_inv
293; GFX1064-NEXT:  .LBB1_2:
294; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
295; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
296; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
297; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
298; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
300; GFX1064-NEXT:    s_mov_b32 s6, -1
301; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
302; GFX1064-NEXT:    s_endpgm
303;
304; GFX1032-LABEL: add_i32_uniform:
305; GFX1032:       ; %bb.0: ; %entry
306; GFX1032-NEXT:    s_clause 0x1
307; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
308; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
309; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
310; GFX1032-NEXT:    ; implicit-def: $vgpr1
311; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
312; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
313; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
314; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
315; GFX1032-NEXT:  ; %bb.1:
316; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
317; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
318; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
320; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
321; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
322; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
323; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
324; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX1032-NEXT:    buffer_gl0_inv
326; GFX1032-NEXT:  .LBB1_2:
327; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
328; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
329; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
330; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
331; GFX1032-NEXT:    s_mov_b32 s6, -1
332; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
334; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
335; GFX1032-NEXT:    s_endpgm
336entry:
337  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
338  store i32 %old, i32 addrspace(1)* %out
339  ret void
340}
341
342define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
343;
344;
345; GFX7LESS-LABEL: add_i32_varying:
346; GFX7LESS:       ; %bb.0: ; %entry
347; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
348; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
349; GFX7LESS-NEXT:    s_mov_b32 m0, -1
350; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
352; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
354; GFX7LESS-NEXT:    s_mov_b32 s2, -1
355; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
356; GFX7LESS-NEXT:    s_endpgm
357;
358; GFX8-LABEL: add_i32_varying:
359; GFX8:       ; %bb.0: ; %entry
360; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
361; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
362; GFX8-NEXT:    v_mov_b32_e32 v1, 0
363; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
364; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
365; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
366; GFX8-NEXT:    v_mov_b32_e32 v2, v0
367; GFX8-NEXT:    s_not_b64 exec, exec
368; GFX8-NEXT:    v_mov_b32_e32 v2, 0
369; GFX8-NEXT:    s_not_b64 exec, exec
370; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
371; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
372; GFX8-NEXT:    s_nop 1
373; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
374; GFX8-NEXT:    s_nop 1
375; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
376; GFX8-NEXT:    s_nop 1
377; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
378; GFX8-NEXT:    s_nop 1
379; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
380; GFX8-NEXT:    s_nop 1
381; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
382; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
383; GFX8-NEXT:    s_nop 0
384; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
385; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
386; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
387; GFX8-NEXT:    ; implicit-def: $vgpr0
388; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
389; GFX8-NEXT:    s_cbranch_execz .LBB2_2
390; GFX8-NEXT:  ; %bb.1:
391; GFX8-NEXT:    v_mov_b32_e32 v0, 0
392; GFX8-NEXT:    v_mov_b32_e32 v3, s4
393; GFX8-NEXT:    s_mov_b32 m0, -1
394; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX8-NEXT:  .LBB2_2:
398; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
399; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
401; GFX8-NEXT:    v_mov_b32_e32 v0, v1
402; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
403; GFX8-NEXT:    s_mov_b32 s3, 0xf000
404; GFX8-NEXT:    s_mov_b32 s2, -1
405; GFX8-NEXT:    s_nop 0
406; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
407; GFX8-NEXT:    s_endpgm
408;
409; GFX9-LABEL: add_i32_varying:
410; GFX9:       ; %bb.0: ; %entry
411; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
412; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
413; GFX9-NEXT:    v_mov_b32_e32 v1, 0
414; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
415; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
416; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
417; GFX9-NEXT:    v_mov_b32_e32 v2, v0
418; GFX9-NEXT:    s_not_b64 exec, exec
419; GFX9-NEXT:    v_mov_b32_e32 v2, 0
420; GFX9-NEXT:    s_not_b64 exec, exec
421; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
422; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
423; GFX9-NEXT:    s_nop 1
424; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
425; GFX9-NEXT:    s_nop 1
426; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
427; GFX9-NEXT:    s_nop 1
428; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
429; GFX9-NEXT:    s_nop 1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
433; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
434; GFX9-NEXT:    s_nop 0
435; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
436; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
437; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
438; GFX9-NEXT:    ; implicit-def: $vgpr0
439; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
440; GFX9-NEXT:    s_cbranch_execz .LBB2_2
441; GFX9-NEXT:  ; %bb.1:
442; GFX9-NEXT:    v_mov_b32_e32 v0, 0
443; GFX9-NEXT:    v_mov_b32_e32 v3, s4
444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-NEXT:  .LBB2_2:
448; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
449; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
451; GFX9-NEXT:    v_mov_b32_e32 v0, v1
452; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
453; GFX9-NEXT:    s_mov_b32 s3, 0xf000
454; GFX9-NEXT:    s_mov_b32 s2, -1
455; GFX9-NEXT:    s_nop 0
456; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
457; GFX9-NEXT:    s_endpgm
458;
459; GFX1064-LABEL: add_i32_varying:
460; GFX1064:       ; %bb.0: ; %entry
461; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
462; GFX1064-NEXT:    s_not_b64 exec, exec
463; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
464; GFX1064-NEXT:    s_not_b64 exec, exec
465; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
466; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
467; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
468; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
469; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
470; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
471; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
472; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
473; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
474; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
475; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
476; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
477; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
478; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
479; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
480; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
481; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
482; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
483; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
484; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
485; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
486; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
487; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
488; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
489; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
490; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
491; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
492; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
493; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
494; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
495; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
496; GFX1064-NEXT:    s_mov_b32 s2, -1
497; GFX1064-NEXT:    ; implicit-def: $vgpr0
498; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
499; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
500; GFX1064-NEXT:  ; %bb.1:
501; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
502; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
503; GFX1064-NEXT:    s_mov_b32 s3, s7
504; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
505; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
506; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
507; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX1064-NEXT:    buffer_gl0_inv
509; GFX1064-NEXT:  .LBB2_2:
510; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
511; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
512; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
513; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
514; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
515; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
516; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
518; GFX1064-NEXT:    s_endpgm
519;
520; GFX1032-LABEL: add_i32_varying:
521; GFX1032:       ; %bb.0: ; %entry
522; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
523; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
524; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
525; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
526; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
527; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
528; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
529; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
530; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
531; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
532; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
533; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
534; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
535; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
536; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
537; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
538; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
539; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
540; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
541; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
542; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
543; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
544; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
545; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
546; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
547; GFX1032-NEXT:    s_mov_b32 s2, -1
548; GFX1032-NEXT:    ; implicit-def: $vgpr0
549; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
550; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
551; GFX1032-NEXT:  ; %bb.1:
552; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
553; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
554; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
555; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
556; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
557; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX1032-NEXT:    buffer_gl0_inv
559; GFX1032-NEXT:  .LBB2_2:
560; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
561; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
562; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
563; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
564; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
565; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
566; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
568; GFX1032-NEXT:    s_endpgm
569entry:
570  %lane = call i32 @llvm.amdgcn.workitem.id.x()
571  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
572  store i32 %old, i32 addrspace(1)* %out
573  ret void
574}
575
576define amdgpu_kernel void @add_i32_varying_nouse() {
577; GFX7LESS-LABEL: add_i32_varying_nouse:
578; GFX7LESS:       ; %bb.0: ; %entry
579; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
580; GFX7LESS-NEXT:    s_mov_b32 m0, -1
581; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX7LESS-NEXT:    ds_add_u32 v1, v0
583; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX7LESS-NEXT:    s_endpgm
585;
586; GFX8-LABEL: add_i32_varying_nouse:
587; GFX8:       ; %bb.0: ; %entry
588; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
589; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
590; GFX8-NEXT:    v_mov_b32_e32 v1, v0
591; GFX8-NEXT:    s_not_b64 exec, exec
592; GFX8-NEXT:    v_mov_b32_e32 v1, 0
593; GFX8-NEXT:    s_not_b64 exec, exec
594; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
595; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
596; GFX8-NEXT:    s_nop 1
597; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
598; GFX8-NEXT:    s_nop 1
599; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
600; GFX8-NEXT:    s_nop 1
601; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
602; GFX8-NEXT:    s_nop 1
603; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
604; GFX8-NEXT:    s_nop 1
605; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
606; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
607; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
608; GFX8-NEXT:    s_mov_b32 s0, s2
609; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
610; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
611; GFX8-NEXT:    s_cbranch_execz .LBB3_2
612; GFX8-NEXT:  ; %bb.1:
613; GFX8-NEXT:    v_mov_b32_e32 v0, 0
614; GFX8-NEXT:    v_mov_b32_e32 v2, s0
615; GFX8-NEXT:    s_mov_b32 m0, -1
616; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX8-NEXT:    ds_add_u32 v0, v2
618; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX8-NEXT:  .LBB3_2:
620; GFX8-NEXT:    s_endpgm
621;
622; GFX9-LABEL: add_i32_varying_nouse:
623; GFX9:       ; %bb.0: ; %entry
624; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
625; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
626; GFX9-NEXT:    v_mov_b32_e32 v1, v0
627; GFX9-NEXT:    s_not_b64 exec, exec
628; GFX9-NEXT:    v_mov_b32_e32 v1, 0
629; GFX9-NEXT:    s_not_b64 exec, exec
630; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
631; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
632; GFX9-NEXT:    s_nop 1
633; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
634; GFX9-NEXT:    s_nop 1
635; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
636; GFX9-NEXT:    s_nop 1
637; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
638; GFX9-NEXT:    s_nop 1
639; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
640; GFX9-NEXT:    s_nop 1
641; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
642; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
643; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
644; GFX9-NEXT:    s_mov_b32 s0, s2
645; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
646; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
647; GFX9-NEXT:    s_cbranch_execz .LBB3_2
648; GFX9-NEXT:  ; %bb.1:
649; GFX9-NEXT:    v_mov_b32_e32 v0, 0
650; GFX9-NEXT:    v_mov_b32_e32 v2, s0
651; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX9-NEXT:    ds_add_u32 v0, v2
653; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX9-NEXT:  .LBB3_2:
655; GFX9-NEXT:    s_endpgm
656;
657; GFX1064-LABEL: add_i32_varying_nouse:
658; GFX1064:       ; %bb.0: ; %entry
659; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
660; GFX1064-NEXT:    s_not_b64 exec, exec
661; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
662; GFX1064-NEXT:    s_not_b64 exec, exec
663; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
664; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
665; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
666; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
669; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
670; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
671; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
672; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
673; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
674; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
675; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
676; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
677; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
678; GFX1064-NEXT:    s_add_i32 s0, s2, s3
679; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
680; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
681; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
682; GFX1064-NEXT:  ; %bb.1:
683; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
684; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
685; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
686; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
687; GFX1064-NEXT:    ds_add_u32 v0, v3
688; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX1064-NEXT:    buffer_gl0_inv
690; GFX1064-NEXT:  .LBB3_2:
691; GFX1064-NEXT:    s_endpgm
692;
693; GFX1032-LABEL: add_i32_varying_nouse:
694; GFX1032:       ; %bb.0: ; %entry
695; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
696; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
697; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
698; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
699; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
700; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
701; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
702; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
703; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
704; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
705; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
706; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
707; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
708; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
709; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
710; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
711; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
712; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
713; GFX1032-NEXT:  ; %bb.1:
714; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
715; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
716; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
717; GFX1032-NEXT:    ds_add_u32 v3, v0
718; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
719; GFX1032-NEXT:    buffer_gl0_inv
720; GFX1032-NEXT:  .LBB3_2:
721; GFX1032-NEXT:    s_endpgm
722entry:
723  %lane = call i32 @llvm.amdgcn.workitem.id.x()
724  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
725  ret void
726}
727
728define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
729;
730;
731; GFX7LESS-LABEL: add_i64_constant:
732; GFX7LESS:       ; %bb.0: ; %entry
733; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
734; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
735; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
736; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
737; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
738; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
739; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
740; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
741; GFX7LESS-NEXT:  ; %bb.1:
742; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
743; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
744; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
745; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
746; GFX7LESS-NEXT:    s_mov_b32 m0, -1
747; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
749; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX7LESS-NEXT:  .LBB4_2:
751; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
752; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
754; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
755; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
756; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
757; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
758; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
759; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
760; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
761; GFX7LESS-NEXT:    s_mov_b32 s2, -1
762; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
763; GFX7LESS-NEXT:    s_endpgm
764;
765; GFX8-LABEL: add_i64_constant:
766; GFX8:       ; %bb.0: ; %entry
767; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
768; GFX8-NEXT:    s_mov_b64 s[4:5], exec
769; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
770; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
771; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
772; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
773; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
774; GFX8-NEXT:    s_cbranch_execz .LBB4_2
775; GFX8-NEXT:  ; %bb.1:
776; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
777; GFX8-NEXT:    s_mul_i32 s4, s4, 5
778; GFX8-NEXT:    v_mov_b32_e32 v0, s4
779; GFX8-NEXT:    v_mov_b32_e32 v1, 0
780; GFX8-NEXT:    s_mov_b32 m0, -1
781; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
783; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX8-NEXT:  .LBB4_2:
785; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
786; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
788; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
789; GFX8-NEXT:    v_mov_b32_e32 v0, s2
790; GFX8-NEXT:    v_mov_b32_e32 v1, s3
791; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
792; GFX8-NEXT:    s_mov_b32 s3, 0xf000
793; GFX8-NEXT:    s_mov_b32 s2, -1
794; GFX8-NEXT:    s_nop 2
795; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
796; GFX8-NEXT:    s_endpgm
797;
798; GFX9-LABEL: add_i64_constant:
799; GFX9:       ; %bb.0: ; %entry
800; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
801; GFX9-NEXT:    s_mov_b64 s[4:5], exec
802; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
803; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
804; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
805; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
806; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
807; GFX9-NEXT:    s_cbranch_execz .LBB4_2
808; GFX9-NEXT:  ; %bb.1:
809; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
810; GFX9-NEXT:    s_mul_i32 s4, s4, 5
811; GFX9-NEXT:    v_mov_b32_e32 v0, s4
812; GFX9-NEXT:    v_mov_b32_e32 v1, 0
813; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
815; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX9-NEXT:  .LBB4_2:
817; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
818; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
820; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
821; GFX9-NEXT:    v_mov_b32_e32 v0, s2
822; GFX9-NEXT:    v_mov_b32_e32 v1, s3
823; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
824; GFX9-NEXT:    s_mov_b32 s3, 0xf000
825; GFX9-NEXT:    s_mov_b32 s2, -1
826; GFX9-NEXT:    s_nop 2
827; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
828; GFX9-NEXT:    s_endpgm
829;
830; GFX1064-LABEL: add_i64_constant:
831; GFX1064:       ; %bb.0: ; %entry
832; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
833; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
834; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
835; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
836; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
837; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
838; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
839; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
840; GFX1064-NEXT:  ; %bb.1:
841; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
842; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
843; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
844; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
845; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
846; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
847; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
848; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX1064-NEXT:    buffer_gl0_inv
850; GFX1064-NEXT:  .LBB4_2:
851; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
852; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
853; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
854; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
855; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
856; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
857; GFX1064-NEXT:    s_mov_b32 s2, -1
858; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
860; GFX1064-NEXT:    s_endpgm
861;
862; GFX1032-LABEL: add_i64_constant:
863; GFX1032:       ; %bb.0: ; %entry
864; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
865; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
866; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
867; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
868; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
869; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
870; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
871; GFX1032-NEXT:  ; %bb.1:
872; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
873; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
874; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
875; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
876; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
877; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
878; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
879; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX1032-NEXT:    buffer_gl0_inv
881; GFX1032-NEXT:  .LBB4_2:
882; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
883; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
884; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
885; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
886; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
887; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
888; GFX1032-NEXT:    s_mov_b32 s2, -1
889; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
891; GFX1032-NEXT:    s_endpgm
892entry:
893  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
894  store i64 %old, i64 addrspace(1)* %out
895  ret void
896}
897
898define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
899;
900;
901; GFX7LESS-LABEL: add_i64_uniform:
902; GFX7LESS:       ; %bb.0: ; %entry
903; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
904; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
905; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
906; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
907; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
908; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
909; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
910; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
911; GFX7LESS-NEXT:  ; %bb.1:
912; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
913; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
914; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
916; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
917; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
918; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
919; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
920; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
921; GFX7LESS-NEXT:    s_mov_b32 m0, -1
922; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
924; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
925; GFX7LESS-NEXT:  .LBB5_2:
926; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
927; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
928; GFX7LESS-NEXT:    s_mov_b32 s6, -1
929; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
930; GFX7LESS-NEXT:    s_mov_b32 s4, s0
931; GFX7LESS-NEXT:    s_mov_b32 s5, s1
932; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
933; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
934; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
935; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
936; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
937; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
938; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
939; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
940; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
941; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
942; GFX7LESS-NEXT:    s_endpgm
943;
944; GFX8-LABEL: add_i64_uniform:
945; GFX8:       ; %bb.0: ; %entry
946; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
947; GFX8-NEXT:    s_mov_b64 s[6:7], exec
948; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
949; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
950; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
951; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
952; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
953; GFX8-NEXT:    s_cbranch_execz .LBB5_2
954; GFX8-NEXT:  ; %bb.1:
955; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
956; GFX8-NEXT:    v_mov_b32_e32 v0, s8
957; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
959; GFX8-NEXT:    s_mul_i32 s6, s3, s8
960; GFX8-NEXT:    v_mov_b32_e32 v3, 0
961; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
962; GFX8-NEXT:    s_mov_b32 m0, -1
963; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
965; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
966; GFX8-NEXT:  .LBB5_2:
967; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
968; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
969; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
970; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
971; GFX8-NEXT:    v_mov_b32_e32 v0, s4
972; GFX8-NEXT:    v_mov_b32_e32 v1, s5
973; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
974; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
975; GFX8-NEXT:    s_mov_b32 s7, 0xf000
976; GFX8-NEXT:    s_mov_b32 s6, -1
977; GFX8-NEXT:    s_mov_b32 s4, s0
978; GFX8-NEXT:    s_mov_b32 s5, s1
979; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
980; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
981; GFX8-NEXT:    s_endpgm
982;
983; GFX9-LABEL: add_i64_uniform:
984; GFX9:       ; %bb.0: ; %entry
985; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
986; GFX9-NEXT:    s_mov_b64 s[6:7], exec
987; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
988; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
989; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
990; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
991; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
992; GFX9-NEXT:    s_cbranch_execz .LBB5_2
993; GFX9-NEXT:  ; %bb.1:
994; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
995; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
996; GFX9-NEXT:    s_mul_i32 s7, s3, s6
997; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
998; GFX9-NEXT:    s_add_i32 s8, s8, s7
999; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1000; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1001; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1002; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1003; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1005; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX9-NEXT:  .LBB5_2:
1007; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1008; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1009; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1010; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1011; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1012; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1013; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1014; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1015; GFX9-NEXT:    s_mov_b32 s6, -1
1016; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1017; GFX9-NEXT:    s_mov_b32 s4, s0
1018; GFX9-NEXT:    s_mov_b32 s5, s1
1019; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1020; GFX9-NEXT:    s_endpgm
1021;
1022; GFX1064-LABEL: add_i64_uniform:
1023; GFX1064:       ; %bb.0: ; %entry
1024; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1025; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1026; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1027; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1028; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1029; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1030; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1031; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1032; GFX1064-NEXT:  ; %bb.1:
1033; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1034; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1035; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1037; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1038; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1039; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1040; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1041; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1042; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1043; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1044; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1045; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1046; GFX1064-NEXT:    buffer_gl0_inv
1047; GFX1064-NEXT:  .LBB5_2:
1048; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1049; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1050; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1051; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1052; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1054; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1055; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1056; GFX1064-NEXT:    s_mov_b32 s2, -1
1057; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1058; GFX1064-NEXT:    s_endpgm
1059;
1060; GFX1032-LABEL: add_i64_uniform:
1061; GFX1032:       ; %bb.0: ; %entry
1062; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1063; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1064; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1065; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1066; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1067; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1068; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1069; GFX1032-NEXT:  ; %bb.1:
1070; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1071; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1072; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1073; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1074; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1075; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1076; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1077; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1078; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1079; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1080; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1081; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1082; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX1032-NEXT:    buffer_gl0_inv
1084; GFX1032-NEXT:  .LBB5_2:
1085; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1086; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1087; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1088; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1089; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1091; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1092; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1093; GFX1032-NEXT:    s_mov_b32 s2, -1
1094; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1095; GFX1032-NEXT:    s_endpgm
1096entry:
1097  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1098  store i64 %old, i64 addrspace(1)* %out
1099  ret void
1100}
1101
1102define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1103;
1104;
1105; GFX7LESS-LABEL: add_i64_varying:
1106; GFX7LESS:       ; %bb.0: ; %entry
1107; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1108; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1109; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1110; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1112; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1114; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1115; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1116; GFX7LESS-NEXT:    s_endpgm
1117;
1118; GFX8-LABEL: add_i64_varying:
1119; GFX8:       ; %bb.0: ; %entry
1120; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1121; GFX8-NEXT:    s_mov_b32 m0, -1
1122; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1123; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1125; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1126; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1127; GFX8-NEXT:    s_mov_b32 s2, -1
1128; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1129; GFX8-NEXT:    s_endpgm
1130;
1131; GFX9-LABEL: add_i64_varying:
1132; GFX9:       ; %bb.0: ; %entry
1133; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1134; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1135; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1137; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1139; GFX9-NEXT:    s_mov_b32 s2, -1
1140; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1141; GFX9-NEXT:    s_endpgm
1142;
1143; GFX10-LABEL: add_i64_varying:
1144; GFX10:       ; %bb.0: ; %entry
1145; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1146; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1147; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1148; GFX10-NEXT:    s_mov_b32 s2, -1
1149; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1150; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1151; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1152; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1153; GFX10-NEXT:    buffer_gl0_inv
1154; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1155; GFX10-NEXT:    s_endpgm
1156entry:
1157  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1158  %zext = zext i32 %lane to i64
1159  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1160  store i64 %old, i64 addrspace(1)* %out
1161  ret void
1162}
1163
1164define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1165;
1166;
1167; GFX7LESS-LABEL: sub_i32_constant:
1168; GFX7LESS:       ; %bb.0: ; %entry
1169; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1170; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1171; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1172; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1173; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1174; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1175; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1176; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1177; GFX7LESS-NEXT:  ; %bb.1:
1178; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1179; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1180; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1181; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1182; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1183; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1184; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1185; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1186; GFX7LESS-NEXT:  .LBB7_2:
1187; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1188; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1189; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1190; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1191; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1192; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1193; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1194; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1195; GFX7LESS-NEXT:    s_endpgm
1196;
1197; GFX8-LABEL: sub_i32_constant:
1198; GFX8:       ; %bb.0: ; %entry
1199; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1200; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1201; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1202; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1203; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1204; GFX8-NEXT:    ; implicit-def: $vgpr1
1205; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1206; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1207; GFX8-NEXT:  ; %bb.1:
1208; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1209; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1210; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1211; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1212; GFX8-NEXT:    s_mov_b32 m0, -1
1213; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1215; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX8-NEXT:  .LBB7_2:
1217; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1218; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1219; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1220; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1221; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1222; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1223; GFX8-NEXT:    s_mov_b32 s2, -1
1224; GFX8-NEXT:    s_nop 0
1225; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1226; GFX8-NEXT:    s_endpgm
1227;
1228; GFX9-LABEL: sub_i32_constant:
1229; GFX9:       ; %bb.0: ; %entry
1230; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1231; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1232; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1233; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1234; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1235; GFX9-NEXT:    ; implicit-def: $vgpr1
1236; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1237; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1238; GFX9-NEXT:  ; %bb.1:
1239; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1240; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1241; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1242; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1243; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1244; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX9-NEXT:  .LBB7_2:
1247; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1249; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1250; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1251; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1252; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1253; GFX9-NEXT:    s_mov_b32 s2, -1
1254; GFX9-NEXT:    s_nop 0
1255; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1256; GFX9-NEXT:    s_endpgm
1257;
1258; GFX1064-LABEL: sub_i32_constant:
1259; GFX1064:       ; %bb.0: ; %entry
1260; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1261; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1262; GFX1064-NEXT:    ; implicit-def: $vgpr1
1263; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1264; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1265; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1266; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1267; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1268; GFX1064-NEXT:  ; %bb.1:
1269; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1270; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1271; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1272; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1273; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1274; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1275; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1276; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX1064-NEXT:    buffer_gl0_inv
1278; GFX1064-NEXT:  .LBB7_2:
1279; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1280; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1281; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1282; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1283; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1284; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1285; GFX1064-NEXT:    s_mov_b32 s2, -1
1286; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1288; GFX1064-NEXT:    s_endpgm
1289;
1290; GFX1032-LABEL: sub_i32_constant:
1291; GFX1032:       ; %bb.0: ; %entry
1292; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1293; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1294; GFX1032-NEXT:    ; implicit-def: $vgpr1
1295; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1296; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1297; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1298; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1299; GFX1032-NEXT:  ; %bb.1:
1300; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1301; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1302; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1303; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1304; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1305; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1306; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1307; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1308; GFX1032-NEXT:    buffer_gl0_inv
1309; GFX1032-NEXT:  .LBB7_2:
1310; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1311; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1312; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1313; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1314; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1315; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1316; GFX1032-NEXT:    s_mov_b32 s2, -1
1317; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1318; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1319; GFX1032-NEXT:    s_endpgm
1320entry:
1321  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1322  store i32 %old, i32 addrspace(1)* %out
1323  ret void
1324}
1325
1326define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1327;
1328;
1329; GFX7LESS-LABEL: sub_i32_uniform:
1330; GFX7LESS:       ; %bb.0: ; %entry
1331; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1332; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1333; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1334; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1335; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1336; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1337; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1338; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1339; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1340; GFX7LESS-NEXT:  ; %bb.1:
1341; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1342; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1344; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1345; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1346; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1347; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1348; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1349; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1350; GFX7LESS-NEXT:  .LBB8_2:
1351; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1352; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1354; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1355; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1356; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1357; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1358; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1359; GFX7LESS-NEXT:    s_endpgm
1360;
1361; GFX8-LABEL: sub_i32_uniform:
1362; GFX8:       ; %bb.0: ; %entry
1363; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1364; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1365; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1366; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1367; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1368; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1369; GFX8-NEXT:    ; implicit-def: $vgpr1
1370; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1371; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1372; GFX8-NEXT:  ; %bb.1:
1373; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1374; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1376; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1377; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1378; GFX8-NEXT:    s_mov_b32 m0, -1
1379; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1380; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1381; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX8-NEXT:  .LBB8_2:
1383; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1384; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1386; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1387; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1388; GFX8-NEXT:    s_mov_b32 s6, -1
1389; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1390; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1391; GFX8-NEXT:    s_endpgm
1392;
1393; GFX9-LABEL: sub_i32_uniform:
1394; GFX9:       ; %bb.0: ; %entry
1395; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1396; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1397; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1398; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1399; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1400; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1401; GFX9-NEXT:    ; implicit-def: $vgpr1
1402; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1403; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1404; GFX9-NEXT:  ; %bb.1:
1405; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1406; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1407; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1408; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1409; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1412; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1413; GFX9-NEXT:  .LBB8_2:
1414; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1415; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1417; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1418; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1419; GFX9-NEXT:    s_mov_b32 s6, -1
1420; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1421; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1422; GFX9-NEXT:    s_endpgm
1423;
1424; GFX1064-LABEL: sub_i32_uniform:
1425; GFX1064:       ; %bb.0: ; %entry
1426; GFX1064-NEXT:    s_clause 0x1
1427; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1428; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
1429; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1430; GFX1064-NEXT:    ; implicit-def: $vgpr1
1431; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1432; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1433; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1434; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1435; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
1436; GFX1064-NEXT:  ; %bb.1:
1437; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1438; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1439; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1440; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
1441; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1442; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1443; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1444; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1445; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1446; GFX1064-NEXT:    buffer_gl0_inv
1447; GFX1064-NEXT:  .LBB8_2:
1448; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1449; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1450; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
1452; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1453; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1454; GFX1064-NEXT:    s_mov_b32 s6, -1
1455; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1456; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1457; GFX1064-NEXT:    s_endpgm
1458;
1459; GFX1032-LABEL: sub_i32_uniform:
1460; GFX1032:       ; %bb.0: ; %entry
1461; GFX1032-NEXT:    s_clause 0x1
1462; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1463; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1464; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1465; GFX1032-NEXT:    ; implicit-def: $vgpr1
1466; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1467; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1468; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1469; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
1470; GFX1032-NEXT:  ; %bb.1:
1471; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1472; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1473; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1474; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1475; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1476; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1477; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1478; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1479; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX1032-NEXT:    buffer_gl0_inv
1481; GFX1032-NEXT:  .LBB8_2:
1482; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1483; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1484; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1485; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1486; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1487; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1488; GFX1032-NEXT:    s_mov_b32 s6, -1
1489; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1490; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1491; GFX1032-NEXT:    s_endpgm
1492entry:
1493  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1494  store i32 %old, i32 addrspace(1)* %out
1495  ret void
1496}
1497
1498define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1499;
1500;
1501; GFX7LESS-LABEL: sub_i32_varying:
1502; GFX7LESS:       ; %bb.0: ; %entry
1503; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1504; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1505; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1506; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1507; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1508; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1510; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1511; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1512; GFX7LESS-NEXT:    s_endpgm
1513;
1514; GFX8-LABEL: sub_i32_varying:
1515; GFX8:       ; %bb.0: ; %entry
1516; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1517; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1518; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1519; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1520; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1521; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1522; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1523; GFX8-NEXT:    s_not_b64 exec, exec
1524; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1525; GFX8-NEXT:    s_not_b64 exec, exec
1526; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1527; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1528; GFX8-NEXT:    s_nop 1
1529; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1530; GFX8-NEXT:    s_nop 1
1531; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1532; GFX8-NEXT:    s_nop 1
1533; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1534; GFX8-NEXT:    s_nop 1
1535; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1536; GFX8-NEXT:    s_nop 1
1537; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1538; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1539; GFX8-NEXT:    s_nop 0
1540; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1541; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1542; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1543; GFX8-NEXT:    ; implicit-def: $vgpr0
1544; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1545; GFX8-NEXT:    s_cbranch_execz .LBB9_2
1546; GFX8-NEXT:  ; %bb.1:
1547; GFX8-NEXT:    v_mov_b32_e32 v0, 0
1548; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1549; GFX8-NEXT:    s_mov_b32 m0, -1
1550; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1551; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1552; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1553; GFX8-NEXT:  .LBB9_2:
1554; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1555; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1556; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1557; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1558; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1559; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1560; GFX8-NEXT:    s_mov_b32 s2, -1
1561; GFX8-NEXT:    s_nop 0
1562; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1563; GFX8-NEXT:    s_endpgm
1564;
1565; GFX9-LABEL: sub_i32_varying:
1566; GFX9:       ; %bb.0: ; %entry
1567; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1568; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1569; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1570; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1571; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1572; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1573; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1574; GFX9-NEXT:    s_not_b64 exec, exec
1575; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1576; GFX9-NEXT:    s_not_b64 exec, exec
1577; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1578; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1579; GFX9-NEXT:    s_nop 1
1580; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1581; GFX9-NEXT:    s_nop 1
1582; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1583; GFX9-NEXT:    s_nop 1
1584; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1585; GFX9-NEXT:    s_nop 1
1586; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1587; GFX9-NEXT:    s_nop 1
1588; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1589; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1590; GFX9-NEXT:    s_nop 0
1591; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1592; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1593; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1594; GFX9-NEXT:    ; implicit-def: $vgpr0
1595; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1596; GFX9-NEXT:    s_cbranch_execz .LBB9_2
1597; GFX9-NEXT:  ; %bb.1:
1598; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1599; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1600; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1602; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX9-NEXT:  .LBB9_2:
1604; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1605; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1606; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1607; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1608; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1609; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1610; GFX9-NEXT:    s_mov_b32 s2, -1
1611; GFX9-NEXT:    s_nop 0
1612; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1613; GFX9-NEXT:    s_endpgm
1614;
1615; GFX1064-LABEL: sub_i32_varying:
1616; GFX1064:       ; %bb.0: ; %entry
1617; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1618; GFX1064-NEXT:    s_not_b64 exec, exec
1619; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1620; GFX1064-NEXT:    s_not_b64 exec, exec
1621; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1622; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1623; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1624; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1625; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1626; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1627; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1628; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1629; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1630; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
1631; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
1632; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1633; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
1634; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1635; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1636; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1637; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1638; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
1639; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
1640; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1641; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1642; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1643; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
1644; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
1645; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
1646; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1647; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1648; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1649; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
1650; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1651; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1652; GFX1064-NEXT:    s_mov_b32 s2, -1
1653; GFX1064-NEXT:    ; implicit-def: $vgpr0
1654; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1655; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
1656; GFX1064-NEXT:  ; %bb.1:
1657; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1658; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
1659; GFX1064-NEXT:    s_mov_b32 s3, s7
1660; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1661; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1662; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
1663; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1664; GFX1064-NEXT:    buffer_gl0_inv
1665; GFX1064-NEXT:  .LBB9_2:
1666; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1667; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1668; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1669; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
1670; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1671; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1672; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1673; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1674; GFX1064-NEXT:    s_endpgm
1675;
1676; GFX1032-LABEL: sub_i32_varying:
1677; GFX1032:       ; %bb.0: ; %entry
1678; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1679; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1680; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1681; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1682; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1683; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1684; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1685; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1686; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1687; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1688; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1689; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1690; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1691; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1692; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1693; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1694; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1695; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1696; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1697; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1698; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1699; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1700; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1701; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1702; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1703; GFX1032-NEXT:    s_mov_b32 s2, -1
1704; GFX1032-NEXT:    ; implicit-def: $vgpr0
1705; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1706; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
1707; GFX1032-NEXT:  ; %bb.1:
1708; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1709; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1710; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1711; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1712; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
1713; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1714; GFX1032-NEXT:    buffer_gl0_inv
1715; GFX1032-NEXT:  .LBB9_2:
1716; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1717; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1718; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1719; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1720; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1721; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1722; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1723; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1724; GFX1032-NEXT:    s_endpgm
1725entry:
1726  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1727  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1728  store i32 %old, i32 addrspace(1)* %out
1729  ret void
1730}
1731
1732define amdgpu_kernel void @sub_i32_varying_nouse() {
1733; GFX7LESS-LABEL: sub_i32_varying_nouse:
1734; GFX7LESS:       ; %bb.0: ; %entry
1735; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1736; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1737; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
1739; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX7LESS-NEXT:    s_endpgm
1741;
1742; GFX8-LABEL: sub_i32_varying_nouse:
1743; GFX8:       ; %bb.0: ; %entry
1744; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
1745; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
1746; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1747; GFX8-NEXT:    s_not_b64 exec, exec
1748; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1749; GFX8-NEXT:    s_not_b64 exec, exec
1750; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
1751; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1752; GFX8-NEXT:    s_nop 1
1753; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1754; GFX8-NEXT:    s_nop 1
1755; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1756; GFX8-NEXT:    s_nop 1
1757; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1758; GFX8-NEXT:    s_nop 1
1759; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1760; GFX8-NEXT:    s_nop 1
1761; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1762; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
1763; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
1764; GFX8-NEXT:    s_mov_b32 s0, s2
1765; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1766; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1767; GFX8-NEXT:    s_cbranch_execz .LBB10_2
1768; GFX8-NEXT:  ; %bb.1:
1769; GFX8-NEXT:    v_mov_b32_e32 v0, 0
1770; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1771; GFX8-NEXT:    s_mov_b32 m0, -1
1772; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX8-NEXT:    ds_sub_u32 v0, v2
1774; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1775; GFX8-NEXT:  .LBB10_2:
1776; GFX8-NEXT:    s_endpgm
1777;
1778; GFX9-LABEL: sub_i32_varying_nouse:
1779; GFX9:       ; %bb.0: ; %entry
1780; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
1781; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
1782; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1783; GFX9-NEXT:    s_not_b64 exec, exec
1784; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1785; GFX9-NEXT:    s_not_b64 exec, exec
1786; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
1787; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1788; GFX9-NEXT:    s_nop 1
1789; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1790; GFX9-NEXT:    s_nop 1
1791; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1792; GFX9-NEXT:    s_nop 1
1793; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1794; GFX9-NEXT:    s_nop 1
1795; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1796; GFX9-NEXT:    s_nop 1
1797; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1798; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
1799; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
1800; GFX9-NEXT:    s_mov_b32 s0, s2
1801; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1802; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1803; GFX9-NEXT:    s_cbranch_execz .LBB10_2
1804; GFX9-NEXT:  ; %bb.1:
1805; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1806; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1807; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1808; GFX9-NEXT:    ds_sub_u32 v0, v2
1809; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX9-NEXT:  .LBB10_2:
1811; GFX9-NEXT:    s_endpgm
1812;
1813; GFX1064-LABEL: sub_i32_varying_nouse:
1814; GFX1064:       ; %bb.0: ; %entry
1815; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1816; GFX1064-NEXT:    s_not_b64 exec, exec
1817; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1818; GFX1064-NEXT:    s_not_b64 exec, exec
1819; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1820; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1821; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1822; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1823; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1824; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1825; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1826; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1827; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1828; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1829; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1830; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
1831; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
1832; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1833; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1834; GFX1064-NEXT:    s_add_i32 s0, s2, s3
1835; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1836; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1837; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
1838; GFX1064-NEXT:  ; %bb.1:
1839; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1840; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
1841; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1842; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1843; GFX1064-NEXT:    ds_sub_u32 v0, v3
1844; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1845; GFX1064-NEXT:    buffer_gl0_inv
1846; GFX1064-NEXT:  .LBB10_2:
1847; GFX1064-NEXT:    s_endpgm
1848;
1849; GFX1032-LABEL: sub_i32_varying_nouse:
1850; GFX1032:       ; %bb.0: ; %entry
1851; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1852; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1853; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1854; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1855; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
1856; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1857; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1858; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1859; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1860; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1861; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1862; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1863; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
1864; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1865; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1866; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1867; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1868; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
1869; GFX1032-NEXT:  ; %bb.1:
1870; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1871; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1872; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1873; GFX1032-NEXT:    ds_sub_u32 v3, v0
1874; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1875; GFX1032-NEXT:    buffer_gl0_inv
1876; GFX1032-NEXT:  .LBB10_2:
1877; GFX1032-NEXT:    s_endpgm
1878entry:
1879  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1880  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1881  ret void
1882}
1883
1884define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
1885;
1886;
1887; GFX7LESS-LABEL: sub_i64_constant:
1888; GFX7LESS:       ; %bb.0: ; %entry
1889; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1890; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1891; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1892; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1893; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1894; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1895; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1896; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
1897; GFX7LESS-NEXT:  ; %bb.1:
1898; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1899; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1900; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1901; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1902; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1903; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1904; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
1905; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1906; GFX7LESS-NEXT:  .LBB11_2:
1907; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1908; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1909; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1910; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1911; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1912; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1913; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1914; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1915; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1916; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1917; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1918; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1919; GFX7LESS-NEXT:    s_endpgm
1920;
1921; GFX8-LABEL: sub_i64_constant:
1922; GFX8:       ; %bb.0: ; %entry
1923; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1924; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1925; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1926; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1927; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1928; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1929; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1930; GFX8-NEXT:    s_cbranch_execz .LBB11_2
1931; GFX8-NEXT:  ; %bb.1:
1932; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1933; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1934; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1935; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1936; GFX8-NEXT:    s_mov_b32 m0, -1
1937; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
1939; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1940; GFX8-NEXT:  .LBB11_2:
1941; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1942; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1943; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1944; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1945; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1946; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1947; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1948; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1949; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1950; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1951; GFX8-NEXT:    s_mov_b32 s2, -1
1952; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1953; GFX8-NEXT:    s_endpgm
1954;
1955; GFX9-LABEL: sub_i64_constant:
1956; GFX9:       ; %bb.0: ; %entry
1957; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1958; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1959; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1960; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1961; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1962; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1963; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1964; GFX9-NEXT:    s_cbranch_execz .LBB11_2
1965; GFX9-NEXT:  ; %bb.1:
1966; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1967; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1968; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1969; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1970; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
1972; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1973; GFX9-NEXT:  .LBB11_2:
1974; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1975; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1976; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1977; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1978; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1979; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1980; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1981; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
1982; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
1983; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1984; GFX9-NEXT:    s_mov_b32 s2, -1
1985; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1986; GFX9-NEXT:    s_endpgm
1987;
1988; GFX1064-LABEL: sub_i64_constant:
1989; GFX1064:       ; %bb.0: ; %entry
1990; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1991; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1992; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1993; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1994; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1995; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1996; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1997; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
1998; GFX1064-NEXT:  ; %bb.1:
1999; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2000; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2001; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2002; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2003; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2004; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2005; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2006; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2007; GFX1064-NEXT:    buffer_gl0_inv
2008; GFX1064-NEXT:  .LBB11_2:
2009; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2010; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2011; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2012; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2013; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2014; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2015; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2016; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2017; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2018; GFX1064-NEXT:    s_mov_b32 s2, -1
2019; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2021; GFX1064-NEXT:    s_endpgm
2022;
2023; GFX1032-LABEL: sub_i64_constant:
2024; GFX1032:       ; %bb.0: ; %entry
2025; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2026; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2027; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2028; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2029; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2030; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2031; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2032; GFX1032-NEXT:  ; %bb.1:
2033; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2034; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2035; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2036; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2037; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2038; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2039; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2040; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2041; GFX1032-NEXT:    buffer_gl0_inv
2042; GFX1032-NEXT:  .LBB11_2:
2043; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2044; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2045; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2046; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2047; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2048; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2049; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2050; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2051; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2052; GFX1032-NEXT:    s_mov_b32 s2, -1
2053; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2054; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2055; GFX1032-NEXT:    s_endpgm
2056entry:
2057  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2058  store i64 %old, i64 addrspace(1)* %out
2059  ret void
2060}
2061
2062define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2063;
2064;
2065; GFX7LESS-LABEL: sub_i64_uniform:
2066; GFX7LESS:       ; %bb.0: ; %entry
2067; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2068; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2069; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2070; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
2071; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2072; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2073; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2074; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
2075; GFX7LESS-NEXT:  ; %bb.1:
2076; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2077; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
2078; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2080; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2081; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
2082; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2083; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
2084; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2085; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2086; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2087; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2088; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2089; GFX7LESS-NEXT:  .LBB12_2:
2090; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2091; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2092; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2093; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2094; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2095; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2096; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
2097; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
2098; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
2099; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
2100; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
2101; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
2102; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
2103; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
2104; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2105; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2106; GFX7LESS-NEXT:    s_endpgm
2107;
2108; GFX8-LABEL: sub_i64_uniform:
2109; GFX8:       ; %bb.0: ; %entry
2110; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2111; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2112; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2113; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2114; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2115; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2116; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2117; GFX8-NEXT:    s_cbranch_execz .LBB12_2
2118; GFX8-NEXT:  ; %bb.1:
2119; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
2120; GFX8-NEXT:    v_mov_b32_e32 v0, s8
2121; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2122; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
2123; GFX8-NEXT:    s_mul_i32 s6, s3, s8
2124; GFX8-NEXT:    v_mov_b32_e32 v3, 0
2125; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
2126; GFX8-NEXT:    s_mov_b32 m0, -1
2127; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2128; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2129; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2130; GFX8-NEXT:  .LBB12_2:
2131; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2132; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2133; GFX8-NEXT:    s_mov_b32 s4, s0
2134; GFX8-NEXT:    s_mov_b32 s5, s1
2135; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
2136; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2137; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2138; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2139; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
2140; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2141; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
2142; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2143; GFX8-NEXT:    s_mov_b32 s6, -1
2144; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2145; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2146; GFX8-NEXT:    s_endpgm
2147;
2148; GFX9-LABEL: sub_i64_uniform:
2149; GFX9:       ; %bb.0: ; %entry
2150; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2151; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2152; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2153; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2154; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2155; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2156; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2157; GFX9-NEXT:    s_cbranch_execz .LBB12_2
2158; GFX9-NEXT:  ; %bb.1:
2159; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2160; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2161; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2162; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2163; GFX9-NEXT:    s_add_i32 s8, s8, s7
2164; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2165; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2166; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2167; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2168; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2169; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2170; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2171; GFX9-NEXT:  .LBB12_2:
2172; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2174; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
2175; GFX9-NEXT:    s_mov_b32 s4, s0
2176; GFX9-NEXT:    s_mov_b32 s5, s1
2177; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
2178; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2179; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2180; GFX9-NEXT:    v_mov_b32_e32 v1, v4
2181; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2182; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
2183; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2184; GFX9-NEXT:    s_mov_b32 s6, -1
2185; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2186; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2187; GFX9-NEXT:    s_endpgm
2188;
2189; GFX1064-LABEL: sub_i64_uniform:
2190; GFX1064:       ; %bb.0: ; %entry
2191; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2192; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2193; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2194; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2195; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2196; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2197; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2198; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
2199; GFX1064-NEXT:  ; %bb.1:
2200; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2201; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2202; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2204; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2205; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2206; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2207; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
2208; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
2209; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2210; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2211; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2212; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX1064-NEXT:    buffer_gl0_inv
2214; GFX1064-NEXT:  .LBB12_2:
2215; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2216; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2217; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2218; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
2219; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
2220; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
2221; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2222; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2223; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
2224; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
2225; GFX1064-NEXT:    s_mov_b32 s2, -1
2226; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2227; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2228; GFX1064-NEXT:    s_endpgm
2229;
2230; GFX1032-LABEL: sub_i64_uniform:
2231; GFX1032:       ; %bb.0: ; %entry
2232; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2233; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2234; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2235; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
2236; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2237; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2238; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
2239; GFX1032-NEXT:  ; %bb.1:
2240; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2241; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2242; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2244; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2245; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2246; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2247; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
2248; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
2249; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2250; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2251; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2252; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2253; GFX1032-NEXT:    buffer_gl0_inv
2254; GFX1032-NEXT:  .LBB12_2:
2255; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2256; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2257; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2258; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
2259; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
2260; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
2261; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2262; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2263; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
2264; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
2265; GFX1032-NEXT:    s_mov_b32 s2, -1
2266; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2267; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2268; GFX1032-NEXT:    s_endpgm
2269entry:
2270  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2271  store i64 %old, i64 addrspace(1)* %out
2272  ret void
2273}
2274
2275define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2276;
2277;
2278; GFX7LESS-LABEL: sub_i64_varying:
2279; GFX7LESS:       ; %bb.0: ; %entry
2280; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2281; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2282; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2283; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2284; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2285; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2286; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2287; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2288; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2289; GFX7LESS-NEXT:    s_endpgm
2290;
2291; GFX8-LABEL: sub_i64_varying:
2292; GFX8:       ; %bb.0: ; %entry
2293; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2294; GFX8-NEXT:    s_mov_b32 m0, -1
2295; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2296; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2297; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2298; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2300; GFX8-NEXT:    s_mov_b32 s2, -1
2301; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2302; GFX8-NEXT:    s_endpgm
2303;
2304; GFX9-LABEL: sub_i64_varying:
2305; GFX9:       ; %bb.0: ; %entry
2306; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2307; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2308; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2309; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2310; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2311; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2312; GFX9-NEXT:    s_mov_b32 s2, -1
2313; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2314; GFX9-NEXT:    s_endpgm
2315;
2316; GFX10-LABEL: sub_i64_varying:
2317; GFX10:       ; %bb.0: ; %entry
2318; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2319; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2320; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2321; GFX10-NEXT:    s_mov_b32 s2, -1
2322; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2323; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2324; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2325; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2326; GFX10-NEXT:    buffer_gl0_inv
2327; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2328; GFX10-NEXT:    s_endpgm
2329entry:
2330  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2331  %zext = zext i32 %lane to i64
2332  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2333  store i64 %old, i64 addrspace(1)* %out
2334  ret void
2335}
2336
2337define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2338;
2339;
2340; GFX7LESS-LABEL: and_i32_varying:
2341; GFX7LESS:       ; %bb.0: ; %entry
2342; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2343; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2344; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2345; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2347; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2348; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2349; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2350; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2351; GFX7LESS-NEXT:    s_endpgm
2352;
2353; GFX8-LABEL: and_i32_varying:
2354; GFX8:       ; %bb.0: ; %entry
2355; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2356; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2357; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2358; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2359; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2360; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2361; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2362; GFX8-NEXT:    s_not_b64 exec, exec
2363; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2364; GFX8-NEXT:    s_not_b64 exec, exec
2365; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2366; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2367; GFX8-NEXT:    s_nop 1
2368; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2369; GFX8-NEXT:    s_nop 1
2370; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2371; GFX8-NEXT:    s_nop 1
2372; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2373; GFX8-NEXT:    s_nop 1
2374; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2375; GFX8-NEXT:    s_nop 1
2376; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2377; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2378; GFX8-NEXT:    s_nop 0
2379; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2380; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2381; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2382; GFX8-NEXT:    ; implicit-def: $vgpr0
2383; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2384; GFX8-NEXT:    s_cbranch_execz .LBB14_2
2385; GFX8-NEXT:  ; %bb.1:
2386; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2387; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2388; GFX8-NEXT:    s_mov_b32 m0, -1
2389; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2391; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2392; GFX8-NEXT:  .LBB14_2:
2393; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2394; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2395; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2396; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2397; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2398; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2399; GFX8-NEXT:    s_mov_b32 s2, -1
2400; GFX8-NEXT:    s_nop 0
2401; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2402; GFX8-NEXT:    s_endpgm
2403;
2404; GFX9-LABEL: and_i32_varying:
2405; GFX9:       ; %bb.0: ; %entry
2406; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2407; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2408; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2409; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2410; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2411; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2412; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2413; GFX9-NEXT:    s_not_b64 exec, exec
2414; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2415; GFX9-NEXT:    s_not_b64 exec, exec
2416; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2417; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2418; GFX9-NEXT:    s_nop 1
2419; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2420; GFX9-NEXT:    s_nop 1
2421; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2422; GFX9-NEXT:    s_nop 1
2423; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2424; GFX9-NEXT:    s_nop 1
2425; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2426; GFX9-NEXT:    s_nop 1
2427; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2428; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2429; GFX9-NEXT:    s_nop 0
2430; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2431; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2432; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2433; GFX9-NEXT:    ; implicit-def: $vgpr0
2434; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2435; GFX9-NEXT:    s_cbranch_execz .LBB14_2
2436; GFX9-NEXT:  ; %bb.1:
2437; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2438; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2439; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2440; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2441; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2442; GFX9-NEXT:  .LBB14_2:
2443; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2445; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2446; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2447; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2448; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2449; GFX9-NEXT:    s_mov_b32 s2, -1
2450; GFX9-NEXT:    s_nop 0
2451; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2452; GFX9-NEXT:    s_endpgm
2453;
2454; GFX1064-LABEL: and_i32_varying:
2455; GFX1064:       ; %bb.0: ; %entry
2456; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2457; GFX1064-NEXT:    s_not_b64 exec, exec
2458; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2459; GFX1064-NEXT:    s_not_b64 exec, exec
2460; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2461; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2462; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2463; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2464; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2465; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2466; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2467; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2468; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2469; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2470; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2471; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2472; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2473; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2474; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2475; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2476; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2477; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2478; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2479; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2480; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2481; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2482; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2483; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2484; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2485; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2486; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2487; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2488; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2489; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2490; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2491; GFX1064-NEXT:    s_mov_b32 s2, -1
2492; GFX1064-NEXT:    ; implicit-def: $vgpr0
2493; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2494; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
2495; GFX1064-NEXT:  ; %bb.1:
2496; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2497; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2498; GFX1064-NEXT:    s_mov_b32 s3, s7
2499; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2500; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2501; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
2502; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2503; GFX1064-NEXT:    buffer_gl0_inv
2504; GFX1064-NEXT:  .LBB14_2:
2505; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2506; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2507; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2508; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2509; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2510; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2511; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2513; GFX1064-NEXT:    s_endpgm
2514;
2515; GFX1032-LABEL: and_i32_varying:
2516; GFX1032:       ; %bb.0: ; %entry
2517; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2518; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2519; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2520; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2521; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2522; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2523; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2524; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2525; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2526; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2527; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2528; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2529; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2530; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2531; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2532; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2533; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2534; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2535; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2536; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2537; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2538; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2539; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2540; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2541; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2542; GFX1032-NEXT:    s_mov_b32 s2, -1
2543; GFX1032-NEXT:    ; implicit-def: $vgpr0
2544; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2545; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
2546; GFX1032-NEXT:  ; %bb.1:
2547; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2548; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2549; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2550; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2551; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
2552; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2553; GFX1032-NEXT:    buffer_gl0_inv
2554; GFX1032-NEXT:  .LBB14_2:
2555; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2556; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2557; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2558; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2559; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2560; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2561; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2563; GFX1032-NEXT:    s_endpgm
2564entry:
2565  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2566  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2567  store i32 %old, i32 addrspace(1)* %out
2568  ret void
2569}
2570
2571define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2572;
2573;
2574; GFX7LESS-LABEL: or_i32_varying:
2575; GFX7LESS:       ; %bb.0: ; %entry
2576; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2577; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2578; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2579; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2580; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2581; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2582; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2583; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2584; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2585; GFX7LESS-NEXT:    s_endpgm
2586;
2587; GFX8-LABEL: or_i32_varying:
2588; GFX8:       ; %bb.0: ; %entry
2589; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2590; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2591; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2592; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2593; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2594; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2595; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2596; GFX8-NEXT:    s_not_b64 exec, exec
2597; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2598; GFX8-NEXT:    s_not_b64 exec, exec
2599; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2600; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2601; GFX8-NEXT:    s_nop 1
2602; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2603; GFX8-NEXT:    s_nop 1
2604; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2605; GFX8-NEXT:    s_nop 1
2606; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2607; GFX8-NEXT:    s_nop 1
2608; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2609; GFX8-NEXT:    s_nop 1
2610; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2611; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2612; GFX8-NEXT:    s_nop 0
2613; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2614; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2615; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2616; GFX8-NEXT:    ; implicit-def: $vgpr0
2617; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2618; GFX8-NEXT:    s_cbranch_execz .LBB15_2
2619; GFX8-NEXT:  ; %bb.1:
2620; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2621; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2622; GFX8-NEXT:    s_mov_b32 m0, -1
2623; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2624; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2625; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2626; GFX8-NEXT:  .LBB15_2:
2627; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2628; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2629; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2630; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2631; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2632; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2633; GFX8-NEXT:    s_mov_b32 s2, -1
2634; GFX8-NEXT:    s_nop 0
2635; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2636; GFX8-NEXT:    s_endpgm
2637;
2638; GFX9-LABEL: or_i32_varying:
2639; GFX9:       ; %bb.0: ; %entry
2640; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2641; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2642; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2643; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2644; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2645; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2646; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2647; GFX9-NEXT:    s_not_b64 exec, exec
2648; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2649; GFX9-NEXT:    s_not_b64 exec, exec
2650; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2651; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2652; GFX9-NEXT:    s_nop 1
2653; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2654; GFX9-NEXT:    s_nop 1
2655; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2656; GFX9-NEXT:    s_nop 1
2657; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2658; GFX9-NEXT:    s_nop 1
2659; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2660; GFX9-NEXT:    s_nop 1
2661; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2662; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2663; GFX9-NEXT:    s_nop 0
2664; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2665; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2666; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2667; GFX9-NEXT:    ; implicit-def: $vgpr0
2668; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2669; GFX9-NEXT:    s_cbranch_execz .LBB15_2
2670; GFX9-NEXT:  ; %bb.1:
2671; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2672; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2673; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2674; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2675; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2676; GFX9-NEXT:  .LBB15_2:
2677; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2678; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2679; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2680; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2681; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2682; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2683; GFX9-NEXT:    s_mov_b32 s2, -1
2684; GFX9-NEXT:    s_nop 0
2685; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2686; GFX9-NEXT:    s_endpgm
2687;
2688; GFX1064-LABEL: or_i32_varying:
2689; GFX1064:       ; %bb.0: ; %entry
2690; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2691; GFX1064-NEXT:    s_not_b64 exec, exec
2692; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2693; GFX1064-NEXT:    s_not_b64 exec, exec
2694; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2695; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2696; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2697; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2698; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2699; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2700; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2701; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2702; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2703; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2704; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2705; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2706; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2707; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2708; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2709; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2710; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2711; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2712; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2713; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2714; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2715; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2716; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2717; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2718; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2719; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2720; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2721; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2722; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2723; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2724; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2725; GFX1064-NEXT:    s_mov_b32 s2, -1
2726; GFX1064-NEXT:    ; implicit-def: $vgpr0
2727; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2728; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
2729; GFX1064-NEXT:  ; %bb.1:
2730; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2731; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2732; GFX1064-NEXT:    s_mov_b32 s3, s7
2733; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2734; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2735; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
2736; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2737; GFX1064-NEXT:    buffer_gl0_inv
2738; GFX1064-NEXT:  .LBB15_2:
2739; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2740; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2741; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2742; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2743; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
2744; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2745; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2746; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2747; GFX1064-NEXT:    s_endpgm
2748;
2749; GFX1032-LABEL: or_i32_varying:
2750; GFX1032:       ; %bb.0: ; %entry
2751; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2752; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2753; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2754; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2755; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2756; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2757; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2758; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2759; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2760; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2761; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2762; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2763; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2764; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2765; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2766; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2767; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2768; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2769; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2770; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2771; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2772; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2773; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2774; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2775; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2776; GFX1032-NEXT:    s_mov_b32 s2, -1
2777; GFX1032-NEXT:    ; implicit-def: $vgpr0
2778; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2779; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
2780; GFX1032-NEXT:  ; %bb.1:
2781; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2782; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2783; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2784; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2785; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
2786; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2787; GFX1032-NEXT:    buffer_gl0_inv
2788; GFX1032-NEXT:  .LBB15_2:
2789; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2790; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2791; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2792; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2793; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
2794; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2795; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2796; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2797; GFX1032-NEXT:    s_endpgm
2798entry:
2799  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2800  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2801  store i32 %old, i32 addrspace(1)* %out
2802  ret void
2803}
2804
2805define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
2806;
2807;
2808; GFX7LESS-LABEL: xor_i32_varying:
2809; GFX7LESS:       ; %bb.0: ; %entry
2810; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2811; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2812; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2813; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2814; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
2815; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2816; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2817; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2818; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2819; GFX7LESS-NEXT:    s_endpgm
2820;
2821; GFX8-LABEL: xor_i32_varying:
2822; GFX8:       ; %bb.0: ; %entry
2823; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2824; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2825; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2826; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2827; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2828; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2829; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2830; GFX8-NEXT:    s_not_b64 exec, exec
2831; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2832; GFX8-NEXT:    s_not_b64 exec, exec
2833; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2834; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2835; GFX8-NEXT:    s_nop 1
2836; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2837; GFX8-NEXT:    s_nop 1
2838; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2839; GFX8-NEXT:    s_nop 1
2840; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2841; GFX8-NEXT:    s_nop 1
2842; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2843; GFX8-NEXT:    s_nop 1
2844; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2845; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2846; GFX8-NEXT:    s_nop 0
2847; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2848; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2849; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2850; GFX8-NEXT:    ; implicit-def: $vgpr0
2851; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2852; GFX8-NEXT:    s_cbranch_execz .LBB16_2
2853; GFX8-NEXT:  ; %bb.1:
2854; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2855; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2856; GFX8-NEXT:    s_mov_b32 m0, -1
2857; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2858; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2859; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2860; GFX8-NEXT:  .LBB16_2:
2861; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2862; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2863; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2864; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2865; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
2866; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2867; GFX8-NEXT:    s_mov_b32 s2, -1
2868; GFX8-NEXT:    s_nop 0
2869; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2870; GFX8-NEXT:    s_endpgm
2871;
2872; GFX9-LABEL: xor_i32_varying:
2873; GFX9:       ; %bb.0: ; %entry
2874; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2875; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2876; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2877; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2878; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2879; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2880; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2881; GFX9-NEXT:    s_not_b64 exec, exec
2882; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2883; GFX9-NEXT:    s_not_b64 exec, exec
2884; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2885; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2886; GFX9-NEXT:    s_nop 1
2887; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2888; GFX9-NEXT:    s_nop 1
2889; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2890; GFX9-NEXT:    s_nop 1
2891; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2892; GFX9-NEXT:    s_nop 1
2893; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2894; GFX9-NEXT:    s_nop 1
2895; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2896; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2897; GFX9-NEXT:    s_nop 0
2898; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2899; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2900; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2901; GFX9-NEXT:    ; implicit-def: $vgpr0
2902; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2903; GFX9-NEXT:    s_cbranch_execz .LBB16_2
2904; GFX9-NEXT:  ; %bb.1:
2905; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2906; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2908; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2909; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2910; GFX9-NEXT:  .LBB16_2:
2911; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2912; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2913; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2914; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2915; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2916; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2917; GFX9-NEXT:    s_mov_b32 s2, -1
2918; GFX9-NEXT:    s_nop 0
2919; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2920; GFX9-NEXT:    s_endpgm
2921;
2922; GFX1064-LABEL: xor_i32_varying:
2923; GFX1064:       ; %bb.0: ; %entry
2924; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2925; GFX1064-NEXT:    s_not_b64 exec, exec
2926; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2927; GFX1064-NEXT:    s_not_b64 exec, exec
2928; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2929; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2930; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2931; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2932; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2933; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2934; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2935; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2936; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2937; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2938; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2939; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2940; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2941; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2942; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2943; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2944; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2945; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2946; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2947; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2948; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2949; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2950; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2951; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2952; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2953; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2954; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2955; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2956; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2957; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2958; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2959; GFX1064-NEXT:    s_mov_b32 s2, -1
2960; GFX1064-NEXT:    ; implicit-def: $vgpr0
2961; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2962; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
2963; GFX1064-NEXT:  ; %bb.1:
2964; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2965; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2966; GFX1064-NEXT:    s_mov_b32 s3, s7
2967; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2968; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2969; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
2970; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2971; GFX1064-NEXT:    buffer_gl0_inv
2972; GFX1064-NEXT:  .LBB16_2:
2973; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2974; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2975; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2976; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2977; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
2978; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2979; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2981; GFX1064-NEXT:    s_endpgm
2982;
2983; GFX1032-LABEL: xor_i32_varying:
2984; GFX1032:       ; %bb.0: ; %entry
2985; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2986; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2987; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2988; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2989; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2990; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2991; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2992; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2993; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2994; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2995; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2996; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2997; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2998; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2999; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3000; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3001; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3002; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3003; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3004; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3005; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3006; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3007; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3008; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3009; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3010; GFX1032-NEXT:    s_mov_b32 s2, -1
3011; GFX1032-NEXT:    ; implicit-def: $vgpr0
3012; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3013; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
3014; GFX1032-NEXT:  ; %bb.1:
3015; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3016; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3017; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3018; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3019; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
3020; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3021; GFX1032-NEXT:    buffer_gl0_inv
3022; GFX1032-NEXT:  .LBB16_2:
3023; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3024; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3025; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3026; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3027; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3028; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3029; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3030; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3031; GFX1032-NEXT:    s_endpgm
3032entry:
3033  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3034  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3035  store i32 %old, i32 addrspace(1)* %out
3036  ret void
3037}
3038
3039define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3040;
3041;
3042; GFX7LESS-LABEL: max_i32_varying:
3043; GFX7LESS:       ; %bb.0: ; %entry
3044; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3045; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3046; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3047; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3048; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3049; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3050; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3051; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3052; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3053; GFX7LESS-NEXT:    s_endpgm
3054;
3055; GFX8-LABEL: max_i32_varying:
3056; GFX8:       ; %bb.0: ; %entry
3057; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3058; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3059; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3060; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3061; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3062; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3063; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3064; GFX8-NEXT:    s_not_b64 exec, exec
3065; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
3066; GFX8-NEXT:    s_not_b64 exec, exec
3067; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3068; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3069; GFX8-NEXT:    s_nop 1
3070; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3071; GFX8-NEXT:    s_nop 1
3072; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3073; GFX8-NEXT:    s_nop 1
3074; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3075; GFX8-NEXT:    s_nop 1
3076; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3077; GFX8-NEXT:    s_nop 1
3078; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3079; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3080; GFX8-NEXT:    s_nop 0
3081; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3082; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3083; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3084; GFX8-NEXT:    ; implicit-def: $vgpr0
3085; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3086; GFX8-NEXT:    s_cbranch_execz .LBB17_2
3087; GFX8-NEXT:  ; %bb.1:
3088; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3089; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3090; GFX8-NEXT:    s_mov_b32 m0, -1
3091; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3092; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3093; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3094; GFX8-NEXT:  .LBB17_2:
3095; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3096; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3097; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3098; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3099; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3100; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3101; GFX8-NEXT:    s_mov_b32 s2, -1
3102; GFX8-NEXT:    s_nop 0
3103; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3104; GFX8-NEXT:    s_endpgm
3105;
3106; GFX9-LABEL: max_i32_varying:
3107; GFX9:       ; %bb.0: ; %entry
3108; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3109; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3110; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3111; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3112; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3113; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3114; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3115; GFX9-NEXT:    s_not_b64 exec, exec
3116; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
3117; GFX9-NEXT:    s_not_b64 exec, exec
3118; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3119; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3120; GFX9-NEXT:    s_nop 1
3121; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3122; GFX9-NEXT:    s_nop 1
3123; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3124; GFX9-NEXT:    s_nop 1
3125; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3126; GFX9-NEXT:    s_nop 1
3127; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3128; GFX9-NEXT:    s_nop 1
3129; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3130; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3131; GFX9-NEXT:    s_nop 0
3132; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3133; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3134; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3135; GFX9-NEXT:    ; implicit-def: $vgpr0
3136; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3137; GFX9-NEXT:    s_cbranch_execz .LBB17_2
3138; GFX9-NEXT:  ; %bb.1:
3139; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3140; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3141; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3143; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3144; GFX9-NEXT:  .LBB17_2:
3145; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3146; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3147; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3148; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3149; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3150; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3151; GFX9-NEXT:    s_mov_b32 s2, -1
3152; GFX9-NEXT:    s_nop 0
3153; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3154; GFX9-NEXT:    s_endpgm
3155;
3156; GFX1064-LABEL: max_i32_varying:
3157; GFX1064:       ; %bb.0: ; %entry
3158; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3159; GFX1064-NEXT:    s_not_b64 exec, exec
3160; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3161; GFX1064-NEXT:    s_not_b64 exec, exec
3162; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3163; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3164; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
3165; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3166; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3167; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3168; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3169; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3170; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3171; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3172; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3173; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3174; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3175; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3176; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3177; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3178; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3179; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3180; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3181; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3182; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3183; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3184; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3185; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3186; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3187; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3188; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3189; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3190; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3191; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3192; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3193; GFX1064-NEXT:    s_mov_b32 s2, -1
3194; GFX1064-NEXT:    ; implicit-def: $vgpr0
3195; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3196; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
3197; GFX1064-NEXT:  ; %bb.1:
3198; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3199; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3200; GFX1064-NEXT:    s_mov_b32 s3, s7
3201; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3202; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3203; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
3204; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3205; GFX1064-NEXT:    buffer_gl0_inv
3206; GFX1064-NEXT:  .LBB17_2:
3207; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3208; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3209; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3210; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3211; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3212; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3213; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3214; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3215; GFX1064-NEXT:    s_endpgm
3216;
3217; GFX1032-LABEL: max_i32_varying:
3218; GFX1032:       ; %bb.0: ; %entry
3219; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3220; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3221; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3222; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3223; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3224; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3225; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3226; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3227; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3228; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3229; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3230; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3231; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3232; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3233; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3234; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
3235; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3236; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3237; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3238; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3239; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3240; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3241; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3242; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3243; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3244; GFX1032-NEXT:    s_mov_b32 s2, -1
3245; GFX1032-NEXT:    ; implicit-def: $vgpr0
3246; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3247; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
3248; GFX1032-NEXT:  ; %bb.1:
3249; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3250; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3251; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3252; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3253; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
3254; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3255; GFX1032-NEXT:    buffer_gl0_inv
3256; GFX1032-NEXT:  .LBB17_2:
3257; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3258; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3259; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3260; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3261; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3262; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3263; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3264; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3265; GFX1032-NEXT:    s_endpgm
3266entry:
3267  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3268  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3269  store i32 %old, i32 addrspace(1)* %out
3270  ret void
3271}
3272
3273define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3274;
3275;
3276; GFX7LESS-LABEL: max_i64_constant:
3277; GFX7LESS:       ; %bb.0: ; %entry
3278; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3279; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3280; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3281; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3282; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3283; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3284; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
3285; GFX7LESS-NEXT:  ; %bb.1:
3286; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
3287; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3288; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3289; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3290; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3291; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3292; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3293; GFX7LESS-NEXT:  .LBB18_2:
3294; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3295; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3296; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3297; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3298; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3299; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3300; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3301; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3302; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3303; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3304; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3305; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3306; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3307; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3308; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3309; GFX7LESS-NEXT:    s_endpgm
3310;
3311; GFX8-LABEL: max_i64_constant:
3312; GFX8:       ; %bb.0: ; %entry
3313; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3314; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3315; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3316; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3317; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3318; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3319; GFX8-NEXT:    s_cbranch_execz .LBB18_2
3320; GFX8-NEXT:  ; %bb.1:
3321; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3322; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3323; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3324; GFX8-NEXT:    s_mov_b32 m0, -1
3325; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3326; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3327; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3328; GFX8-NEXT:  .LBB18_2:
3329; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3330; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3331; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3332; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3333; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3334; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3335; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3336; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3337; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3338; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3339; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3340; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3341; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3342; GFX8-NEXT:    s_mov_b32 s2, -1
3343; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3344; GFX8-NEXT:    s_endpgm
3345;
3346; GFX9-LABEL: max_i64_constant:
3347; GFX9:       ; %bb.0: ; %entry
3348; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3349; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3350; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3351; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3352; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3353; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3354; GFX9-NEXT:    s_cbranch_execz .LBB18_2
3355; GFX9-NEXT:  ; %bb.1:
3356; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3357; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3358; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3359; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3360; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3361; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3362; GFX9-NEXT:  .LBB18_2:
3363; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3365; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3366; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3367; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3368; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3369; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3370; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3371; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3372; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3373; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3374; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3375; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3376; GFX9-NEXT:    s_mov_b32 s2, -1
3377; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3378; GFX9-NEXT:    s_endpgm
3379;
3380; GFX1064-LABEL: max_i64_constant:
3381; GFX1064:       ; %bb.0: ; %entry
3382; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3383; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3384; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3385; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3386; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3387; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3388; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
3389; GFX1064-NEXT:  ; %bb.1:
3390; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3391; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3392; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3393; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3394; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3395; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3396; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3397; GFX1064-NEXT:    buffer_gl0_inv
3398; GFX1064-NEXT:  .LBB18_2:
3399; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3400; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3401; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3402; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3403; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3404; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3405; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3406; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3407; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3408; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3409; GFX1064-NEXT:    s_mov_b32 s2, -1
3410; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3411; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3412; GFX1064-NEXT:    s_endpgm
3413;
3414; GFX1032-LABEL: max_i64_constant:
3415; GFX1032:       ; %bb.0: ; %entry
3416; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3417; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3418; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3419; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3420; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3421; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
3422; GFX1032-NEXT:  ; %bb.1:
3423; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3424; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3425; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3426; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3427; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3428; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3429; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3430; GFX1032-NEXT:    buffer_gl0_inv
3431; GFX1032-NEXT:  .LBB18_2:
3432; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3433; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3434; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3435; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3436; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3437; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3438; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3439; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3440; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3441; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3442; GFX1032-NEXT:    s_mov_b32 s2, -1
3443; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3444; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3445; GFX1032-NEXT:    s_endpgm
3446entry:
3447  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3448  store i64 %old, i64 addrspace(1)* %out
3449  ret void
3450}
3451
3452define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3453;
3454;
3455; GFX7LESS-LABEL: min_i32_varying:
3456; GFX7LESS:       ; %bb.0: ; %entry
3457; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3458; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3459; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3460; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3461; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3462; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3464; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3465; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3466; GFX7LESS-NEXT:    s_endpgm
3467;
3468; GFX8-LABEL: min_i32_varying:
3469; GFX8:       ; %bb.0: ; %entry
3470; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3471; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3472; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3473; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3474; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3475; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3476; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3477; GFX8-NEXT:    s_not_b64 exec, exec
3478; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
3479; GFX8-NEXT:    s_not_b64 exec, exec
3480; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3481; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3482; GFX8-NEXT:    s_nop 1
3483; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3484; GFX8-NEXT:    s_nop 1
3485; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3486; GFX8-NEXT:    s_nop 1
3487; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3488; GFX8-NEXT:    s_nop 1
3489; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3490; GFX8-NEXT:    s_nop 1
3491; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3492; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3493; GFX8-NEXT:    s_nop 0
3494; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3495; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3496; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3497; GFX8-NEXT:    ; implicit-def: $vgpr0
3498; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3499; GFX8-NEXT:    s_cbranch_execz .LBB19_2
3500; GFX8-NEXT:  ; %bb.1:
3501; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3502; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3503; GFX8-NEXT:    s_mov_b32 m0, -1
3504; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3505; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3506; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3507; GFX8-NEXT:  .LBB19_2:
3508; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3509; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3510; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3511; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3512; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3513; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3514; GFX8-NEXT:    s_mov_b32 s2, -1
3515; GFX8-NEXT:    s_nop 0
3516; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3517; GFX8-NEXT:    s_endpgm
3518;
3519; GFX9-LABEL: min_i32_varying:
3520; GFX9:       ; %bb.0: ; %entry
3521; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3522; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3523; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3524; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3525; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3526; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3527; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3528; GFX9-NEXT:    s_not_b64 exec, exec
3529; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
3530; GFX9-NEXT:    s_not_b64 exec, exec
3531; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3532; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3533; GFX9-NEXT:    s_nop 1
3534; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3535; GFX9-NEXT:    s_nop 1
3536; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3537; GFX9-NEXT:    s_nop 1
3538; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3539; GFX9-NEXT:    s_nop 1
3540; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3541; GFX9-NEXT:    s_nop 1
3542; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3543; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3544; GFX9-NEXT:    s_nop 0
3545; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3546; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3547; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3548; GFX9-NEXT:    ; implicit-def: $vgpr0
3549; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3550; GFX9-NEXT:    s_cbranch_execz .LBB19_2
3551; GFX9-NEXT:  ; %bb.1:
3552; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3553; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3554; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3555; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3557; GFX9-NEXT:  .LBB19_2:
3558; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3559; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3560; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3561; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3562; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3563; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3564; GFX9-NEXT:    s_mov_b32 s2, -1
3565; GFX9-NEXT:    s_nop 0
3566; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3567; GFX9-NEXT:    s_endpgm
3568;
3569; GFX1064-LABEL: min_i32_varying:
3570; GFX1064:       ; %bb.0: ; %entry
3571; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3572; GFX1064-NEXT:    s_not_b64 exec, exec
3573; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3574; GFX1064-NEXT:    s_not_b64 exec, exec
3575; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3576; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3577; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
3578; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3579; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3580; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3581; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3582; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3583; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3584; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3585; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3586; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3587; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3588; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3589; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3590; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3591; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3592; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3593; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3594; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3595; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3596; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3597; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3598; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3599; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3600; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3601; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3602; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3603; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3604; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3605; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3606; GFX1064-NEXT:    s_mov_b32 s2, -1
3607; GFX1064-NEXT:    ; implicit-def: $vgpr0
3608; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3609; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
3610; GFX1064-NEXT:  ; %bb.1:
3611; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3612; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3613; GFX1064-NEXT:    s_mov_b32 s3, s7
3614; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3615; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3616; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
3617; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3618; GFX1064-NEXT:    buffer_gl0_inv
3619; GFX1064-NEXT:  .LBB19_2:
3620; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3621; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3622; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3623; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3624; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3625; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3626; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3627; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3628; GFX1064-NEXT:    s_endpgm
3629;
3630; GFX1032-LABEL: min_i32_varying:
3631; GFX1032:       ; %bb.0: ; %entry
3632; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3633; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3634; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3635; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3636; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3637; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3638; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3639; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3640; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3641; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3642; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3643; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3644; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3645; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3646; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3647; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
3648; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3649; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3650; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3651; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3652; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3653; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3654; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3655; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3656; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3657; GFX1032-NEXT:    s_mov_b32 s2, -1
3658; GFX1032-NEXT:    ; implicit-def: $vgpr0
3659; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3660; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
3661; GFX1032-NEXT:  ; %bb.1:
3662; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3663; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3664; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3665; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3666; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
3667; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3668; GFX1032-NEXT:    buffer_gl0_inv
3669; GFX1032-NEXT:  .LBB19_2:
3670; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3671; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3672; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3673; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3674; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3675; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3676; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3677; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3678; GFX1032-NEXT:    s_endpgm
3679entry:
3680  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3681  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3682  store i32 %old, i32 addrspace(1)* %out
3683  ret void
3684}
3685
3686define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3687;
3688;
3689; GFX7LESS-LABEL: min_i64_constant:
3690; GFX7LESS:       ; %bb.0: ; %entry
3691; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3692; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3693; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3694; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3695; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3696; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3697; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
3698; GFX7LESS-NEXT:  ; %bb.1:
3699; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
3700; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3701; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3702; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3703; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3704; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3705; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3706; GFX7LESS-NEXT:  .LBB20_2:
3707; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3708; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3709; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3710; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3711; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
3712; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3713; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3714; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3715; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3716; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3717; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3718; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3719; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3720; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3721; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3722; GFX7LESS-NEXT:    s_endpgm
3723;
3724; GFX8-LABEL: min_i64_constant:
3725; GFX8:       ; %bb.0: ; %entry
3726; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3727; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3728; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3729; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3730; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3731; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3732; GFX8-NEXT:    s_cbranch_execz .LBB20_2
3733; GFX8-NEXT:  ; %bb.1:
3734; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3735; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3736; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3737; GFX8-NEXT:    s_mov_b32 m0, -1
3738; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3739; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3740; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3741; GFX8-NEXT:  .LBB20_2:
3742; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3743; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3744; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
3745; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
3746; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
3747; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3748; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3749; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3750; GFX8-NEXT:    v_mov_b32_e32 v2, s5
3751; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3752; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3753; GFX8-NEXT:    s_mov_b32 s2, -1
3754; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3755; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3756; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3757; GFX8-NEXT:    s_endpgm
3758;
3759; GFX9-LABEL: min_i64_constant:
3760; GFX9:       ; %bb.0: ; %entry
3761; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3762; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3763; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3764; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3765; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3766; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3767; GFX9-NEXT:    s_cbranch_execz .LBB20_2
3768; GFX9-NEXT:  ; %bb.1:
3769; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3770; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3771; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3772; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3773; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3774; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3775; GFX9-NEXT:  .LBB20_2:
3776; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3777; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3778; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
3779; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
3780; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
3781; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3782; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3783; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3784; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3785; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3786; GFX9-NEXT:    v_mov_b32_e32 v2, s4
3787; GFX9-NEXT:    s_mov_b32 s2, -1
3788; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3789; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3790; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3791; GFX9-NEXT:    s_endpgm
3792;
3793; GFX1064-LABEL: min_i64_constant:
3794; GFX1064:       ; %bb.0: ; %entry
3795; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3796; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3797; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3798; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3799; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3800; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3801; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
3802; GFX1064-NEXT:  ; %bb.1:
3803; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3804; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3805; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3806; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3807; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3808; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3809; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3810; GFX1064-NEXT:    buffer_gl0_inv
3811; GFX1064-NEXT:  .LBB20_2:
3812; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3813; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3814; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3815; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3816; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
3817; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3818; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3819; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3820; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3821; GFX1064-NEXT:    s_mov_b32 s2, -1
3822; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3823; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3824; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3825; GFX1064-NEXT:    s_endpgm
3826;
3827; GFX1032-LABEL: min_i64_constant:
3828; GFX1032:       ; %bb.0: ; %entry
3829; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3830; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3831; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3832; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3833; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3834; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
3835; GFX1032-NEXT:  ; %bb.1:
3836; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3837; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3838; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3839; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3840; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3841; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3842; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3843; GFX1032-NEXT:    buffer_gl0_inv
3844; GFX1032-NEXT:  .LBB20_2:
3845; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3846; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3847; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3848; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3849; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
3850; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
3851; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
3852; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3853; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3854; GFX1032-NEXT:    s_mov_b32 s2, -1
3855; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3856; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3857; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3858; GFX1032-NEXT:    s_endpgm
3859entry:
3860  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
3861  store i64 %old, i64 addrspace(1)* %out
3862  ret void
3863}
3864
3865define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
3866;
3867;
3868; GFX7LESS-LABEL: umax_i32_varying:
3869; GFX7LESS:       ; %bb.0: ; %entry
3870; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3871; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3872; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3873; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3874; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
3875; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3876; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3877; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3878; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3879; GFX7LESS-NEXT:    s_endpgm
3880;
3881; GFX8-LABEL: umax_i32_varying:
3882; GFX8:       ; %bb.0: ; %entry
3883; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3884; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3885; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3886; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3887; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3888; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3889; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3890; GFX8-NEXT:    s_not_b64 exec, exec
3891; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3892; GFX8-NEXT:    s_not_b64 exec, exec
3893; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3894; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3895; GFX8-NEXT:    s_nop 1
3896; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3897; GFX8-NEXT:    s_nop 1
3898; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3899; GFX8-NEXT:    s_nop 1
3900; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3901; GFX8-NEXT:    s_nop 1
3902; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3903; GFX8-NEXT:    s_nop 1
3904; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3905; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3906; GFX8-NEXT:    s_nop 0
3907; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3908; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3909; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3910; GFX8-NEXT:    ; implicit-def: $vgpr0
3911; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3912; GFX8-NEXT:    s_cbranch_execz .LBB21_2
3913; GFX8-NEXT:  ; %bb.1:
3914; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3915; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3916; GFX8-NEXT:    s_mov_b32 m0, -1
3917; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3918; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
3919; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3920; GFX8-NEXT:  .LBB21_2:
3921; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3922; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3923; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3924; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3925; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
3926; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3927; GFX8-NEXT:    s_mov_b32 s2, -1
3928; GFX8-NEXT:    s_nop 0
3929; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3930; GFX8-NEXT:    s_endpgm
3931;
3932; GFX9-LABEL: umax_i32_varying:
3933; GFX9:       ; %bb.0: ; %entry
3934; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3935; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3936; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3937; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3938; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3939; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3940; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3941; GFX9-NEXT:    s_not_b64 exec, exec
3942; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3943; GFX9-NEXT:    s_not_b64 exec, exec
3944; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3945; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3946; GFX9-NEXT:    s_nop 1
3947; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3948; GFX9-NEXT:    s_nop 1
3949; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3950; GFX9-NEXT:    s_nop 1
3951; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3952; GFX9-NEXT:    s_nop 1
3953; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3954; GFX9-NEXT:    s_nop 1
3955; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3956; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3957; GFX9-NEXT:    s_nop 0
3958; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3959; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3960; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3961; GFX9-NEXT:    ; implicit-def: $vgpr0
3962; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3963; GFX9-NEXT:    s_cbranch_execz .LBB21_2
3964; GFX9-NEXT:  ; %bb.1:
3965; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3966; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3967; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3968; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
3969; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3970; GFX9-NEXT:  .LBB21_2:
3971; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3972; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3973; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3974; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3975; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
3976; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3977; GFX9-NEXT:    s_mov_b32 s2, -1
3978; GFX9-NEXT:    s_nop 0
3979; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3980; GFX9-NEXT:    s_endpgm
3981;
3982; GFX1064-LABEL: umax_i32_varying:
3983; GFX1064:       ; %bb.0: ; %entry
3984; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3985; GFX1064-NEXT:    s_not_b64 exec, exec
3986; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3987; GFX1064-NEXT:    s_not_b64 exec, exec
3988; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3989; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3990; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3991; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3992; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3993; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3994; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3995; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3996; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3997; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3998; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3999; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4000; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4001; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4002; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4003; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4004; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4005; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4006; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4007; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4008; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4009; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4010; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4011; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4012; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4013; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4014; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4015; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4016; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4017; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4018; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4019; GFX1064-NEXT:    s_mov_b32 s2, -1
4020; GFX1064-NEXT:    ; implicit-def: $vgpr0
4021; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4022; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
4023; GFX1064-NEXT:  ; %bb.1:
4024; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4025; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4026; GFX1064-NEXT:    s_mov_b32 s3, s7
4027; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4028; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4029; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
4030; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4031; GFX1064-NEXT:    buffer_gl0_inv
4032; GFX1064-NEXT:  .LBB21_2:
4033; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4034; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4035; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4036; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4037; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4038; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4039; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4040; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4041; GFX1064-NEXT:    s_endpgm
4042;
4043; GFX1032-LABEL: umax_i32_varying:
4044; GFX1032:       ; %bb.0: ; %entry
4045; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4046; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4047; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4048; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4049; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4050; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4051; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4052; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4053; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4054; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4055; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4056; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4057; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4058; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4059; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4060; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4061; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4062; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4063; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4064; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4065; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4066; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4067; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4068; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4069; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4070; GFX1032-NEXT:    s_mov_b32 s2, -1
4071; GFX1032-NEXT:    ; implicit-def: $vgpr0
4072; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4073; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
4074; GFX1032-NEXT:  ; %bb.1:
4075; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4076; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4077; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4078; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4079; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
4080; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4081; GFX1032-NEXT:    buffer_gl0_inv
4082; GFX1032-NEXT:  .LBB21_2:
4083; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4084; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4085; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4086; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4087; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4088; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4089; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4090; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4091; GFX1032-NEXT:    s_endpgm
4092entry:
4093  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4094  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4095  store i32 %old, i32 addrspace(1)* %out
4096  ret void
4097}
4098
4099define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4100;
4101;
4102; GFX7LESS-LABEL: umax_i64_constant:
4103; GFX7LESS:       ; %bb.0: ; %entry
4104; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4105; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4106; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4107; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4108; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4109; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4110; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
4111; GFX7LESS-NEXT:  ; %bb.1:
4112; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4113; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4114; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4115; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4116; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4117; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4118; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4119; GFX7LESS-NEXT:  .LBB22_2:
4120; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4121; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4122; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4123; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4124; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4125; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4126; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4127; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4128; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4129; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4130; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4131; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4132; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4133; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4134; GFX7LESS-NEXT:    s_endpgm
4135;
4136; GFX8-LABEL: umax_i64_constant:
4137; GFX8:       ; %bb.0: ; %entry
4138; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4139; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4140; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4141; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4142; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4143; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4144; GFX8-NEXT:    s_cbranch_execz .LBB22_2
4145; GFX8-NEXT:  ; %bb.1:
4146; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4147; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4148; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4149; GFX8-NEXT:    s_mov_b32 m0, -1
4150; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4151; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4152; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4153; GFX8-NEXT:  .LBB22_2:
4154; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4155; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4156; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4157; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4158; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4159; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4160; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4161; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4162; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4163; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4164; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4165; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4166; GFX8-NEXT:    s_mov_b32 s2, -1
4167; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4168; GFX8-NEXT:    s_endpgm
4169;
4170; GFX9-LABEL: umax_i64_constant:
4171; GFX9:       ; %bb.0: ; %entry
4172; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4173; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4174; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4175; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4176; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4177; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4178; GFX9-NEXT:    s_cbranch_execz .LBB22_2
4179; GFX9-NEXT:  ; %bb.1:
4180; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4181; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4182; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4183; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4184; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4186; GFX9-NEXT:  .LBB22_2:
4187; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4188; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4189; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4190; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4191; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4192; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4193; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4194; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4195; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4196; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4197; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4198; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4199; GFX9-NEXT:    s_mov_b32 s2, -1
4200; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4201; GFX9-NEXT:    s_endpgm
4202;
4203; GFX1064-LABEL: umax_i64_constant:
4204; GFX1064:       ; %bb.0: ; %entry
4205; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4206; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4207; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4208; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4209; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4210; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4211; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
4212; GFX1064-NEXT:  ; %bb.1:
4213; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4214; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4215; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4216; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4217; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4218; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4219; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4220; GFX1064-NEXT:    buffer_gl0_inv
4221; GFX1064-NEXT:  .LBB22_2:
4222; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4223; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4224; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4225; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4226; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4227; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4228; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4229; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4230; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4231; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4232; GFX1064-NEXT:    s_mov_b32 s2, -1
4233; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4234; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4235; GFX1064-NEXT:    s_endpgm
4236;
4237; GFX1032-LABEL: umax_i64_constant:
4238; GFX1032:       ; %bb.0: ; %entry
4239; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4240; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4241; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4242; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4243; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4244; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
4245; GFX1032-NEXT:  ; %bb.1:
4246; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4247; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4248; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4249; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4250; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4251; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4252; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4253; GFX1032-NEXT:    buffer_gl0_inv
4254; GFX1032-NEXT:  .LBB22_2:
4255; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4256; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4257; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4258; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4259; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4260; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4261; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4262; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4263; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4264; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4265; GFX1032-NEXT:    s_mov_b32 s2, -1
4266; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4267; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4268; GFX1032-NEXT:    s_endpgm
4269entry:
4270  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4271  store i64 %old, i64 addrspace(1)* %out
4272  ret void
4273}
4274
4275define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4276;
4277;
4278; GFX7LESS-LABEL: umin_i32_varying:
4279; GFX7LESS:       ; %bb.0: ; %entry
4280; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4281; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4282; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4283; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4284; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4285; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4286; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4287; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4288; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4289; GFX7LESS-NEXT:    s_endpgm
4290;
4291; GFX8-LABEL: umin_i32_varying:
4292; GFX8:       ; %bb.0: ; %entry
4293; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4294; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4295; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4296; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4297; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4298; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4299; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4300; GFX8-NEXT:    s_not_b64 exec, exec
4301; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4302; GFX8-NEXT:    s_not_b64 exec, exec
4303; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4304; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4305; GFX8-NEXT:    s_nop 1
4306; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4307; GFX8-NEXT:    s_nop 1
4308; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4309; GFX8-NEXT:    s_nop 1
4310; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4311; GFX8-NEXT:    s_nop 1
4312; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4313; GFX8-NEXT:    s_nop 1
4314; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4315; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4316; GFX8-NEXT:    s_nop 0
4317; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4318; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4319; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4320; GFX8-NEXT:    ; implicit-def: $vgpr0
4321; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4322; GFX8-NEXT:    s_cbranch_execz .LBB23_2
4323; GFX8-NEXT:  ; %bb.1:
4324; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4325; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4326; GFX8-NEXT:    s_mov_b32 m0, -1
4327; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4328; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4329; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4330; GFX8-NEXT:  .LBB23_2:
4331; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4332; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4334; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4335; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4336; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4337; GFX8-NEXT:    s_mov_b32 s2, -1
4338; GFX8-NEXT:    s_nop 0
4339; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4340; GFX8-NEXT:    s_endpgm
4341;
4342; GFX9-LABEL: umin_i32_varying:
4343; GFX9:       ; %bb.0: ; %entry
4344; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4345; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4346; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4347; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4348; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4349; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4350; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4351; GFX9-NEXT:    s_not_b64 exec, exec
4352; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4353; GFX9-NEXT:    s_not_b64 exec, exec
4354; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4355; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4356; GFX9-NEXT:    s_nop 1
4357; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4358; GFX9-NEXT:    s_nop 1
4359; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4360; GFX9-NEXT:    s_nop 1
4361; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4362; GFX9-NEXT:    s_nop 1
4363; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4364; GFX9-NEXT:    s_nop 1
4365; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4366; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4367; GFX9-NEXT:    s_nop 0
4368; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4369; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4370; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4371; GFX9-NEXT:    ; implicit-def: $vgpr0
4372; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4373; GFX9-NEXT:    s_cbranch_execz .LBB23_2
4374; GFX9-NEXT:  ; %bb.1:
4375; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4376; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4377; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4378; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4379; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4380; GFX9-NEXT:  .LBB23_2:
4381; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4383; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4384; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4385; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4386; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4387; GFX9-NEXT:    s_mov_b32 s2, -1
4388; GFX9-NEXT:    s_nop 0
4389; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4390; GFX9-NEXT:    s_endpgm
4391;
4392; GFX1064-LABEL: umin_i32_varying:
4393; GFX1064:       ; %bb.0: ; %entry
4394; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4395; GFX1064-NEXT:    s_not_b64 exec, exec
4396; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4397; GFX1064-NEXT:    s_not_b64 exec, exec
4398; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4399; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4400; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4401; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4402; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4403; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4404; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4405; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4406; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4407; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4408; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4409; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4410; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4411; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4412; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4413; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4414; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4415; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4416; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4417; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4418; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4419; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4420; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4421; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4422; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4423; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4424; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4425; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4426; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4427; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4428; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4429; GFX1064-NEXT:    s_mov_b32 s2, -1
4430; GFX1064-NEXT:    ; implicit-def: $vgpr0
4431; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4432; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
4433; GFX1064-NEXT:  ; %bb.1:
4434; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4435; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4436; GFX1064-NEXT:    s_mov_b32 s3, s7
4437; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4438; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4439; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
4440; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4441; GFX1064-NEXT:    buffer_gl0_inv
4442; GFX1064-NEXT:  .LBB23_2:
4443; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4444; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4445; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4446; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4447; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4448; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4449; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4450; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4451; GFX1064-NEXT:    s_endpgm
4452;
4453; GFX1032-LABEL: umin_i32_varying:
4454; GFX1032:       ; %bb.0: ; %entry
4455; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4456; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4457; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4458; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4459; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4460; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4461; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4462; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4463; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4464; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4465; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4466; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4467; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4468; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4469; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4470; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4471; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4472; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4473; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4474; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4475; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4476; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4477; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4478; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4479; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4480; GFX1032-NEXT:    s_mov_b32 s2, -1
4481; GFX1032-NEXT:    ; implicit-def: $vgpr0
4482; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4483; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
4484; GFX1032-NEXT:  ; %bb.1:
4485; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4486; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4487; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4488; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4489; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
4490; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4491; GFX1032-NEXT:    buffer_gl0_inv
4492; GFX1032-NEXT:  .LBB23_2:
4493; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4494; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4495; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4496; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4497; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4498; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4499; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4500; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4501; GFX1032-NEXT:    s_endpgm
4502entry:
4503  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4504  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4505  store i32 %old, i32 addrspace(1)* %out
4506  ret void
4507}
4508
4509define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4510;
4511;
4512; GFX7LESS-LABEL: umin_i64_constant:
4513; GFX7LESS:       ; %bb.0: ; %entry
4514; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4515; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4516; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4517; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4518; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4519; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4520; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
4521; GFX7LESS-NEXT:  ; %bb.1:
4522; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4523; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4524; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4525; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4526; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4527; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4528; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4529; GFX7LESS-NEXT:  .LBB24_2:
4530; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4531; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4532; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4533; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4534; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4535; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4536; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4537; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4538; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4539; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4540; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4541; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4542; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4543; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4544; GFX7LESS-NEXT:    s_endpgm
4545;
4546; GFX8-LABEL: umin_i64_constant:
4547; GFX8:       ; %bb.0: ; %entry
4548; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4549; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4550; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4551; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4552; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4553; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4554; GFX8-NEXT:    s_cbranch_execz .LBB24_2
4555; GFX8-NEXT:  ; %bb.1:
4556; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4557; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4558; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4559; GFX8-NEXT:    s_mov_b32 m0, -1
4560; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4561; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4562; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4563; GFX8-NEXT:  .LBB24_2:
4564; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4565; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4566; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4567; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4568; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4569; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4570; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4571; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4572; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4573; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4574; GFX8-NEXT:    s_mov_b32 s2, -1
4575; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4576; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4577; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4578; GFX8-NEXT:    s_endpgm
4579;
4580; GFX9-LABEL: umin_i64_constant:
4581; GFX9:       ; %bb.0: ; %entry
4582; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4583; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4584; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4585; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4586; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4587; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4588; GFX9-NEXT:    s_cbranch_execz .LBB24_2
4589; GFX9-NEXT:  ; %bb.1:
4590; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4591; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4592; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4593; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4594; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4595; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4596; GFX9-NEXT:  .LBB24_2:
4597; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4599; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4600; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4601; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4602; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4603; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4604; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4605; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4606; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4607; GFX9-NEXT:    s_mov_b32 s2, -1
4608; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4609; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4610; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4611; GFX9-NEXT:    s_endpgm
4612;
4613; GFX1064-LABEL: umin_i64_constant:
4614; GFX1064:       ; %bb.0: ; %entry
4615; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4616; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4617; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4618; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4619; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4620; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4621; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
4622; GFX1064-NEXT:  ; %bb.1:
4623; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4624; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4625; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4626; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4627; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4628; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4629; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4630; GFX1064-NEXT:    buffer_gl0_inv
4631; GFX1064-NEXT:  .LBB24_2:
4632; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4633; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4634; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4635; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4636; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4637; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4638; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4639; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4640; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4641; GFX1064-NEXT:    s_mov_b32 s2, -1
4642; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4643; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4644; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4645; GFX1064-NEXT:    s_endpgm
4646;
4647; GFX1032-LABEL: umin_i64_constant:
4648; GFX1032:       ; %bb.0: ; %entry
4649; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4650; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4651; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4652; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4653; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4654; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
4655; GFX1032-NEXT:  ; %bb.1:
4656; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4657; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4658; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4659; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4660; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4661; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4662; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4663; GFX1032-NEXT:    buffer_gl0_inv
4664; GFX1032-NEXT:  .LBB24_2:
4665; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4666; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4667; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4668; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4669; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4670; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4671; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4672; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4673; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4674; GFX1032-NEXT:    s_mov_b32 s2, -1
4675; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4676; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4677; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4678; GFX1032-NEXT:    s_endpgm
4679entry:
4680  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
4681  store i64 %old, i64 addrspace(1)* %out
4682  ret void
4683}
4684