1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
32; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
33; GFX7LESS-NEXT:    s_mov_b32 m0, -1
34; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
43; GFX7LESS-NEXT:    s_mov_b32 s2, -1
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    s_mul_i32 s2, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
61; GFX8-NEXT:    v_mov_b32_e32 v2, s2
62; GFX8-NEXT:    s_mov_b32 m0, -1
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
70; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
71; GFX8-NEXT:    s_mov_b32 s3, 0xf000
72; GFX8-NEXT:    s_mov_b32 s2, -1
73; GFX8-NEXT:    s_nop 1
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    s_mul_i32 s2, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
91; GFX9-NEXT:    v_mov_b32_e32 v2, s2
92; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
99; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
100; GFX9-NEXT:    s_mov_b32 s3, 0xf000
101; GFX9-NEXT:    s_mov_b32 s2, -1
102; GFX9-NEXT:    s_nop 1
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
119; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
120; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
121; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
122; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
124; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX1064-NEXT:    buffer_gl0_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    s_nop 0
135; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
136; GFX1064-NEXT:    s_endpgm
137;
138; GFX1032-LABEL: add_i32_constant:
139; GFX1032:       ; %bb.0: ; %entry
140; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
141; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
142; GFX1032-NEXT:    ; implicit-def: $vgpr1
143; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
144; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
145; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
146; GFX1032-NEXT:    s_cbranch_execz BB0_2
147; GFX1032-NEXT:  ; %bb.1:
148; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
149; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
150; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
151; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
152; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
153; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
155; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX1032-NEXT:    buffer_gl0_inv
157; GFX1032-NEXT:  BB0_2:
158; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
159; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
160; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
161; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
162; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
163; GFX1032-NEXT:    s_mov_b32 s2, -1
164; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX1032-NEXT:    s_nop 0
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168entry:
169  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
170  store i32 %old, i32 addrspace(1)* %out
171  ret void
172}
173
174define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
175;
176;
177; GFX7LESS-LABEL: add_i32_uniform:
178; GFX7LESS:       ; %bb.0: ; %entry
179; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
180; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
181; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
182; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
183; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
184; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
185; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
186; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
187; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
188; GFX7LESS-NEXT:  ; %bb.1:
189; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
190; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
192; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
193; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
194; GFX7LESS-NEXT:    s_mov_b32 m0, -1
195; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
197; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX7LESS-NEXT:  BB1_2:
199; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
200; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
202; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
203; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
204; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
205; GFX7LESS-NEXT:    s_mov_b32 s6, -1
206; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
207; GFX7LESS-NEXT:    s_endpgm
208;
209; GFX8-LABEL: add_i32_uniform:
210; GFX8:       ; %bb.0: ; %entry
211; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
212; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
213; GFX8-NEXT:    s_mov_b64 s[2:3], exec
214; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
215; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
216; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
217; GFX8-NEXT:    ; implicit-def: $vgpr1
218; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
219; GFX8-NEXT:    s_cbranch_execz BB1_2
220; GFX8-NEXT:  ; %bb.1:
221; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
222; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX8-NEXT:    s_mul_i32 s1, s0, s1
224; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
225; GFX8-NEXT:    v_mov_b32_e32 v2, s1
226; GFX8-NEXT:    s_mov_b32 m0, -1
227; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
229; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX8-NEXT:  BB1_2:
231; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
232; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
234; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
235; GFX8-NEXT:    s_mov_b32 s7, 0xf000
236; GFX8-NEXT:    s_mov_b32 s6, -1
237; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
238; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
239; GFX8-NEXT:    s_endpgm
240;
241; GFX9-LABEL: add_i32_uniform:
242; GFX9:       ; %bb.0: ; %entry
243; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
244; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
245; GFX9-NEXT:    s_mov_b64 s[6:7], exec
246; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
247; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
248; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
249; GFX9-NEXT:    ; implicit-def: $vgpr1
250; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
251; GFX9-NEXT:    s_cbranch_execz BB1_2
252; GFX9-NEXT:  ; %bb.1:
253; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
254; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX9-NEXT:    s_mul_i32 s3, s2, s3
256; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
257; GFX9-NEXT:    v_mov_b32_e32 v2, s3
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX9-NEXT:  BB1_2:
262; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
263; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
265; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
266; GFX9-NEXT:    s_mov_b32 s7, 0xf000
267; GFX9-NEXT:    s_mov_b32 s6, -1
268; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
269; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
270; GFX9-NEXT:    s_endpgm
271;
272; GFX1064-LABEL: add_i32_uniform:
273; GFX1064:       ; %bb.0: ; %entry
274; GFX1064-NEXT:    s_clause 0x1
275; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
276; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
277; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
278; GFX1064-NEXT:    ; implicit-def: $vgpr1
279; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
280; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
281; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
282; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
283; GFX1064-NEXT:    s_cbranch_execz BB1_2
284; GFX1064-NEXT:  ; %bb.1:
285; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
286; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
287; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
289; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
290; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
291; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
292; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
293; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX1064-NEXT:    buffer_gl0_inv
295; GFX1064-NEXT:  BB1_2:
296; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
297; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
298; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
300; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
301; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
302; GFX1064-NEXT:    s_mov_b32 s6, -1
303; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
304; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
305; GFX1064-NEXT:    s_endpgm
306;
307; GFX1032-LABEL: add_i32_uniform:
308; GFX1032:       ; %bb.0: ; %entry
309; GFX1032-NEXT:    s_clause 0x1
310; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
311; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
312; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
313; GFX1032-NEXT:    ; implicit-def: $vgpr1
314; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
315; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
316; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
317; GFX1032-NEXT:    s_cbranch_execz BB1_2
318; GFX1032-NEXT:  ; %bb.1:
319; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
320; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
321; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
323; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
324; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
325; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
326; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
327; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX1032-NEXT:    buffer_gl0_inv
329; GFX1032-NEXT:  BB1_2:
330; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
331; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
332; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
334; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
335; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
336; GFX1032-NEXT:    s_mov_b32 s6, -1
337; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
338; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
339; GFX1032-NEXT:    s_endpgm
340entry:
341  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
342  store i32 %old, i32 addrspace(1)* %out
343  ret void
344}
345
346define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
347;
348;
349; GFX7LESS-LABEL: add_i32_varying:
350; GFX7LESS:       ; %bb.0: ; %entry
351; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
352; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
353; GFX7LESS-NEXT:    s_mov_b32 m0, -1
354; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
356; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
358; GFX7LESS-NEXT:    s_mov_b32 s2, -1
359; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
360; GFX7LESS-NEXT:    s_endpgm
361;
362; GFX8-LABEL: add_i32_varying:
363; GFX8:       ; %bb.0: ; %entry
364; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
365; GFX8-NEXT:    v_mov_b32_e32 v2, v0
366; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
367; GFX8-NEXT:    v_mov_b32_e32 v1, 0
368; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
369; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
370; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
371; GFX8-NEXT:    s_not_b64 exec, exec
372; GFX8-NEXT:    v_mov_b32_e32 v2, 0
373; GFX8-NEXT:    s_not_b64 exec, exec
374; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
375; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
376; GFX8-NEXT:    s_nop 1
377; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
378; GFX8-NEXT:    s_nop 1
379; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
380; GFX8-NEXT:    s_nop 1
381; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
382; GFX8-NEXT:    s_nop 1
383; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
384; GFX8-NEXT:    s_nop 1
385; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
386; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
387; GFX8-NEXT:    s_nop 0
388; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
389; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
390; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
391; GFX8-NEXT:    ; implicit-def: $vgpr0
392; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
393; GFX8-NEXT:    s_cbranch_execz BB2_2
394; GFX8-NEXT:  ; %bb.1:
395; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
396; GFX8-NEXT:    v_mov_b32_e32 v3, s4
397; GFX8-NEXT:    s_mov_b32 m0, -1
398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
400; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX8-NEXT:  BB2_2:
402; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
403; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
405; GFX8-NEXT:    v_mov_b32_e32 v0, v1
406; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
407; GFX8-NEXT:    s_mov_b32 s3, 0xf000
408; GFX8-NEXT:    s_mov_b32 s2, -1
409; GFX8-NEXT:    s_nop 0
410; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
411; GFX8-NEXT:    s_endpgm
412;
413; GFX9-LABEL: add_i32_varying:
414; GFX9:       ; %bb.0: ; %entry
415; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
416; GFX9-NEXT:    v_mov_b32_e32 v2, v0
417; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
418; GFX9-NEXT:    v_mov_b32_e32 v1, 0
419; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
420; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
421; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
422; GFX9-NEXT:    s_not_b64 exec, exec
423; GFX9-NEXT:    v_mov_b32_e32 v2, 0
424; GFX9-NEXT:    s_not_b64 exec, exec
425; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
426; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
427; GFX9-NEXT:    s_nop 1
428; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
429; GFX9-NEXT:    s_nop 1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
433; GFX9-NEXT:    s_nop 1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
435; GFX9-NEXT:    s_nop 1
436; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
437; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
438; GFX9-NEXT:    s_nop 0
439; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
440; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
441; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
442; GFX9-NEXT:    ; implicit-def: $vgpr0
443; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
444; GFX9-NEXT:    s_cbranch_execz BB2_2
445; GFX9-NEXT:  ; %bb.1:
446; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
447; GFX9-NEXT:    v_mov_b32_e32 v3, s4
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
450; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX9-NEXT:  BB2_2:
452; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
453; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
455; GFX9-NEXT:    v_mov_b32_e32 v0, v1
456; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
457; GFX9-NEXT:    s_mov_b32 s3, 0xf000
458; GFX9-NEXT:    s_mov_b32 s2, -1
459; GFX9-NEXT:    s_nop 0
460; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
461; GFX9-NEXT:    s_endpgm
462;
463; GFX1064-LABEL: add_i32_varying:
464; GFX1064:       ; %bb.0: ; %entry
465; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
466; GFX1064-NEXT:    s_not_b64 exec, exec
467; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
468; GFX1064-NEXT:    s_not_b64 exec, exec
469; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
470; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
471; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
472; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
473; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
474; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
475; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
476; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
477; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
478; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
479; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
480; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
481; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
482; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
483; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
484; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
485; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
486; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
487; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
488; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
489; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
490; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
491; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
492; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
493; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
494; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
495; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
496; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
497; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
498; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
499; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
500; GFX1064-NEXT:    s_mov_b32 s2, -1
501; GFX1064-NEXT:    ; implicit-def: $vgpr0
502; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
503; GFX1064-NEXT:    s_cbranch_execz BB2_2
504; GFX1064-NEXT:  ; %bb.1:
505; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
506; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
507; GFX1064-NEXT:    s_mov_b32 s3, s7
508; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
509; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
510; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
511; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX1064-NEXT:    buffer_gl0_inv
513; GFX1064-NEXT:  BB2_2:
514; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
515; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
516; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
517; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
518; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
519; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
520; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX1064-NEXT:    s_nop 0
522; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
523; GFX1064-NEXT:    s_endpgm
524;
525; GFX1032-LABEL: add_i32_varying:
526; GFX1032:       ; %bb.0: ; %entry
527; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
528; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
529; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
530; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
531; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
532; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
533; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
534; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
535; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
536; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
537; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
538; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
539; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
540; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
541; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
542; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
543; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
544; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
545; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
546; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
547; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
548; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
549; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
550; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
551; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
552; GFX1032-NEXT:    s_mov_b32 s2, -1
553; GFX1032-NEXT:    ; implicit-def: $vgpr0
554; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
555; GFX1032-NEXT:    s_cbranch_execz BB2_2
556; GFX1032-NEXT:  ; %bb.1:
557; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
558; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
559; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
560; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
561; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
562; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
563; GFX1032-NEXT:    buffer_gl0_inv
564; GFX1032-NEXT:  BB2_2:
565; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
566; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
567; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
568; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
569; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
570; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
571; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX1032-NEXT:    s_nop 0
573; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
574; GFX1032-NEXT:    s_endpgm
575entry:
576  %lane = call i32 @llvm.amdgcn.workitem.id.x()
577  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
578  store i32 %old, i32 addrspace(1)* %out
579  ret void
580}
581
582define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
583;
584;
585; GFX7LESS-LABEL: add_i32_varying_gfx1032:
586; GFX7LESS:       ; %bb.0: ; %entry
587; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
588; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
589; GFX7LESS-NEXT:    s_mov_b32 m0, -1
590; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
591; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
592; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
594; GFX7LESS-NEXT:    s_mov_b32 s2, -1
595; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
596; GFX7LESS-NEXT:    s_endpgm
597;
598; GFX8-LABEL: add_i32_varying_gfx1032:
599; GFX8:       ; %bb.0: ; %entry
600; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
601; GFX8-NEXT:    v_mov_b32_e32 v2, v0
602; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
603; GFX8-NEXT:    v_mov_b32_e32 v1, 0
604; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
605; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
606; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
607; GFX8-NEXT:    s_not_b64 exec, exec
608; GFX8-NEXT:    v_mov_b32_e32 v2, 0
609; GFX8-NEXT:    s_not_b64 exec, exec
610; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
611; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
612; GFX8-NEXT:    s_nop 1
613; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
614; GFX8-NEXT:    s_nop 1
615; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
616; GFX8-NEXT:    s_nop 1
617; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
618; GFX8-NEXT:    s_nop 1
619; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
620; GFX8-NEXT:    s_nop 1
621; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
622; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
623; GFX8-NEXT:    s_nop 0
624; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
625; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
626; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
627; GFX8-NEXT:    ; implicit-def: $vgpr0
628; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
629; GFX8-NEXT:    s_cbranch_execz BB3_2
630; GFX8-NEXT:  ; %bb.1:
631; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
632; GFX8-NEXT:    v_mov_b32_e32 v3, s4
633; GFX8-NEXT:    s_mov_b32 m0, -1
634; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
636; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX8-NEXT:  BB3_2:
638; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
639; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
640; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
641; GFX8-NEXT:    v_mov_b32_e32 v0, v1
642; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
643; GFX8-NEXT:    s_mov_b32 s3, 0xf000
644; GFX8-NEXT:    s_mov_b32 s2, -1
645; GFX8-NEXT:    s_nop 0
646; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
647; GFX8-NEXT:    s_endpgm
648;
649; GFX9-LABEL: add_i32_varying_gfx1032:
650; GFX9:       ; %bb.0: ; %entry
651; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
652; GFX9-NEXT:    v_mov_b32_e32 v2, v0
653; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
654; GFX9-NEXT:    v_mov_b32_e32 v1, 0
655; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
656; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
657; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
658; GFX9-NEXT:    s_not_b64 exec, exec
659; GFX9-NEXT:    v_mov_b32_e32 v2, 0
660; GFX9-NEXT:    s_not_b64 exec, exec
661; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
662; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
663; GFX9-NEXT:    s_nop 1
664; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
665; GFX9-NEXT:    s_nop 1
666; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX9-NEXT:    s_nop 1
668; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX9-NEXT:    s_nop 1
670; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
671; GFX9-NEXT:    s_nop 1
672; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
673; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
674; GFX9-NEXT:    s_nop 0
675; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
676; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
677; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
678; GFX9-NEXT:    ; implicit-def: $vgpr0
679; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
680; GFX9-NEXT:    s_cbranch_execz BB3_2
681; GFX9-NEXT:  ; %bb.1:
682; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
683; GFX9-NEXT:    v_mov_b32_e32 v3, s4
684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
685; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
686; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX9-NEXT:  BB3_2:
688; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
689; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
690; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
691; GFX9-NEXT:    v_mov_b32_e32 v0, v1
692; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
693; GFX9-NEXT:    s_mov_b32 s3, 0xf000
694; GFX9-NEXT:    s_mov_b32 s2, -1
695; GFX9-NEXT:    s_nop 0
696; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
697; GFX9-NEXT:    s_endpgm
698;
699; GFX1064-LABEL: add_i32_varying_gfx1032:
700; GFX1064:       ; %bb.0: ; %entry
701; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
702; GFX1064-NEXT:    s_not_b64 exec, exec
703; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
704; GFX1064-NEXT:    s_not_b64 exec, exec
705; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
706; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
707; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
708; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
709; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
710; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
711; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
712; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
713; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
714; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
715; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
716; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
717; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
718; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
719; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
720; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
721; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
722; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
723; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
724; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
725; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
726; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
727; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
728; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
729; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
730; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
731; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
732; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
733; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
734; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
735; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
736; GFX1064-NEXT:    s_mov_b32 s2, -1
737; GFX1064-NEXT:    ; implicit-def: $vgpr0
738; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
739; GFX1064-NEXT:    s_cbranch_execz BB3_2
740; GFX1064-NEXT:  ; %bb.1:
741; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
742; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
743; GFX1064-NEXT:    s_mov_b32 s3, s7
744; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
745; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
746; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
747; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX1064-NEXT:    buffer_gl0_inv
749; GFX1064-NEXT:  BB3_2:
750; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
751; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
752; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
753; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
754; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
755; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
756; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX1064-NEXT:    s_nop 0
758; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
759; GFX1064-NEXT:    s_endpgm
760;
761; GFX1032-LABEL: add_i32_varying_gfx1032:
762; GFX1032:       ; %bb.0: ; %entry
763; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
764; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
765; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
766; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
767; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
768; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
769; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
770; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
771; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
772; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
773; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
774; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
775; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
776; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
777; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
778; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
779; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
780; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
781; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
782; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
783; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
784; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
785; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
786; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
787; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
788; GFX1032-NEXT:    s_mov_b32 s2, -1
789; GFX1032-NEXT:    ; implicit-def: $vgpr0
790; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
791; GFX1032-NEXT:    s_cbranch_execz BB3_2
792; GFX1032-NEXT:  ; %bb.1:
793; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
794; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
795; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
796; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
797; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
798; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX1032-NEXT:    buffer_gl0_inv
800; GFX1032-NEXT:  BB3_2:
801; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
802; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
803; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
804; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
805; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
806; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
807; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
808; GFX1032-NEXT:    s_nop 0
809; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
810; GFX1032-NEXT:    s_endpgm
811entry:
812  %lane = call i32 @llvm.amdgcn.workitem.id.x()
813  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
814  store i32 %old, i32 addrspace(1)* %out
815  ret void
816}
817
818define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
819;
820;
821; GFX7LESS-LABEL: add_i32_varying_gfx1064:
822; GFX7LESS:       ; %bb.0: ; %entry
823; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
824; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
825; GFX7LESS-NEXT:    s_mov_b32 m0, -1
826; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
828; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
830; GFX7LESS-NEXT:    s_mov_b32 s2, -1
831; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
832; GFX7LESS-NEXT:    s_endpgm
833;
834; GFX8-LABEL: add_i32_varying_gfx1064:
835; GFX8:       ; %bb.0: ; %entry
836; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
837; GFX8-NEXT:    v_mov_b32_e32 v2, v0
838; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
839; GFX8-NEXT:    v_mov_b32_e32 v1, 0
840; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
841; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
842; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
843; GFX8-NEXT:    s_not_b64 exec, exec
844; GFX8-NEXT:    v_mov_b32_e32 v2, 0
845; GFX8-NEXT:    s_not_b64 exec, exec
846; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
847; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
848; GFX8-NEXT:    s_nop 1
849; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
850; GFX8-NEXT:    s_nop 1
851; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
852; GFX8-NEXT:    s_nop 1
853; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
854; GFX8-NEXT:    s_nop 1
855; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
856; GFX8-NEXT:    s_nop 1
857; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
858; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
859; GFX8-NEXT:    s_nop 0
860; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
861; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
862; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
863; GFX8-NEXT:    ; implicit-def: $vgpr0
864; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
865; GFX8-NEXT:    s_cbranch_execz BB4_2
866; GFX8-NEXT:  ; %bb.1:
867; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
868; GFX8-NEXT:    v_mov_b32_e32 v3, s4
869; GFX8-NEXT:    s_mov_b32 m0, -1
870; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
872; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
873; GFX8-NEXT:  BB4_2:
874; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
875; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
876; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
877; GFX8-NEXT:    v_mov_b32_e32 v0, v1
878; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
879; GFX8-NEXT:    s_mov_b32 s3, 0xf000
880; GFX8-NEXT:    s_mov_b32 s2, -1
881; GFX8-NEXT:    s_nop 0
882; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
883; GFX8-NEXT:    s_endpgm
884;
885; GFX9-LABEL: add_i32_varying_gfx1064:
886; GFX9:       ; %bb.0: ; %entry
887; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
888; GFX9-NEXT:    v_mov_b32_e32 v2, v0
889; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
890; GFX9-NEXT:    v_mov_b32_e32 v1, 0
891; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
892; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
893; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
894; GFX9-NEXT:    s_not_b64 exec, exec
895; GFX9-NEXT:    v_mov_b32_e32 v2, 0
896; GFX9-NEXT:    s_not_b64 exec, exec
897; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
898; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
899; GFX9-NEXT:    s_nop 1
900; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX9-NEXT:    s_nop 1
902; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
903; GFX9-NEXT:    s_nop 1
904; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
905; GFX9-NEXT:    s_nop 1
906; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
907; GFX9-NEXT:    s_nop 1
908; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
909; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
910; GFX9-NEXT:    s_nop 0
911; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
912; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
913; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
914; GFX9-NEXT:    ; implicit-def: $vgpr0
915; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
916; GFX9-NEXT:    s_cbranch_execz BB4_2
917; GFX9-NEXT:  ; %bb.1:
918; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
919; GFX9-NEXT:    v_mov_b32_e32 v3, s4
920; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
922; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX9-NEXT:  BB4_2:
924; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
925; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
926; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
927; GFX9-NEXT:    v_mov_b32_e32 v0, v1
928; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
929; GFX9-NEXT:    s_mov_b32 s3, 0xf000
930; GFX9-NEXT:    s_mov_b32 s2, -1
931; GFX9-NEXT:    s_nop 0
932; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
933; GFX9-NEXT:    s_endpgm
934;
935; GFX1064-LABEL: add_i32_varying_gfx1064:
936; GFX1064:       ; %bb.0: ; %entry
937; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
938; GFX1064-NEXT:    s_not_b64 exec, exec
939; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
940; GFX1064-NEXT:    s_not_b64 exec, exec
941; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
942; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
943; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
944; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
945; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
946; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
947; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
948; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
949; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
950; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
951; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
952; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
953; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
954; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
955; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
956; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
957; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
958; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
959; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
960; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
961; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
962; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
963; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
964; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
965; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
966; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
967; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
968; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
969; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
970; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
971; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
972; GFX1064-NEXT:    s_mov_b32 s2, -1
973; GFX1064-NEXT:    ; implicit-def: $vgpr0
974; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
975; GFX1064-NEXT:    s_cbranch_execz BB4_2
976; GFX1064-NEXT:  ; %bb.1:
977; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
978; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
979; GFX1064-NEXT:    s_mov_b32 s3, s7
980; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
981; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
982; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
983; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX1064-NEXT:    buffer_gl0_inv
985; GFX1064-NEXT:  BB4_2:
986; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
987; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
988; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
989; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
990; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
991; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
992; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX1064-NEXT:    s_nop 0
994; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
995; GFX1064-NEXT:    s_endpgm
996;
997; GFX1032-LABEL: add_i32_varying_gfx1064:
998; GFX1032:       ; %bb.0: ; %entry
999; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1000; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1001; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1002; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1003; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1004; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1005; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1006; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1007; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1008; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1009; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1010; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1011; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1012; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1013; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1014; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1015; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1016; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1017; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1018; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1019; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1020; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1021; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1022; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1023; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1024; GFX1032-NEXT:    s_mov_b32 s2, -1
1025; GFX1032-NEXT:    ; implicit-def: $vgpr0
1026; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1027; GFX1032-NEXT:    s_cbranch_execz BB4_2
1028; GFX1032-NEXT:  ; %bb.1:
1029; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1030; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1031; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1032; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1033; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
1034; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX1032-NEXT:    buffer_gl0_inv
1036; GFX1032-NEXT:  BB4_2:
1037; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1038; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1039; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1040; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1041; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1042; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1043; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX1032-NEXT:    s_nop 0
1045; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1046; GFX1032-NEXT:    s_endpgm
1047entry:
1048  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1049  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1050  store i32 %old, i32 addrspace(1)* %out
1051  ret void
1052}
1053
1054define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1055;
1056;
1057; GFX7LESS-LABEL: add_i64_constant:
1058; GFX7LESS:       ; %bb.0: ; %entry
1059; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1060; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1061; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1062; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1063; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1064; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1065; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1066; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1067; GFX7LESS-NEXT:  ; %bb.1:
1068; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1069; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1070; GFX7LESS-NEXT:    s_mul_i32 s5, s4, 5
1071; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1072; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
1073; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1074; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1076; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1077; GFX7LESS-NEXT:  BB5_2:
1078; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1079; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1080; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1081; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1082; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1083; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1084; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1085; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1086; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1087; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1088; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1089; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1090; GFX7LESS-NEXT:    s_endpgm
1091;
1092; GFX8-LABEL: add_i64_constant:
1093; GFX8:       ; %bb.0: ; %entry
1094; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1095; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1096; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1097; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1098; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1099; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1100; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1101; GFX8-NEXT:    s_cbranch_execz BB5_2
1102; GFX8-NEXT:  ; %bb.1:
1103; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1104; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1105; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1106; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1107; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1108; GFX8-NEXT:    s_mov_b32 m0, -1
1109; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1111; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX8-NEXT:  BB5_2:
1113; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1114; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1116; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1117; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1118; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1119; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1120; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1121; GFX8-NEXT:    s_mov_b32 s2, -1
1122; GFX8-NEXT:    s_nop 2
1123; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1124; GFX8-NEXT:    s_endpgm
1125;
1126; GFX9-LABEL: add_i64_constant:
1127; GFX9:       ; %bb.0: ; %entry
1128; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1129; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1130; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1131; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1132; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1133; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1134; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1135; GFX9-NEXT:    s_cbranch_execz BB5_2
1136; GFX9-NEXT:  ; %bb.1:
1137; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1138; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1139; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1140; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1141; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1142; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1143; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1144; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX9-NEXT:  BB5_2:
1146; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1147; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1148; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1149; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1150; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1151; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1152; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1153; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1154; GFX9-NEXT:    s_mov_b32 s2, -1
1155; GFX9-NEXT:    s_nop 2
1156; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1157; GFX9-NEXT:    s_endpgm
1158;
1159; GFX1064-LABEL: add_i64_constant:
1160; GFX1064:       ; %bb.0: ; %entry
1161; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1162; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1163; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1164; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1165; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
1166; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1167; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1168; GFX1064-NEXT:    s_cbranch_execz BB5_2
1169; GFX1064-NEXT:  ; %bb.1:
1170; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1171; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1172; GFX1064-NEXT:    s_mul_i32 s5, s4, 5
1173; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1174; GFX1064-NEXT:    v_mov_b32_e32 v1, s5
1175; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1176; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1177; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1178; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX1064-NEXT:    buffer_gl0_inv
1180; GFX1064-NEXT:  BB5_2:
1181; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1182; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1183; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1184; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1185; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1186; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1187; GFX1064-NEXT:    s_mov_b32 s2, -1
1188; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1189; GFX1064-NEXT:    s_nop 1
1190; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1191; GFX1064-NEXT:    s_endpgm
1192;
1193; GFX1032-LABEL: add_i64_constant:
1194; GFX1032:       ; %bb.0: ; %entry
1195; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1196; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1197; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1198; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1199; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1200; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1201; GFX1032-NEXT:    s_cbranch_execz BB5_2
1202; GFX1032-NEXT:  ; %bb.1:
1203; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1204; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1205; GFX1032-NEXT:    s_mul_i32 s4, s3, 5
1206; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1207; GFX1032-NEXT:    v_mov_b32_e32 v1, s4
1208; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1209; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1210; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1211; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX1032-NEXT:    buffer_gl0_inv
1213; GFX1032-NEXT:  BB5_2:
1214; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1215; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1216; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1217; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1218; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1219; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1220; GFX1032-NEXT:    s_mov_b32 s2, -1
1221; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX1032-NEXT:    s_nop 1
1223; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1224; GFX1032-NEXT:    s_endpgm
1225entry:
1226  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1227  store i64 %old, i64 addrspace(1)* %out
1228  ret void
1229}
1230
1231define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1232;
1233;
1234; GFX7LESS-LABEL: add_i64_uniform:
1235; GFX7LESS:       ; %bb.0: ; %entry
1236; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1237; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1238; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1239; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1240; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1241; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1242; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1243; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1244; GFX7LESS-NEXT:  ; %bb.1:
1245; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1246; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1247; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1248; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1249; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1250; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1251; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1252; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1253; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1254; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1255; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1257; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX7LESS-NEXT:  BB6_2:
1259; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1260; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1261; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1262; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1263; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1264; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1265; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1266; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1267; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1268; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1269; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1270; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1271; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1272; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1273; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1274; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1275; GFX7LESS-NEXT:    s_endpgm
1276;
1277; GFX8-LABEL: add_i64_uniform:
1278; GFX8:       ; %bb.0: ; %entry
1279; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1280; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1281; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1282; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1283; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1284; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1285; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1286; GFX8-NEXT:    s_cbranch_execz BB6_2
1287; GFX8-NEXT:  ; %bb.1:
1288; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1289; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1290; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1292; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1293; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1294; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1295; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1296; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1297; GFX8-NEXT:    s_mov_b32 m0, -1
1298; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1299; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1300; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1301; GFX8-NEXT:  BB6_2:
1302; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1303; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX8-NEXT:    s_mov_b32 s4, s0
1305; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1306; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1307; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1308; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1309; GFX8-NEXT:    s_mov_b32 s5, s1
1310; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1311; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1312; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1313; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1314; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1315; GFX8-NEXT:    s_mov_b32 s6, -1
1316; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1317; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1318; GFX8-NEXT:    s_endpgm
1319;
1320; GFX9-LABEL: add_i64_uniform:
1321; GFX9:       ; %bb.0: ; %entry
1322; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1323; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1324; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1325; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1326; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1327; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1328; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1329; GFX9-NEXT:    s_cbranch_execz BB6_2
1330; GFX9-NEXT:  ; %bb.1:
1331; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1334; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1335; GFX9-NEXT:    s_add_i32 s8, s8, s7
1336; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1337; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1338; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1339; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1340; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1342; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX9-NEXT:  BB6_2:
1344; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1345; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1346; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1347; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1348; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1349; GFX9-NEXT:    s_mov_b32 s4, s0
1350; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1351; GFX9-NEXT:    s_mov_b32 s5, s1
1352; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1353; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1354; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1355; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1356; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1357; GFX9-NEXT:    s_mov_b32 s6, -1
1358; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1359; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1360; GFX9-NEXT:    s_endpgm
1361;
1362; GFX1064-LABEL: add_i64_uniform:
1363; GFX1064:       ; %bb.0: ; %entry
1364; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1365; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1366; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1367; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1368; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1369; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1370; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1371; GFX1064-NEXT:    s_cbranch_execz BB6_2
1372; GFX1064-NEXT:  ; %bb.1:
1373; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1374; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1375; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1376; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1377; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1378; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1379; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1380; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1381; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1382; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1383; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1384; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1385; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1386; GFX1064-NEXT:    buffer_gl0_inv
1387; GFX1064-NEXT:  BB6_2:
1388; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1389; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1390; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1391; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1392; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1393; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1394; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1395; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1396; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1397; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1398; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
1399; GFX1064-NEXT:    s_mov_b32 s2, -1
1400; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1401; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1402; GFX1064-NEXT:    s_endpgm
1403;
1404; GFX1032-LABEL: add_i64_uniform:
1405; GFX1032:       ; %bb.0: ; %entry
1406; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1407; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1408; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1409; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1410; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1411; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1412; GFX1032-NEXT:    s_cbranch_execz BB6_2
1413; GFX1032-NEXT:  ; %bb.1:
1414; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1415; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1416; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1418; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1419; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1420; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1421; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1422; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1423; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1424; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1425; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1426; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX1032-NEXT:    buffer_gl0_inv
1428; GFX1032-NEXT:  BB6_2:
1429; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1430; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1431; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1432; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1433; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1434; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1435; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1436; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1437; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1438; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1439; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s2, v0
1440; GFX1032-NEXT:    s_mov_b32 s2, -1
1441; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1442; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1443; GFX1032-NEXT:    s_endpgm
1444entry:
1445  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1446  store i64 %old, i64 addrspace(1)* %out
1447  ret void
1448}
1449
1450; GCN-NOT: v_mbcnt_lo_u32_b32
1451; GCN-NOT: v_mbcnt_hi_u32_b32
1452; GCN-NOT: s_bcnt1_i32_b64
1453define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1454;
1455;
1456; GFX7LESS-LABEL: add_i64_varying:
1457; GFX7LESS:       ; %bb.0: ; %entry
1458; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1459; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1460; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1461; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1462; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1463; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1464; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1465; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1466; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1467; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1468; GFX7LESS-NEXT:    s_endpgm
1469;
1470; GFX8-LABEL: add_i64_varying:
1471; GFX8:       ; %bb.0: ; %entry
1472; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1473; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1474; GFX8-NEXT:    s_mov_b32 m0, -1
1475; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1476; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1478; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1479; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1480; GFX8-NEXT:    s_mov_b32 s2, -1
1481; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1482; GFX8-NEXT:    s_endpgm
1483;
1484; GFX9-LABEL: add_i64_varying:
1485; GFX9:       ; %bb.0: ; %entry
1486; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1487; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1488; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1489; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1491; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1492; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1493; GFX9-NEXT:    s_mov_b32 s2, -1
1494; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1495; GFX9-NEXT:    s_endpgm
1496;
1497; GFX1064-LABEL: add_i64_varying:
1498; GFX1064:       ; %bb.0: ; %entry
1499; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1500; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1501; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1502; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1503; GFX1064-NEXT:    s_mov_b32 s2, -1
1504; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1505; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1506; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1507; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX1064-NEXT:    buffer_gl0_inv
1509; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1510; GFX1064-NEXT:    s_endpgm
1511;
1512; GFX1032-LABEL: add_i64_varying:
1513; GFX1032:       ; %bb.0: ; %entry
1514; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1515; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1516; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1517; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1518; GFX1032-NEXT:    s_mov_b32 s2, -1
1519; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1520; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1521; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1522; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1523; GFX1032-NEXT:    buffer_gl0_inv
1524; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1525; GFX1032-NEXT:    s_endpgm
1526entry:
1527  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1528  %zext = zext i32 %lane to i64
1529  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1530  store i64 %old, i64 addrspace(1)* %out
1531  ret void
1532}
1533
1534define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1535;
1536;
1537; GFX7LESS-LABEL: sub_i32_constant:
1538; GFX7LESS:       ; %bb.0: ; %entry
1539; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1540; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1541; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1542; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1543; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1544; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1545; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1546; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1547; GFX7LESS-NEXT:  ; %bb.1:
1548; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1549; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1550; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1551; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1552; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1553; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1555; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1556; GFX7LESS-NEXT:  BB8_2:
1557; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1558; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1559; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1560; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1561; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1562; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1563; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1564; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1565; GFX7LESS-NEXT:    s_endpgm
1566;
1567; GFX8-LABEL: sub_i32_constant:
1568; GFX8:       ; %bb.0: ; %entry
1569; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1570; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1571; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1572; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1573; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1574; GFX8-NEXT:    ; implicit-def: $vgpr1
1575; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1576; GFX8-NEXT:    s_cbranch_execz BB8_2
1577; GFX8-NEXT:  ; %bb.1:
1578; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1579; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1580; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1581; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1582; GFX8-NEXT:    s_mov_b32 m0, -1
1583; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1585; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1586; GFX8-NEXT:  BB8_2:
1587; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1588; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1589; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1590; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1591; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1592; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1593; GFX8-NEXT:    s_mov_b32 s2, -1
1594; GFX8-NEXT:    s_nop 0
1595; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1596; GFX8-NEXT:    s_endpgm
1597;
1598; GFX9-LABEL: sub_i32_constant:
1599; GFX9:       ; %bb.0: ; %entry
1600; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1601; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1602; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1603; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1604; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1605; GFX9-NEXT:    ; implicit-def: $vgpr1
1606; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1607; GFX9-NEXT:    s_cbranch_execz BB8_2
1608; GFX9-NEXT:  ; %bb.1:
1609; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1610; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1611; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1612; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1613; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1615; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1616; GFX9-NEXT:  BB8_2:
1617; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1620; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1621; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1622; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1623; GFX9-NEXT:    s_mov_b32 s2, -1
1624; GFX9-NEXT:    s_nop 0
1625; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1626; GFX9-NEXT:    s_endpgm
1627;
1628; GFX1064-LABEL: sub_i32_constant:
1629; GFX1064:       ; %bb.0: ; %entry
1630; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1631; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1632; GFX1064-NEXT:    ; implicit-def: $vgpr1
1633; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1634; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1635; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1636; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1637; GFX1064-NEXT:    s_cbranch_execz BB8_2
1638; GFX1064-NEXT:  ; %bb.1:
1639; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1640; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1641; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1642; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1643; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1644; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1645; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1646; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1647; GFX1064-NEXT:    buffer_gl0_inv
1648; GFX1064-NEXT:  BB8_2:
1649; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1650; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1651; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1652; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1653; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1654; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1655; GFX1064-NEXT:    s_mov_b32 s2, -1
1656; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1658; GFX1064-NEXT:    s_endpgm
1659;
1660; GFX1032-LABEL: sub_i32_constant:
1661; GFX1032:       ; %bb.0: ; %entry
1662; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1663; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1664; GFX1032-NEXT:    ; implicit-def: $vgpr1
1665; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1666; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1667; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1668; GFX1032-NEXT:    s_cbranch_execz BB8_2
1669; GFX1032-NEXT:  ; %bb.1:
1670; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1671; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1672; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1673; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1674; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1675; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1676; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1677; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1678; GFX1032-NEXT:    buffer_gl0_inv
1679; GFX1032-NEXT:  BB8_2:
1680; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1681; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1682; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1683; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1684; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1685; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1686; GFX1032-NEXT:    s_mov_b32 s2, -1
1687; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1688; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1689; GFX1032-NEXT:    s_endpgm
1690entry:
1691  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1692  store i32 %old, i32 addrspace(1)* %out
1693  ret void
1694}
1695
1696define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1697;
1698;
1699; GFX7LESS-LABEL: sub_i32_uniform:
1700; GFX7LESS:       ; %bb.0: ; %entry
1701; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1702; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1703; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
1704; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1705; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1706; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1707; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1708; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1709; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1710; GFX7LESS-NEXT:  ; %bb.1:
1711; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1712; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
1714; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1715; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1716; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1717; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1718; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1719; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1720; GFX7LESS-NEXT:  BB9_2:
1721; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
1722; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1723; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1724; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1725; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1726; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
1727; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1728; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1729; GFX7LESS-NEXT:    s_endpgm
1730;
1731; GFX8-LABEL: sub_i32_uniform:
1732; GFX8:       ; %bb.0: ; %entry
1733; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1734; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1735; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1736; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1737; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1738; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1739; GFX8-NEXT:    ; implicit-def: $vgpr1
1740; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1741; GFX8-NEXT:    s_cbranch_execz BB9_2
1742; GFX8-NEXT:  ; %bb.1:
1743; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1744; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1745; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1746; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1747; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1748; GFX8-NEXT:    s_mov_b32 m0, -1
1749; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1750; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1751; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX8-NEXT:  BB9_2:
1753; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1754; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1756; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1757; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1758; GFX8-NEXT:    s_mov_b32 s6, -1
1759; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1760; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1761; GFX8-NEXT:    s_endpgm
1762;
1763; GFX9-LABEL: sub_i32_uniform:
1764; GFX9:       ; %bb.0: ; %entry
1765; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1766; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1767; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1768; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1769; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1770; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1771; GFX9-NEXT:    ; implicit-def: $vgpr1
1772; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1773; GFX9-NEXT:    s_cbranch_execz BB9_2
1774; GFX9-NEXT:  ; %bb.1:
1775; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1776; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1777; GFX9-NEXT:    s_mul_i32 s3, s2, s3
1778; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1779; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1780; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1781; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1782; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1783; GFX9-NEXT:  BB9_2:
1784; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1785; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1786; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1787; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1788; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1789; GFX9-NEXT:    s_mov_b32 s6, -1
1790; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1791; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1792; GFX9-NEXT:    s_endpgm
1793;
1794; GFX1064-LABEL: sub_i32_uniform:
1795; GFX1064:       ; %bb.0: ; %entry
1796; GFX1064-NEXT:    s_clause 0x1
1797; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1798; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
1799; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1800; GFX1064-NEXT:    ; implicit-def: $vgpr1
1801; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1802; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1803; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1804; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1805; GFX1064-NEXT:    s_cbranch_execz BB9_2
1806; GFX1064-NEXT:  ; %bb.1:
1807; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1808; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1809; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
1811; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
1812; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1813; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1814; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1815; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1816; GFX1064-NEXT:    buffer_gl0_inv
1817; GFX1064-NEXT:  BB9_2:
1818; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1819; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1820; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1822; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1823; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1824; GFX1064-NEXT:    s_mov_b32 s6, -1
1825; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1826; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1827; GFX1064-NEXT:    s_endpgm
1828;
1829; GFX1032-LABEL: sub_i32_uniform:
1830; GFX1032:       ; %bb.0: ; %entry
1831; GFX1032-NEXT:    s_clause 0x1
1832; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1833; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1834; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1835; GFX1032-NEXT:    ; implicit-def: $vgpr1
1836; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1837; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1838; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1839; GFX1032-NEXT:    s_cbranch_execz BB9_2
1840; GFX1032-NEXT:  ; %bb.1:
1841; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1842; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1843; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1845; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1846; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1847; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1848; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1849; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1850; GFX1032-NEXT:    buffer_gl0_inv
1851; GFX1032-NEXT:  BB9_2:
1852; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1853; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1854; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1856; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1857; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1858; GFX1032-NEXT:    s_mov_b32 s6, -1
1859; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1860; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1861; GFX1032-NEXT:    s_endpgm
1862entry:
1863  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1864  store i32 %old, i32 addrspace(1)* %out
1865  ret void
1866}
1867
1868define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1869;
1870;
1871; GFX7LESS-LABEL: sub_i32_varying:
1872; GFX7LESS:       ; %bb.0: ; %entry
1873; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1874; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1875; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1876; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1878; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1879; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1880; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1881; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1882; GFX7LESS-NEXT:    s_endpgm
1883;
1884; GFX8-LABEL: sub_i32_varying:
1885; GFX8:       ; %bb.0: ; %entry
1886; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1887; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1888; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1889; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1890; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1891; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1892; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1893; GFX8-NEXT:    s_not_b64 exec, exec
1894; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1895; GFX8-NEXT:    s_not_b64 exec, exec
1896; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1897; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1898; GFX8-NEXT:    s_nop 1
1899; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1900; GFX8-NEXT:    s_nop 1
1901; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1902; GFX8-NEXT:    s_nop 1
1903; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1904; GFX8-NEXT:    s_nop 1
1905; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1906; GFX8-NEXT:    s_nop 1
1907; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1908; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1909; GFX8-NEXT:    s_nop 0
1910; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1911; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1912; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1913; GFX8-NEXT:    ; implicit-def: $vgpr0
1914; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1915; GFX8-NEXT:    s_cbranch_execz BB10_2
1916; GFX8-NEXT:  ; %bb.1:
1917; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1918; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1919; GFX8-NEXT:    s_mov_b32 m0, -1
1920; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1922; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1923; GFX8-NEXT:  BB10_2:
1924; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1925; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1926; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1927; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1928; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1929; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1930; GFX8-NEXT:    s_mov_b32 s2, -1
1931; GFX8-NEXT:    s_nop 0
1932; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1933; GFX8-NEXT:    s_endpgm
1934;
1935; GFX9-LABEL: sub_i32_varying:
1936; GFX9:       ; %bb.0: ; %entry
1937; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1938; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1939; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1940; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1941; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1942; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1943; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1944; GFX9-NEXT:    s_not_b64 exec, exec
1945; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1946; GFX9-NEXT:    s_not_b64 exec, exec
1947; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1948; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1949; GFX9-NEXT:    s_nop 1
1950; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1951; GFX9-NEXT:    s_nop 1
1952; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1953; GFX9-NEXT:    s_nop 1
1954; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1955; GFX9-NEXT:    s_nop 1
1956; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1957; GFX9-NEXT:    s_nop 1
1958; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1959; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1960; GFX9-NEXT:    s_nop 0
1961; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1962; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1963; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1964; GFX9-NEXT:    ; implicit-def: $vgpr0
1965; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1966; GFX9-NEXT:    s_cbranch_execz BB10_2
1967; GFX9-NEXT:  ; %bb.1:
1968; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1969; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1970; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1972; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1973; GFX9-NEXT:  BB10_2:
1974; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1975; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1976; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1977; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1978; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1979; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1980; GFX9-NEXT:    s_mov_b32 s2, -1
1981; GFX9-NEXT:    s_nop 0
1982; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1983; GFX9-NEXT:    s_endpgm
1984;
1985; GFX1064-LABEL: sub_i32_varying:
1986; GFX1064:       ; %bb.0: ; %entry
1987; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1988; GFX1064-NEXT:    s_not_b64 exec, exec
1989; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1990; GFX1064-NEXT:    s_not_b64 exec, exec
1991; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1992; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1993; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1994; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1995; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1996; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1997; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1998; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1999; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2000; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2001; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2002; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2003; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2004; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2005; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2006; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2007; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2008; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2009; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2010; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2011; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2012; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2013; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2014; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2015; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2016; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2017; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2018; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2019; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2020; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2021; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2022; GFX1064-NEXT:    s_mov_b32 s2, -1
2023; GFX1064-NEXT:    ; implicit-def: $vgpr0
2024; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2025; GFX1064-NEXT:    s_cbranch_execz BB10_2
2026; GFX1064-NEXT:  ; %bb.1:
2027; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2028; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2029; GFX1064-NEXT:    s_mov_b32 s3, s7
2030; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2031; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2032; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2033; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2034; GFX1064-NEXT:    buffer_gl0_inv
2035; GFX1064-NEXT:  BB10_2:
2036; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2037; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2038; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2039; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2040; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2041; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2042; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2043; GFX1064-NEXT:    s_nop 0
2044; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2045; GFX1064-NEXT:    s_endpgm
2046;
2047; GFX1032-LABEL: sub_i32_varying:
2048; GFX1032:       ; %bb.0: ; %entry
2049; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2050; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2051; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2052; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2053; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2054; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2055; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2056; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2057; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2058; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2059; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2060; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2061; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2062; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2063; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2064; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2065; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2066; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2067; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2068; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2069; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2070; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2071; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2072; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2073; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2074; GFX1032-NEXT:    s_mov_b32 s2, -1
2075; GFX1032-NEXT:    ; implicit-def: $vgpr0
2076; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2077; GFX1032-NEXT:    s_cbranch_execz BB10_2
2078; GFX1032-NEXT:  ; %bb.1:
2079; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2080; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2081; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2082; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2083; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2084; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX1032-NEXT:    buffer_gl0_inv
2086; GFX1032-NEXT:  BB10_2:
2087; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2088; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2089; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2090; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2091; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2092; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2093; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2094; GFX1032-NEXT:    s_nop 0
2095; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2096; GFX1032-NEXT:    s_endpgm
2097entry:
2098  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2099  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2100  store i32 %old, i32 addrspace(1)* %out
2101  ret void
2102}
2103
2104define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2105;
2106;
2107; GFX7LESS-LABEL: sub_i64_constant:
2108; GFX7LESS:       ; %bb.0: ; %entry
2109; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2110; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2111; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2112; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2113; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2114; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2115; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2116; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2117; GFX7LESS-NEXT:  ; %bb.1:
2118; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2119; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2120; GFX7LESS-NEXT:    s_mul_i32 s5, s4, 5
2121; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2122; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
2123; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2124; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2125; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2126; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2127; GFX7LESS-NEXT:  BB11_2:
2128; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2129; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2130; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2131; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2132; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2133; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2134; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2135; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2136; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2137; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2138; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2139; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2140; GFX7LESS-NEXT:    s_endpgm
2141;
2142; GFX8-LABEL: sub_i64_constant:
2143; GFX8:       ; %bb.0: ; %entry
2144; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2145; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2146; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2147; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2148; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2149; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2150; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2151; GFX8-NEXT:    s_cbranch_execz BB11_2
2152; GFX8-NEXT:  ; %bb.1:
2153; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2154; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2155; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2156; GFX8-NEXT:    v_mov_b32_e32 v1, s4
2157; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2158; GFX8-NEXT:    s_mov_b32 m0, -1
2159; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2161; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2162; GFX8-NEXT:  BB11_2:
2163; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2164; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2166; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2167; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2168; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2169; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2170; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2171; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2172; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2173; GFX8-NEXT:    s_mov_b32 s2, -1
2174; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2175; GFX8-NEXT:    s_endpgm
2176;
2177; GFX9-LABEL: sub_i64_constant:
2178; GFX9:       ; %bb.0: ; %entry
2179; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2180; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2181; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2182; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2183; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2184; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2185; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2186; GFX9-NEXT:    s_cbranch_execz BB11_2
2187; GFX9-NEXT:  ; %bb.1:
2188; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2189; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2190; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2191; GFX9-NEXT:    v_mov_b32_e32 v1, s4
2192; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2193; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2194; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2195; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2196; GFX9-NEXT:  BB11_2:
2197; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2198; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2199; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2200; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2201; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2202; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2203; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2204; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2205; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2206; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2207; GFX9-NEXT:    s_mov_b32 s2, -1
2208; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2209; GFX9-NEXT:    s_endpgm
2210;
2211; GFX1064-LABEL: sub_i64_constant:
2212; GFX1064:       ; %bb.0: ; %entry
2213; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2214; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2215; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2216; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2217; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2218; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2219; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2220; GFX1064-NEXT:    s_cbranch_execz BB11_2
2221; GFX1064-NEXT:  ; %bb.1:
2222; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2223; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2224; GFX1064-NEXT:    s_mul_i32 s5, s4, 5
2225; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2226; GFX1064-NEXT:    v_mov_b32_e32 v1, s5
2227; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2228; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2229; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2230; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2231; GFX1064-NEXT:    buffer_gl0_inv
2232; GFX1064-NEXT:  BB11_2:
2233; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2234; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2235; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2236; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2237; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2238; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2239; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2240; GFX1064-NEXT:    s_mov_b32 s2, -1
2241; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2242; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2243; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2244; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2245; GFX1064-NEXT:    s_endpgm
2246;
2247; GFX1032-LABEL: sub_i64_constant:
2248; GFX1032:       ; %bb.0: ; %entry
2249; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2250; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2251; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2252; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2253; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2254; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2255; GFX1032-NEXT:    s_cbranch_execz BB11_2
2256; GFX1032-NEXT:  ; %bb.1:
2257; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2258; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2259; GFX1032-NEXT:    s_mul_i32 s4, s3, 5
2260; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2261; GFX1032-NEXT:    v_mov_b32_e32 v1, s4
2262; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2263; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2264; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2265; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2266; GFX1032-NEXT:    buffer_gl0_inv
2267; GFX1032-NEXT:  BB11_2:
2268; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2269; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2270; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2271; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2272; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2273; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2274; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2275; GFX1032-NEXT:    s_mov_b32 s2, -1
2276; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2277; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2278; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2279; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2280; GFX1032-NEXT:    s_endpgm
2281entry:
2282  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2283  store i64 %old, i64 addrspace(1)* %out
2284  ret void
2285}
2286
2287define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2288;
2289;
2290; GFX7LESS-LABEL: sub_i64_uniform:
2291; GFX7LESS:       ; %bb.0: ; %entry
2292; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2293; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2294; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2295; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2296; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2297; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2298; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2299; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2300; GFX7LESS-NEXT:  ; %bb.1:
2301; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2302; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2303; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2304; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2305; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2306; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2307; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2308; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2309; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2310; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2311; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2313; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2314; GFX7LESS-NEXT:  BB12_2:
2315; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2316; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2317; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2318; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2319; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2320; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2321; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2322; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2323; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2324; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2325; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2326; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2327; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2328; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2329; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2330; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2331; GFX7LESS-NEXT:    s_endpgm
2332;
2333; GFX8-LABEL: sub_i64_uniform:
2334; GFX8:       ; %bb.0: ; %entry
2335; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2336; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2337; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2338; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2339; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2340; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2341; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2342; GFX8-NEXT:    s_cbranch_execz BB12_2
2343; GFX8-NEXT:  ; %bb.1:
2344; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2345; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2346; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2347; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2348; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2349; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2350; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2351; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2352; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2353; GFX8-NEXT:    s_mov_b32 m0, -1
2354; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2355; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2356; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2357; GFX8-NEXT:  BB12_2:
2358; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2359; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2360; GFX8-NEXT:    s_mov_b32 s4, s0
2361; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2362; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2363; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2364; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2365; GFX8-NEXT:    s_mov_b32 s5, s1
2366; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2367; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2368; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2369; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2370; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2371; GFX8-NEXT:    s_mov_b32 s6, -1
2372; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2373; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2374; GFX8-NEXT:    s_endpgm
2375;
2376; GFX9-LABEL: sub_i64_uniform:
2377; GFX9:       ; %bb.0: ; %entry
2378; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2379; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2380; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2381; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2382; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2383; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2384; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2385; GFX9-NEXT:    s_cbranch_execz BB12_2
2386; GFX9-NEXT:  ; %bb.1:
2387; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2389; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2390; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2391; GFX9-NEXT:    s_add_i32 s8, s8, s7
2392; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2393; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2394; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2395; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2396; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2397; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2398; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2399; GFX9-NEXT:  BB12_2:
2400; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2401; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2402; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2403; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2404; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2405; GFX9-NEXT:    s_mov_b32 s4, s0
2406; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2407; GFX9-NEXT:    s_mov_b32 s5, s1
2408; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2409; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2410; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2411; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2412; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2413; GFX9-NEXT:    s_mov_b32 s6, -1
2414; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2415; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2416; GFX9-NEXT:    s_endpgm
2417;
2418; GFX1064-LABEL: sub_i64_uniform:
2419; GFX1064:       ; %bb.0: ; %entry
2420; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2421; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2422; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2423; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2424; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2425; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2426; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2427; GFX1064-NEXT:    s_cbranch_execz BB12_2
2428; GFX1064-NEXT:  ; %bb.1:
2429; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2430; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2431; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2432; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2433; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2434; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2435; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2436; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2437; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2438; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2439; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2440; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2441; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2442; GFX1064-NEXT:    buffer_gl0_inv
2443; GFX1064-NEXT:  BB12_2:
2444; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2445; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2446; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2447; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2448; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2449; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2450; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2451; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2452; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2453; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2454; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v0
2455; GFX1064-NEXT:    s_mov_b32 s2, -1
2456; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2457; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2458; GFX1064-NEXT:    s_endpgm
2459;
2460; GFX1032-LABEL: sub_i64_uniform:
2461; GFX1032:       ; %bb.0: ; %entry
2462; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2463; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2464; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2465; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2466; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2467; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2468; GFX1032-NEXT:    s_cbranch_execz BB12_2
2469; GFX1032-NEXT:  ; %bb.1:
2470; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2471; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2472; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2473; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2474; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2475; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2476; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2477; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2478; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2479; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2480; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2481; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2482; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2483; GFX1032-NEXT:    buffer_gl0_inv
2484; GFX1032-NEXT:  BB12_2:
2485; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2486; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2487; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2488; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2489; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2490; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2491; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2492; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2493; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2494; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2495; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v0
2496; GFX1032-NEXT:    s_mov_b32 s2, -1
2497; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2498; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2499; GFX1032-NEXT:    s_endpgm
2500entry:
2501  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2502  store i64 %old, i64 addrspace(1)* %out
2503  ret void
2504}
2505
2506; GCN-NOT: v_mbcnt_lo_u32_b32
2507; GCN-NOT: v_mbcnt_hi_u32_b32
2508; GCN-NOT: s_bcnt1_i32_b64
2509define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2510;
2511;
2512; GFX7LESS-LABEL: sub_i64_varying:
2513; GFX7LESS:       ; %bb.0: ; %entry
2514; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2515; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2516; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2517; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2518; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2519; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2520; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2521; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2522; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2523; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2524; GFX7LESS-NEXT:    s_endpgm
2525;
2526; GFX8-LABEL: sub_i64_varying:
2527; GFX8:       ; %bb.0: ; %entry
2528; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2529; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2530; GFX8-NEXT:    s_mov_b32 m0, -1
2531; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2532; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2533; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2534; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2535; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2536; GFX8-NEXT:    s_mov_b32 s2, -1
2537; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2538; GFX8-NEXT:    s_endpgm
2539;
2540; GFX9-LABEL: sub_i64_varying:
2541; GFX9:       ; %bb.0: ; %entry
2542; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2543; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2544; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2545; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2546; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2547; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2548; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2549; GFX9-NEXT:    s_mov_b32 s2, -1
2550; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2551; GFX9-NEXT:    s_endpgm
2552;
2553; GFX1064-LABEL: sub_i64_varying:
2554; GFX1064:       ; %bb.0: ; %entry
2555; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2556; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2557; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2558; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2559; GFX1064-NEXT:    s_mov_b32 s2, -1
2560; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2561; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2562; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2563; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2564; GFX1064-NEXT:    buffer_gl0_inv
2565; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2566; GFX1064-NEXT:    s_endpgm
2567;
2568; GFX1032-LABEL: sub_i64_varying:
2569; GFX1032:       ; %bb.0: ; %entry
2570; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2571; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2572; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2573; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2574; GFX1032-NEXT:    s_mov_b32 s2, -1
2575; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2576; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2577; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2578; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2579; GFX1032-NEXT:    buffer_gl0_inv
2580; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2581; GFX1032-NEXT:    s_endpgm
2582entry:
2583  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2584  %zext = zext i32 %lane to i64
2585  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2586  store i64 %old, i64 addrspace(1)* %out
2587  ret void
2588}
2589
2590define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2591;
2592;
2593; GFX7LESS-LABEL: and_i32_varying:
2594; GFX7LESS:       ; %bb.0: ; %entry
2595; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2596; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2597; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2598; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2599; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2600; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2601; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2602; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2603; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2604; GFX7LESS-NEXT:    s_endpgm
2605;
2606; GFX8-LABEL: and_i32_varying:
2607; GFX8:       ; %bb.0: ; %entry
2608; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2609; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2610; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2611; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2612; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2613; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2614; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2615; GFX8-NEXT:    s_not_b64 exec, exec
2616; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2617; GFX8-NEXT:    s_not_b64 exec, exec
2618; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2619; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2620; GFX8-NEXT:    s_nop 1
2621; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2622; GFX8-NEXT:    s_nop 1
2623; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2624; GFX8-NEXT:    s_nop 1
2625; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2626; GFX8-NEXT:    s_nop 1
2627; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2628; GFX8-NEXT:    s_nop 1
2629; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2630; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2631; GFX8-NEXT:    s_nop 0
2632; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2633; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2634; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2635; GFX8-NEXT:    ; implicit-def: $vgpr0
2636; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2637; GFX8-NEXT:    s_cbranch_execz BB14_2
2638; GFX8-NEXT:  ; %bb.1:
2639; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2640; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2641; GFX8-NEXT:    s_mov_b32 m0, -1
2642; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2643; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2644; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2645; GFX8-NEXT:  BB14_2:
2646; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2647; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2648; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2649; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2650; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2651; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2652; GFX8-NEXT:    s_mov_b32 s2, -1
2653; GFX8-NEXT:    s_nop 0
2654; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2655; GFX8-NEXT:    s_endpgm
2656;
2657; GFX9-LABEL: and_i32_varying:
2658; GFX9:       ; %bb.0: ; %entry
2659; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2660; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2661; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2662; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2663; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2664; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2665; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2666; GFX9-NEXT:    s_not_b64 exec, exec
2667; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2668; GFX9-NEXT:    s_not_b64 exec, exec
2669; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2670; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2671; GFX9-NEXT:    s_nop 1
2672; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2673; GFX9-NEXT:    s_nop 1
2674; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2675; GFX9-NEXT:    s_nop 1
2676; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2677; GFX9-NEXT:    s_nop 1
2678; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2679; GFX9-NEXT:    s_nop 1
2680; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2681; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2682; GFX9-NEXT:    s_nop 0
2683; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2684; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2685; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2686; GFX9-NEXT:    ; implicit-def: $vgpr0
2687; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2688; GFX9-NEXT:    s_cbranch_execz BB14_2
2689; GFX9-NEXT:  ; %bb.1:
2690; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2691; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2692; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2693; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2694; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2695; GFX9-NEXT:  BB14_2:
2696; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2698; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2699; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2700; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2701; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2702; GFX9-NEXT:    s_mov_b32 s2, -1
2703; GFX9-NEXT:    s_nop 0
2704; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2705; GFX9-NEXT:    s_endpgm
2706;
2707; GFX1064-LABEL: and_i32_varying:
2708; GFX1064:       ; %bb.0: ; %entry
2709; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2710; GFX1064-NEXT:    s_not_b64 exec, exec
2711; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2712; GFX1064-NEXT:    s_not_b64 exec, exec
2713; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2714; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2715; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2716; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2717; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2718; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2719; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2720; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2721; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2722; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2723; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2724; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2725; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2726; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2727; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2728; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2729; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2730; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2731; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2732; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2733; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2734; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2735; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2736; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2737; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2738; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2739; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2740; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2741; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2742; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2743; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2744; GFX1064-NEXT:    s_mov_b32 s2, -1
2745; GFX1064-NEXT:    ; implicit-def: $vgpr0
2746; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2747; GFX1064-NEXT:    s_cbranch_execz BB14_2
2748; GFX1064-NEXT:  ; %bb.1:
2749; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2750; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2751; GFX1064-NEXT:    s_mov_b32 s3, s7
2752; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2753; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2754; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
2755; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2756; GFX1064-NEXT:    buffer_gl0_inv
2757; GFX1064-NEXT:  BB14_2:
2758; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2759; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2760; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2761; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2762; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2763; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2764; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2765; GFX1064-NEXT:    s_nop 0
2766; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2767; GFX1064-NEXT:    s_endpgm
2768;
2769; GFX1032-LABEL: and_i32_varying:
2770; GFX1032:       ; %bb.0: ; %entry
2771; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2772; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2773; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2774; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2775; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2776; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2777; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2778; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2779; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2780; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2781; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2782; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2783; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2784; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2785; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2786; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2787; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2788; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2789; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2790; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2791; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2792; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2793; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2794; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2795; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2796; GFX1032-NEXT:    s_mov_b32 s2, -1
2797; GFX1032-NEXT:    ; implicit-def: $vgpr0
2798; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2799; GFX1032-NEXT:    s_cbranch_execz BB14_2
2800; GFX1032-NEXT:  ; %bb.1:
2801; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2802; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2803; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2804; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2805; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
2806; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2807; GFX1032-NEXT:    buffer_gl0_inv
2808; GFX1032-NEXT:  BB14_2:
2809; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2810; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2811; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2812; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2813; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2814; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2815; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2816; GFX1032-NEXT:    s_nop 0
2817; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2818; GFX1032-NEXT:    s_endpgm
2819entry:
2820  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2821  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2822  store i32 %old, i32 addrspace(1)* %out
2823  ret void
2824}
2825
2826define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2827;
2828;
2829; GFX7LESS-LABEL: or_i32_varying:
2830; GFX7LESS:       ; %bb.0: ; %entry
2831; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2832; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2833; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2834; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2835; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2836; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2838; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2839; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2840; GFX7LESS-NEXT:    s_endpgm
2841;
2842; GFX8-LABEL: or_i32_varying:
2843; GFX8:       ; %bb.0: ; %entry
2844; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2845; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2846; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2847; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2848; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2849; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2850; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2851; GFX8-NEXT:    s_not_b64 exec, exec
2852; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2853; GFX8-NEXT:    s_not_b64 exec, exec
2854; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2855; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2856; GFX8-NEXT:    s_nop 1
2857; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2858; GFX8-NEXT:    s_nop 1
2859; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2860; GFX8-NEXT:    s_nop 1
2861; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2862; GFX8-NEXT:    s_nop 1
2863; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2864; GFX8-NEXT:    s_nop 1
2865; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2866; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2867; GFX8-NEXT:    s_nop 0
2868; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2869; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2870; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2871; GFX8-NEXT:    ; implicit-def: $vgpr0
2872; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2873; GFX8-NEXT:    s_cbranch_execz BB15_2
2874; GFX8-NEXT:  ; %bb.1:
2875; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2876; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2877; GFX8-NEXT:    s_mov_b32 m0, -1
2878; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2879; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2880; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2881; GFX8-NEXT:  BB15_2:
2882; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2883; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2884; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2885; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2886; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2887; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2888; GFX8-NEXT:    s_mov_b32 s2, -1
2889; GFX8-NEXT:    s_nop 0
2890; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2891; GFX8-NEXT:    s_endpgm
2892;
2893; GFX9-LABEL: or_i32_varying:
2894; GFX9:       ; %bb.0: ; %entry
2895; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2896; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2897; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2898; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2899; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2900; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2901; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2902; GFX9-NEXT:    s_not_b64 exec, exec
2903; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2904; GFX9-NEXT:    s_not_b64 exec, exec
2905; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2906; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2907; GFX9-NEXT:    s_nop 1
2908; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2909; GFX9-NEXT:    s_nop 1
2910; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2911; GFX9-NEXT:    s_nop 1
2912; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2913; GFX9-NEXT:    s_nop 1
2914; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2915; GFX9-NEXT:    s_nop 1
2916; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2917; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2918; GFX9-NEXT:    s_nop 0
2919; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2920; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2921; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2922; GFX9-NEXT:    ; implicit-def: $vgpr0
2923; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2924; GFX9-NEXT:    s_cbranch_execz BB15_2
2925; GFX9-NEXT:  ; %bb.1:
2926; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2927; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2928; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2929; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2930; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2931; GFX9-NEXT:  BB15_2:
2932; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2933; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2934; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2935; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2936; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2937; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2938; GFX9-NEXT:    s_mov_b32 s2, -1
2939; GFX9-NEXT:    s_nop 0
2940; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2941; GFX9-NEXT:    s_endpgm
2942;
2943; GFX1064-LABEL: or_i32_varying:
2944; GFX1064:       ; %bb.0: ; %entry
2945; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2946; GFX1064-NEXT:    s_not_b64 exec, exec
2947; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2948; GFX1064-NEXT:    s_not_b64 exec, exec
2949; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2950; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2951; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2952; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2953; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2954; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2955; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2956; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2957; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2958; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2959; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2960; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2961; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2962; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2963; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2964; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2965; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2966; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2967; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2968; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2969; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2970; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2971; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2972; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2973; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2974; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2975; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2976; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2977; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2978; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2979; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2980; GFX1064-NEXT:    s_mov_b32 s2, -1
2981; GFX1064-NEXT:    ; implicit-def: $vgpr0
2982; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2983; GFX1064-NEXT:    s_cbranch_execz BB15_2
2984; GFX1064-NEXT:  ; %bb.1:
2985; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2986; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2987; GFX1064-NEXT:    s_mov_b32 s3, s7
2988; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2989; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2990; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
2991; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2992; GFX1064-NEXT:    buffer_gl0_inv
2993; GFX1064-NEXT:  BB15_2:
2994; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2995; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2996; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2997; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2998; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
2999; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3000; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3001; GFX1064-NEXT:    s_nop 0
3002; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3003; GFX1064-NEXT:    s_endpgm
3004;
3005; GFX1032-LABEL: or_i32_varying:
3006; GFX1032:       ; %bb.0: ; %entry
3007; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3008; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3009; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3010; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3011; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3012; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3013; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3014; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3015; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3016; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3017; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3018; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3019; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3020; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3021; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3022; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3023; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3024; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3025; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3026; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3027; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3028; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3029; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3030; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3031; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3032; GFX1032-NEXT:    s_mov_b32 s2, -1
3033; GFX1032-NEXT:    ; implicit-def: $vgpr0
3034; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3035; GFX1032-NEXT:    s_cbranch_execz BB15_2
3036; GFX1032-NEXT:  ; %bb.1:
3037; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3038; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3039; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3040; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3041; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
3042; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3043; GFX1032-NEXT:    buffer_gl0_inv
3044; GFX1032-NEXT:  BB15_2:
3045; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3046; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3047; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3048; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3049; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3050; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3051; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3052; GFX1032-NEXT:    s_nop 0
3053; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3054; GFX1032-NEXT:    s_endpgm
3055entry:
3056  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3057  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3058  store i32 %old, i32 addrspace(1)* %out
3059  ret void
3060}
3061
3062define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3063;
3064;
3065; GFX7LESS-LABEL: xor_i32_varying:
3066; GFX7LESS:       ; %bb.0: ; %entry
3067; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3068; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3069; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3070; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3071; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3072; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3073; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3074; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3075; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3076; GFX7LESS-NEXT:    s_endpgm
3077;
3078; GFX8-LABEL: xor_i32_varying:
3079; GFX8:       ; %bb.0: ; %entry
3080; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3081; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3082; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3083; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3084; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3085; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3086; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3087; GFX8-NEXT:    s_not_b64 exec, exec
3088; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3089; GFX8-NEXT:    s_not_b64 exec, exec
3090; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3091; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3092; GFX8-NEXT:    s_nop 1
3093; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3094; GFX8-NEXT:    s_nop 1
3095; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3096; GFX8-NEXT:    s_nop 1
3097; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3098; GFX8-NEXT:    s_nop 1
3099; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3100; GFX8-NEXT:    s_nop 1
3101; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3102; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3103; GFX8-NEXT:    s_nop 0
3104; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3105; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3106; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3107; GFX8-NEXT:    ; implicit-def: $vgpr0
3108; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3109; GFX8-NEXT:    s_cbranch_execz BB16_2
3110; GFX8-NEXT:  ; %bb.1:
3111; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3112; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3113; GFX8-NEXT:    s_mov_b32 m0, -1
3114; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3115; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3116; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3117; GFX8-NEXT:  BB16_2:
3118; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3119; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3120; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3121; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3122; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3123; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3124; GFX8-NEXT:    s_mov_b32 s2, -1
3125; GFX8-NEXT:    s_nop 0
3126; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3127; GFX8-NEXT:    s_endpgm
3128;
3129; GFX9-LABEL: xor_i32_varying:
3130; GFX9:       ; %bb.0: ; %entry
3131; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3132; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3133; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3134; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3135; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3136; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3137; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3138; GFX9-NEXT:    s_not_b64 exec, exec
3139; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3140; GFX9-NEXT:    s_not_b64 exec, exec
3141; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3142; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3143; GFX9-NEXT:    s_nop 1
3144; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3145; GFX9-NEXT:    s_nop 1
3146; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3147; GFX9-NEXT:    s_nop 1
3148; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3149; GFX9-NEXT:    s_nop 1
3150; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3151; GFX9-NEXT:    s_nop 1
3152; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3153; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3154; GFX9-NEXT:    s_nop 0
3155; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3156; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3157; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3158; GFX9-NEXT:    ; implicit-def: $vgpr0
3159; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3160; GFX9-NEXT:    s_cbranch_execz BB16_2
3161; GFX9-NEXT:  ; %bb.1:
3162; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3163; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3165; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3166; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3167; GFX9-NEXT:  BB16_2:
3168; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3170; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3171; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3172; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3173; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3174; GFX9-NEXT:    s_mov_b32 s2, -1
3175; GFX9-NEXT:    s_nop 0
3176; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3177; GFX9-NEXT:    s_endpgm
3178;
3179; GFX1064-LABEL: xor_i32_varying:
3180; GFX1064:       ; %bb.0: ; %entry
3181; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3182; GFX1064-NEXT:    s_not_b64 exec, exec
3183; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3184; GFX1064-NEXT:    s_not_b64 exec, exec
3185; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3186; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3187; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3188; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3189; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3190; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3191; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3192; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3193; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3194; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3195; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3196; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3197; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3198; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3199; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3200; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3201; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3202; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3203; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3204; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3205; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3206; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3207; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3208; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3209; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3210; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3211; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3212; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3213; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3214; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3215; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3216; GFX1064-NEXT:    s_mov_b32 s2, -1
3217; GFX1064-NEXT:    ; implicit-def: $vgpr0
3218; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3219; GFX1064-NEXT:    s_cbranch_execz BB16_2
3220; GFX1064-NEXT:  ; %bb.1:
3221; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3222; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3223; GFX1064-NEXT:    s_mov_b32 s3, s7
3224; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3225; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3226; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3227; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3228; GFX1064-NEXT:    buffer_gl0_inv
3229; GFX1064-NEXT:  BB16_2:
3230; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3231; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3232; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3233; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3234; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3235; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3236; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3237; GFX1064-NEXT:    s_nop 0
3238; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3239; GFX1064-NEXT:    s_endpgm
3240;
3241; GFX1032-LABEL: xor_i32_varying:
3242; GFX1032:       ; %bb.0: ; %entry
3243; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3244; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3245; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3246; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3247; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3248; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3249; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3250; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3251; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3252; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3253; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3254; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3255; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3256; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3257; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3258; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3259; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3260; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3261; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3262; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3263; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3264; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3265; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3266; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3267; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3268; GFX1032-NEXT:    s_mov_b32 s2, -1
3269; GFX1032-NEXT:    ; implicit-def: $vgpr0
3270; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3271; GFX1032-NEXT:    s_cbranch_execz BB16_2
3272; GFX1032-NEXT:  ; %bb.1:
3273; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3274; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3275; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3276; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3277; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3278; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3279; GFX1032-NEXT:    buffer_gl0_inv
3280; GFX1032-NEXT:  BB16_2:
3281; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3282; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3283; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3284; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3285; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3286; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3287; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3288; GFX1032-NEXT:    s_nop 0
3289; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3290; GFX1032-NEXT:    s_endpgm
3291entry:
3292  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3293  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3294  store i32 %old, i32 addrspace(1)* %out
3295  ret void
3296}
3297
3298define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3299;
3300;
3301; GFX7LESS-LABEL: max_i32_varying:
3302; GFX7LESS:       ; %bb.0: ; %entry
3303; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3304; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3305; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3306; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3307; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3308; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3309; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3310; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3311; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3312; GFX7LESS-NEXT:    s_endpgm
3313;
3314; GFX8-LABEL: max_i32_varying:
3315; GFX8:       ; %bb.0: ; %entry
3316; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3317; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3318; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3319; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3320; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3321; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3322; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3323; GFX8-NEXT:    s_not_b64 exec, exec
3324; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3325; GFX8-NEXT:    s_not_b64 exec, exec
3326; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3327; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3328; GFX8-NEXT:    s_nop 1
3329; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3330; GFX8-NEXT:    s_nop 1
3331; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3332; GFX8-NEXT:    s_nop 1
3333; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3334; GFX8-NEXT:    s_nop 1
3335; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3336; GFX8-NEXT:    s_nop 1
3337; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3338; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3339; GFX8-NEXT:    s_nop 0
3340; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3341; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3342; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3343; GFX8-NEXT:    ; implicit-def: $vgpr0
3344; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3345; GFX8-NEXT:    s_cbranch_execz BB17_2
3346; GFX8-NEXT:  ; %bb.1:
3347; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3348; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3349; GFX8-NEXT:    s_mov_b32 m0, -1
3350; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3351; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3352; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3353; GFX8-NEXT:  BB17_2:
3354; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3355; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3356; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3357; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3358; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3359; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3360; GFX8-NEXT:    s_mov_b32 s2, -1
3361; GFX8-NEXT:    s_nop 0
3362; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3363; GFX8-NEXT:    s_endpgm
3364;
3365; GFX9-LABEL: max_i32_varying:
3366; GFX9:       ; %bb.0: ; %entry
3367; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3368; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3369; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3370; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3371; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3372; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3373; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3374; GFX9-NEXT:    s_not_b64 exec, exec
3375; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3376; GFX9-NEXT:    s_not_b64 exec, exec
3377; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3378; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3379; GFX9-NEXT:    s_nop 1
3380; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3381; GFX9-NEXT:    s_nop 1
3382; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3383; GFX9-NEXT:    s_nop 1
3384; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3385; GFX9-NEXT:    s_nop 1
3386; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3387; GFX9-NEXT:    s_nop 1
3388; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3389; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3390; GFX9-NEXT:    s_nop 0
3391; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3392; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3393; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3394; GFX9-NEXT:    ; implicit-def: $vgpr0
3395; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3396; GFX9-NEXT:    s_cbranch_execz BB17_2
3397; GFX9-NEXT:  ; %bb.1:
3398; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3399; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3400; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3401; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3403; GFX9-NEXT:  BB17_2:
3404; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3405; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3406; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3407; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3408; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3409; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3410; GFX9-NEXT:    s_mov_b32 s2, -1
3411; GFX9-NEXT:    s_nop 0
3412; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3413; GFX9-NEXT:    s_endpgm
3414;
3415; GFX1064-LABEL: max_i32_varying:
3416; GFX1064:       ; %bb.0: ; %entry
3417; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3418; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3419; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3420; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3421; GFX1064-NEXT:    s_not_b64 exec, exec
3422; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3423; GFX1064-NEXT:    s_not_b64 exec, exec
3424; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3425; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3426; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3427; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3428; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3429; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3430; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3431; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3432; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3433; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3434; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3435; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3436; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3437; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3438; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3439; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3440; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3441; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3442; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3443; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3444; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3445; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3446; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3447; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3448; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3449; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3450; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3451; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3452; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3453; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3454; GFX1064-NEXT:    s_mov_b32 s2, -1
3455; GFX1064-NEXT:    ; implicit-def: $vgpr0
3456; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3457; GFX1064-NEXT:    s_cbranch_execz BB17_2
3458; GFX1064-NEXT:  ; %bb.1:
3459; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3460; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3461; GFX1064-NEXT:    s_mov_b32 s3, s7
3462; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3463; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3464; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
3465; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3466; GFX1064-NEXT:    buffer_gl0_inv
3467; GFX1064-NEXT:  BB17_2:
3468; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3469; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3470; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3471; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3472; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3473; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3474; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3475; GFX1064-NEXT:    s_nop 0
3476; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3477; GFX1064-NEXT:    s_endpgm
3478;
3479; GFX1032-LABEL: max_i32_varying:
3480; GFX1032:       ; %bb.0: ; %entry
3481; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3482; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3483; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3484; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3485; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3486; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3487; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3488; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3489; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3490; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3491; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3492; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3493; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3494; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3495; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3496; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3497; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3498; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3499; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3500; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3501; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3502; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3503; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3504; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3505; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3506; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3507; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3508; GFX1032-NEXT:    s_mov_b32 s2, -1
3509; GFX1032-NEXT:    ; implicit-def: $vgpr0
3510; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3511; GFX1032-NEXT:    s_cbranch_execz BB17_2
3512; GFX1032-NEXT:  ; %bb.1:
3513; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3514; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3515; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3516; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3517; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
3518; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3519; GFX1032-NEXT:    buffer_gl0_inv
3520; GFX1032-NEXT:  BB17_2:
3521; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3522; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3523; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3524; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3525; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3526; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3527; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX1032-NEXT:    s_nop 0
3529; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3530; GFX1032-NEXT:    s_endpgm
3531entry:
3532  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3533  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3534  store i32 %old, i32 addrspace(1)* %out
3535  ret void
3536}
3537
3538define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3539;
3540;
3541; GFX7LESS-LABEL: max_i64_constant:
3542; GFX7LESS:       ; %bb.0: ; %entry
3543; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3544; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3545; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3546; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3547; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3548; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3549; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3550; GFX7LESS-NEXT:  ; %bb.1:
3551; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3552; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3553; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3554; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3555; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3556; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3557; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3558; GFX7LESS-NEXT:  BB18_2:
3559; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3560; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3561; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3562; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3563; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3564; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3565; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3566; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3567; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3568; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3569; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3570; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3571; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3572; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3573; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3574; GFX7LESS-NEXT:    s_endpgm
3575;
3576; GFX8-LABEL: max_i64_constant:
3577; GFX8:       ; %bb.0: ; %entry
3578; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3579; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3580; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3581; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3582; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3583; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3584; GFX8-NEXT:    s_cbranch_execz BB18_2
3585; GFX8-NEXT:  ; %bb.1:
3586; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3587; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3588; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3589; GFX8-NEXT:    s_mov_b32 m0, -1
3590; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3591; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3592; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3593; GFX8-NEXT:  BB18_2:
3594; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3595; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3596; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3597; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3598; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3599; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3600; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3601; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3602; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3603; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3604; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3605; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3606; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3607; GFX8-NEXT:    s_mov_b32 s2, -1
3608; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3609; GFX8-NEXT:    s_endpgm
3610;
3611; GFX9-LABEL: max_i64_constant:
3612; GFX9:       ; %bb.0: ; %entry
3613; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3614; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3615; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3616; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3617; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3618; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3619; GFX9-NEXT:    s_cbranch_execz BB18_2
3620; GFX9-NEXT:  ; %bb.1:
3621; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3622; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3623; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3625; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3626; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3627; GFX9-NEXT:  BB18_2:
3628; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3629; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3630; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3631; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3632; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3633; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3634; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3635; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3636; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3637; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3638; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3639; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3640; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3641; GFX9-NEXT:    s_mov_b32 s2, -1
3642; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3643; GFX9-NEXT:    s_endpgm
3644;
3645; GFX1064-LABEL: max_i64_constant:
3646; GFX1064:       ; %bb.0: ; %entry
3647; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3648; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3649; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3650; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3651; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3652; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3653; GFX1064-NEXT:    s_cbranch_execz BB18_2
3654; GFX1064-NEXT:  ; %bb.1:
3655; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3656; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3657; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3658; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3659; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3660; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3661; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3662; GFX1064-NEXT:    buffer_gl0_inv
3663; GFX1064-NEXT:  BB18_2:
3664; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3665; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3666; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3667; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3668; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3669; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3670; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3671; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3672; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3673; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3674; GFX1064-NEXT:    s_mov_b32 s2, -1
3675; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3676; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3677; GFX1064-NEXT:    s_endpgm
3678;
3679; GFX1032-LABEL: max_i64_constant:
3680; GFX1032:       ; %bb.0: ; %entry
3681; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3682; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3683; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3684; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3685; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3686; GFX1032-NEXT:    s_cbranch_execz BB18_2
3687; GFX1032-NEXT:  ; %bb.1:
3688; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3689; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3690; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3691; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3692; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3693; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3694; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3695; GFX1032-NEXT:    buffer_gl0_inv
3696; GFX1032-NEXT:  BB18_2:
3697; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3698; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3699; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3700; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3701; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3702; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3703; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3704; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3705; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3706; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3707; GFX1032-NEXT:    s_mov_b32 s2, -1
3708; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3709; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3710; GFX1032-NEXT:    s_endpgm
3711entry:
3712  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3713  store i64 %old, i64 addrspace(1)* %out
3714  ret void
3715}
3716
3717define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3718;
3719;
3720; GFX7LESS-LABEL: min_i32_varying:
3721; GFX7LESS:       ; %bb.0: ; %entry
3722; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3723; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3724; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3725; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3726; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3727; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3728; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3729; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3730; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3731; GFX7LESS-NEXT:    s_endpgm
3732;
3733; GFX8-LABEL: min_i32_varying:
3734; GFX8:       ; %bb.0: ; %entry
3735; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3736; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3737; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3738; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3739; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3740; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3741; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3742; GFX8-NEXT:    s_not_b64 exec, exec
3743; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3744; GFX8-NEXT:    s_not_b64 exec, exec
3745; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3746; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3747; GFX8-NEXT:    s_nop 1
3748; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3749; GFX8-NEXT:    s_nop 1
3750; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3751; GFX8-NEXT:    s_nop 1
3752; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3753; GFX8-NEXT:    s_nop 1
3754; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3755; GFX8-NEXT:    s_nop 1
3756; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3757; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3758; GFX8-NEXT:    s_nop 0
3759; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3760; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3761; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3762; GFX8-NEXT:    ; implicit-def: $vgpr0
3763; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3764; GFX8-NEXT:    s_cbranch_execz BB19_2
3765; GFX8-NEXT:  ; %bb.1:
3766; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3767; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3768; GFX8-NEXT:    s_mov_b32 m0, -1
3769; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3770; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3771; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3772; GFX8-NEXT:  BB19_2:
3773; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3774; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3775; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3776; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3777; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3778; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3779; GFX8-NEXT:    s_mov_b32 s2, -1
3780; GFX8-NEXT:    s_nop 0
3781; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3782; GFX8-NEXT:    s_endpgm
3783;
3784; GFX9-LABEL: min_i32_varying:
3785; GFX9:       ; %bb.0: ; %entry
3786; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3787; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3788; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3789; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3790; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3791; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3792; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3793; GFX9-NEXT:    s_not_b64 exec, exec
3794; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3795; GFX9-NEXT:    s_not_b64 exec, exec
3796; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3797; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3798; GFX9-NEXT:    s_nop 1
3799; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3800; GFX9-NEXT:    s_nop 1
3801; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3802; GFX9-NEXT:    s_nop 1
3803; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3804; GFX9-NEXT:    s_nop 1
3805; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3806; GFX9-NEXT:    s_nop 1
3807; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3808; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3809; GFX9-NEXT:    s_nop 0
3810; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3811; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3812; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3813; GFX9-NEXT:    ; implicit-def: $vgpr0
3814; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3815; GFX9-NEXT:    s_cbranch_execz BB19_2
3816; GFX9-NEXT:  ; %bb.1:
3817; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3818; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3820; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3821; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3822; GFX9-NEXT:  BB19_2:
3823; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3824; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3825; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3826; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3827; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3828; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3829; GFX9-NEXT:    s_mov_b32 s2, -1
3830; GFX9-NEXT:    s_nop 0
3831; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3832; GFX9-NEXT:    s_endpgm
3833;
3834; GFX1064-LABEL: min_i32_varying:
3835; GFX1064:       ; %bb.0: ; %entry
3836; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3837; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3838; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3839; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3840; GFX1064-NEXT:    s_not_b64 exec, exec
3841; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3842; GFX1064-NEXT:    s_not_b64 exec, exec
3843; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3844; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3845; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3846; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3847; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3848; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3849; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3850; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3851; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3852; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3853; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3854; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3855; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3856; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3857; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3858; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3859; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3860; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3861; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3862; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3863; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3864; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3865; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3866; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3867; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3868; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3869; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3870; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3871; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3872; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3873; GFX1064-NEXT:    s_mov_b32 s2, -1
3874; GFX1064-NEXT:    ; implicit-def: $vgpr0
3875; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3876; GFX1064-NEXT:    s_cbranch_execz BB19_2
3877; GFX1064-NEXT:  ; %bb.1:
3878; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3879; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3880; GFX1064-NEXT:    s_mov_b32 s3, s7
3881; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3882; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3883; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
3884; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3885; GFX1064-NEXT:    buffer_gl0_inv
3886; GFX1064-NEXT:  BB19_2:
3887; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3888; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3889; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3890; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3891; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3892; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3893; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3894; GFX1064-NEXT:    s_nop 0
3895; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3896; GFX1064-NEXT:    s_endpgm
3897;
3898; GFX1032-LABEL: min_i32_varying:
3899; GFX1032:       ; %bb.0: ; %entry
3900; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3901; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3902; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3903; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3904; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3905; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3906; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3907; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3908; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3909; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3910; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3911; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3912; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3913; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3914; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3915; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3916; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3917; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3918; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3919; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3920; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3921; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3922; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3923; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3924; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3925; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3926; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3927; GFX1032-NEXT:    s_mov_b32 s2, -1
3928; GFX1032-NEXT:    ; implicit-def: $vgpr0
3929; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3930; GFX1032-NEXT:    s_cbranch_execz BB19_2
3931; GFX1032-NEXT:  ; %bb.1:
3932; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3933; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3934; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3935; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3936; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
3937; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3938; GFX1032-NEXT:    buffer_gl0_inv
3939; GFX1032-NEXT:  BB19_2:
3940; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3941; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3942; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3943; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3944; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3945; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3946; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3947; GFX1032-NEXT:    s_nop 0
3948; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3949; GFX1032-NEXT:    s_endpgm
3950entry:
3951  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3952  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3953  store i32 %old, i32 addrspace(1)* %out
3954  ret void
3955}
3956
3957define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3958;
3959;
3960; GFX7LESS-LABEL: min_i64_constant:
3961; GFX7LESS:       ; %bb.0: ; %entry
3962; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3963; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3964; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3965; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3966; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3967; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3968; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
3969; GFX7LESS-NEXT:  ; %bb.1:
3970; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3971; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3972; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3973; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3974; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3975; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3976; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3977; GFX7LESS-NEXT:  BB20_2:
3978; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3979; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3980; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3981; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3982; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
3983; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3984; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3985; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3986; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3987; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3988; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3989; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3990; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3991; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3992; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3993; GFX7LESS-NEXT:    s_endpgm
3994;
3995; GFX8-LABEL: min_i64_constant:
3996; GFX8:       ; %bb.0: ; %entry
3997; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3998; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3999; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4000; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4001; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4002; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4003; GFX8-NEXT:    s_cbranch_execz BB20_2
4004; GFX8-NEXT:  ; %bb.1:
4005; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4006; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4007; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4008; GFX8-NEXT:    s_mov_b32 m0, -1
4009; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4010; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4011; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4012; GFX8-NEXT:  BB20_2:
4013; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4014; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4015; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4016; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
4017; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4018; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4019; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4020; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4021; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4022; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4023; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4024; GFX8-NEXT:    s_mov_b32 s2, -1
4025; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4026; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4027; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4028; GFX8-NEXT:    s_endpgm
4029;
4030; GFX9-LABEL: min_i64_constant:
4031; GFX9:       ; %bb.0: ; %entry
4032; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4033; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4034; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4035; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4036; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4037; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4038; GFX9-NEXT:    s_cbranch_execz BB20_2
4039; GFX9-NEXT:  ; %bb.1:
4040; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4041; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4042; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4043; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4044; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4045; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4046; GFX9-NEXT:  BB20_2:
4047; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4048; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4049; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4050; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4051; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4052; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4053; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4054; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4055; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4056; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4057; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4058; GFX9-NEXT:    s_mov_b32 s2, -1
4059; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4060; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4061; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4062; GFX9-NEXT:    s_endpgm
4063;
4064; GFX1064-LABEL: min_i64_constant:
4065; GFX1064:       ; %bb.0: ; %entry
4066; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4067; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4068; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4069; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4070; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4071; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4072; GFX1064-NEXT:    s_cbranch_execz BB20_2
4073; GFX1064-NEXT:  ; %bb.1:
4074; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4075; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4076; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4077; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4078; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4079; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4080; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4081; GFX1064-NEXT:    buffer_gl0_inv
4082; GFX1064-NEXT:  BB20_2:
4083; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4084; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4085; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4086; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4087; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4088; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4089; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
4090; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4091; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4092; GFX1064-NEXT:    s_mov_b32 s2, -1
4093; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4094; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4095; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4096; GFX1064-NEXT:    s_endpgm
4097;
4098; GFX1032-LABEL: min_i64_constant:
4099; GFX1032:       ; %bb.0: ; %entry
4100; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4101; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4102; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4103; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4104; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4105; GFX1032-NEXT:    s_cbranch_execz BB20_2
4106; GFX1032-NEXT:  ; %bb.1:
4107; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4108; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4109; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4110; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4111; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4112; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4113; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4114; GFX1032-NEXT:    buffer_gl0_inv
4115; GFX1032-NEXT:  BB20_2:
4116; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4117; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4118; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4119; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4120; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4121; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4122; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
4123; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4124; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4125; GFX1032-NEXT:    s_mov_b32 s2, -1
4126; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4127; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4128; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4129; GFX1032-NEXT:    s_endpgm
4130entry:
4131  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4132  store i64 %old, i64 addrspace(1)* %out
4133  ret void
4134}
4135
4136define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4137;
4138;
4139; GFX7LESS-LABEL: umax_i32_varying:
4140; GFX7LESS:       ; %bb.0: ; %entry
4141; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4142; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4143; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4144; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4145; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4146; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4147; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4148; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4149; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4150; GFX7LESS-NEXT:    s_endpgm
4151;
4152; GFX8-LABEL: umax_i32_varying:
4153; GFX8:       ; %bb.0: ; %entry
4154; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4155; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4156; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4157; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4158; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4159; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4160; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4161; GFX8-NEXT:    s_not_b64 exec, exec
4162; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4163; GFX8-NEXT:    s_not_b64 exec, exec
4164; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4165; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4166; GFX8-NEXT:    s_nop 1
4167; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4168; GFX8-NEXT:    s_nop 1
4169; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4170; GFX8-NEXT:    s_nop 1
4171; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4172; GFX8-NEXT:    s_nop 1
4173; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4174; GFX8-NEXT:    s_nop 1
4175; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4176; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4177; GFX8-NEXT:    s_nop 0
4178; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4179; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4180; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4181; GFX8-NEXT:    ; implicit-def: $vgpr0
4182; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4183; GFX8-NEXT:    s_cbranch_execz BB21_2
4184; GFX8-NEXT:  ; %bb.1:
4185; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4186; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4187; GFX8-NEXT:    s_mov_b32 m0, -1
4188; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4189; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4190; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4191; GFX8-NEXT:  BB21_2:
4192; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4193; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4194; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4195; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4196; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4197; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4198; GFX8-NEXT:    s_mov_b32 s2, -1
4199; GFX8-NEXT:    s_nop 0
4200; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4201; GFX8-NEXT:    s_endpgm
4202;
4203; GFX9-LABEL: umax_i32_varying:
4204; GFX9:       ; %bb.0: ; %entry
4205; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4206; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4207; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4208; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4209; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4210; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4211; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4212; GFX9-NEXT:    s_not_b64 exec, exec
4213; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4214; GFX9-NEXT:    s_not_b64 exec, exec
4215; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4216; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4217; GFX9-NEXT:    s_nop 1
4218; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4219; GFX9-NEXT:    s_nop 1
4220; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4221; GFX9-NEXT:    s_nop 1
4222; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4223; GFX9-NEXT:    s_nop 1
4224; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4225; GFX9-NEXT:    s_nop 1
4226; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4227; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4228; GFX9-NEXT:    s_nop 0
4229; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4230; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4231; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4232; GFX9-NEXT:    ; implicit-def: $vgpr0
4233; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4234; GFX9-NEXT:    s_cbranch_execz BB21_2
4235; GFX9-NEXT:  ; %bb.1:
4236; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4237; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4238; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4239; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4241; GFX9-NEXT:  BB21_2:
4242; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4243; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4244; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4245; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4246; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4247; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4248; GFX9-NEXT:    s_mov_b32 s2, -1
4249; GFX9-NEXT:    s_nop 0
4250; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4251; GFX9-NEXT:    s_endpgm
4252;
4253; GFX1064-LABEL: umax_i32_varying:
4254; GFX1064:       ; %bb.0: ; %entry
4255; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4256; GFX1064-NEXT:    s_not_b64 exec, exec
4257; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4258; GFX1064-NEXT:    s_not_b64 exec, exec
4259; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4260; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4261; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4262; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4263; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4264; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4265; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4266; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4267; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4268; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4269; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4270; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4271; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4272; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4273; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4274; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4275; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4276; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4277; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4278; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4279; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4280; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4281; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4282; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4283; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4284; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4285; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4286; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4287; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4288; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4289; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4290; GFX1064-NEXT:    s_mov_b32 s2, -1
4291; GFX1064-NEXT:    ; implicit-def: $vgpr0
4292; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4293; GFX1064-NEXT:    s_cbranch_execz BB21_2
4294; GFX1064-NEXT:  ; %bb.1:
4295; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4296; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4297; GFX1064-NEXT:    s_mov_b32 s3, s7
4298; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4299; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4300; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
4301; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4302; GFX1064-NEXT:    buffer_gl0_inv
4303; GFX1064-NEXT:  BB21_2:
4304; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4305; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4306; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4307; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4308; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4309; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4310; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4311; GFX1064-NEXT:    s_nop 0
4312; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4313; GFX1064-NEXT:    s_endpgm
4314;
4315; GFX1032-LABEL: umax_i32_varying:
4316; GFX1032:       ; %bb.0: ; %entry
4317; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4318; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4319; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4320; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4321; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4322; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4323; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4324; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4325; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4326; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4327; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4328; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4329; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4330; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4331; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4332; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4333; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4334; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4335; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4336; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4337; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4338; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4339; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4340; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4341; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4342; GFX1032-NEXT:    s_mov_b32 s2, -1
4343; GFX1032-NEXT:    ; implicit-def: $vgpr0
4344; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4345; GFX1032-NEXT:    s_cbranch_execz BB21_2
4346; GFX1032-NEXT:  ; %bb.1:
4347; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4348; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4349; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4350; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4351; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
4352; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4353; GFX1032-NEXT:    buffer_gl0_inv
4354; GFX1032-NEXT:  BB21_2:
4355; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4356; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4357; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4358; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4359; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4360; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4361; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4362; GFX1032-NEXT:    s_nop 0
4363; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4364; GFX1032-NEXT:    s_endpgm
4365entry:
4366  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4367  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4368  store i32 %old, i32 addrspace(1)* %out
4369  ret void
4370}
4371
4372define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4373;
4374;
4375; GFX7LESS-LABEL: umax_i64_constant:
4376; GFX7LESS:       ; %bb.0: ; %entry
4377; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4378; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4379; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4380; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4381; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4382; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4383; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4384; GFX7LESS-NEXT:  ; %bb.1:
4385; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4386; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4387; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4388; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4389; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4390; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4391; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4392; GFX7LESS-NEXT:  BB22_2:
4393; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4394; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4395; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4396; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4397; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4398; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4399; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4400; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4401; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4402; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4403; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4404; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4405; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4406; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4407; GFX7LESS-NEXT:    s_endpgm
4408;
4409; GFX8-LABEL: umax_i64_constant:
4410; GFX8:       ; %bb.0: ; %entry
4411; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4412; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4413; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4414; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4415; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4416; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4417; GFX8-NEXT:    s_cbranch_execz BB22_2
4418; GFX8-NEXT:  ; %bb.1:
4419; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4420; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4421; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4422; GFX8-NEXT:    s_mov_b32 m0, -1
4423; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4424; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4425; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4426; GFX8-NEXT:  BB22_2:
4427; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4428; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4429; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4430; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4431; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4432; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4433; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4434; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4435; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4436; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4437; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4438; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4439; GFX8-NEXT:    s_mov_b32 s2, -1
4440; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4441; GFX8-NEXT:    s_endpgm
4442;
4443; GFX9-LABEL: umax_i64_constant:
4444; GFX9:       ; %bb.0: ; %entry
4445; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4446; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4447; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4448; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4449; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4450; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4451; GFX9-NEXT:    s_cbranch_execz BB22_2
4452; GFX9-NEXT:  ; %bb.1:
4453; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4454; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4455; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4456; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4457; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4458; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4459; GFX9-NEXT:  BB22_2:
4460; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4461; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4462; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4463; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4464; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4465; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4466; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4467; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4468; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4469; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4470; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4471; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4472; GFX9-NEXT:    s_mov_b32 s2, -1
4473; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4474; GFX9-NEXT:    s_endpgm
4475;
4476; GFX1064-LABEL: umax_i64_constant:
4477; GFX1064:       ; %bb.0: ; %entry
4478; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4479; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4480; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4481; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4482; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4483; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4484; GFX1064-NEXT:    s_cbranch_execz BB22_2
4485; GFX1064-NEXT:  ; %bb.1:
4486; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4487; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4488; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4489; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4490; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4491; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4492; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4493; GFX1064-NEXT:    buffer_gl0_inv
4494; GFX1064-NEXT:  BB22_2:
4495; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4496; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4497; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4498; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4499; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4500; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4501; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4502; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4503; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4504; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4505; GFX1064-NEXT:    s_mov_b32 s2, -1
4506; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4507; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4508; GFX1064-NEXT:    s_endpgm
4509;
4510; GFX1032-LABEL: umax_i64_constant:
4511; GFX1032:       ; %bb.0: ; %entry
4512; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4513; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4514; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4515; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4516; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4517; GFX1032-NEXT:    s_cbranch_execz BB22_2
4518; GFX1032-NEXT:  ; %bb.1:
4519; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4520; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4521; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4522; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4523; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4524; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4525; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4526; GFX1032-NEXT:    buffer_gl0_inv
4527; GFX1032-NEXT:  BB22_2:
4528; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4529; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4530; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4531; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4532; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4533; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4534; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4535; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4536; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4537; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4538; GFX1032-NEXT:    s_mov_b32 s2, -1
4539; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4540; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4541; GFX1032-NEXT:    s_endpgm
4542entry:
4543  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4544  store i64 %old, i64 addrspace(1)* %out
4545  ret void
4546}
4547
4548define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4549;
4550;
4551; GFX7LESS-LABEL: umin_i32_varying:
4552; GFX7LESS:       ; %bb.0: ; %entry
4553; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4554; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4555; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4556; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4557; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4558; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4559; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4560; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4561; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4562; GFX7LESS-NEXT:    s_endpgm
4563;
4564; GFX8-LABEL: umin_i32_varying:
4565; GFX8:       ; %bb.0: ; %entry
4566; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4567; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4568; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4569; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4570; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4571; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4572; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4573; GFX8-NEXT:    s_not_b64 exec, exec
4574; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4575; GFX8-NEXT:    s_not_b64 exec, exec
4576; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4577; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4578; GFX8-NEXT:    s_nop 1
4579; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4580; GFX8-NEXT:    s_nop 1
4581; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4582; GFX8-NEXT:    s_nop 1
4583; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4584; GFX8-NEXT:    s_nop 1
4585; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4586; GFX8-NEXT:    s_nop 1
4587; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4588; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4589; GFX8-NEXT:    s_nop 0
4590; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4591; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4592; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4593; GFX8-NEXT:    ; implicit-def: $vgpr0
4594; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4595; GFX8-NEXT:    s_cbranch_execz BB23_2
4596; GFX8-NEXT:  ; %bb.1:
4597; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4598; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4599; GFX8-NEXT:    s_mov_b32 m0, -1
4600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4601; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4602; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4603; GFX8-NEXT:  BB23_2:
4604; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4605; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4606; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4607; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4608; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4609; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4610; GFX8-NEXT:    s_mov_b32 s2, -1
4611; GFX8-NEXT:    s_nop 0
4612; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4613; GFX8-NEXT:    s_endpgm
4614;
4615; GFX9-LABEL: umin_i32_varying:
4616; GFX9:       ; %bb.0: ; %entry
4617; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4618; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4619; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4620; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4621; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4622; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4623; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4624; GFX9-NEXT:    s_not_b64 exec, exec
4625; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4626; GFX9-NEXT:    s_not_b64 exec, exec
4627; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4628; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4629; GFX9-NEXT:    s_nop 1
4630; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4631; GFX9-NEXT:    s_nop 1
4632; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4633; GFX9-NEXT:    s_nop 1
4634; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4635; GFX9-NEXT:    s_nop 1
4636; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4637; GFX9-NEXT:    s_nop 1
4638; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4639; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4640; GFX9-NEXT:    s_nop 0
4641; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4642; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4643; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4644; GFX9-NEXT:    ; implicit-def: $vgpr0
4645; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4646; GFX9-NEXT:    s_cbranch_execz BB23_2
4647; GFX9-NEXT:  ; %bb.1:
4648; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4649; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4650; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4651; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4652; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4653; GFX9-NEXT:  BB23_2:
4654; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4656; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4657; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4658; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4659; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4660; GFX9-NEXT:    s_mov_b32 s2, -1
4661; GFX9-NEXT:    s_nop 0
4662; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4663; GFX9-NEXT:    s_endpgm
4664;
4665; GFX1064-LABEL: umin_i32_varying:
4666; GFX1064:       ; %bb.0: ; %entry
4667; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4668; GFX1064-NEXT:    s_not_b64 exec, exec
4669; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4670; GFX1064-NEXT:    s_not_b64 exec, exec
4671; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4672; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4673; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4674; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4675; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4676; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4677; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4678; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4679; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4680; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4681; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4682; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4683; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4684; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4685; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4686; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4687; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4688; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4689; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4690; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4691; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4692; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4693; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4694; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4695; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4696; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4697; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4698; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4699; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4700; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4701; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4702; GFX1064-NEXT:    s_mov_b32 s2, -1
4703; GFX1064-NEXT:    ; implicit-def: $vgpr0
4704; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4705; GFX1064-NEXT:    s_cbranch_execz BB23_2
4706; GFX1064-NEXT:  ; %bb.1:
4707; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4708; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4709; GFX1064-NEXT:    s_mov_b32 s3, s7
4710; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4711; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4712; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
4713; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4714; GFX1064-NEXT:    buffer_gl0_inv
4715; GFX1064-NEXT:  BB23_2:
4716; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4717; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4718; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4719; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4720; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4721; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4722; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4723; GFX1064-NEXT:    s_nop 0
4724; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4725; GFX1064-NEXT:    s_endpgm
4726;
4727; GFX1032-LABEL: umin_i32_varying:
4728; GFX1032:       ; %bb.0: ; %entry
4729; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4730; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4731; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4732; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4733; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4734; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4735; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4736; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4737; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4738; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4739; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4740; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4741; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4742; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4743; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4744; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4745; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4746; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4747; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4748; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4749; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4750; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4751; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4752; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4753; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4754; GFX1032-NEXT:    s_mov_b32 s2, -1
4755; GFX1032-NEXT:    ; implicit-def: $vgpr0
4756; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4757; GFX1032-NEXT:    s_cbranch_execz BB23_2
4758; GFX1032-NEXT:  ; %bb.1:
4759; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4760; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4761; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4762; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4763; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
4764; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4765; GFX1032-NEXT:    buffer_gl0_inv
4766; GFX1032-NEXT:  BB23_2:
4767; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4768; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4769; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4770; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4771; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4772; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4773; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4774; GFX1032-NEXT:    s_nop 0
4775; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4776; GFX1032-NEXT:    s_endpgm
4777entry:
4778  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4779  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4780  store i32 %old, i32 addrspace(1)* %out
4781  ret void
4782}
4783
4784define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4785;
4786;
4787; GFX7LESS-LABEL: umin_i64_constant:
4788; GFX7LESS:       ; %bb.0: ; %entry
4789; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4790; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4791; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4792; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4793; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4794; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4795; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4796; GFX7LESS-NEXT:  ; %bb.1:
4797; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4798; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4799; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4800; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4801; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4802; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4803; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4804; GFX7LESS-NEXT:  BB24_2:
4805; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4806; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4807; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4808; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4809; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4810; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4811; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4812; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4813; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4814; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4815; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4816; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4817; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4818; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4819; GFX7LESS-NEXT:    s_endpgm
4820;
4821; GFX8-LABEL: umin_i64_constant:
4822; GFX8:       ; %bb.0: ; %entry
4823; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4824; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4825; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4826; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4827; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4828; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4829; GFX8-NEXT:    s_cbranch_execz BB24_2
4830; GFX8-NEXT:  ; %bb.1:
4831; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4832; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4833; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4834; GFX8-NEXT:    s_mov_b32 m0, -1
4835; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4836; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4837; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4838; GFX8-NEXT:  BB24_2:
4839; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4840; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4841; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4842; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4843; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4844; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4845; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4846; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4847; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4848; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4849; GFX8-NEXT:    s_mov_b32 s2, -1
4850; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4851; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4852; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4853; GFX8-NEXT:    s_endpgm
4854;
4855; GFX9-LABEL: umin_i64_constant:
4856; GFX9:       ; %bb.0: ; %entry
4857; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4858; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4859; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4860; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4861; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4862; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4863; GFX9-NEXT:    s_cbranch_execz BB24_2
4864; GFX9-NEXT:  ; %bb.1:
4865; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4866; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4867; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4868; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4869; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4870; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4871; GFX9-NEXT:  BB24_2:
4872; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4873; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4874; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4875; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4876; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4877; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4878; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4879; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4880; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4881; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4882; GFX9-NEXT:    s_mov_b32 s2, -1
4883; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4884; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4885; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4886; GFX9-NEXT:    s_endpgm
4887;
4888; GFX1064-LABEL: umin_i64_constant:
4889; GFX1064:       ; %bb.0: ; %entry
4890; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4891; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4892; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4893; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4894; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4895; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4896; GFX1064-NEXT:    s_cbranch_execz BB24_2
4897; GFX1064-NEXT:  ; %bb.1:
4898; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4899; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4900; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4901; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4902; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4903; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4904; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4905; GFX1064-NEXT:    buffer_gl0_inv
4906; GFX1064-NEXT:  BB24_2:
4907; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4908; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4909; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4910; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4911; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4912; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4913; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4914; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4915; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4916; GFX1064-NEXT:    s_mov_b32 s2, -1
4917; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4918; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4919; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4920; GFX1064-NEXT:    s_endpgm
4921;
4922; GFX1032-LABEL: umin_i64_constant:
4923; GFX1032:       ; %bb.0: ; %entry
4924; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4925; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4926; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4927; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4928; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4929; GFX1032-NEXT:    s_cbranch_execz BB24_2
4930; GFX1032-NEXT:  ; %bb.1:
4931; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4932; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4933; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4934; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4935; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4936; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4937; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4938; GFX1032-NEXT:    buffer_gl0_inv
4939; GFX1032-NEXT:  BB24_2:
4940; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4941; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4942; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4943; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4944; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4945; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4946; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4947; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4948; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4949; GFX1032-NEXT:    s_mov_b32 s2, -1
4950; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4951; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4952; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4953; GFX1032-NEXT:    s_endpgm
4954entry:
4955  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
4956  store i64 %old, i64 addrspace(1)* %out
4957  ret void
4958}
4959