1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
31; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
32; GFX7LESS-NEXT:    s_mov_b32 m0, -1
33; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
34; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
35; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
36; GFX7LESS-NEXT:    buffer_wbinvl1
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
40; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
41; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
42; GFX7LESS-NEXT:    s_mov_b32 s2, -1
43; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
61; GFX8-NEXT:    s_mov_b32 m0, -1
62; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
63; GFX8-NEXT:    ds_add_rtn_u32 v1, v2, v1
64; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
65; GFX8-NEXT:    buffer_wbinvl1_vol
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
69; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
70; GFX8-NEXT:    s_mov_b32 s3, 0xf000
71; GFX8-NEXT:    s_mov_b32 s2, -1
72; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX8-NEXT:    s_nop 0
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
91; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
92; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
93; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
94; GFX9-NEXT:    buffer_wbinvl1_vol
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
98; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
99; GFX9-NEXT:    s_mov_b32 s3, 0xf000
100; GFX9-NEXT:    s_mov_b32 s2, -1
101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX9-NEXT:    s_nop 0
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
119; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
120; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
121; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    buffer_gl0_inv
125; GFX1064-NEXT:    buffer_gl1_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    s_nop 0
135; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
136; GFX1064-NEXT:    s_endpgm
137;
138; GFX1032-LABEL: add_i32_constant:
139; GFX1032:       ; %bb.0: ; %entry
140; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
141; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
142; GFX1032-NEXT:    ; implicit-def: $vcc_hi
143; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
144; GFX1032-NEXT:    ; implicit-def: $vgpr1
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz BB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
150; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
151; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
152; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
153; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
155; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
156; GFX1032-NEXT:    buffer_gl0_inv
157; GFX1032-NEXT:    buffer_gl1_inv
158; GFX1032-NEXT:  BB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    s_nop 0
167; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
168; GFX1032-NEXT:    s_endpgm
169entry:
170  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
171  store i32 %old, i32 addrspace(1)* %out
172  ret void
173}
174
175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
176;
177;
178; GFX7LESS-LABEL: add_i32_uniform:
179; GFX7LESS:       ; %bb.0: ; %entry
180; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
181; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
182; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
183; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
184; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
185; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
186; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
187; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
188; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
189; GFX7LESS-NEXT:  ; %bb.1:
190; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
191; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
193; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
194; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
195; GFX7LESS-NEXT:    s_mov_b32 m0, -1
196; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
197; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
198; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
199; GFX7LESS-NEXT:    buffer_wbinvl1
200; GFX7LESS-NEXT:  BB1_2:
201; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
202; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
203; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
205; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
206; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
207; GFX7LESS-NEXT:    s_mov_b32 s6, -1
208; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
209; GFX7LESS-NEXT:    s_endpgm
210;
211; GFX8-LABEL: add_i32_uniform:
212; GFX8:       ; %bb.0: ; %entry
213; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
214; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
215; GFX8-NEXT:    s_mov_b64 s[2:3], exec
216; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
217; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
218; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
219; GFX8-NEXT:    ; implicit-def: $vgpr1
220; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
221; GFX8-NEXT:    s_cbranch_execz BB1_2
222; GFX8-NEXT:  ; %bb.1:
223; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
224; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX8-NEXT:    s_mul_i32 s1, s0, s1
226; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
227; GFX8-NEXT:    v_mov_b32_e32 v2, s1
228; GFX8-NEXT:    s_mov_b32 m0, -1
229; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
230; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
231; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
232; GFX8-NEXT:    buffer_wbinvl1_vol
233; GFX8-NEXT:  BB1_2:
234; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
237; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
238; GFX8-NEXT:    s_mov_b32 s7, 0xf000
239; GFX8-NEXT:    s_mov_b32 s6, -1
240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
241; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
242; GFX8-NEXT:    s_endpgm
243;
244; GFX9-LABEL: add_i32_uniform:
245; GFX9:       ; %bb.0: ; %entry
246; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
247; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
248; GFX9-NEXT:    s_mov_b64 s[2:3], exec
249; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
250; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
252; GFX9-NEXT:    ; implicit-def: $vgpr1
253; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
254; GFX9-NEXT:    s_cbranch_execz BB1_2
255; GFX9-NEXT:  ; %bb.1:
256; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
257; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX9-NEXT:    s_mul_i32 s1, s0, s1
259; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
260; GFX9-NEXT:    v_mov_b32_e32 v2, s1
261; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
262; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
263; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
264; GFX9-NEXT:    buffer_wbinvl1_vol
265; GFX9-NEXT:  BB1_2:
266; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
267; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
269; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
270; GFX9-NEXT:    s_mov_b32 s7, 0xf000
271; GFX9-NEXT:    s_mov_b32 s6, -1
272; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
273; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
274; GFX9-NEXT:    s_endpgm
275;
276; GFX1064-LABEL: add_i32_uniform:
277; GFX1064:       ; %bb.0: ; %entry
278; GFX1064-NEXT:    s_clause 0x1
279; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
280; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
281; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
282; GFX1064-NEXT:    ; implicit-def: $vgpr1
283; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
284; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
285; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
286; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
287; GFX1064-NEXT:    s_cbranch_execz BB1_2
288; GFX1064-NEXT:  ; %bb.1:
289; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
290; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
293; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
294; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
295; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
296; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
297; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX1064-NEXT:    buffer_gl0_inv
299; GFX1064-NEXT:    buffer_gl1_inv
300; GFX1064-NEXT:  BB1_2:
301; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
302; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
303; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
305; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
306; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
307; GFX1064-NEXT:    s_mov_b32 s6, -1
308; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
309; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
310; GFX1064-NEXT:    s_endpgm
311;
312; GFX1032-LABEL: add_i32_uniform:
313; GFX1032:       ; %bb.0: ; %entry
314; GFX1032-NEXT:    s_clause 0x1
315; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
316; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
317; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
318; GFX1032-NEXT:    ; implicit-def: $vcc_hi
319; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
320; GFX1032-NEXT:    ; implicit-def: $vgpr1
321; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
322; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
323; GFX1032-NEXT:    s_cbranch_execz BB1_2
324; GFX1032-NEXT:  ; %bb.1:
325; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
326; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
327; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
329; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
330; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
331; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
332; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
333; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
334; GFX1032-NEXT:    buffer_gl0_inv
335; GFX1032-NEXT:    buffer_gl1_inv
336; GFX1032-NEXT:  BB1_2:
337; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
338; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
339; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
341; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
342; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
343; GFX1032-NEXT:    s_mov_b32 s6, -1
344; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
345; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
346; GFX1032-NEXT:    s_endpgm
347entry:
348  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
349  store i32 %old, i32 addrspace(1)* %out
350  ret void
351}
352
353define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
354;
355;
356; GFX7LESS-LABEL: add_i32_varying:
357; GFX7LESS:       ; %bb.0: ; %entry
358; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
359; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
360; GFX7LESS-NEXT:    s_mov_b32 m0, -1
361; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
362; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
363; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
364; GFX7LESS-NEXT:    buffer_wbinvl1
365; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
366; GFX7LESS-NEXT:    s_mov_b32 s2, -1
367; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
368; GFX7LESS-NEXT:    s_endpgm
369;
370; GFX8-LABEL: add_i32_varying:
371; GFX8:       ; %bb.0: ; %entry
372; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
373; GFX8-NEXT:    s_mov_b64 s[2:3], exec
374; GFX8-NEXT:    v_mov_b32_e32 v2, v0
375; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
376; GFX8-NEXT:    v_mov_b32_e32 v1, 0
377; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
378; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
379; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
380; GFX8-NEXT:    s_not_b64 exec, exec
381; GFX8-NEXT:    v_mov_b32_e32 v2, 0
382; GFX8-NEXT:    s_not_b64 exec, exec
383; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
384; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
385; GFX8-NEXT:    s_nop 1
386; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
387; GFX8-NEXT:    s_nop 1
388; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
389; GFX8-NEXT:    s_nop 1
390; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
391; GFX8-NEXT:    s_nop 1
392; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
393; GFX8-NEXT:    s_nop 1
394; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
395; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
396; GFX8-NEXT:    s_nop 0
397; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
398; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
399; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
400; GFX8-NEXT:    ; implicit-def: $vgpr0
401; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
402; GFX8-NEXT:    s_cbranch_execz BB2_2
403; GFX8-NEXT:  ; %bb.1:
404; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
405; GFX8-NEXT:    v_mov_b32_e32 v3, s2
406; GFX8-NEXT:    s_mov_b32 m0, -1
407; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
408; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
409; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
410; GFX8-NEXT:    buffer_wbinvl1_vol
411; GFX8-NEXT:  BB2_2:
412; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
413; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
414; GFX8-NEXT:    v_mov_b32_e32 v0, v1
415; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
416; GFX8-NEXT:    s_mov_b32 s3, 0xf000
417; GFX8-NEXT:    s_mov_b32 s2, -1
418; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
420; GFX8-NEXT:    s_endpgm
421;
422; GFX9-LABEL: add_i32_varying:
423; GFX9:       ; %bb.0: ; %entry
424; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
425; GFX9-NEXT:    s_mov_b64 s[2:3], exec
426; GFX9-NEXT:    v_mov_b32_e32 v2, v0
427; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
428; GFX9-NEXT:    v_mov_b32_e32 v1, 0
429; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
430; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
431; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
432; GFX9-NEXT:    s_not_b64 exec, exec
433; GFX9-NEXT:    v_mov_b32_e32 v2, 0
434; GFX9-NEXT:    s_not_b64 exec, exec
435; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
436; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
437; GFX9-NEXT:    s_nop 1
438; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
439; GFX9-NEXT:    s_nop 1
440; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
441; GFX9-NEXT:    s_nop 1
442; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
443; GFX9-NEXT:    s_nop 1
444; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
445; GFX9-NEXT:    s_nop 1
446; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
447; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
448; GFX9-NEXT:    s_nop 0
449; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
450; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
451; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
452; GFX9-NEXT:    ; implicit-def: $vgpr0
453; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
454; GFX9-NEXT:    s_cbranch_execz BB2_2
455; GFX9-NEXT:  ; %bb.1:
456; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
457; GFX9-NEXT:    v_mov_b32_e32 v3, s2
458; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
459; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
460; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
461; GFX9-NEXT:    buffer_wbinvl1_vol
462; GFX9-NEXT:  BB2_2:
463; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
464; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
465; GFX9-NEXT:    v_mov_b32_e32 v0, v1
466; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
467; GFX9-NEXT:    s_mov_b32 s3, 0xf000
468; GFX9-NEXT:    s_mov_b32 s2, -1
469; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
471; GFX9-NEXT:    s_endpgm
472;
473; GFX1064-LABEL: add_i32_varying:
474; GFX1064:       ; %bb.0: ; %entry
475; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
476; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
477; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
478; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
479; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
480; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
481; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
482; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
483; GFX1064-NEXT:    s_not_b64 exec, exec
484; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
485; GFX1064-NEXT:    s_not_b64 exec, exec
486; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
487; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
488; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
489; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
490; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
491; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
492; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
493; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
494; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
495; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
496; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
497; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
498; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
499; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
500; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
501; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
502; GFX1064-NEXT:    s_mov_b32 s2, -1
503; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
504; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
505; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
506; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
507; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
508; GFX1064-NEXT:    ; implicit-def: $vgpr0
509; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
510; GFX1064-NEXT:    s_cbranch_execz BB2_2
511; GFX1064-NEXT:  ; %bb.1:
512; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
513; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
514; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
515; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
516; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
517; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
518; GFX1064-NEXT:    buffer_gl0_inv
519; GFX1064-NEXT:    buffer_gl1_inv
520; GFX1064-NEXT:  BB2_2:
521; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
522; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
523; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
524; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
525; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
526; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
527; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
528; GFX1064-NEXT:    s_nop 0
529; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
530; GFX1064-NEXT:    s_endpgm
531;
532; GFX1032-LABEL: add_i32_varying:
533; GFX1032:       ; %bb.0: ; %entry
534; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
535; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
536; GFX1032-NEXT:    ; implicit-def: $vcc_hi
537; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
538; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
539; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
540; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
541; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
542; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
543; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
544; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
545; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
546; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
547; GFX1032-NEXT:    s_mov_b32 s2, -1
548; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
549; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
550; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
551; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
552; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
553; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
554; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
555; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
556; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
557; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
558; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
559; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
560; GFX1032-NEXT:    ; implicit-def: $vgpr0
561; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
562; GFX1032-NEXT:    s_cbranch_execz BB2_2
563; GFX1032-NEXT:  ; %bb.1:
564; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
565; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
566; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
567; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
568; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
569; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
570; GFX1032-NEXT:    buffer_gl0_inv
571; GFX1032-NEXT:    buffer_gl1_inv
572; GFX1032-NEXT:  BB2_2:
573; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
574; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
575; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
576; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
577; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
578; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
579; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX1032-NEXT:    s_nop 0
581; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
582; GFX1032-NEXT:    s_endpgm
583entry:
584  %lane = call i32 @llvm.amdgcn.workitem.id.x()
585  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
586  store i32 %old, i32 addrspace(1)* %out
587  ret void
588}
589
590define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
591;
592;
593; GFX7LESS-LABEL: add_i32_varying_gfx1032:
594; GFX7LESS:       ; %bb.0: ; %entry
595; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
596; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
597; GFX7LESS-NEXT:    s_mov_b32 m0, -1
598; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
599; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
600; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
601; GFX7LESS-NEXT:    buffer_wbinvl1
602; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
603; GFX7LESS-NEXT:    s_mov_b32 s2, -1
604; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
605; GFX7LESS-NEXT:    s_endpgm
606;
607; GFX8-LABEL: add_i32_varying_gfx1032:
608; GFX8:       ; %bb.0: ; %entry
609; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
610; GFX8-NEXT:    s_mov_b64 s[2:3], exec
611; GFX8-NEXT:    v_mov_b32_e32 v2, v0
612; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
613; GFX8-NEXT:    v_mov_b32_e32 v1, 0
614; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
615; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
616; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
617; GFX8-NEXT:    s_not_b64 exec, exec
618; GFX8-NEXT:    v_mov_b32_e32 v2, 0
619; GFX8-NEXT:    s_not_b64 exec, exec
620; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
621; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
622; GFX8-NEXT:    s_nop 1
623; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
624; GFX8-NEXT:    s_nop 1
625; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
626; GFX8-NEXT:    s_nop 1
627; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
628; GFX8-NEXT:    s_nop 1
629; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
630; GFX8-NEXT:    s_nop 1
631; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
632; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
633; GFX8-NEXT:    s_nop 0
634; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
635; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
636; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
637; GFX8-NEXT:    ; implicit-def: $vgpr0
638; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
639; GFX8-NEXT:    s_cbranch_execz BB3_2
640; GFX8-NEXT:  ; %bb.1:
641; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
642; GFX8-NEXT:    v_mov_b32_e32 v3, s2
643; GFX8-NEXT:    s_mov_b32 m0, -1
644; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
645; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
646; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
647; GFX8-NEXT:    buffer_wbinvl1_vol
648; GFX8-NEXT:  BB3_2:
649; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
650; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
651; GFX8-NEXT:    v_mov_b32_e32 v0, v1
652; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
653; GFX8-NEXT:    s_mov_b32 s3, 0xf000
654; GFX8-NEXT:    s_mov_b32 s2, -1
655; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
657; GFX8-NEXT:    s_endpgm
658;
659; GFX9-LABEL: add_i32_varying_gfx1032:
660; GFX9:       ; %bb.0: ; %entry
661; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
662; GFX9-NEXT:    s_mov_b64 s[2:3], exec
663; GFX9-NEXT:    v_mov_b32_e32 v2, v0
664; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
665; GFX9-NEXT:    v_mov_b32_e32 v1, 0
666; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
667; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
668; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
669; GFX9-NEXT:    s_not_b64 exec, exec
670; GFX9-NEXT:    v_mov_b32_e32 v2, 0
671; GFX9-NEXT:    s_not_b64 exec, exec
672; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
673; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
674; GFX9-NEXT:    s_nop 1
675; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
676; GFX9-NEXT:    s_nop 1
677; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
678; GFX9-NEXT:    s_nop 1
679; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
680; GFX9-NEXT:    s_nop 1
681; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
682; GFX9-NEXT:    s_nop 1
683; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
684; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
685; GFX9-NEXT:    s_nop 0
686; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
687; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
688; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
689; GFX9-NEXT:    ; implicit-def: $vgpr0
690; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
691; GFX9-NEXT:    s_cbranch_execz BB3_2
692; GFX9-NEXT:  ; %bb.1:
693; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
694; GFX9-NEXT:    v_mov_b32_e32 v3, s2
695; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
696; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
697; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
698; GFX9-NEXT:    buffer_wbinvl1_vol
699; GFX9-NEXT:  BB3_2:
700; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
701; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
702; GFX9-NEXT:    v_mov_b32_e32 v0, v1
703; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
704; GFX9-NEXT:    s_mov_b32 s3, 0xf000
705; GFX9-NEXT:    s_mov_b32 s2, -1
706; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
708; GFX9-NEXT:    s_endpgm
709;
710; GFX1064-LABEL: add_i32_varying_gfx1032:
711; GFX1064:       ; %bb.0: ; %entry
712; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
713; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
714; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
715; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
716; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
717; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
718; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
719; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
720; GFX1064-NEXT:    s_not_b64 exec, exec
721; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
722; GFX1064-NEXT:    s_not_b64 exec, exec
723; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
724; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
725; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
726; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
727; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
728; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
729; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
730; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
731; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
732; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
733; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
734; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
735; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
736; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
737; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
738; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
739; GFX1064-NEXT:    s_mov_b32 s2, -1
740; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
741; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
742; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
743; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
744; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
745; GFX1064-NEXT:    ; implicit-def: $vgpr0
746; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
747; GFX1064-NEXT:    s_cbranch_execz BB3_2
748; GFX1064-NEXT:  ; %bb.1:
749; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
750; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
751; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
752; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
753; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
754; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
755; GFX1064-NEXT:    buffer_gl0_inv
756; GFX1064-NEXT:    buffer_gl1_inv
757; GFX1064-NEXT:  BB3_2:
758; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
759; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
760; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
761; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
762; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
763; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
764; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX1064-NEXT:    s_nop 0
766; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
767; GFX1064-NEXT:    s_endpgm
768;
769; GFX1032-LABEL: add_i32_varying_gfx1032:
770; GFX1032:       ; %bb.0: ; %entry
771; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
772; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
773; GFX1032-NEXT:    ; implicit-def: $vcc_hi
774; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
775; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
776; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
777; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
778; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
779; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
780; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
781; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
782; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
783; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
784; GFX1032-NEXT:    s_mov_b32 s2, -1
785; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
786; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
787; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
788; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
789; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
790; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
791; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
792; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
793; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
794; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
795; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
796; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
797; GFX1032-NEXT:    ; implicit-def: $vgpr0
798; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
799; GFX1032-NEXT:    s_cbranch_execz BB3_2
800; GFX1032-NEXT:  ; %bb.1:
801; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
802; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
803; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
804; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
805; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
806; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
807; GFX1032-NEXT:    buffer_gl0_inv
808; GFX1032-NEXT:    buffer_gl1_inv
809; GFX1032-NEXT:  BB3_2:
810; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
811; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
812; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
813; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
814; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
815; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
816; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX1032-NEXT:    s_nop 0
818; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
819; GFX1032-NEXT:    s_endpgm
820entry:
821  %lane = call i32 @llvm.amdgcn.workitem.id.x()
822  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
823  store i32 %old, i32 addrspace(1)* %out
824  ret void
825}
826
827define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
828;
829;
830; GFX7LESS-LABEL: add_i32_varying_gfx1064:
831; GFX7LESS:       ; %bb.0: ; %entry
832; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
833; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
834; GFX7LESS-NEXT:    s_mov_b32 m0, -1
835; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
836; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
837; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
838; GFX7LESS-NEXT:    buffer_wbinvl1
839; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
840; GFX7LESS-NEXT:    s_mov_b32 s2, -1
841; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
842; GFX7LESS-NEXT:    s_endpgm
843;
844; GFX8-LABEL: add_i32_varying_gfx1064:
845; GFX8:       ; %bb.0: ; %entry
846; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
847; GFX8-NEXT:    s_mov_b64 s[2:3], exec
848; GFX8-NEXT:    v_mov_b32_e32 v2, v0
849; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
850; GFX8-NEXT:    v_mov_b32_e32 v1, 0
851; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
852; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
853; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
854; GFX8-NEXT:    s_not_b64 exec, exec
855; GFX8-NEXT:    v_mov_b32_e32 v2, 0
856; GFX8-NEXT:    s_not_b64 exec, exec
857; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
858; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
859; GFX8-NEXT:    s_nop 1
860; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
861; GFX8-NEXT:    s_nop 1
862; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
863; GFX8-NEXT:    s_nop 1
864; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
865; GFX8-NEXT:    s_nop 1
866; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
867; GFX8-NEXT:    s_nop 1
868; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
869; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
870; GFX8-NEXT:    s_nop 0
871; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
872; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
873; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
874; GFX8-NEXT:    ; implicit-def: $vgpr0
875; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
876; GFX8-NEXT:    s_cbranch_execz BB4_2
877; GFX8-NEXT:  ; %bb.1:
878; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
879; GFX8-NEXT:    v_mov_b32_e32 v3, s2
880; GFX8-NEXT:    s_mov_b32 m0, -1
881; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
882; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
883; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
884; GFX8-NEXT:    buffer_wbinvl1_vol
885; GFX8-NEXT:  BB4_2:
886; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
887; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
888; GFX8-NEXT:    v_mov_b32_e32 v0, v1
889; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
890; GFX8-NEXT:    s_mov_b32 s3, 0xf000
891; GFX8-NEXT:    s_mov_b32 s2, -1
892; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
894; GFX8-NEXT:    s_endpgm
895;
896; GFX9-LABEL: add_i32_varying_gfx1064:
897; GFX9:       ; %bb.0: ; %entry
898; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
899; GFX9-NEXT:    s_mov_b64 s[2:3], exec
900; GFX9-NEXT:    v_mov_b32_e32 v2, v0
901; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
902; GFX9-NEXT:    v_mov_b32_e32 v1, 0
903; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
904; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
905; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
906; GFX9-NEXT:    s_not_b64 exec, exec
907; GFX9-NEXT:    v_mov_b32_e32 v2, 0
908; GFX9-NEXT:    s_not_b64 exec, exec
909; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
910; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
911; GFX9-NEXT:    s_nop 1
912; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
913; GFX9-NEXT:    s_nop 1
914; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
915; GFX9-NEXT:    s_nop 1
916; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
917; GFX9-NEXT:    s_nop 1
918; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
919; GFX9-NEXT:    s_nop 1
920; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
921; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
922; GFX9-NEXT:    s_nop 0
923; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
924; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
925; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
926; GFX9-NEXT:    ; implicit-def: $vgpr0
927; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
928; GFX9-NEXT:    s_cbranch_execz BB4_2
929; GFX9-NEXT:  ; %bb.1:
930; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
931; GFX9-NEXT:    v_mov_b32_e32 v3, s2
932; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
933; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
934; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
935; GFX9-NEXT:    buffer_wbinvl1_vol
936; GFX9-NEXT:  BB4_2:
937; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
938; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
939; GFX9-NEXT:    v_mov_b32_e32 v0, v1
940; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
941; GFX9-NEXT:    s_mov_b32 s3, 0xf000
942; GFX9-NEXT:    s_mov_b32 s2, -1
943; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
945; GFX9-NEXT:    s_endpgm
946;
947; GFX1064-LABEL: add_i32_varying_gfx1064:
948; GFX1064:       ; %bb.0: ; %entry
949; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
950; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
951; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
952; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
953; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
954; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
955; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
956; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
957; GFX1064-NEXT:    s_not_b64 exec, exec
958; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
959; GFX1064-NEXT:    s_not_b64 exec, exec
960; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
961; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
962; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
963; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
964; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
965; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
966; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
967; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
968; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
969; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
970; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
971; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
972; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
973; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
974; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
975; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
976; GFX1064-NEXT:    s_mov_b32 s2, -1
977; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
978; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
979; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
980; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
981; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
982; GFX1064-NEXT:    ; implicit-def: $vgpr0
983; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
984; GFX1064-NEXT:    s_cbranch_execz BB4_2
985; GFX1064-NEXT:  ; %bb.1:
986; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
987; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
988; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
989; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
990; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
991; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
992; GFX1064-NEXT:    buffer_gl0_inv
993; GFX1064-NEXT:    buffer_gl1_inv
994; GFX1064-NEXT:  BB4_2:
995; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
996; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
997; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
998; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
999; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1000; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1001; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX1064-NEXT:    s_nop 0
1003; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1004; GFX1064-NEXT:    s_endpgm
1005;
1006; GFX1032-LABEL: add_i32_varying_gfx1064:
1007; GFX1032:       ; %bb.0: ; %entry
1008; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1009; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1010; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1011; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
1012; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
1013; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1014; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
1015; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1016; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1017; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1018; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1019; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1020; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1021; GFX1032-NEXT:    s_mov_b32 s2, -1
1022; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1023; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1024; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1025; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
1026; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
1027; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1028; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
1029; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
1030; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
1031; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
1032; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1033; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1034; GFX1032-NEXT:    ; implicit-def: $vgpr0
1035; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1036; GFX1032-NEXT:    s_cbranch_execz BB4_2
1037; GFX1032-NEXT:  ; %bb.1:
1038; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1039; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
1040; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1041; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1042; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
1043; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1044; GFX1032-NEXT:    buffer_gl0_inv
1045; GFX1032-NEXT:    buffer_gl1_inv
1046; GFX1032-NEXT:  BB4_2:
1047; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1048; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1049; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1050; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1051; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1052; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1053; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1054; GFX1032-NEXT:    s_nop 0
1055; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1056; GFX1032-NEXT:    s_endpgm
1057entry:
1058  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1059  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1060  store i32 %old, i32 addrspace(1)* %out
1061  ret void
1062}
1063
1064define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1065;
1066;
1067; GFX7LESS-LABEL: add_i64_constant:
1068; GFX7LESS:       ; %bb.0: ; %entry
1069; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1070; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1071; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1072; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1073; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1074; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1075; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1076; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1077; GFX7LESS-NEXT:  ; %bb.1:
1078; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1079; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1080; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1081; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1082; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1083; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1084; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1085; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1086; GFX7LESS-NEXT:    buffer_wbinvl1
1087; GFX7LESS-NEXT:  BB5_2:
1088; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1089; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1090; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1091; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1092; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1093; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1094; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1095; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1096; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1097; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1098; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1100; GFX7LESS-NEXT:    s_endpgm
1101;
1102; GFX8-LABEL: add_i64_constant:
1103; GFX8:       ; %bb.0: ; %entry
1104; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1105; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1106; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1107; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1108; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1109; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1110; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1111; GFX8-NEXT:    s_cbranch_execz BB5_2
1112; GFX8-NEXT:  ; %bb.1:
1113; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1114; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1115; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1116; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1117; GFX8-NEXT:    s_mov_b32 m0, -1
1118; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1119; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1120; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1121; GFX8-NEXT:    buffer_wbinvl1_vol
1122; GFX8-NEXT:  BB5_2:
1123; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1124; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1125; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1126; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1127; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1128; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1129; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1130; GFX8-NEXT:    s_mov_b32 s2, -1
1131; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1132; GFX8-NEXT:    s_nop 1
1133; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1134; GFX8-NEXT:    s_endpgm
1135;
1136; GFX9-LABEL: add_i64_constant:
1137; GFX9:       ; %bb.0: ; %entry
1138; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1139; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1140; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1141; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1142; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1143; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1144; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1145; GFX9-NEXT:    s_cbranch_execz BB5_2
1146; GFX9-NEXT:  ; %bb.1:
1147; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1148; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1149; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1150; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1151; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1152; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1153; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1154; GFX9-NEXT:    buffer_wbinvl1_vol
1155; GFX9-NEXT:  BB5_2:
1156; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1157; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1158; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1159; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1160; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1161; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1162; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1163; GFX9-NEXT:    s_mov_b32 s2, -1
1164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX9-NEXT:    s_nop 1
1166; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1167; GFX9-NEXT:    s_endpgm
1168;
1169; GFX1064-LABEL: add_i64_constant:
1170; GFX1064:       ; %bb.0: ; %entry
1171; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1172; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1173; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1174; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1175; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
1176; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1177; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1178; GFX1064-NEXT:    s_cbranch_execz BB5_2
1179; GFX1064-NEXT:  ; %bb.1:
1180; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1181; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1182; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1183; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1184; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1185; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1186; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1187; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1188; GFX1064-NEXT:    buffer_gl0_inv
1189; GFX1064-NEXT:    buffer_gl1_inv
1190; GFX1064-NEXT:  BB5_2:
1191; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1192; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1193; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1194; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1195; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1196; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1197; GFX1064-NEXT:    s_mov_b32 s2, -1
1198; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1199; GFX1064-NEXT:    s_nop 1
1200; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1201; GFX1064-NEXT:    s_endpgm
1202;
1203; GFX1032-LABEL: add_i64_constant:
1204; GFX1032:       ; %bb.0: ; %entry
1205; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1206; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1207; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1208; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1209; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1210; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1211; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1212; GFX1032-NEXT:    s_cbranch_execz BB5_2
1213; GFX1032-NEXT:  ; %bb.1:
1214; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1215; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1216; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1217; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1218; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1219; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1220; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1221; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1222; GFX1032-NEXT:    buffer_gl0_inv
1223; GFX1032-NEXT:    buffer_gl1_inv
1224; GFX1032-NEXT:  BB5_2:
1225; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1226; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1227; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1228; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1229; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1230; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1231; GFX1032-NEXT:    s_mov_b32 s2, -1
1232; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1233; GFX1032-NEXT:    s_nop 1
1234; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1235; GFX1032-NEXT:    s_endpgm
1236entry:
1237  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1238  store i64 %old, i64 addrspace(1)* %out
1239  ret void
1240}
1241
1242define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1243;
1244;
1245; GFX7LESS-LABEL: add_i64_uniform:
1246; GFX7LESS:       ; %bb.0: ; %entry
1247; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1248; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1249; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1250; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1251; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1252; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1253; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1254; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1255; GFX7LESS-NEXT:  ; %bb.1:
1256; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1257; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1258; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1260; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1261; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1262; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1263; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1264; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1265; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1266; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1267; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1268; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1269; GFX7LESS-NEXT:    buffer_wbinvl1
1270; GFX7LESS-NEXT:  BB6_2:
1271; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1272; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1273; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1274; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1275; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1276; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1277; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1278; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1279; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1280; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1281; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1282; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1283; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1284; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1285; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1286; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1287; GFX7LESS-NEXT:    s_endpgm
1288;
1289; GFX8-LABEL: add_i64_uniform:
1290; GFX8:       ; %bb.0: ; %entry
1291; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1292; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1293; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1294; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1295; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1296; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1297; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1298; GFX8-NEXT:    s_cbranch_execz BB6_2
1299; GFX8-NEXT:  ; %bb.1:
1300; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1301; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1302; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1304; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1305; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1306; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1307; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1308; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1309; GFX8-NEXT:    s_mov_b32 m0, -1
1310; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1311; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1312; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1313; GFX8-NEXT:    buffer_wbinvl1_vol
1314; GFX8-NEXT:  BB6_2:
1315; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1316; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX8-NEXT:    s_mov_b32 s4, s0
1318; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1319; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1320; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1321; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1322; GFX8-NEXT:    s_mov_b32 s5, s1
1323; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1324; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1325; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1326; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1327; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1328; GFX8-NEXT:    s_mov_b32 s6, -1
1329; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1330; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1331; GFX8-NEXT:    s_endpgm
1332;
1333; GFX9-LABEL: add_i64_uniform:
1334; GFX9:       ; %bb.0: ; %entry
1335; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1336; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1337; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1338; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1339; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1340; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1341; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1342; GFX9-NEXT:    s_cbranch_execz BB6_2
1343; GFX9-NEXT:  ; %bb.1:
1344; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1345; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1346; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1347; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1348; GFX9-NEXT:    s_add_i32 s8, s8, s7
1349; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1350; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1351; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1352; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1353; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1354; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1355; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1356; GFX9-NEXT:    buffer_wbinvl1_vol
1357; GFX9-NEXT:  BB6_2:
1358; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1359; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1360; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1361; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1362; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1363; GFX9-NEXT:    s_mov_b32 s4, s0
1364; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1365; GFX9-NEXT:    s_mov_b32 s5, s1
1366; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1367; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1368; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1369; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1370; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1371; GFX9-NEXT:    s_mov_b32 s6, -1
1372; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1373; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1374; GFX9-NEXT:    s_endpgm
1375;
1376; GFX1064-LABEL: add_i64_uniform:
1377; GFX1064:       ; %bb.0: ; %entry
1378; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1379; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1380; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1381; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1382; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1383; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1384; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1385; GFX1064-NEXT:    s_cbranch_execz BB6_2
1386; GFX1064-NEXT:  ; %bb.1:
1387; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1388; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1389; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1390; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1391; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1392; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1393; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1394; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1395; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1396; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1397; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1398; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1399; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1400; GFX1064-NEXT:    buffer_gl0_inv
1401; GFX1064-NEXT:    buffer_gl1_inv
1402; GFX1064-NEXT:  BB6_2:
1403; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1404; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1405; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1406; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1407; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1408; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1409; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1410; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1411; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1412; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1413; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
1414; GFX1064-NEXT:    s_mov_b32 s2, -1
1415; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1416; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1417; GFX1064-NEXT:    s_endpgm
1418;
1419; GFX1032-LABEL: add_i64_uniform:
1420; GFX1032:       ; %bb.0: ; %entry
1421; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1422; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1423; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1424; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1425; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1426; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1427; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1428; GFX1032-NEXT:    s_cbranch_execz BB6_2
1429; GFX1032-NEXT:  ; %bb.1:
1430; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1431; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1432; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1434; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1435; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1436; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1437; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1438; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1439; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1440; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1441; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1442; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1443; GFX1032-NEXT:    buffer_gl0_inv
1444; GFX1032-NEXT:    buffer_gl1_inv
1445; GFX1032-NEXT:  BB6_2:
1446; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1447; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1448; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1449; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1450; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1451; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1452; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1453; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1454; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1455; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1456; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s2, v0
1457; GFX1032-NEXT:    s_mov_b32 s2, -1
1458; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1459; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1460; GFX1032-NEXT:    s_endpgm
1461entry:
1462  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1463  store i64 %old, i64 addrspace(1)* %out
1464  ret void
1465}
1466
1467; GCN-NOT: v_mbcnt_lo_u32_b32
1468; GCN-NOT: v_mbcnt_hi_u32_b32
1469; GCN-NOT: s_bcnt1_i32_b64
1470define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1471;
1472;
1473; GFX7LESS-LABEL: add_i64_varying:
1474; GFX7LESS:       ; %bb.0: ; %entry
1475; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1476; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1477; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1478; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1479; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1480; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1481; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1482; GFX7LESS-NEXT:    buffer_wbinvl1
1483; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1484; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1485; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1486; GFX7LESS-NEXT:    s_endpgm
1487;
1488; GFX8-LABEL: add_i64_varying:
1489; GFX8:       ; %bb.0: ; %entry
1490; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1491; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1492; GFX8-NEXT:    s_mov_b32 m0, -1
1493; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1494; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1495; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1496; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1497; GFX8-NEXT:    buffer_wbinvl1_vol
1498; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1499; GFX8-NEXT:    s_mov_b32 s2, -1
1500; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1501; GFX8-NEXT:    s_endpgm
1502;
1503; GFX9-LABEL: add_i64_varying:
1504; GFX9:       ; %bb.0: ; %entry
1505; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1506; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1507; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1508; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1509; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1510; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1511; GFX9-NEXT:    buffer_wbinvl1_vol
1512; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1513; GFX9-NEXT:    s_mov_b32 s2, -1
1514; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1515; GFX9-NEXT:    s_endpgm
1516;
1517; GFX1064-LABEL: add_i64_varying:
1518; GFX1064:       ; %bb.0: ; %entry
1519; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1520; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1521; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1522; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1523; GFX1064-NEXT:    s_mov_b32 s2, -1
1524; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1525; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1526; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1527; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1528; GFX1064-NEXT:    buffer_gl0_inv
1529; GFX1064-NEXT:    buffer_gl1_inv
1530; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1531; GFX1064-NEXT:    s_endpgm
1532;
1533; GFX1032-LABEL: add_i64_varying:
1534; GFX1032:       ; %bb.0: ; %entry
1535; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1536; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1537; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1538; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1539; GFX1032-NEXT:    s_mov_b32 s2, -1
1540; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1541; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1542; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1543; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1544; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1545; GFX1032-NEXT:    buffer_gl0_inv
1546; GFX1032-NEXT:    buffer_gl1_inv
1547; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1548; GFX1032-NEXT:    s_endpgm
1549entry:
1550  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1551  %zext = zext i32 %lane to i64
1552  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1553  store i64 %old, i64 addrspace(1)* %out
1554  ret void
1555}
1556
1557define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1558;
1559;
1560; GFX7LESS-LABEL: sub_i32_constant:
1561; GFX7LESS:       ; %bb.0: ; %entry
1562; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1563; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1564; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1565; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1566; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1567; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1568; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1569; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1570; GFX7LESS-NEXT:  ; %bb.1:
1571; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1572; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1573; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
1574; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1575; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1576; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1577; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1578; GFX7LESS-NEXT:    buffer_wbinvl1
1579; GFX7LESS-NEXT:  BB8_2:
1580; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1581; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1582; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1583; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1584; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1585; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1586; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1587; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1588; GFX7LESS-NEXT:    s_endpgm
1589;
1590; GFX8-LABEL: sub_i32_constant:
1591; GFX8:       ; %bb.0: ; %entry
1592; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1593; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1594; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1595; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1596; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1597; GFX8-NEXT:    ; implicit-def: $vgpr1
1598; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1599; GFX8-NEXT:    s_cbranch_execz BB8_2
1600; GFX8-NEXT:  ; %bb.1:
1601; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1602; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1603; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1604; GFX8-NEXT:    s_mov_b32 m0, -1
1605; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1606; GFX8-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1607; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1608; GFX8-NEXT:    buffer_wbinvl1_vol
1609; GFX8-NEXT:  BB8_2:
1610; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1611; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1612; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1613; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1614; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1615; GFX8-NEXT:    s_mov_b32 s2, -1
1616; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1617; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1618; GFX8-NEXT:    s_endpgm
1619;
1620; GFX9-LABEL: sub_i32_constant:
1621; GFX9:       ; %bb.0: ; %entry
1622; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1623; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1624; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1625; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1626; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1627; GFX9-NEXT:    ; implicit-def: $vgpr1
1628; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1629; GFX9-NEXT:    s_cbranch_execz BB8_2
1630; GFX9-NEXT:  ; %bb.1:
1631; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1632; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1633; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1634; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1635; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1636; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1637; GFX9-NEXT:    buffer_wbinvl1_vol
1638; GFX9-NEXT:  BB8_2:
1639; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1640; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1641; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1642; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1643; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1644; GFX9-NEXT:    s_mov_b32 s2, -1
1645; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1646; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1647; GFX9-NEXT:    s_endpgm
1648;
1649; GFX1064-LABEL: sub_i32_constant:
1650; GFX1064:       ; %bb.0: ; %entry
1651; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1652; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1653; GFX1064-NEXT:    ; implicit-def: $vgpr1
1654; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1655; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1656; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1657; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1658; GFX1064-NEXT:    s_cbranch_execz BB8_2
1659; GFX1064-NEXT:  ; %bb.1:
1660; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1661; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1662; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1663; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1664; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1665; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1666; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1667; GFX1064-NEXT:    buffer_gl0_inv
1668; GFX1064-NEXT:    buffer_gl1_inv
1669; GFX1064-NEXT:  BB8_2:
1670; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1671; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1672; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1673; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1674; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1675; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1676; GFX1064-NEXT:    s_mov_b32 s2, -1
1677; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1678; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1679; GFX1064-NEXT:    s_endpgm
1680;
1681; GFX1032-LABEL: sub_i32_constant:
1682; GFX1032:       ; %bb.0: ; %entry
1683; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1684; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1685; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1686; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1687; GFX1032-NEXT:    ; implicit-def: $vgpr1
1688; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1689; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1690; GFX1032-NEXT:    s_cbranch_execz BB8_2
1691; GFX1032-NEXT:  ; %bb.1:
1692; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1693; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1694; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1695; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1696; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1697; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1698; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1699; GFX1032-NEXT:    buffer_gl0_inv
1700; GFX1032-NEXT:    buffer_gl1_inv
1701; GFX1032-NEXT:  BB8_2:
1702; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1703; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1704; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1705; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1706; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1707; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1708; GFX1032-NEXT:    s_mov_b32 s2, -1
1709; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1711; GFX1032-NEXT:    s_endpgm
1712entry:
1713  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1714  store i32 %old, i32 addrspace(1)* %out
1715  ret void
1716}
1717
1718define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1719;
1720;
1721; GFX7LESS-LABEL: sub_i32_uniform:
1722; GFX7LESS:       ; %bb.0: ; %entry
1723; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1724; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1725; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
1726; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1727; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1728; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1729; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1730; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1731; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1732; GFX7LESS-NEXT:  ; %bb.1:
1733; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1734; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1735; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
1736; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1737; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
1738; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1739; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1740; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1741; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1742; GFX7LESS-NEXT:    buffer_wbinvl1
1743; GFX7LESS-NEXT:  BB9_2:
1744; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1745; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1746; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1747; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1748; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1749; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1750; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1751; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1752; GFX7LESS-NEXT:    s_endpgm
1753;
1754; GFX8-LABEL: sub_i32_uniform:
1755; GFX8:       ; %bb.0: ; %entry
1756; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1757; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1758; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1759; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1760; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1761; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1762; GFX8-NEXT:    ; implicit-def: $vgpr1
1763; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1764; GFX8-NEXT:    s_cbranch_execz BB9_2
1765; GFX8-NEXT:  ; %bb.1:
1766; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1767; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1769; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1770; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1771; GFX8-NEXT:    s_mov_b32 m0, -1
1772; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1773; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1774; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1775; GFX8-NEXT:    buffer_wbinvl1_vol
1776; GFX8-NEXT:  BB9_2:
1777; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1778; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1779; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1780; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1781; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1782; GFX8-NEXT:    s_mov_b32 s6, -1
1783; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1784; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1785; GFX8-NEXT:    s_endpgm
1786;
1787; GFX9-LABEL: sub_i32_uniform:
1788; GFX9:       ; %bb.0: ; %entry
1789; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1790; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
1791; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1792; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1793; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1794; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1795; GFX9-NEXT:    ; implicit-def: $vgpr1
1796; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1797; GFX9-NEXT:    s_cbranch_execz BB9_2
1798; GFX9-NEXT:  ; %bb.1:
1799; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1800; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1801; GFX9-NEXT:    s_mul_i32 s1, s0, s1
1802; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1803; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1804; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1805; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1806; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1807; GFX9-NEXT:    buffer_wbinvl1_vol
1808; GFX9-NEXT:  BB9_2:
1809; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1810; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1811; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
1812; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1813; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1814; GFX9-NEXT:    s_mov_b32 s6, -1
1815; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1816; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1817; GFX9-NEXT:    s_endpgm
1818;
1819; GFX1064-LABEL: sub_i32_uniform:
1820; GFX1064:       ; %bb.0: ; %entry
1821; GFX1064-NEXT:    s_clause 0x1
1822; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1823; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
1824; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1825; GFX1064-NEXT:    ; implicit-def: $vgpr1
1826; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1827; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1828; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1829; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1830; GFX1064-NEXT:    s_cbranch_execz BB9_2
1831; GFX1064-NEXT:  ; %bb.1:
1832; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1833; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1834; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1835; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
1836; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
1837; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1838; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1839; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1840; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1841; GFX1064-NEXT:    buffer_gl0_inv
1842; GFX1064-NEXT:    buffer_gl1_inv
1843; GFX1064-NEXT:  BB9_2:
1844; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1845; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
1846; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1847; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
1848; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1849; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1850; GFX1064-NEXT:    s_mov_b32 s6, -1
1851; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1852; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1853; GFX1064-NEXT:    s_endpgm
1854;
1855; GFX1032-LABEL: sub_i32_uniform:
1856; GFX1032:       ; %bb.0: ; %entry
1857; GFX1032-NEXT:    s_clause 0x1
1858; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1859; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
1860; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1861; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1862; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1863; GFX1032-NEXT:    ; implicit-def: $vgpr1
1864; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1865; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1866; GFX1032-NEXT:    s_cbranch_execz BB9_2
1867; GFX1032-NEXT:  ; %bb.1:
1868; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1869; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1870; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
1872; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
1873; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1874; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1875; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1876; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1877; GFX1032-NEXT:    buffer_gl0_inv
1878; GFX1032-NEXT:    buffer_gl1_inv
1879; GFX1032-NEXT:  BB9_2:
1880; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1881; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1882; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1883; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
1884; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1885; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1886; GFX1032-NEXT:    s_mov_b32 s6, -1
1887; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1888; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1889; GFX1032-NEXT:    s_endpgm
1890entry:
1891  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1892  store i32 %old, i32 addrspace(1)* %out
1893  ret void
1894}
1895
1896define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1897;
1898;
1899; GFX7LESS-LABEL: sub_i32_varying:
1900; GFX7LESS:       ; %bb.0: ; %entry
1901; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1902; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1903; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1904; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1905; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1906; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1907; GFX7LESS-NEXT:    buffer_wbinvl1
1908; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1909; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1910; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1911; GFX7LESS-NEXT:    s_endpgm
1912;
1913; GFX8-LABEL: sub_i32_varying:
1914; GFX8:       ; %bb.0: ; %entry
1915; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1916; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1917; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1918; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1919; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1920; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1921; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1922; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1923; GFX8-NEXT:    s_not_b64 exec, exec
1924; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1925; GFX8-NEXT:    s_not_b64 exec, exec
1926; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1927; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1928; GFX8-NEXT:    s_nop 1
1929; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1930; GFX8-NEXT:    s_nop 1
1931; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1932; GFX8-NEXT:    s_nop 1
1933; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1934; GFX8-NEXT:    s_nop 1
1935; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1936; GFX8-NEXT:    s_nop 1
1937; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1938; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
1939; GFX8-NEXT:    s_nop 0
1940; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1941; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1942; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1943; GFX8-NEXT:    ; implicit-def: $vgpr0
1944; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1945; GFX8-NEXT:    s_cbranch_execz BB10_2
1946; GFX8-NEXT:  ; %bb.1:
1947; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1948; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1949; GFX8-NEXT:    s_mov_b32 m0, -1
1950; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1951; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1952; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1953; GFX8-NEXT:    buffer_wbinvl1_vol
1954; GFX8-NEXT:  BB10_2:
1955; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1956; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1957; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1958; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1959; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1960; GFX8-NEXT:    s_mov_b32 s2, -1
1961; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1962; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1963; GFX8-NEXT:    s_endpgm
1964;
1965; GFX9-LABEL: sub_i32_varying:
1966; GFX9:       ; %bb.0: ; %entry
1967; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1968; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1969; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1970; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1971; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1972; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1973; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1974; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1975; GFX9-NEXT:    s_not_b64 exec, exec
1976; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1977; GFX9-NEXT:    s_not_b64 exec, exec
1978; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1979; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1980; GFX9-NEXT:    s_nop 1
1981; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1982; GFX9-NEXT:    s_nop 1
1983; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1984; GFX9-NEXT:    s_nop 1
1985; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1986; GFX9-NEXT:    s_nop 1
1987; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1988; GFX9-NEXT:    s_nop 1
1989; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1990; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
1991; GFX9-NEXT:    s_nop 0
1992; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1993; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1994; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1995; GFX9-NEXT:    ; implicit-def: $vgpr0
1996; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1997; GFX9-NEXT:    s_cbranch_execz BB10_2
1998; GFX9-NEXT:  ; %bb.1:
1999; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2000; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2001; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2002; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2003; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2004; GFX9-NEXT:    buffer_wbinvl1_vol
2005; GFX9-NEXT:  BB10_2:
2006; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2007; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2008; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2009; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2010; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2011; GFX9-NEXT:    s_mov_b32 s2, -1
2012; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2013; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2014; GFX9-NEXT:    s_endpgm
2015;
2016; GFX1064-LABEL: sub_i32_varying:
2017; GFX1064:       ; %bb.0: ; %entry
2018; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2019; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
2020; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2021; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2022; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2023; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2024; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2025; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
2026; GFX1064-NEXT:    s_not_b64 exec, exec
2027; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2028; GFX1064-NEXT:    s_not_b64 exec, exec
2029; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2030; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2031; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2032; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2033; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2034; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2035; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2036; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2037; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2038; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2039; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2040; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2041; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2042; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2043; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2044; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2045; GFX1064-NEXT:    s_mov_b32 s2, -1
2046; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2047; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2048; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2049; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2050; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2051; GFX1064-NEXT:    ; implicit-def: $vgpr0
2052; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2053; GFX1064-NEXT:    s_cbranch_execz BB10_2
2054; GFX1064-NEXT:  ; %bb.1:
2055; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2056; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
2057; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2058; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2059; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2060; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2061; GFX1064-NEXT:    buffer_gl0_inv
2062; GFX1064-NEXT:    buffer_gl1_inv
2063; GFX1064-NEXT:  BB10_2:
2064; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2065; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2066; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2067; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2068; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2069; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2070; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2071; GFX1064-NEXT:    s_nop 0
2072; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2073; GFX1064-NEXT:    s_endpgm
2074;
2075; GFX1032-LABEL: sub_i32_varying:
2076; GFX1032:       ; %bb.0: ; %entry
2077; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2078; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
2079; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2080; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2081; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
2082; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2083; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
2084; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2085; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2086; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2087; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2088; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2089; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2090; GFX1032-NEXT:    s_mov_b32 s2, -1
2091; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2092; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2093; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2094; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2095; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2096; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2097; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2098; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2099; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2100; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2101; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2102; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2103; GFX1032-NEXT:    ; implicit-def: $vgpr0
2104; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2105; GFX1032-NEXT:    s_cbranch_execz BB10_2
2106; GFX1032-NEXT:  ; %bb.1:
2107; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2108; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
2109; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2110; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2111; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2112; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2113; GFX1032-NEXT:    buffer_gl0_inv
2114; GFX1032-NEXT:    buffer_gl1_inv
2115; GFX1032-NEXT:  BB10_2:
2116; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2117; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2118; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2119; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2120; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2121; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2122; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2123; GFX1032-NEXT:    s_nop 0
2124; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2125; GFX1032-NEXT:    s_endpgm
2126entry:
2127  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2128  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2129  store i32 %old, i32 addrspace(1)* %out
2130  ret void
2131}
2132
2133define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2134;
2135;
2136; GFX7LESS-LABEL: sub_i64_constant:
2137; GFX7LESS:       ; %bb.0: ; %entry
2138; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2139; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2140; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2141; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2142; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2143; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2144; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2145; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2146; GFX7LESS-NEXT:  ; %bb.1:
2147; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2148; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2149; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2150; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2151; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2152; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2153; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2154; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2155; GFX7LESS-NEXT:    buffer_wbinvl1
2156; GFX7LESS-NEXT:  BB11_2:
2157; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2158; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2159; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2160; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2161; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2162; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2163; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2164; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2165; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2166; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2167; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2168; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2169; GFX7LESS-NEXT:    s_endpgm
2170;
2171; GFX8-LABEL: sub_i64_constant:
2172; GFX8:       ; %bb.0: ; %entry
2173; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2174; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2175; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2176; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2177; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2178; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2179; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2180; GFX8-NEXT:    s_cbranch_execz BB11_2
2181; GFX8-NEXT:  ; %bb.1:
2182; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2183; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2184; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2185; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2186; GFX8-NEXT:    s_mov_b32 m0, -1
2187; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2188; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2189; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2190; GFX8-NEXT:    buffer_wbinvl1_vol
2191; GFX8-NEXT:  BB11_2:
2192; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2193; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2194; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2195; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2196; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2197; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2198; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2199; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2200; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2201; GFX8-NEXT:    s_mov_b32 s2, -1
2202; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2204; GFX8-NEXT:    s_endpgm
2205;
2206; GFX9-LABEL: sub_i64_constant:
2207; GFX9:       ; %bb.0: ; %entry
2208; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2209; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2210; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2211; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2212; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2213; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2214; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2215; GFX9-NEXT:    s_cbranch_execz BB11_2
2216; GFX9-NEXT:  ; %bb.1:
2217; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2218; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2219; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2220; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2221; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2222; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2223; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2224; GFX9-NEXT:    buffer_wbinvl1_vol
2225; GFX9-NEXT:  BB11_2:
2226; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2227; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2228; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2229; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2230; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2231; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2232; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2233; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2234; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2235; GFX9-NEXT:    s_mov_b32 s2, -1
2236; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2237; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2238; GFX9-NEXT:    s_endpgm
2239;
2240; GFX1064-LABEL: sub_i64_constant:
2241; GFX1064:       ; %bb.0: ; %entry
2242; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2243; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2244; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2245; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2246; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2247; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2248; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2249; GFX1064-NEXT:    s_cbranch_execz BB11_2
2250; GFX1064-NEXT:  ; %bb.1:
2251; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2252; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2253; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2254; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2255; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2256; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2257; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2258; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2259; GFX1064-NEXT:    buffer_gl0_inv
2260; GFX1064-NEXT:    buffer_gl1_inv
2261; GFX1064-NEXT:  BB11_2:
2262; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2263; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2264; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2265; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2266; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2267; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2268; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2269; GFX1064-NEXT:    s_mov_b32 s2, -1
2270; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2271; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2272; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2273; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2274; GFX1064-NEXT:    s_endpgm
2275;
2276; GFX1032-LABEL: sub_i64_constant:
2277; GFX1032:       ; %bb.0: ; %entry
2278; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2279; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2280; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2281; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2282; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2283; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2284; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2285; GFX1032-NEXT:    s_cbranch_execz BB11_2
2286; GFX1032-NEXT:  ; %bb.1:
2287; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2288; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2289; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2290; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
2291; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2292; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2293; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2294; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2295; GFX1032-NEXT:    buffer_gl0_inv
2296; GFX1032-NEXT:    buffer_gl1_inv
2297; GFX1032-NEXT:  BB11_2:
2298; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2299; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2300; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2301; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2302; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2303; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2304; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2305; GFX1032-NEXT:    s_mov_b32 s2, -1
2306; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2307; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2308; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2309; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2310; GFX1032-NEXT:    s_endpgm
2311entry:
2312  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2313  store i64 %old, i64 addrspace(1)* %out
2314  ret void
2315}
2316
2317define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2318;
2319;
2320; GFX7LESS-LABEL: sub_i64_uniform:
2321; GFX7LESS:       ; %bb.0: ; %entry
2322; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2323; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2324; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2325; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2326; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2327; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2328; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2329; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2330; GFX7LESS-NEXT:  ; %bb.1:
2331; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2332; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2333; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2334; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2335; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2336; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2337; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2338; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2339; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2340; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2341; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2342; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2343; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2344; GFX7LESS-NEXT:    buffer_wbinvl1
2345; GFX7LESS-NEXT:  BB12_2:
2346; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2347; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2348; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2349; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2350; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2351; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2352; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2353; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2354; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2355; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2356; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2357; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2358; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2359; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2360; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2361; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2362; GFX7LESS-NEXT:    s_endpgm
2363;
2364; GFX8-LABEL: sub_i64_uniform:
2365; GFX8:       ; %bb.0: ; %entry
2366; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2367; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2368; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2369; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2370; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2371; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2372; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2373; GFX8-NEXT:    s_cbranch_execz BB12_2
2374; GFX8-NEXT:  ; %bb.1:
2375; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2376; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2377; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2378; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2379; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2380; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2381; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2382; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2383; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2384; GFX8-NEXT:    s_mov_b32 m0, -1
2385; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2386; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2387; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2388; GFX8-NEXT:    buffer_wbinvl1_vol
2389; GFX8-NEXT:  BB12_2:
2390; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2391; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2392; GFX8-NEXT:    s_mov_b32 s4, s0
2393; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2394; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2395; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2396; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2397; GFX8-NEXT:    s_mov_b32 s5, s1
2398; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2399; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2400; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2401; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2402; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2403; GFX8-NEXT:    s_mov_b32 s6, -1
2404; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2405; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2406; GFX8-NEXT:    s_endpgm
2407;
2408; GFX9-LABEL: sub_i64_uniform:
2409; GFX9:       ; %bb.0: ; %entry
2410; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2411; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2412; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2413; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2414; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2415; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2416; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2417; GFX9-NEXT:    s_cbranch_execz BB12_2
2418; GFX9-NEXT:  ; %bb.1:
2419; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2421; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2422; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2423; GFX9-NEXT:    s_add_i32 s8, s8, s7
2424; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2425; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2426; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2427; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2428; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2429; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2430; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2431; GFX9-NEXT:    buffer_wbinvl1_vol
2432; GFX9-NEXT:  BB12_2:
2433; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2434; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2435; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2436; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2437; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2438; GFX9-NEXT:    s_mov_b32 s4, s0
2439; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2440; GFX9-NEXT:    s_mov_b32 s5, s1
2441; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2442; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2443; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2444; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2445; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2446; GFX9-NEXT:    s_mov_b32 s6, -1
2447; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2448; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2449; GFX9-NEXT:    s_endpgm
2450;
2451; GFX1064-LABEL: sub_i64_uniform:
2452; GFX1064:       ; %bb.0: ; %entry
2453; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2454; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2455; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2456; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2457; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2458; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2459; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2460; GFX1064-NEXT:    s_cbranch_execz BB12_2
2461; GFX1064-NEXT:  ; %bb.1:
2462; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2463; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2464; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2465; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2466; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2467; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2468; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2469; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2470; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2471; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2472; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2473; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2474; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2475; GFX1064-NEXT:    buffer_gl0_inv
2476; GFX1064-NEXT:    buffer_gl1_inv
2477; GFX1064-NEXT:  BB12_2:
2478; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2479; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2480; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2482; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2483; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2484; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2485; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2486; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2487; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2488; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v0
2489; GFX1064-NEXT:    s_mov_b32 s2, -1
2490; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2491; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2492; GFX1064-NEXT:    s_endpgm
2493;
2494; GFX1032-LABEL: sub_i64_uniform:
2495; GFX1032:       ; %bb.0: ; %entry
2496; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2497; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2498; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2499; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2500; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2501; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2502; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2503; GFX1032-NEXT:    s_cbranch_execz BB12_2
2504; GFX1032-NEXT:  ; %bb.1:
2505; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2506; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2507; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2508; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2509; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2510; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2511; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2512; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2513; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2514; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2515; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2516; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2517; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2518; GFX1032-NEXT:    buffer_gl0_inv
2519; GFX1032-NEXT:    buffer_gl1_inv
2520; GFX1032-NEXT:  BB12_2:
2521; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2522; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2523; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2524; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2525; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2526; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2527; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2528; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2529; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2530; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2531; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v0
2532; GFX1032-NEXT:    s_mov_b32 s2, -1
2533; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2534; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2535; GFX1032-NEXT:    s_endpgm
2536entry:
2537  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2538  store i64 %old, i64 addrspace(1)* %out
2539  ret void
2540}
2541
2542; GCN-NOT: v_mbcnt_lo_u32_b32
2543; GCN-NOT: v_mbcnt_hi_u32_b32
2544; GCN-NOT: s_bcnt1_i32_b64
2545define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2546;
2547;
2548; GFX7LESS-LABEL: sub_i64_varying:
2549; GFX7LESS:       ; %bb.0: ; %entry
2550; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2551; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2552; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2553; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2554; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2555; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2556; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2557; GFX7LESS-NEXT:    buffer_wbinvl1
2558; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2559; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2560; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2561; GFX7LESS-NEXT:    s_endpgm
2562;
2563; GFX8-LABEL: sub_i64_varying:
2564; GFX8:       ; %bb.0: ; %entry
2565; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2566; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2567; GFX8-NEXT:    s_mov_b32 m0, -1
2568; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2569; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2570; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2571; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2572; GFX8-NEXT:    buffer_wbinvl1_vol
2573; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2574; GFX8-NEXT:    s_mov_b32 s2, -1
2575; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2576; GFX8-NEXT:    s_endpgm
2577;
2578; GFX9-LABEL: sub_i64_varying:
2579; GFX9:       ; %bb.0: ; %entry
2580; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2581; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2582; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2583; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2584; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2585; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2586; GFX9-NEXT:    buffer_wbinvl1_vol
2587; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2588; GFX9-NEXT:    s_mov_b32 s2, -1
2589; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2590; GFX9-NEXT:    s_endpgm
2591;
2592; GFX1064-LABEL: sub_i64_varying:
2593; GFX1064:       ; %bb.0: ; %entry
2594; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2595; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2596; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2597; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2598; GFX1064-NEXT:    s_mov_b32 s2, -1
2599; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2600; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2601; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2602; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2603; GFX1064-NEXT:    buffer_gl0_inv
2604; GFX1064-NEXT:    buffer_gl1_inv
2605; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2606; GFX1064-NEXT:    s_endpgm
2607;
2608; GFX1032-LABEL: sub_i64_varying:
2609; GFX1032:       ; %bb.0: ; %entry
2610; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2611; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2612; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2613; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2614; GFX1032-NEXT:    s_mov_b32 s2, -1
2615; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2616; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2617; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2618; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2619; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2620; GFX1032-NEXT:    buffer_gl0_inv
2621; GFX1032-NEXT:    buffer_gl1_inv
2622; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2623; GFX1032-NEXT:    s_endpgm
2624entry:
2625  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2626  %zext = zext i32 %lane to i64
2627  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2628  store i64 %old, i64 addrspace(1)* %out
2629  ret void
2630}
2631
2632define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2633;
2634;
2635; GFX7LESS-LABEL: and_i32_varying:
2636; GFX7LESS:       ; %bb.0: ; %entry
2637; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2638; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2639; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2640; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2641; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2642; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2643; GFX7LESS-NEXT:    buffer_wbinvl1
2644; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2645; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2646; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2647; GFX7LESS-NEXT:    s_endpgm
2648;
2649; GFX8-LABEL: and_i32_varying:
2650; GFX8:       ; %bb.0: ; %entry
2651; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2652; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2653; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2654; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2655; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2656; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2657; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2658; GFX8-NEXT:    s_not_b64 exec, exec
2659; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2660; GFX8-NEXT:    s_not_b64 exec, exec
2661; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2662; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2663; GFX8-NEXT:    s_nop 1
2664; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2665; GFX8-NEXT:    s_nop 1
2666; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2667; GFX8-NEXT:    s_nop 1
2668; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2669; GFX8-NEXT:    s_nop 1
2670; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2671; GFX8-NEXT:    s_nop 1
2672; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2673; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2674; GFX8-NEXT:    s_nop 0
2675; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2676; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2677; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2678; GFX8-NEXT:    ; implicit-def: $vgpr0
2679; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2680; GFX8-NEXT:    s_cbranch_execz BB14_2
2681; GFX8-NEXT:  ; %bb.1:
2682; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2683; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2684; GFX8-NEXT:    s_mov_b32 m0, -1
2685; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2686; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2687; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2688; GFX8-NEXT:    buffer_wbinvl1_vol
2689; GFX8-NEXT:  BB14_2:
2690; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2691; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2692; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2693; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2694; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2695; GFX8-NEXT:    s_mov_b32 s2, -1
2696; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2697; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2698; GFX8-NEXT:    s_endpgm
2699;
2700; GFX9-LABEL: and_i32_varying:
2701; GFX9:       ; %bb.0: ; %entry
2702; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2703; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2704; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2705; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2706; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2707; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2708; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2709; GFX9-NEXT:    s_not_b64 exec, exec
2710; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2711; GFX9-NEXT:    s_not_b64 exec, exec
2712; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2713; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2714; GFX9-NEXT:    s_nop 1
2715; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2716; GFX9-NEXT:    s_nop 1
2717; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2718; GFX9-NEXT:    s_nop 1
2719; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2720; GFX9-NEXT:    s_nop 1
2721; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2722; GFX9-NEXT:    s_nop 1
2723; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2724; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2725; GFX9-NEXT:    s_nop 0
2726; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2727; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2728; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2729; GFX9-NEXT:    ; implicit-def: $vgpr0
2730; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2731; GFX9-NEXT:    s_cbranch_execz BB14_2
2732; GFX9-NEXT:  ; %bb.1:
2733; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2734; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2735; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2736; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2737; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2738; GFX9-NEXT:    buffer_wbinvl1_vol
2739; GFX9-NEXT:  BB14_2:
2740; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2741; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2742; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2743; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2744; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2745; GFX9-NEXT:    s_mov_b32 s2, -1
2746; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2747; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2748; GFX9-NEXT:    s_endpgm
2749;
2750; GFX1064-LABEL: and_i32_varying:
2751; GFX1064:       ; %bb.0: ; %entry
2752; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2753; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
2754; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2755; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
2756; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2757; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2758; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2759; GFX1064-NEXT:    s_not_b64 exec, exec
2760; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
2761; GFX1064-NEXT:    s_not_b64 exec, exec
2762; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2763; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2764; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2765; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2766; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2767; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2768; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2769; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2770; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2771; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2772; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2773; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2774; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2775; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2776; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2777; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2778; GFX1064-NEXT:    s_mov_b32 s2, -1
2779; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2780; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2781; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2782; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2783; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
2784; GFX1064-NEXT:    ; implicit-def: $vgpr0
2785; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2786; GFX1064-NEXT:    s_cbranch_execz BB14_2
2787; GFX1064-NEXT:  ; %bb.1:
2788; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2789; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
2790; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2791; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2792; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
2793; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2794; GFX1064-NEXT:    buffer_gl0_inv
2795; GFX1064-NEXT:    buffer_gl1_inv
2796; GFX1064-NEXT:  BB14_2:
2797; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2798; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2799; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2800; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2801; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2802; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2803; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2804; GFX1064-NEXT:    s_nop 0
2805; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2806; GFX1064-NEXT:    s_endpgm
2807;
2808; GFX1032-LABEL: and_i32_varying:
2809; GFX1032:       ; %bb.0: ; %entry
2810; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2811; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
2812; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2813; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2814; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2815; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2816; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2817; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2818; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
2819; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2820; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2821; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2822; GFX1032-NEXT:    s_mov_b32 s2, -1
2823; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2824; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2825; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2826; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2827; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2828; GFX1032-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2829; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2830; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2831; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2832; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2833; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2834; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
2835; GFX1032-NEXT:    ; implicit-def: $vgpr0
2836; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2837; GFX1032-NEXT:    s_cbranch_execz BB14_2
2838; GFX1032-NEXT:  ; %bb.1:
2839; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2840; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
2841; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2842; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2843; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
2844; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2845; GFX1032-NEXT:    buffer_gl0_inv
2846; GFX1032-NEXT:    buffer_gl1_inv
2847; GFX1032-NEXT:  BB14_2:
2848; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2849; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2850; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2851; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2852; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2853; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2854; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2855; GFX1032-NEXT:    s_nop 0
2856; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2857; GFX1032-NEXT:    s_endpgm
2858entry:
2859  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2860  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2861  store i32 %old, i32 addrspace(1)* %out
2862  ret void
2863}
2864
2865define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2866;
2867;
2868; GFX7LESS-LABEL: or_i32_varying:
2869; GFX7LESS:       ; %bb.0: ; %entry
2870; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2871; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2872; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2873; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2874; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2875; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2876; GFX7LESS-NEXT:    buffer_wbinvl1
2877; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2878; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2879; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2880; GFX7LESS-NEXT:    s_endpgm
2881;
2882; GFX8-LABEL: or_i32_varying:
2883; GFX8:       ; %bb.0: ; %entry
2884; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2885; GFX8-NEXT:    s_mov_b64 s[2:3], exec
2886; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2887; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2888; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2889; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2890; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2891; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2892; GFX8-NEXT:    s_not_b64 exec, exec
2893; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2894; GFX8-NEXT:    s_not_b64 exec, exec
2895; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2896; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2897; GFX8-NEXT:    s_nop 1
2898; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2899; GFX8-NEXT:    s_nop 1
2900; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2901; GFX8-NEXT:    s_nop 1
2902; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2903; GFX8-NEXT:    s_nop 1
2904; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2905; GFX8-NEXT:    s_nop 1
2906; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2907; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2908; GFX8-NEXT:    s_nop 0
2909; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2910; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2911; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2912; GFX8-NEXT:    ; implicit-def: $vgpr0
2913; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2914; GFX8-NEXT:    s_cbranch_execz BB15_2
2915; GFX8-NEXT:  ; %bb.1:
2916; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2917; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2918; GFX8-NEXT:    s_mov_b32 m0, -1
2919; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2920; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2921; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2922; GFX8-NEXT:    buffer_wbinvl1_vol
2923; GFX8-NEXT:  BB15_2:
2924; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2925; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2926; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2927; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2928; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2929; GFX8-NEXT:    s_mov_b32 s2, -1
2930; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2931; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2932; GFX8-NEXT:    s_endpgm
2933;
2934; GFX9-LABEL: or_i32_varying:
2935; GFX9:       ; %bb.0: ; %entry
2936; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2937; GFX9-NEXT:    s_mov_b64 s[2:3], exec
2938; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2939; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2940; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2941; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2942; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2943; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2944; GFX9-NEXT:    s_not_b64 exec, exec
2945; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2946; GFX9-NEXT:    s_not_b64 exec, exec
2947; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2948; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2949; GFX9-NEXT:    s_nop 1
2950; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2951; GFX9-NEXT:    s_nop 1
2952; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2953; GFX9-NEXT:    s_nop 1
2954; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2955; GFX9-NEXT:    s_nop 1
2956; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2957; GFX9-NEXT:    s_nop 1
2958; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2959; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2960; GFX9-NEXT:    s_nop 0
2961; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2962; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2963; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2964; GFX9-NEXT:    ; implicit-def: $vgpr0
2965; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2966; GFX9-NEXT:    s_cbranch_execz BB15_2
2967; GFX9-NEXT:  ; %bb.1:
2968; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2969; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2970; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2971; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2972; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2973; GFX9-NEXT:    buffer_wbinvl1_vol
2974; GFX9-NEXT:  BB15_2:
2975; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2976; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2977; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2978; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2979; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2980; GFX9-NEXT:    s_mov_b32 s2, -1
2981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2982; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2983; GFX9-NEXT:    s_endpgm
2984;
2985; GFX1064-LABEL: or_i32_varying:
2986; GFX1064:       ; %bb.0: ; %entry
2987; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2988; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
2989; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2990; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2991; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2992; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2993; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2994; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
2995; GFX1064-NEXT:    s_not_b64 exec, exec
2996; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2997; GFX1064-NEXT:    s_not_b64 exec, exec
2998; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2999; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3000; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3001; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3002; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3003; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3004; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3005; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3006; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3007; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3008; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3009; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3010; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3011; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3012; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3013; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3014; GFX1064-NEXT:    s_mov_b32 s2, -1
3015; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3016; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3017; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3018; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3019; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3020; GFX1064-NEXT:    ; implicit-def: $vgpr0
3021; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3022; GFX1064-NEXT:    s_cbranch_execz BB15_2
3023; GFX1064-NEXT:  ; %bb.1:
3024; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3025; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
3026; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3027; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3028; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
3029; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3030; GFX1064-NEXT:    buffer_gl0_inv
3031; GFX1064-NEXT:    buffer_gl1_inv
3032; GFX1064-NEXT:  BB15_2:
3033; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3034; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3035; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3036; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3037; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3038; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3039; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3040; GFX1064-NEXT:    s_nop 0
3041; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3042; GFX1064-NEXT:    s_endpgm
3043;
3044; GFX1032-LABEL: or_i32_varying:
3045; GFX1032:       ; %bb.0: ; %entry
3046; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3047; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
3048; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3049; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3050; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
3051; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3052; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
3053; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3054; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3055; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3056; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3057; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3058; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3059; GFX1032-NEXT:    s_mov_b32 s2, -1
3060; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3061; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3062; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3063; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3064; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3065; GFX1032-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3066; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3067; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3068; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3069; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3070; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3071; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3072; GFX1032-NEXT:    ; implicit-def: $vgpr0
3073; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3074; GFX1032-NEXT:    s_cbranch_execz BB15_2
3075; GFX1032-NEXT:  ; %bb.1:
3076; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3077; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
3078; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3079; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3080; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
3081; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3082; GFX1032-NEXT:    buffer_gl0_inv
3083; GFX1032-NEXT:    buffer_gl1_inv
3084; GFX1032-NEXT:  BB15_2:
3085; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3086; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3087; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3088; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3089; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3090; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3091; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3092; GFX1032-NEXT:    s_nop 0
3093; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3094; GFX1032-NEXT:    s_endpgm
3095entry:
3096  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3097  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3098  store i32 %old, i32 addrspace(1)* %out
3099  ret void
3100}
3101
3102define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3103;
3104;
3105; GFX7LESS-LABEL: xor_i32_varying:
3106; GFX7LESS:       ; %bb.0: ; %entry
3107; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3108; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3109; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3110; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3111; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3112; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3113; GFX7LESS-NEXT:    buffer_wbinvl1
3114; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3115; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3116; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3117; GFX7LESS-NEXT:    s_endpgm
3118;
3119; GFX8-LABEL: xor_i32_varying:
3120; GFX8:       ; %bb.0: ; %entry
3121; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3122; GFX8-NEXT:    s_mov_b64 s[2:3], exec
3123; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3124; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3125; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3126; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3127; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3128; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3129; GFX8-NEXT:    s_not_b64 exec, exec
3130; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3131; GFX8-NEXT:    s_not_b64 exec, exec
3132; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3133; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3134; GFX8-NEXT:    s_nop 1
3135; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3136; GFX8-NEXT:    s_nop 1
3137; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3138; GFX8-NEXT:    s_nop 1
3139; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3140; GFX8-NEXT:    s_nop 1
3141; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3142; GFX8-NEXT:    s_nop 1
3143; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3144; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3145; GFX8-NEXT:    s_nop 0
3146; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3147; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3148; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3149; GFX8-NEXT:    ; implicit-def: $vgpr0
3150; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3151; GFX8-NEXT:    s_cbranch_execz BB16_2
3152; GFX8-NEXT:  ; %bb.1:
3153; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3154; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3155; GFX8-NEXT:    s_mov_b32 m0, -1
3156; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3157; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3158; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3159; GFX8-NEXT:    buffer_wbinvl1_vol
3160; GFX8-NEXT:  BB16_2:
3161; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3162; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3163; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3164; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3165; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3166; GFX8-NEXT:    s_mov_b32 s2, -1
3167; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3168; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3169; GFX8-NEXT:    s_endpgm
3170;
3171; GFX9-LABEL: xor_i32_varying:
3172; GFX9:       ; %bb.0: ; %entry
3173; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3174; GFX9-NEXT:    s_mov_b64 s[2:3], exec
3175; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3176; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3177; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3178; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3179; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3180; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3181; GFX9-NEXT:    s_not_b64 exec, exec
3182; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3183; GFX9-NEXT:    s_not_b64 exec, exec
3184; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3185; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3186; GFX9-NEXT:    s_nop 1
3187; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3188; GFX9-NEXT:    s_nop 1
3189; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3190; GFX9-NEXT:    s_nop 1
3191; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3192; GFX9-NEXT:    s_nop 1
3193; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3194; GFX9-NEXT:    s_nop 1
3195; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3196; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3197; GFX9-NEXT:    s_nop 0
3198; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3199; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3200; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3201; GFX9-NEXT:    ; implicit-def: $vgpr0
3202; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3203; GFX9-NEXT:    s_cbranch_execz BB16_2
3204; GFX9-NEXT:  ; %bb.1:
3205; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3206; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3207; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3208; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3209; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3210; GFX9-NEXT:    buffer_wbinvl1_vol
3211; GFX9-NEXT:  BB16_2:
3212; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3213; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3214; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3215; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3216; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3217; GFX9-NEXT:    s_mov_b32 s2, -1
3218; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3219; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3220; GFX9-NEXT:    s_endpgm
3221;
3222; GFX1064-LABEL: xor_i32_varying:
3223; GFX1064:       ; %bb.0: ; %entry
3224; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3225; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
3226; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3227; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3228; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3229; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3230; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3231; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3232; GFX1064-NEXT:    s_not_b64 exec, exec
3233; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3234; GFX1064-NEXT:    s_not_b64 exec, exec
3235; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3236; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3237; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3238; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3239; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3240; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3241; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3242; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3243; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3244; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3245; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3246; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3247; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3248; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3249; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3250; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3251; GFX1064-NEXT:    s_mov_b32 s2, -1
3252; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3253; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3254; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3255; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3256; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3257; GFX1064-NEXT:    ; implicit-def: $vgpr0
3258; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3259; GFX1064-NEXT:    s_cbranch_execz BB16_2
3260; GFX1064-NEXT:  ; %bb.1:
3261; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3262; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
3263; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3264; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3265; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3266; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3267; GFX1064-NEXT:    buffer_gl0_inv
3268; GFX1064-NEXT:    buffer_gl1_inv
3269; GFX1064-NEXT:  BB16_2:
3270; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3271; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3272; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3273; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3274; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3275; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3276; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3277; GFX1064-NEXT:    s_nop 0
3278; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3279; GFX1064-NEXT:    s_endpgm
3280;
3281; GFX1032-LABEL: xor_i32_varying:
3282; GFX1032:       ; %bb.0: ; %entry
3283; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3284; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
3285; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3286; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3287; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
3288; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3289; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
3290; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3291; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3292; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3293; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3294; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3295; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3296; GFX1032-NEXT:    s_mov_b32 s2, -1
3297; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3298; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3299; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3300; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3301; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3302; GFX1032-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3303; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3304; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3305; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3306; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3307; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3308; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3309; GFX1032-NEXT:    ; implicit-def: $vgpr0
3310; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3311; GFX1032-NEXT:    s_cbranch_execz BB16_2
3312; GFX1032-NEXT:  ; %bb.1:
3313; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3314; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
3315; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3316; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3317; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3318; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3319; GFX1032-NEXT:    buffer_gl0_inv
3320; GFX1032-NEXT:    buffer_gl1_inv
3321; GFX1032-NEXT:  BB16_2:
3322; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3323; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3324; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3325; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3326; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3327; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3328; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3329; GFX1032-NEXT:    s_nop 0
3330; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3331; GFX1032-NEXT:    s_endpgm
3332entry:
3333  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3334  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3335  store i32 %old, i32 addrspace(1)* %out
3336  ret void
3337}
3338
3339define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3340;
3341;
3342; GFX7LESS-LABEL: max_i32_varying:
3343; GFX7LESS:       ; %bb.0: ; %entry
3344; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3345; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3346; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3347; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3348; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3349; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3350; GFX7LESS-NEXT:    buffer_wbinvl1
3351; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3352; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3353; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3354; GFX7LESS-NEXT:    s_endpgm
3355;
3356; GFX8-LABEL: max_i32_varying:
3357; GFX8:       ; %bb.0: ; %entry
3358; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3359; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3360; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3361; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3362; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3363; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3364; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3365; GFX8-NEXT:    s_not_b64 exec, exec
3366; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3367; GFX8-NEXT:    s_not_b64 exec, exec
3368; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3369; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3370; GFX8-NEXT:    s_nop 1
3371; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3372; GFX8-NEXT:    s_nop 1
3373; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3374; GFX8-NEXT:    s_nop 1
3375; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3376; GFX8-NEXT:    s_nop 1
3377; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3378; GFX8-NEXT:    s_nop 1
3379; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3380; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3381; GFX8-NEXT:    s_nop 0
3382; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3383; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3384; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3385; GFX8-NEXT:    ; implicit-def: $vgpr0
3386; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3387; GFX8-NEXT:    s_cbranch_execz BB17_2
3388; GFX8-NEXT:  ; %bb.1:
3389; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3390; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3391; GFX8-NEXT:    s_mov_b32 m0, -1
3392; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3393; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3394; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3395; GFX8-NEXT:    buffer_wbinvl1_vol
3396; GFX8-NEXT:  BB17_2:
3397; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3398; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3399; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3400; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3401; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3402; GFX8-NEXT:    s_mov_b32 s2, -1
3403; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3404; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3405; GFX8-NEXT:    s_endpgm
3406;
3407; GFX9-LABEL: max_i32_varying:
3408; GFX9:       ; %bb.0: ; %entry
3409; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3410; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3411; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3412; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3413; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3414; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3415; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3416; GFX9-NEXT:    s_not_b64 exec, exec
3417; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3418; GFX9-NEXT:    s_not_b64 exec, exec
3419; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3420; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3421; GFX9-NEXT:    s_nop 1
3422; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3423; GFX9-NEXT:    s_nop 1
3424; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3425; GFX9-NEXT:    s_nop 1
3426; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3427; GFX9-NEXT:    s_nop 1
3428; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3429; GFX9-NEXT:    s_nop 1
3430; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3431; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3432; GFX9-NEXT:    s_nop 0
3433; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3434; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3435; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3436; GFX9-NEXT:    ; implicit-def: $vgpr0
3437; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3438; GFX9-NEXT:    s_cbranch_execz BB17_2
3439; GFX9-NEXT:  ; %bb.1:
3440; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3441; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3442; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3443; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3444; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3445; GFX9-NEXT:    buffer_wbinvl1_vol
3446; GFX9-NEXT:  BB17_2:
3447; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3448; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3449; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3450; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3451; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3452; GFX9-NEXT:    s_mov_b32 s2, -1
3453; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3454; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3455; GFX9-NEXT:    s_endpgm
3456;
3457; GFX1064-LABEL: max_i32_varying:
3458; GFX1064:       ; %bb.0: ; %entry
3459; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3460; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3461; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3462; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
3463; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3464; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3465; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3466; GFX1064-NEXT:    s_not_b64 exec, exec
3467; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3468; GFX1064-NEXT:    s_not_b64 exec, exec
3469; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3470; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3471; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3472; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3473; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3474; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3475; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3476; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3477; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3478; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3479; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3480; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3481; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3482; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3483; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3484; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3485; GFX1064-NEXT:    s_mov_b32 s2, -1
3486; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3487; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3488; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3489; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3490; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3491; GFX1064-NEXT:    ; implicit-def: $vgpr0
3492; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3493; GFX1064-NEXT:    s_cbranch_execz BB17_2
3494; GFX1064-NEXT:  ; %bb.1:
3495; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3496; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
3497; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3498; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3499; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
3500; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3501; GFX1064-NEXT:    buffer_gl0_inv
3502; GFX1064-NEXT:    buffer_gl1_inv
3503; GFX1064-NEXT:  BB17_2:
3504; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3505; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3506; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3507; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3508; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3509; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3510; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3511; GFX1064-NEXT:    s_nop 0
3512; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3513; GFX1064-NEXT:    s_endpgm
3514;
3515; GFX1032-LABEL: max_i32_varying:
3516; GFX1032:       ; %bb.0: ; %entry
3517; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3518; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3519; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3520; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3521; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3522; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3523; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3524; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3525; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3526; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3527; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3528; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3529; GFX1032-NEXT:    s_mov_b32 s2, -1
3530; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3531; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3532; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3533; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3534; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3535; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3536; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3537; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3538; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3539; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3540; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3541; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
3542; GFX1032-NEXT:    ; implicit-def: $vgpr0
3543; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3544; GFX1032-NEXT:    s_cbranch_execz BB17_2
3545; GFX1032-NEXT:  ; %bb.1:
3546; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3547; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
3548; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3549; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3550; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
3551; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3552; GFX1032-NEXT:    buffer_gl0_inv
3553; GFX1032-NEXT:    buffer_gl1_inv
3554; GFX1032-NEXT:  BB17_2:
3555; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3556; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3557; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3558; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3559; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3560; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3561; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3562; GFX1032-NEXT:    s_nop 0
3563; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3564; GFX1032-NEXT:    s_endpgm
3565entry:
3566  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3567  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3568  store i32 %old, i32 addrspace(1)* %out
3569  ret void
3570}
3571
3572define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3573;
3574;
3575; GFX7LESS-LABEL: max_i64_constant:
3576; GFX7LESS:       ; %bb.0: ; %entry
3577; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3578; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3579; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3580; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3581; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3582; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3583; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3584; GFX7LESS-NEXT:  ; %bb.1:
3585; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3586; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3587; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3588; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3589; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3590; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3591; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3592; GFX7LESS-NEXT:    buffer_wbinvl1
3593; GFX7LESS-NEXT:  BB18_2:
3594; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3595; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3596; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3597; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3598; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3599; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3600; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3601; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3602; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3603; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3604; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3605; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3606; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3607; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3608; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3609; GFX7LESS-NEXT:    s_endpgm
3610;
3611; GFX8-LABEL: max_i64_constant:
3612; GFX8:       ; %bb.0: ; %entry
3613; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3614; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3615; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3616; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3617; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3618; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3619; GFX8-NEXT:    s_cbranch_execz BB18_2
3620; GFX8-NEXT:  ; %bb.1:
3621; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3622; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3623; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3624; GFX8-NEXT:    s_mov_b32 m0, -1
3625; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3626; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3627; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3628; GFX8-NEXT:    buffer_wbinvl1_vol
3629; GFX8-NEXT:  BB18_2:
3630; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3631; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3632; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3633; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3634; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3635; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3636; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3637; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3638; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3639; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3640; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3641; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3642; GFX8-NEXT:    s_mov_b32 s2, -1
3643; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3644; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3645; GFX8-NEXT:    s_endpgm
3646;
3647; GFX9-LABEL: max_i64_constant:
3648; GFX9:       ; %bb.0: ; %entry
3649; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3650; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3651; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3652; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3653; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3654; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3655; GFX9-NEXT:    s_cbranch_execz BB18_2
3656; GFX9-NEXT:  ; %bb.1:
3657; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3658; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3659; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3660; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3661; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3662; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3663; GFX9-NEXT:    buffer_wbinvl1_vol
3664; GFX9-NEXT:  BB18_2:
3665; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3666; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3667; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3668; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3669; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3670; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3671; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3672; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3673; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3674; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3675; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3676; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3677; GFX9-NEXT:    s_mov_b32 s2, -1
3678; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3679; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3680; GFX9-NEXT:    s_endpgm
3681;
3682; GFX1064-LABEL: max_i64_constant:
3683; GFX1064:       ; %bb.0: ; %entry
3684; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3685; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3686; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3687; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3688; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3689; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3690; GFX1064-NEXT:    s_cbranch_execz BB18_2
3691; GFX1064-NEXT:  ; %bb.1:
3692; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3693; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3694; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3695; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3696; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3697; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3698; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3699; GFX1064-NEXT:    buffer_gl0_inv
3700; GFX1064-NEXT:    buffer_gl1_inv
3701; GFX1064-NEXT:  BB18_2:
3702; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3703; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3704; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3705; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3706; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3707; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3708; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3709; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3710; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3711; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3712; GFX1064-NEXT:    s_mov_b32 s2, -1
3713; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3714; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3715; GFX1064-NEXT:    s_endpgm
3716;
3717; GFX1032-LABEL: max_i64_constant:
3718; GFX1032:       ; %bb.0: ; %entry
3719; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3720; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3721; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3722; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3723; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3724; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3725; GFX1032-NEXT:    s_cbranch_execz BB18_2
3726; GFX1032-NEXT:  ; %bb.1:
3727; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3728; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3729; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3730; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3731; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3732; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3733; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3734; GFX1032-NEXT:    buffer_gl0_inv
3735; GFX1032-NEXT:    buffer_gl1_inv
3736; GFX1032-NEXT:  BB18_2:
3737; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3738; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3739; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3740; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3741; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3742; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3743; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3744; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3745; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3746; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3747; GFX1032-NEXT:    s_mov_b32 s2, -1
3748; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3749; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3750; GFX1032-NEXT:    s_endpgm
3751entry:
3752  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3753  store i64 %old, i64 addrspace(1)* %out
3754  ret void
3755}
3756
3757define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3758;
3759;
3760; GFX7LESS-LABEL: min_i32_varying:
3761; GFX7LESS:       ; %bb.0: ; %entry
3762; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3763; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3764; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3765; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3766; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3767; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3768; GFX7LESS-NEXT:    buffer_wbinvl1
3769; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3770; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3771; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3772; GFX7LESS-NEXT:    s_endpgm
3773;
3774; GFX8-LABEL: min_i32_varying:
3775; GFX8:       ; %bb.0: ; %entry
3776; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3777; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3778; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3779; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3780; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3781; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3782; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3783; GFX8-NEXT:    s_not_b64 exec, exec
3784; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3785; GFX8-NEXT:    s_not_b64 exec, exec
3786; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3787; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3788; GFX8-NEXT:    s_nop 1
3789; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3790; GFX8-NEXT:    s_nop 1
3791; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3792; GFX8-NEXT:    s_nop 1
3793; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3794; GFX8-NEXT:    s_nop 1
3795; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3796; GFX8-NEXT:    s_nop 1
3797; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3798; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3799; GFX8-NEXT:    s_nop 0
3800; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3801; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3802; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3803; GFX8-NEXT:    ; implicit-def: $vgpr0
3804; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3805; GFX8-NEXT:    s_cbranch_execz BB19_2
3806; GFX8-NEXT:  ; %bb.1:
3807; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3808; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3809; GFX8-NEXT:    s_mov_b32 m0, -1
3810; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3811; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3812; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3813; GFX8-NEXT:    buffer_wbinvl1_vol
3814; GFX8-NEXT:  BB19_2:
3815; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3816; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3817; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3818; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3819; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3820; GFX8-NEXT:    s_mov_b32 s2, -1
3821; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3822; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3823; GFX8-NEXT:    s_endpgm
3824;
3825; GFX9-LABEL: min_i32_varying:
3826; GFX9:       ; %bb.0: ; %entry
3827; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3828; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3829; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3830; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3831; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3832; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3833; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3834; GFX9-NEXT:    s_not_b64 exec, exec
3835; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3836; GFX9-NEXT:    s_not_b64 exec, exec
3837; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3838; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3839; GFX9-NEXT:    s_nop 1
3840; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3841; GFX9-NEXT:    s_nop 1
3842; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3843; GFX9-NEXT:    s_nop 1
3844; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3845; GFX9-NEXT:    s_nop 1
3846; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3847; GFX9-NEXT:    s_nop 1
3848; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3849; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3850; GFX9-NEXT:    s_nop 0
3851; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3852; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3853; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3854; GFX9-NEXT:    ; implicit-def: $vgpr0
3855; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3856; GFX9-NEXT:    s_cbranch_execz BB19_2
3857; GFX9-NEXT:  ; %bb.1:
3858; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3859; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3860; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3861; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3862; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3863; GFX9-NEXT:    buffer_wbinvl1_vol
3864; GFX9-NEXT:  BB19_2:
3865; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3866; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3867; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3868; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3869; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3870; GFX9-NEXT:    s_mov_b32 s2, -1
3871; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3872; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3873; GFX9-NEXT:    s_endpgm
3874;
3875; GFX1064-LABEL: min_i32_varying:
3876; GFX1064:       ; %bb.0: ; %entry
3877; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3878; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3879; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3880; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
3881; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3882; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3883; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3884; GFX1064-NEXT:    s_not_b64 exec, exec
3885; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3886; GFX1064-NEXT:    s_not_b64 exec, exec
3887; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3888; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3889; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3890; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3891; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3892; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3893; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3894; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3895; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3896; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3897; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3898; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3899; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3900; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3901; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3902; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3903; GFX1064-NEXT:    s_mov_b32 s2, -1
3904; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3905; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3906; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3907; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3908; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3909; GFX1064-NEXT:    ; implicit-def: $vgpr0
3910; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3911; GFX1064-NEXT:    s_cbranch_execz BB19_2
3912; GFX1064-NEXT:  ; %bb.1:
3913; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3914; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
3915; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3916; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3917; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
3918; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3919; GFX1064-NEXT:    buffer_gl0_inv
3920; GFX1064-NEXT:    buffer_gl1_inv
3921; GFX1064-NEXT:  BB19_2:
3922; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3923; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3924; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3925; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3926; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3927; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3928; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3929; GFX1064-NEXT:    s_nop 0
3930; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3931; GFX1064-NEXT:    s_endpgm
3932;
3933; GFX1032-LABEL: min_i32_varying:
3934; GFX1032:       ; %bb.0: ; %entry
3935; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3936; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3937; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3938; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3939; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3940; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3941; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3942; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3943; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3944; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3945; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3946; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3947; GFX1032-NEXT:    s_mov_b32 s2, -1
3948; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3949; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3950; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3951; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3952; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3953; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3954; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3955; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3956; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3957; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3958; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3959; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
3960; GFX1032-NEXT:    ; implicit-def: $vgpr0
3961; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3962; GFX1032-NEXT:    s_cbranch_execz BB19_2
3963; GFX1032-NEXT:  ; %bb.1:
3964; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3965; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
3966; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3967; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3968; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
3969; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3970; GFX1032-NEXT:    buffer_gl0_inv
3971; GFX1032-NEXT:    buffer_gl1_inv
3972; GFX1032-NEXT:  BB19_2:
3973; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3974; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3975; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3976; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3977; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3978; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3979; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3980; GFX1032-NEXT:    s_nop 0
3981; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3982; GFX1032-NEXT:    s_endpgm
3983entry:
3984  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3985  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3986  store i32 %old, i32 addrspace(1)* %out
3987  ret void
3988}
3989
3990define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3991;
3992;
3993; GFX7LESS-LABEL: min_i64_constant:
3994; GFX7LESS:       ; %bb.0: ; %entry
3995; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3996; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3997; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3998; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3999; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4000; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4001; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
4002; GFX7LESS-NEXT:  ; %bb.1:
4003; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4004; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4005; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4006; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4007; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4008; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4009; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4010; GFX7LESS-NEXT:    buffer_wbinvl1
4011; GFX7LESS-NEXT:  BB20_2:
4012; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4013; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4014; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4015; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
4016; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4017; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4018; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4019; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4020; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4021; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4022; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4023; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4024; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4025; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4026; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4027; GFX7LESS-NEXT:    s_endpgm
4028;
4029; GFX8-LABEL: min_i64_constant:
4030; GFX8:       ; %bb.0: ; %entry
4031; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4032; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4033; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4034; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4035; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4036; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4037; GFX8-NEXT:    s_cbranch_execz BB20_2
4038; GFX8-NEXT:  ; %bb.1:
4039; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4040; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4041; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4042; GFX8-NEXT:    s_mov_b32 m0, -1
4043; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4044; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4045; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4046; GFX8-NEXT:    buffer_wbinvl1_vol
4047; GFX8-NEXT:  BB20_2:
4048; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4049; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4050; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
4051; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4052; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4053; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4054; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4055; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4056; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4057; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4058; GFX8-NEXT:    s_mov_b32 s2, -1
4059; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4060; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4061; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4062; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4063; GFX8-NEXT:    s_endpgm
4064;
4065; GFX9-LABEL: min_i64_constant:
4066; GFX9:       ; %bb.0: ; %entry
4067; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4068; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4069; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4070; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4071; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4072; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4073; GFX9-NEXT:    s_cbranch_execz BB20_2
4074; GFX9-NEXT:  ; %bb.1:
4075; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4076; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4077; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4078; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4079; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4080; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4081; GFX9-NEXT:    buffer_wbinvl1_vol
4082; GFX9-NEXT:  BB20_2:
4083; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4084; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4085; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4086; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4087; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4088; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4089; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4090; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4091; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4092; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4093; GFX9-NEXT:    s_mov_b32 s2, -1
4094; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4095; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4096; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4097; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4098; GFX9-NEXT:    s_endpgm
4099;
4100; GFX1064-LABEL: min_i64_constant:
4101; GFX1064:       ; %bb.0: ; %entry
4102; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4103; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4104; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4105; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4106; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4107; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4108; GFX1064-NEXT:    s_cbranch_execz BB20_2
4109; GFX1064-NEXT:  ; %bb.1:
4110; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4111; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4112; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4113; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4114; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4115; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4116; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4117; GFX1064-NEXT:    buffer_gl0_inv
4118; GFX1064-NEXT:    buffer_gl1_inv
4119; GFX1064-NEXT:  BB20_2:
4120; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4121; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4122; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4123; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4124; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4125; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4126; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
4127; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4128; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4129; GFX1064-NEXT:    s_mov_b32 s2, -1
4130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4131; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4132; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4133; GFX1064-NEXT:    s_endpgm
4134;
4135; GFX1032-LABEL: min_i64_constant:
4136; GFX1032:       ; %bb.0: ; %entry
4137; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4138; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4139; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4140; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4141; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4142; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4143; GFX1032-NEXT:    s_cbranch_execz BB20_2
4144; GFX1032-NEXT:  ; %bb.1:
4145; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4146; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4147; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4148; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4149; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4150; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4151; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4152; GFX1032-NEXT:    buffer_gl0_inv
4153; GFX1032-NEXT:    buffer_gl1_inv
4154; GFX1032-NEXT:  BB20_2:
4155; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4156; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4157; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4158; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4159; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4160; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4161; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
4162; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4163; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4164; GFX1032-NEXT:    s_mov_b32 s2, -1
4165; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4166; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4167; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4168; GFX1032-NEXT:    s_endpgm
4169entry:
4170  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4171  store i64 %old, i64 addrspace(1)* %out
4172  ret void
4173}
4174
4175define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4176;
4177;
4178; GFX7LESS-LABEL: umax_i32_varying:
4179; GFX7LESS:       ; %bb.0: ; %entry
4180; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4181; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4182; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4183; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4184; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4185; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4186; GFX7LESS-NEXT:    buffer_wbinvl1
4187; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4188; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4189; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4190; GFX7LESS-NEXT:    s_endpgm
4191;
4192; GFX8-LABEL: umax_i32_varying:
4193; GFX8:       ; %bb.0: ; %entry
4194; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4195; GFX8-NEXT:    s_mov_b64 s[2:3], exec
4196; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4197; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4198; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4199; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4200; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4201; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4202; GFX8-NEXT:    s_not_b64 exec, exec
4203; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4204; GFX8-NEXT:    s_not_b64 exec, exec
4205; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4206; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4207; GFX8-NEXT:    s_nop 1
4208; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4209; GFX8-NEXT:    s_nop 1
4210; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4211; GFX8-NEXT:    s_nop 1
4212; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4213; GFX8-NEXT:    s_nop 1
4214; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4215; GFX8-NEXT:    s_nop 1
4216; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4217; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4218; GFX8-NEXT:    s_nop 0
4219; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4220; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4221; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4222; GFX8-NEXT:    ; implicit-def: $vgpr0
4223; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4224; GFX8-NEXT:    s_cbranch_execz BB21_2
4225; GFX8-NEXT:  ; %bb.1:
4226; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4227; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4228; GFX8-NEXT:    s_mov_b32 m0, -1
4229; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4230; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4231; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4232; GFX8-NEXT:    buffer_wbinvl1_vol
4233; GFX8-NEXT:  BB21_2:
4234; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4235; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4236; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4237; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4238; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4239; GFX8-NEXT:    s_mov_b32 s2, -1
4240; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4241; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4242; GFX8-NEXT:    s_endpgm
4243;
4244; GFX9-LABEL: umax_i32_varying:
4245; GFX9:       ; %bb.0: ; %entry
4246; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4247; GFX9-NEXT:    s_mov_b64 s[2:3], exec
4248; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4249; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4250; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4251; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4252; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4253; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4254; GFX9-NEXT:    s_not_b64 exec, exec
4255; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4256; GFX9-NEXT:    s_not_b64 exec, exec
4257; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4258; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4259; GFX9-NEXT:    s_nop 1
4260; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4261; GFX9-NEXT:    s_nop 1
4262; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4263; GFX9-NEXT:    s_nop 1
4264; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4265; GFX9-NEXT:    s_nop 1
4266; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4267; GFX9-NEXT:    s_nop 1
4268; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4269; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4270; GFX9-NEXT:    s_nop 0
4271; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4272; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4273; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4274; GFX9-NEXT:    ; implicit-def: $vgpr0
4275; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4276; GFX9-NEXT:    s_cbranch_execz BB21_2
4277; GFX9-NEXT:  ; %bb.1:
4278; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4279; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4280; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4281; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4282; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4283; GFX9-NEXT:    buffer_wbinvl1_vol
4284; GFX9-NEXT:  BB21_2:
4285; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4286; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4287; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4288; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4289; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4290; GFX9-NEXT:    s_mov_b32 s2, -1
4291; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4292; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4293; GFX9-NEXT:    s_endpgm
4294;
4295; GFX1064-LABEL: umax_i32_varying:
4296; GFX1064:       ; %bb.0: ; %entry
4297; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4298; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
4299; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4300; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4301; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4302; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4303; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4304; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4305; GFX1064-NEXT:    s_not_b64 exec, exec
4306; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4307; GFX1064-NEXT:    s_not_b64 exec, exec
4308; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4309; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4310; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4311; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4312; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4313; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4314; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4315; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4316; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4317; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4318; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4319; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4320; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4321; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4322; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4323; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4324; GFX1064-NEXT:    s_mov_b32 s2, -1
4325; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4326; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4327; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4328; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4329; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4330; GFX1064-NEXT:    ; implicit-def: $vgpr0
4331; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4332; GFX1064-NEXT:    s_cbranch_execz BB21_2
4333; GFX1064-NEXT:  ; %bb.1:
4334; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4335; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
4336; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4337; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4338; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
4339; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4340; GFX1064-NEXT:    buffer_gl0_inv
4341; GFX1064-NEXT:    buffer_gl1_inv
4342; GFX1064-NEXT:  BB21_2:
4343; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4344; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4345; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4346; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4347; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4348; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4349; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4350; GFX1064-NEXT:    s_nop 0
4351; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4352; GFX1064-NEXT:    s_endpgm
4353;
4354; GFX1032-LABEL: umax_i32_varying:
4355; GFX1032:       ; %bb.0: ; %entry
4356; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4357; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
4358; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4359; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4360; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
4361; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4362; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
4363; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4364; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4365; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4366; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4367; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4368; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4369; GFX1032-NEXT:    s_mov_b32 s2, -1
4370; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4371; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4372; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4373; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4374; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4375; GFX1032-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4376; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4377; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4378; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4379; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4380; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4381; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4382; GFX1032-NEXT:    ; implicit-def: $vgpr0
4383; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4384; GFX1032-NEXT:    s_cbranch_execz BB21_2
4385; GFX1032-NEXT:  ; %bb.1:
4386; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4387; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
4388; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4389; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4390; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
4391; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4392; GFX1032-NEXT:    buffer_gl0_inv
4393; GFX1032-NEXT:    buffer_gl1_inv
4394; GFX1032-NEXT:  BB21_2:
4395; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4396; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4397; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4398; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4399; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4400; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4401; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4402; GFX1032-NEXT:    s_nop 0
4403; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4404; GFX1032-NEXT:    s_endpgm
4405entry:
4406  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4407  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4408  store i32 %old, i32 addrspace(1)* %out
4409  ret void
4410}
4411
4412define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4413;
4414;
4415; GFX7LESS-LABEL: umax_i64_constant:
4416; GFX7LESS:       ; %bb.0: ; %entry
4417; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4418; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4419; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4420; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4421; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4422; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4423; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4424; GFX7LESS-NEXT:  ; %bb.1:
4425; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4426; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4427; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4428; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4429; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4430; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4431; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4432; GFX7LESS-NEXT:    buffer_wbinvl1
4433; GFX7LESS-NEXT:  BB22_2:
4434; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4435; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4436; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4437; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4438; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4439; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4440; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4441; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4442; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4443; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4444; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4445; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4446; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4447; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4448; GFX7LESS-NEXT:    s_endpgm
4449;
4450; GFX8-LABEL: umax_i64_constant:
4451; GFX8:       ; %bb.0: ; %entry
4452; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4453; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4454; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4455; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4456; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4457; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4458; GFX8-NEXT:    s_cbranch_execz BB22_2
4459; GFX8-NEXT:  ; %bb.1:
4460; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4461; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4462; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4463; GFX8-NEXT:    s_mov_b32 m0, -1
4464; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4465; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4466; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4467; GFX8-NEXT:    buffer_wbinvl1_vol
4468; GFX8-NEXT:  BB22_2:
4469; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4470; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4471; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4472; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4473; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4474; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4475; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4476; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4477; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4478; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4479; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4480; GFX8-NEXT:    s_mov_b32 s2, -1
4481; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4482; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4483; GFX8-NEXT:    s_endpgm
4484;
4485; GFX9-LABEL: umax_i64_constant:
4486; GFX9:       ; %bb.0: ; %entry
4487; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4488; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4489; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4490; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4491; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4492; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4493; GFX9-NEXT:    s_cbranch_execz BB22_2
4494; GFX9-NEXT:  ; %bb.1:
4495; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4496; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4497; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4498; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4499; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4500; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4501; GFX9-NEXT:    buffer_wbinvl1_vol
4502; GFX9-NEXT:  BB22_2:
4503; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4504; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4505; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4506; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4507; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4508; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4509; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4510; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4511; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4512; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4513; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4514; GFX9-NEXT:    s_mov_b32 s2, -1
4515; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4516; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4517; GFX9-NEXT:    s_endpgm
4518;
4519; GFX1064-LABEL: umax_i64_constant:
4520; GFX1064:       ; %bb.0: ; %entry
4521; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4522; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4523; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4524; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4525; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4526; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4527; GFX1064-NEXT:    s_cbranch_execz BB22_2
4528; GFX1064-NEXT:  ; %bb.1:
4529; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4530; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4531; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4532; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4533; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4534; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4535; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4536; GFX1064-NEXT:    buffer_gl0_inv
4537; GFX1064-NEXT:    buffer_gl1_inv
4538; GFX1064-NEXT:  BB22_2:
4539; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4540; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4541; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4542; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4543; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4544; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4545; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4546; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4547; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4548; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4549; GFX1064-NEXT:    s_mov_b32 s2, -1
4550; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4551; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4552; GFX1064-NEXT:    s_endpgm
4553;
4554; GFX1032-LABEL: umax_i64_constant:
4555; GFX1032:       ; %bb.0: ; %entry
4556; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4557; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4558; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4559; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4560; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4561; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4562; GFX1032-NEXT:    s_cbranch_execz BB22_2
4563; GFX1032-NEXT:  ; %bb.1:
4564; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4565; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4566; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4567; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4568; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4569; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4570; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4571; GFX1032-NEXT:    buffer_gl0_inv
4572; GFX1032-NEXT:    buffer_gl1_inv
4573; GFX1032-NEXT:  BB22_2:
4574; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4575; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4576; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4577; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4578; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4579; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4580; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4581; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4582; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4583; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4584; GFX1032-NEXT:    s_mov_b32 s2, -1
4585; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4586; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4587; GFX1032-NEXT:    s_endpgm
4588entry:
4589  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4590  store i64 %old, i64 addrspace(1)* %out
4591  ret void
4592}
4593
4594define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4595;
4596;
4597; GFX7LESS-LABEL: umin_i32_varying:
4598; GFX7LESS:       ; %bb.0: ; %entry
4599; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4600; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4601; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4602; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4603; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4604; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4605; GFX7LESS-NEXT:    buffer_wbinvl1
4606; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4607; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4608; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4609; GFX7LESS-NEXT:    s_endpgm
4610;
4611; GFX8-LABEL: umin_i32_varying:
4612; GFX8:       ; %bb.0: ; %entry
4613; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4614; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4615; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4616; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4617; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4618; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4619; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4620; GFX8-NEXT:    s_not_b64 exec, exec
4621; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4622; GFX8-NEXT:    s_not_b64 exec, exec
4623; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4624; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4625; GFX8-NEXT:    s_nop 1
4626; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4627; GFX8-NEXT:    s_nop 1
4628; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4629; GFX8-NEXT:    s_nop 1
4630; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4631; GFX8-NEXT:    s_nop 1
4632; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4633; GFX8-NEXT:    s_nop 1
4634; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4635; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4636; GFX8-NEXT:    s_nop 0
4637; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4638; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4639; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4640; GFX8-NEXT:    ; implicit-def: $vgpr0
4641; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4642; GFX8-NEXT:    s_cbranch_execz BB23_2
4643; GFX8-NEXT:  ; %bb.1:
4644; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4645; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4646; GFX8-NEXT:    s_mov_b32 m0, -1
4647; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4648; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4649; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4650; GFX8-NEXT:    buffer_wbinvl1_vol
4651; GFX8-NEXT:  BB23_2:
4652; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4653; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4654; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4655; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4656; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4657; GFX8-NEXT:    s_mov_b32 s2, -1
4658; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4659; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4660; GFX8-NEXT:    s_endpgm
4661;
4662; GFX9-LABEL: umin_i32_varying:
4663; GFX9:       ; %bb.0: ; %entry
4664; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4665; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4666; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4667; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4668; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4669; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4670; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4671; GFX9-NEXT:    s_not_b64 exec, exec
4672; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4673; GFX9-NEXT:    s_not_b64 exec, exec
4674; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4675; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4676; GFX9-NEXT:    s_nop 1
4677; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4678; GFX9-NEXT:    s_nop 1
4679; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4680; GFX9-NEXT:    s_nop 1
4681; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4682; GFX9-NEXT:    s_nop 1
4683; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4684; GFX9-NEXT:    s_nop 1
4685; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4686; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4687; GFX9-NEXT:    s_nop 0
4688; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4689; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4690; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4691; GFX9-NEXT:    ; implicit-def: $vgpr0
4692; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4693; GFX9-NEXT:    s_cbranch_execz BB23_2
4694; GFX9-NEXT:  ; %bb.1:
4695; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4696; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4697; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4698; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4699; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4700; GFX9-NEXT:    buffer_wbinvl1_vol
4701; GFX9-NEXT:  BB23_2:
4702; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4703; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4704; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4705; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4706; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4707; GFX9-NEXT:    s_mov_b32 s2, -1
4708; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4709; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4710; GFX9-NEXT:    s_endpgm
4711;
4712; GFX1064-LABEL: umin_i32_varying:
4713; GFX1064:       ; %bb.0: ; %entry
4714; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4715; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
4716; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4717; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
4718; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4719; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4720; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4721; GFX1064-NEXT:    s_not_b64 exec, exec
4722; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
4723; GFX1064-NEXT:    s_not_b64 exec, exec
4724; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4725; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4726; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4727; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4728; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4729; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4730; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4731; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4732; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4733; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4734; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4735; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4736; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4737; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4738; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4739; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4740; GFX1064-NEXT:    s_mov_b32 s2, -1
4741; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4742; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4743; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4744; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4745; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4746; GFX1064-NEXT:    ; implicit-def: $vgpr0
4747; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4748; GFX1064-NEXT:    s_cbranch_execz BB23_2
4749; GFX1064-NEXT:  ; %bb.1:
4750; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4751; GFX1064-NEXT:    v_mov_b32_e32 v4, s3
4752; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4753; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4754; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
4755; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4756; GFX1064-NEXT:    buffer_gl0_inv
4757; GFX1064-NEXT:    buffer_gl1_inv
4758; GFX1064-NEXT:  BB23_2:
4759; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4760; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4761; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4762; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4763; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4764; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4765; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4766; GFX1064-NEXT:    s_nop 0
4767; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4768; GFX1064-NEXT:    s_endpgm
4769;
4770; GFX1032-LABEL: umin_i32_varying:
4771; GFX1032:       ; %bb.0: ; %entry
4772; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4773; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
4774; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4775; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4776; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4777; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4778; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4779; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4780; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
4781; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4782; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4783; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4784; GFX1032-NEXT:    s_mov_b32 s2, -1
4785; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4786; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4787; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4788; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4789; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4790; GFX1032-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4791; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4792; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4793; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4794; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4795; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4796; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4797; GFX1032-NEXT:    ; implicit-def: $vgpr0
4798; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4799; GFX1032-NEXT:    s_cbranch_execz BB23_2
4800; GFX1032-NEXT:  ; %bb.1:
4801; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4802; GFX1032-NEXT:    v_mov_b32_e32 v4, s3
4803; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4804; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4805; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
4806; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4807; GFX1032-NEXT:    buffer_gl0_inv
4808; GFX1032-NEXT:    buffer_gl1_inv
4809; GFX1032-NEXT:  BB23_2:
4810; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4811; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4812; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4813; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4814; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4815; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4816; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4817; GFX1032-NEXT:    s_nop 0
4818; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4819; GFX1032-NEXT:    s_endpgm
4820entry:
4821  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4822  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4823  store i32 %old, i32 addrspace(1)* %out
4824  ret void
4825}
4826
4827define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4828;
4829;
4830; GFX7LESS-LABEL: umin_i64_constant:
4831; GFX7LESS:       ; %bb.0: ; %entry
4832; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4833; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4834; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4835; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4836; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4837; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4838; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4839; GFX7LESS-NEXT:  ; %bb.1:
4840; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4841; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4842; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4843; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4844; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4845; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4846; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4847; GFX7LESS-NEXT:    buffer_wbinvl1
4848; GFX7LESS-NEXT:  BB24_2:
4849; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4850; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4851; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4852; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4853; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4854; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4855; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4856; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4857; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4858; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4859; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4860; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4861; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4862; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4863; GFX7LESS-NEXT:    s_endpgm
4864;
4865; GFX8-LABEL: umin_i64_constant:
4866; GFX8:       ; %bb.0: ; %entry
4867; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4868; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4869; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4870; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4871; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4872; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4873; GFX8-NEXT:    s_cbranch_execz BB24_2
4874; GFX8-NEXT:  ; %bb.1:
4875; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4876; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4877; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4878; GFX8-NEXT:    s_mov_b32 m0, -1
4879; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4880; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4881; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4882; GFX8-NEXT:    buffer_wbinvl1_vol
4883; GFX8-NEXT:  BB24_2:
4884; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4885; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4886; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4887; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4888; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4889; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4890; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4891; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4892; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4893; GFX8-NEXT:    s_mov_b32 s2, -1
4894; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4895; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4896; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4897; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4898; GFX8-NEXT:    s_endpgm
4899;
4900; GFX9-LABEL: umin_i64_constant:
4901; GFX9:       ; %bb.0: ; %entry
4902; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4903; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4904; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4905; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4906; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4907; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4908; GFX9-NEXT:    s_cbranch_execz BB24_2
4909; GFX9-NEXT:  ; %bb.1:
4910; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4911; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4912; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4913; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4914; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4915; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4916; GFX9-NEXT:    buffer_wbinvl1_vol
4917; GFX9-NEXT:  BB24_2:
4918; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4919; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4920; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4921; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4922; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4923; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4924; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4925; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4926; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4927; GFX9-NEXT:    s_mov_b32 s2, -1
4928; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4929; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4930; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4931; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4932; GFX9-NEXT:    s_endpgm
4933;
4934; GFX1064-LABEL: umin_i64_constant:
4935; GFX1064:       ; %bb.0: ; %entry
4936; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4937; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4938; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4939; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4940; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4941; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4942; GFX1064-NEXT:    s_cbranch_execz BB24_2
4943; GFX1064-NEXT:  ; %bb.1:
4944; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4945; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4946; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4947; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4948; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4949; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4950; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4951; GFX1064-NEXT:    buffer_gl0_inv
4952; GFX1064-NEXT:    buffer_gl1_inv
4953; GFX1064-NEXT:  BB24_2:
4954; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4955; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4956; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4957; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4958; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4959; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4960; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4961; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4962; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4963; GFX1064-NEXT:    s_mov_b32 s2, -1
4964; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4965; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4966; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4967; GFX1064-NEXT:    s_endpgm
4968;
4969; GFX1032-LABEL: umin_i64_constant:
4970; GFX1032:       ; %bb.0: ; %entry
4971; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4972; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4973; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4974; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4975; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4976; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4977; GFX1032-NEXT:    s_cbranch_execz BB24_2
4978; GFX1032-NEXT:  ; %bb.1:
4979; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4980; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4981; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4982; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4983; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4984; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4985; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4986; GFX1032-NEXT:    buffer_gl0_inv
4987; GFX1032-NEXT:    buffer_gl1_inv
4988; GFX1032-NEXT:  BB24_2:
4989; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4990; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4991; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4992; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4993; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4994; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4995; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4996; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4997; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4998; GFX1032-NEXT:    s_mov_b32 s2, -1
4999; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5000; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5001; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5002; GFX1032-NEXT:    s_endpgm
5003entry:
5004  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
5005  store i64 %old, i64 addrspace(1)* %out
5006  ret void
5007}
5008