1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
31; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
32; GFX7LESS-NEXT:    s_mov_b32 m0, -1
33; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
34; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
35; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
36; GFX7LESS-NEXT:    buffer_wbinvl1
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
40; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
41; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
42; GFX7LESS-NEXT:    s_mov_b32 s2, -1
43; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
61; GFX8-NEXT:    s_mov_b32 m0, -1
62; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
63; GFX8-NEXT:    ds_add_rtn_u32 v1, v2, v1
64; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
65; GFX8-NEXT:    buffer_wbinvl1_vol
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
69; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
70; GFX8-NEXT:    s_mov_b32 s3, 0xf000
71; GFX8-NEXT:    s_mov_b32 s2, -1
72; GFX8-NEXT:    s_nop 1
73; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
91; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
92; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
93; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
94; GFX9-NEXT:    buffer_wbinvl1_vol
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
98; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
99; GFX9-NEXT:    s_mov_b32 s3, 0xf000
100; GFX9-NEXT:    s_mov_b32 s2, -1
101; GFX9-NEXT:    s_nop 1
102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
109; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
119; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
120; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
121; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    buffer_gl0_inv
125; GFX1064-NEXT:    buffer_gl1_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    v_nop
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_nop 1
134; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
136; GFX1064-NEXT:    s_endpgm
137;
138; GFX1032-LABEL: add_i32_constant:
139; GFX1032:       ; %bb.0: ; %entry
140; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
141; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
142; GFX1032-NEXT:    ; implicit-def: $vcc_hi
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz BB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
150; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
151; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
152; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
153; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
155; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
156; GFX1032-NEXT:    buffer_gl0_inv
157; GFX1032-NEXT:    buffer_gl1_inv
158; GFX1032-NEXT:  BB0_2:
159; GFX1032-NEXT:    v_nop
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_nop 1
166; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
168; GFX1032-NEXT:    s_endpgm
169entry:
170  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
171  store i32 %old, i32 addrspace(1)* %out
172  ret void
173}
174
175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
176;
177;
178; GFX7LESS-LABEL: add_i32_uniform:
179; GFX7LESS:       ; %bb.0: ; %entry
180; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
181; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
182; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
183; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
184; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
185; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
186; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
187; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
188; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
189; GFX7LESS-NEXT:  ; %bb.1:
190; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
191; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
193; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
194; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
195; GFX7LESS-NEXT:    s_mov_b32 m0, -1
196; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
197; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
198; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
199; GFX7LESS-NEXT:    buffer_wbinvl1
200; GFX7LESS-NEXT:  BB1_2:
201; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
202; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
203; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
205; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
206; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
207; GFX7LESS-NEXT:    s_mov_b32 s6, -1
208; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
209; GFX7LESS-NEXT:    s_endpgm
210;
211; GFX8-LABEL: add_i32_uniform:
212; GFX8:       ; %bb.0: ; %entry
213; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
214; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
215; GFX8-NEXT:    s_mov_b64 s[2:3], exec
216; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
217; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
218; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
219; GFX8-NEXT:    ; implicit-def: $vgpr1
220; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
221; GFX8-NEXT:    s_cbranch_execz BB1_2
222; GFX8-NEXT:  ; %bb.1:
223; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
224; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX8-NEXT:    s_mul_i32 s1, s0, s1
226; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
227; GFX8-NEXT:    v_mov_b32_e32 v2, s1
228; GFX8-NEXT:    s_mov_b32 m0, -1
229; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
230; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
231; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
232; GFX8-NEXT:    buffer_wbinvl1_vol
233; GFX8-NEXT:  BB1_2:
234; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
237; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
238; GFX8-NEXT:    s_mov_b32 s7, 0xf000
239; GFX8-NEXT:    s_mov_b32 s6, -1
240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
241; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
242; GFX8-NEXT:    s_endpgm
243;
244; GFX9-LABEL: add_i32_uniform:
245; GFX9:       ; %bb.0: ; %entry
246; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
247; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
248; GFX9-NEXT:    s_mov_b64 s[2:3], exec
249; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
250; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
252; GFX9-NEXT:    ; implicit-def: $vgpr1
253; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
254; GFX9-NEXT:    s_cbranch_execz BB1_2
255; GFX9-NEXT:  ; %bb.1:
256; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
257; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX9-NEXT:    s_mul_i32 s1, s0, s1
259; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
260; GFX9-NEXT:    v_mov_b32_e32 v2, s1
261; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
262; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
263; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
264; GFX9-NEXT:    buffer_wbinvl1_vol
265; GFX9-NEXT:  BB1_2:
266; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
267; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
269; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
270; GFX9-NEXT:    s_mov_b32 s7, 0xf000
271; GFX9-NEXT:    s_mov_b32 s6, -1
272; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
273; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
274; GFX9-NEXT:    s_endpgm
275;
276; GFX1064-LABEL: add_i32_uniform:
277; GFX1064:       ; %bb.0: ; %entry
278; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
279; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
280; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
281; GFX1064-NEXT:    ; implicit-def: $vgpr1
282; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
283; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
284; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
285; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
286; GFX1064-NEXT:    s_cbranch_execz BB1_2
287; GFX1064-NEXT:  ; %bb.1:
288; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
289; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
290; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
292; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
293; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
294; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
295; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
296; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
297; GFX1064-NEXT:    buffer_gl0_inv
298; GFX1064-NEXT:    buffer_gl1_inv
299; GFX1064-NEXT:  BB1_2:
300; GFX1064-NEXT:    v_nop
301; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
302; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
304; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
305; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
306; GFX1064-NEXT:    s_mov_b32 s6, -1
307; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
308; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
309; GFX1064-NEXT:    s_endpgm
310;
311; GFX1032-LABEL: add_i32_uniform:
312; GFX1032:       ; %bb.0: ; %entry
313; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
314; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
315; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
316; GFX1032-NEXT:    ; implicit-def: $vcc_hi
317; GFX1032-NEXT:    ; implicit-def: $vgpr1
318; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
319; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
320; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
321; GFX1032-NEXT:    s_cbranch_execz BB1_2
322; GFX1032-NEXT:  ; %bb.1:
323; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
324; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
325; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
327; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
328; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
329; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
330; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
331; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
332; GFX1032-NEXT:    buffer_gl0_inv
333; GFX1032-NEXT:    buffer_gl1_inv
334; GFX1032-NEXT:  BB1_2:
335; GFX1032-NEXT:    v_nop
336; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
337; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
339; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
340; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
341; GFX1032-NEXT:    s_mov_b32 s6, -1
342; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
343; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
344; GFX1032-NEXT:    s_endpgm
345entry:
346  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
347  store i32 %old, i32 addrspace(1)* %out
348  ret void
349}
350
351define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
352;
353;
354; GFX7LESS-LABEL: add_i32_varying:
355; GFX7LESS:       ; %bb.0: ; %entry
356; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
357; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
358; GFX7LESS-NEXT:    s_mov_b32 m0, -1
359; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
360; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
361; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
362; GFX7LESS-NEXT:    buffer_wbinvl1
363; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
364; GFX7LESS-NEXT:    s_mov_b32 s2, -1
365; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
366; GFX7LESS-NEXT:    s_endpgm
367;
368; GFX8-LABEL: add_i32_varying:
369; GFX8:       ; %bb.0: ; %entry
370; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
371; GFX8-NEXT:    s_mov_b64 s[2:3], exec
372; GFX8-NEXT:    v_mov_b32_e32 v2, v0
373; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
374; GFX8-NEXT:    v_mov_b32_e32 v1, 0
375; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
376; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
377; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
378; GFX8-NEXT:    s_not_b64 exec, exec
379; GFX8-NEXT:    v_mov_b32_e32 v2, 0
380; GFX8-NEXT:    s_not_b64 exec, exec
381; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
382; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
383; GFX8-NEXT:    s_nop 1
384; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
385; GFX8-NEXT:    s_nop 1
386; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
387; GFX8-NEXT:    s_nop 1
388; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
389; GFX8-NEXT:    s_nop 1
390; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
391; GFX8-NEXT:    s_nop 1
392; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
393; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
394; GFX8-NEXT:    s_nop 0
395; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
396; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
397; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
398; GFX8-NEXT:    ; implicit-def: $vgpr0
399; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
400; GFX8-NEXT:    s_cbranch_execz BB2_2
401; GFX8-NEXT:  ; %bb.1:
402; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
403; GFX8-NEXT:    v_mov_b32_e32 v3, s2
404; GFX8-NEXT:    s_mov_b32 m0, -1
405; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
406; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
407; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
408; GFX8-NEXT:    buffer_wbinvl1_vol
409; GFX8-NEXT:  BB2_2:
410; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
411; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
412; GFX8-NEXT:    v_mov_b32_e32 v0, v1
413; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
414; GFX8-NEXT:    s_mov_b32 s3, 0xf000
415; GFX8-NEXT:    s_mov_b32 s2, -1
416; GFX8-NEXT:    s_nop 0
417; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
419; GFX8-NEXT:    s_endpgm
420;
421; GFX9-LABEL: add_i32_varying:
422; GFX9:       ; %bb.0: ; %entry
423; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
424; GFX9-NEXT:    s_mov_b64 s[2:3], exec
425; GFX9-NEXT:    v_mov_b32_e32 v2, v0
426; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
427; GFX9-NEXT:    v_mov_b32_e32 v1, 0
428; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
429; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
430; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
431; GFX9-NEXT:    s_not_b64 exec, exec
432; GFX9-NEXT:    v_mov_b32_e32 v2, 0
433; GFX9-NEXT:    s_not_b64 exec, exec
434; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
435; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
436; GFX9-NEXT:    s_nop 1
437; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
438; GFX9-NEXT:    s_nop 1
439; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
440; GFX9-NEXT:    s_nop 1
441; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
442; GFX9-NEXT:    s_nop 1
443; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
444; GFX9-NEXT:    s_nop 1
445; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
446; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
447; GFX9-NEXT:    s_nop 0
448; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
449; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
450; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
451; GFX9-NEXT:    ; implicit-def: $vgpr0
452; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
453; GFX9-NEXT:    s_cbranch_execz BB2_2
454; GFX9-NEXT:  ; %bb.1:
455; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
456; GFX9-NEXT:    v_mov_b32_e32 v3, s2
457; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
459; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
460; GFX9-NEXT:    buffer_wbinvl1_vol
461; GFX9-NEXT:  BB2_2:
462; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
463; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
464; GFX9-NEXT:    v_mov_b32_e32 v0, v1
465; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
466; GFX9-NEXT:    s_mov_b32 s3, 0xf000
467; GFX9-NEXT:    s_mov_b32 s2, -1
468; GFX9-NEXT:    s_nop 0
469; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
471; GFX9-NEXT:    s_endpgm
472;
473; GFX1064-LABEL: add_i32_varying:
474; GFX1064:       ; %bb.0: ; %entry
475; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
476; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
477; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
478; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
479; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
480; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
481; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
482; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
483; GFX1064-NEXT:    s_not_b64 exec, exec
484; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
485; GFX1064-NEXT:    s_not_b64 exec, exec
486; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
487; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
488; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
489; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
490; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
491; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
492; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
493; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
494; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
495; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
496; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
497; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
498; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
499; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
500; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
501; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
502; GFX1064-NEXT:    s_mov_b32 s2, -1
503; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
504; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
505; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
506; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
507; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
508; GFX1064-NEXT:    ; implicit-def: $vgpr0
509; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
510; GFX1064-NEXT:    s_cbranch_execz BB2_2
511; GFX1064-NEXT:  ; %bb.1:
512; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
513; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
514; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
515; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
516; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
517; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
518; GFX1064-NEXT:    buffer_gl0_inv
519; GFX1064-NEXT:    buffer_gl1_inv
520; GFX1064-NEXT:  BB2_2:
521; GFX1064-NEXT:    v_nop
522; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
523; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
524; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
525; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
526; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
527; GFX1064-NEXT:    s_nop 1
528; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
530; GFX1064-NEXT:    s_endpgm
531;
532; GFX1032-LABEL: add_i32_varying:
533; GFX1032:       ; %bb.0: ; %entry
534; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
535; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
536; GFX1032-NEXT:    ; implicit-def: $vcc_hi
537; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
538; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
539; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
540; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
541; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
542; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
543; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
544; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
545; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
546; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
547; GFX1032-NEXT:    s_mov_b32 s2, -1
548; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
549; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
550; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
551; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
552; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
553; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
554; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
555; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
556; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
557; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
558; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
559; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
560; GFX1032-NEXT:    ; implicit-def: $vgpr0
561; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
562; GFX1032-NEXT:    s_cbranch_execz BB2_2
563; GFX1032-NEXT:  ; %bb.1:
564; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
565; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
566; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
567; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
568; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
569; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
570; GFX1032-NEXT:    buffer_gl0_inv
571; GFX1032-NEXT:    buffer_gl1_inv
572; GFX1032-NEXT:  BB2_2:
573; GFX1032-NEXT:    v_nop
574; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
575; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
576; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
577; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
578; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
579; GFX1032-NEXT:    s_nop 1
580; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
581; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
582; GFX1032-NEXT:    s_endpgm
583entry:
584  %lane = call i32 @llvm.amdgcn.workitem.id.x()
585  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
586  store i32 %old, i32 addrspace(1)* %out
587  ret void
588}
589
590define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
591;
592;
593; GFX7LESS-LABEL: add_i32_varying_gfx1032:
594; GFX7LESS:       ; %bb.0: ; %entry
595; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
596; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
597; GFX7LESS-NEXT:    s_mov_b32 m0, -1
598; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
599; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
600; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
601; GFX7LESS-NEXT:    buffer_wbinvl1
602; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
603; GFX7LESS-NEXT:    s_mov_b32 s2, -1
604; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
605; GFX7LESS-NEXT:    s_endpgm
606;
607; GFX8-LABEL: add_i32_varying_gfx1032:
608; GFX8:       ; %bb.0: ; %entry
609; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
610; GFX8-NEXT:    s_mov_b64 s[2:3], exec
611; GFX8-NEXT:    v_mov_b32_e32 v2, v0
612; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
613; GFX8-NEXT:    v_mov_b32_e32 v1, 0
614; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
615; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
616; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
617; GFX8-NEXT:    s_not_b64 exec, exec
618; GFX8-NEXT:    v_mov_b32_e32 v2, 0
619; GFX8-NEXT:    s_not_b64 exec, exec
620; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
621; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
622; GFX8-NEXT:    s_nop 1
623; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
624; GFX8-NEXT:    s_nop 1
625; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
626; GFX8-NEXT:    s_nop 1
627; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
628; GFX8-NEXT:    s_nop 1
629; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
630; GFX8-NEXT:    s_nop 1
631; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
632; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
633; GFX8-NEXT:    s_nop 0
634; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
635; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
636; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
637; GFX8-NEXT:    ; implicit-def: $vgpr0
638; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
639; GFX8-NEXT:    s_cbranch_execz BB3_2
640; GFX8-NEXT:  ; %bb.1:
641; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
642; GFX8-NEXT:    v_mov_b32_e32 v3, s2
643; GFX8-NEXT:    s_mov_b32 m0, -1
644; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
645; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
646; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
647; GFX8-NEXT:    buffer_wbinvl1_vol
648; GFX8-NEXT:  BB3_2:
649; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
650; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
651; GFX8-NEXT:    v_mov_b32_e32 v0, v1
652; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
653; GFX8-NEXT:    s_mov_b32 s3, 0xf000
654; GFX8-NEXT:    s_mov_b32 s2, -1
655; GFX8-NEXT:    s_nop 0
656; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
658; GFX8-NEXT:    s_endpgm
659;
660; GFX9-LABEL: add_i32_varying_gfx1032:
661; GFX9:       ; %bb.0: ; %entry
662; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
663; GFX9-NEXT:    s_mov_b64 s[2:3], exec
664; GFX9-NEXT:    v_mov_b32_e32 v2, v0
665; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
666; GFX9-NEXT:    v_mov_b32_e32 v1, 0
667; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
668; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
669; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
670; GFX9-NEXT:    s_not_b64 exec, exec
671; GFX9-NEXT:    v_mov_b32_e32 v2, 0
672; GFX9-NEXT:    s_not_b64 exec, exec
673; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
674; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
675; GFX9-NEXT:    s_nop 1
676; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
677; GFX9-NEXT:    s_nop 1
678; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
679; GFX9-NEXT:    s_nop 1
680; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
681; GFX9-NEXT:    s_nop 1
682; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
683; GFX9-NEXT:    s_nop 1
684; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
685; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
686; GFX9-NEXT:    s_nop 0
687; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
688; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
689; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
690; GFX9-NEXT:    ; implicit-def: $vgpr0
691; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
692; GFX9-NEXT:    s_cbranch_execz BB3_2
693; GFX9-NEXT:  ; %bb.1:
694; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
695; GFX9-NEXT:    v_mov_b32_e32 v3, s2
696; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
697; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
698; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
699; GFX9-NEXT:    buffer_wbinvl1_vol
700; GFX9-NEXT:  BB3_2:
701; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
702; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
703; GFX9-NEXT:    v_mov_b32_e32 v0, v1
704; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
705; GFX9-NEXT:    s_mov_b32 s3, 0xf000
706; GFX9-NEXT:    s_mov_b32 s2, -1
707; GFX9-NEXT:    s_nop 0
708; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
710; GFX9-NEXT:    s_endpgm
711;
712; GFX1064-LABEL: add_i32_varying_gfx1032:
713; GFX1064:       ; %bb.0: ; %entry
714; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
715; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
716; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
717; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
718; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
719; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
720; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
721; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
722; GFX1064-NEXT:    s_not_b64 exec, exec
723; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
724; GFX1064-NEXT:    s_not_b64 exec, exec
725; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
726; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
727; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
728; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
729; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
730; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
731; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
732; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
733; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
734; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
735; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
736; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
737; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
738; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
739; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
740; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
741; GFX1064-NEXT:    s_mov_b32 s2, -1
742; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
743; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
744; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
745; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
746; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
747; GFX1064-NEXT:    ; implicit-def: $vgpr0
748; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
749; GFX1064-NEXT:    s_cbranch_execz BB3_2
750; GFX1064-NEXT:  ; %bb.1:
751; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
752; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
753; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
754; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
755; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
756; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
757; GFX1064-NEXT:    buffer_gl0_inv
758; GFX1064-NEXT:    buffer_gl1_inv
759; GFX1064-NEXT:  BB3_2:
760; GFX1064-NEXT:    v_nop
761; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
762; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
763; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
764; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
765; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
766; GFX1064-NEXT:    s_nop 1
767; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
768; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
769; GFX1064-NEXT:    s_endpgm
770;
771; GFX1032-LABEL: add_i32_varying_gfx1032:
772; GFX1032:       ; %bb.0: ; %entry
773; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
774; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
775; GFX1032-NEXT:    ; implicit-def: $vcc_hi
776; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
777; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
778; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
779; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
780; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
781; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
782; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
783; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
784; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
785; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
786; GFX1032-NEXT:    s_mov_b32 s2, -1
787; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
788; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
789; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
790; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
791; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
792; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
793; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
794; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
795; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
796; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
797; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
798; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
799; GFX1032-NEXT:    ; implicit-def: $vgpr0
800; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
801; GFX1032-NEXT:    s_cbranch_execz BB3_2
802; GFX1032-NEXT:  ; %bb.1:
803; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
804; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
805; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
806; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
807; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
808; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
809; GFX1032-NEXT:    buffer_gl0_inv
810; GFX1032-NEXT:    buffer_gl1_inv
811; GFX1032-NEXT:  BB3_2:
812; GFX1032-NEXT:    v_nop
813; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
814; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
815; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
816; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
817; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
818; GFX1032-NEXT:    s_nop 1
819; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
820; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
821; GFX1032-NEXT:    s_endpgm
822entry:
823  %lane = call i32 @llvm.amdgcn.workitem.id.x()
824  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
825  store i32 %old, i32 addrspace(1)* %out
826  ret void
827}
828
829define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
830;
831;
832; GFX7LESS-LABEL: add_i32_varying_gfx1064:
833; GFX7LESS:       ; %bb.0: ; %entry
834; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
835; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
836; GFX7LESS-NEXT:    s_mov_b32 m0, -1
837; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
838; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
839; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
840; GFX7LESS-NEXT:    buffer_wbinvl1
841; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
842; GFX7LESS-NEXT:    s_mov_b32 s2, -1
843; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
844; GFX7LESS-NEXT:    s_endpgm
845;
846; GFX8-LABEL: add_i32_varying_gfx1064:
847; GFX8:       ; %bb.0: ; %entry
848; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
849; GFX8-NEXT:    s_mov_b64 s[2:3], exec
850; GFX8-NEXT:    v_mov_b32_e32 v2, v0
851; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
852; GFX8-NEXT:    v_mov_b32_e32 v1, 0
853; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
854; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
855; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
856; GFX8-NEXT:    s_not_b64 exec, exec
857; GFX8-NEXT:    v_mov_b32_e32 v2, 0
858; GFX8-NEXT:    s_not_b64 exec, exec
859; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
860; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
861; GFX8-NEXT:    s_nop 1
862; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
863; GFX8-NEXT:    s_nop 1
864; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
865; GFX8-NEXT:    s_nop 1
866; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
867; GFX8-NEXT:    s_nop 1
868; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
869; GFX8-NEXT:    s_nop 1
870; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
871; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
872; GFX8-NEXT:    s_nop 0
873; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
874; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
875; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
876; GFX8-NEXT:    ; implicit-def: $vgpr0
877; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
878; GFX8-NEXT:    s_cbranch_execz BB4_2
879; GFX8-NEXT:  ; %bb.1:
880; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
881; GFX8-NEXT:    v_mov_b32_e32 v3, s2
882; GFX8-NEXT:    s_mov_b32 m0, -1
883; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
884; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
885; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
886; GFX8-NEXT:    buffer_wbinvl1_vol
887; GFX8-NEXT:  BB4_2:
888; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
889; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
890; GFX8-NEXT:    v_mov_b32_e32 v0, v1
891; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
892; GFX8-NEXT:    s_mov_b32 s3, 0xf000
893; GFX8-NEXT:    s_mov_b32 s2, -1
894; GFX8-NEXT:    s_nop 0
895; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
897; GFX8-NEXT:    s_endpgm
898;
899; GFX9-LABEL: add_i32_varying_gfx1064:
900; GFX9:       ; %bb.0: ; %entry
901; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
902; GFX9-NEXT:    s_mov_b64 s[2:3], exec
903; GFX9-NEXT:    v_mov_b32_e32 v2, v0
904; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
905; GFX9-NEXT:    v_mov_b32_e32 v1, 0
906; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
907; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
908; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
909; GFX9-NEXT:    s_not_b64 exec, exec
910; GFX9-NEXT:    v_mov_b32_e32 v2, 0
911; GFX9-NEXT:    s_not_b64 exec, exec
912; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
913; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
914; GFX9-NEXT:    s_nop 1
915; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
916; GFX9-NEXT:    s_nop 1
917; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
918; GFX9-NEXT:    s_nop 1
919; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
920; GFX9-NEXT:    s_nop 1
921; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
922; GFX9-NEXT:    s_nop 1
923; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
924; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
925; GFX9-NEXT:    s_nop 0
926; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
927; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
928; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
929; GFX9-NEXT:    ; implicit-def: $vgpr0
930; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
931; GFX9-NEXT:    s_cbranch_execz BB4_2
932; GFX9-NEXT:  ; %bb.1:
933; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
934; GFX9-NEXT:    v_mov_b32_e32 v3, s2
935; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
936; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
937; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
938; GFX9-NEXT:    buffer_wbinvl1_vol
939; GFX9-NEXT:  BB4_2:
940; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
941; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
942; GFX9-NEXT:    v_mov_b32_e32 v0, v1
943; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
944; GFX9-NEXT:    s_mov_b32 s3, 0xf000
945; GFX9-NEXT:    s_mov_b32 s2, -1
946; GFX9-NEXT:    s_nop 0
947; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
949; GFX9-NEXT:    s_endpgm
950;
951; GFX1064-LABEL: add_i32_varying_gfx1064:
952; GFX1064:       ; %bb.0: ; %entry
953; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
954; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
955; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
956; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
957; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
958; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
959; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
960; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
961; GFX1064-NEXT:    s_not_b64 exec, exec
962; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
963; GFX1064-NEXT:    s_not_b64 exec, exec
964; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
965; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
966; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
967; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
968; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
969; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
970; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
971; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
972; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
973; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
974; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
975; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
976; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
977; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
978; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
979; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
980; GFX1064-NEXT:    s_mov_b32 s2, -1
981; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
982; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
983; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
984; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
985; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
986; GFX1064-NEXT:    ; implicit-def: $vgpr0
987; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
988; GFX1064-NEXT:    s_cbranch_execz BB4_2
989; GFX1064-NEXT:  ; %bb.1:
990; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
991; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
992; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
993; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
994; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
995; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
996; GFX1064-NEXT:    buffer_gl0_inv
997; GFX1064-NEXT:    buffer_gl1_inv
998; GFX1064-NEXT:  BB4_2:
999; GFX1064-NEXT:    v_nop
1000; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1001; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1002; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
1003; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1004; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1005; GFX1064-NEXT:    s_nop 1
1006; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1008; GFX1064-NEXT:    s_endpgm
1009;
1010; GFX1032-LABEL: add_i32_varying_gfx1064:
1011; GFX1032:       ; %bb.0: ; %entry
1012; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1013; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1014; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1015; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
1016; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
1017; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1018; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
1019; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1020; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1021; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1022; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1023; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1024; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1025; GFX1032-NEXT:    s_mov_b32 s2, -1
1026; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1027; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1028; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1029; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
1030; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
1031; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1032; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
1033; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
1034; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
1035; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
1036; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1037; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1038; GFX1032-NEXT:    ; implicit-def: $vgpr0
1039; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1040; GFX1032-NEXT:    s_cbranch_execz BB4_2
1041; GFX1032-NEXT:  ; %bb.1:
1042; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1043; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
1044; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1045; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1046; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
1047; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1048; GFX1032-NEXT:    buffer_gl0_inv
1049; GFX1032-NEXT:    buffer_gl1_inv
1050; GFX1032-NEXT:  BB4_2:
1051; GFX1032-NEXT:    v_nop
1052; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1053; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1054; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1055; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1056; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1057; GFX1032-NEXT:    s_nop 1
1058; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1060; GFX1032-NEXT:    s_endpgm
1061entry:
1062  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1063  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1064  store i32 %old, i32 addrspace(1)* %out
1065  ret void
1066}
1067
1068define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1069;
1070;
1071; GFX7LESS-LABEL: add_i64_constant:
1072; GFX7LESS:       ; %bb.0: ; %entry
1073; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1074; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1075; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1076; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1077; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1078; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1079; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1080; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1081; GFX7LESS-NEXT:  ; %bb.1:
1082; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1083; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1084; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1085; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1086; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1087; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1088; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1089; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1090; GFX7LESS-NEXT:    buffer_wbinvl1
1091; GFX7LESS-NEXT:  BB5_2:
1092; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1093; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1094; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1095; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1096; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1097; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1098; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1099; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1100; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1101; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1102; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1103; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1104; GFX7LESS-NEXT:    s_endpgm
1105;
1106; GFX8-LABEL: add_i64_constant:
1107; GFX8:       ; %bb.0: ; %entry
1108; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1109; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1110; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1111; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1112; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1113; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1114; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1115; GFX8-NEXT:    s_cbranch_execz BB5_2
1116; GFX8-NEXT:  ; %bb.1:
1117; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1118; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1119; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1120; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1121; GFX8-NEXT:    s_mov_b32 m0, -1
1122; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1123; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1124; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1125; GFX8-NEXT:    buffer_wbinvl1_vol
1126; GFX8-NEXT:  BB5_2:
1127; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1128; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1129; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1130; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1131; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1132; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1133; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1134; GFX8-NEXT:    s_mov_b32 s2, -1
1135; GFX8-NEXT:    s_nop 2
1136; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1138; GFX8-NEXT:    s_endpgm
1139;
1140; GFX9-LABEL: add_i64_constant:
1141; GFX9:       ; %bb.0: ; %entry
1142; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1143; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1144; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1145; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1146; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1147; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1148; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1149; GFX9-NEXT:    s_cbranch_execz BB5_2
1150; GFX9-NEXT:  ; %bb.1:
1151; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1152; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1153; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1154; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1155; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1156; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1157; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1158; GFX9-NEXT:    buffer_wbinvl1_vol
1159; GFX9-NEXT:  BB5_2:
1160; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1161; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1162; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1163; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1164; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1165; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1166; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1167; GFX9-NEXT:    s_mov_b32 s2, -1
1168; GFX9-NEXT:    s_nop 2
1169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1171; GFX9-NEXT:    s_endpgm
1172;
1173; GFX1064-LABEL: add_i64_constant:
1174; GFX1064:       ; %bb.0: ; %entry
1175; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1176; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1177; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1178; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1179; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
1180; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1181; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1182; GFX1064-NEXT:    s_cbranch_execz BB5_2
1183; GFX1064-NEXT:  ; %bb.1:
1184; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1185; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1186; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1187; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1188; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1189; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1190; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1191; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1192; GFX1064-NEXT:    buffer_gl0_inv
1193; GFX1064-NEXT:    buffer_gl1_inv
1194; GFX1064-NEXT:  BB5_2:
1195; GFX1064-NEXT:    v_nop
1196; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1197; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1198; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1199; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1200; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1201; GFX1064-NEXT:    s_mov_b32 s2, -1
1202; GFX1064-NEXT:    s_nop 2
1203; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1204; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1205; GFX1064-NEXT:    s_endpgm
1206;
1207; GFX1032-LABEL: add_i64_constant:
1208; GFX1032:       ; %bb.0: ; %entry
1209; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1210; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1211; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1212; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1213; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1214; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1215; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1216; GFX1032-NEXT:    s_cbranch_execz BB5_2
1217; GFX1032-NEXT:  ; %bb.1:
1218; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1219; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1220; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1221; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1222; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1223; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1224; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1225; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1226; GFX1032-NEXT:    buffer_gl0_inv
1227; GFX1032-NEXT:    buffer_gl1_inv
1228; GFX1032-NEXT:  BB5_2:
1229; GFX1032-NEXT:    v_nop
1230; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1231; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1232; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1233; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1234; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1235; GFX1032-NEXT:    s_mov_b32 s2, -1
1236; GFX1032-NEXT:    s_nop 2
1237; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1239; GFX1032-NEXT:    s_endpgm
1240entry:
1241  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1242  store i64 %old, i64 addrspace(1)* %out
1243  ret void
1244}
1245
1246define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1247;
1248;
1249; GFX7LESS-LABEL: add_i64_uniform:
1250; GFX7LESS:       ; %bb.0: ; %entry
1251; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1252; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1253; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1254; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1255; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1256; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1257; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1258; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1259; GFX7LESS-NEXT:  ; %bb.1:
1260; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1261; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1262; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1263; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1264; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1265; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1266; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1267; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1268; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1269; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1270; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1271; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1272; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1273; GFX7LESS-NEXT:    buffer_wbinvl1
1274; GFX7LESS-NEXT:  BB6_2:
1275; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1276; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1277; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1278; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1280; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1281; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1282; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1283; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1284; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1285; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1286; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1287; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1288; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1289; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1290; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1291; GFX7LESS-NEXT:    s_endpgm
1292;
1293; GFX8-LABEL: add_i64_uniform:
1294; GFX8:       ; %bb.0: ; %entry
1295; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1296; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1297; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1298; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1299; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1300; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1301; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1302; GFX8-NEXT:    s_cbranch_execz BB6_2
1303; GFX8-NEXT:  ; %bb.1:
1304; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1305; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1306; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1307; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1308; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1309; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1310; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1311; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1312; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1313; GFX8-NEXT:    s_mov_b32 m0, -1
1314; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1315; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1316; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1317; GFX8-NEXT:    buffer_wbinvl1_vol
1318; GFX8-NEXT:  BB6_2:
1319; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1320; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX8-NEXT:    s_mov_b32 s4, s0
1322; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1323; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1324; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1325; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1326; GFX8-NEXT:    s_mov_b32 s5, s1
1327; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1328; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1329; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1330; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1331; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1332; GFX8-NEXT:    s_mov_b32 s6, -1
1333; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1334; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1335; GFX8-NEXT:    s_endpgm
1336;
1337; GFX9-LABEL: add_i64_uniform:
1338; GFX9:       ; %bb.0: ; %entry
1339; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1340; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1341; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1342; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1343; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1344; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1345; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1346; GFX9-NEXT:    s_cbranch_execz BB6_2
1347; GFX9-NEXT:  ; %bb.1:
1348; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1349; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1350; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1351; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1352; GFX9-NEXT:    s_add_i32 s8, s8, s7
1353; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1354; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1355; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1356; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1357; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1358; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1359; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1360; GFX9-NEXT:    buffer_wbinvl1_vol
1361; GFX9-NEXT:  BB6_2:
1362; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1363; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1365; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1366; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1367; GFX9-NEXT:    s_mov_b32 s4, s0
1368; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1369; GFX9-NEXT:    s_mov_b32 s5, s1
1370; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1371; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1372; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1373; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1374; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1375; GFX9-NEXT:    s_mov_b32 s6, -1
1376; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1377; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1378; GFX9-NEXT:    s_endpgm
1379;
1380; GFX1064-LABEL: add_i64_uniform:
1381; GFX1064:       ; %bb.0: ; %entry
1382; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1383; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1384; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1385; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1386; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1387; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1388; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1389; GFX1064-NEXT:    s_cbranch_execz BB6_2
1390; GFX1064-NEXT:  ; %bb.1:
1391; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1392; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1393; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1394; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1395; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1396; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1397; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1398; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1399; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1400; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1401; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1402; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1403; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1404; GFX1064-NEXT:    buffer_gl0_inv
1405; GFX1064-NEXT:    buffer_gl1_inv
1406; GFX1064-NEXT:  BB6_2:
1407; GFX1064-NEXT:    v_nop
1408; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1409; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1410; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1411; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1412; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1413; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
1414; GFX1064-NEXT:    v_readfirstlane_b32 s5, v2
1415; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1416; GFX1064-NEXT:    s_mov_b32 s2, -1
1417; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1418; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s4, v0
1419; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc
1420; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1421; GFX1064-NEXT:    s_endpgm
1422;
1423; GFX1032-LABEL: add_i64_uniform:
1424; GFX1032:       ; %bb.0: ; %entry
1425; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1426; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1427; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1428; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1429; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1430; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1431; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1432; GFX1032-NEXT:    s_cbranch_execz BB6_2
1433; GFX1032-NEXT:  ; %bb.1:
1434; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1435; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1436; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1438; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1439; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1440; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1441; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1442; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1443; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1444; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1445; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1446; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1447; GFX1032-NEXT:    buffer_gl0_inv
1448; GFX1032-NEXT:    buffer_gl1_inv
1449; GFX1032-NEXT:  BB6_2:
1450; GFX1032-NEXT:    v_nop
1451; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1452; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1454; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1455; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1456; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
1457; GFX1032-NEXT:    v_readfirstlane_b32 s5, v2
1458; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1459; GFX1032-NEXT:    s_mov_b32 s2, -1
1460; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1461; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s4, v0
1462; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
1463; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1464; GFX1032-NEXT:    s_endpgm
1465entry:
1466  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1467  store i64 %old, i64 addrspace(1)* %out
1468  ret void
1469}
1470
1471; GCN-NOT: v_mbcnt_lo_u32_b32
1472; GCN-NOT: v_mbcnt_hi_u32_b32
1473; GCN-NOT: s_bcnt1_i32_b64
1474define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1475;
1476;
1477; GFX7LESS-LABEL: add_i64_varying:
1478; GFX7LESS:       ; %bb.0: ; %entry
1479; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1480; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1481; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1482; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1483; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1484; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1485; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1486; GFX7LESS-NEXT:    buffer_wbinvl1
1487; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1488; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1489; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1490; GFX7LESS-NEXT:    s_endpgm
1491;
1492; GFX8-LABEL: add_i64_varying:
1493; GFX8:       ; %bb.0: ; %entry
1494; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1495; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1496; GFX8-NEXT:    s_mov_b32 m0, -1
1497; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1498; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1499; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1500; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1501; GFX8-NEXT:    buffer_wbinvl1_vol
1502; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1503; GFX8-NEXT:    s_mov_b32 s2, -1
1504; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1505; GFX8-NEXT:    s_endpgm
1506;
1507; GFX9-LABEL: add_i64_varying:
1508; GFX9:       ; %bb.0: ; %entry
1509; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1510; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1511; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1512; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1513; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1514; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1515; GFX9-NEXT:    buffer_wbinvl1_vol
1516; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1517; GFX9-NEXT:    s_mov_b32 s2, -1
1518; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1519; GFX9-NEXT:    s_endpgm
1520;
1521; GFX1064-LABEL: add_i64_varying:
1522; GFX1064:       ; %bb.0: ; %entry
1523; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1524; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1525; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1526; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1527; GFX1064-NEXT:    s_mov_b32 s2, -1
1528; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1529; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1530; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1531; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1532; GFX1064-NEXT:    buffer_gl0_inv
1533; GFX1064-NEXT:    buffer_gl1_inv
1534; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1535; GFX1064-NEXT:    s_endpgm
1536;
1537; GFX1032-LABEL: add_i64_varying:
1538; GFX1032:       ; %bb.0: ; %entry
1539; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1540; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1541; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1542; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1543; GFX1032-NEXT:    s_mov_b32 s2, -1
1544; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1545; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1546; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1547; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1548; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1549; GFX1032-NEXT:    buffer_gl0_inv
1550; GFX1032-NEXT:    buffer_gl1_inv
1551; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1552; GFX1032-NEXT:    s_endpgm
1553entry:
1554  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1555  %zext = zext i32 %lane to i64
1556  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1557  store i64 %old, i64 addrspace(1)* %out
1558  ret void
1559}
1560
1561define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1562;
1563;
1564; GFX7LESS-LABEL: sub_i32_constant:
1565; GFX7LESS:       ; %bb.0: ; %entry
1566; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1567; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1568; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1569; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1570; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1571; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1572; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1573; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1574; GFX7LESS-NEXT:  ; %bb.1:
1575; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1576; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1577; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
1578; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1579; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1580; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1581; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1582; GFX7LESS-NEXT:    buffer_wbinvl1
1583; GFX7LESS-NEXT:  BB8_2:
1584; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1585; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1586; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1587; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1588; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1589; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1590; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1592; GFX7LESS-NEXT:    s_endpgm
1593;
1594; GFX8-LABEL: sub_i32_constant:
1595; GFX8:       ; %bb.0: ; %entry
1596; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1597; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1598; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1599; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1600; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1601; GFX8-NEXT:    ; implicit-def: $vgpr1
1602; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1603; GFX8-NEXT:    s_cbranch_execz BB8_2
1604; GFX8-NEXT:  ; %bb.1:
1605; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1606; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1607; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1608; GFX8-NEXT:    s_mov_b32 m0, -1
1609; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1610; GFX8-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1611; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1612; GFX8-NEXT:    buffer_wbinvl1_vol
1613; GFX8-NEXT:  BB8_2:
1614; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1615; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1616; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1617; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1618; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1619; GFX8-NEXT:    s_mov_b32 s2, -1
1620; GFX8-NEXT:    s_nop 0
1621; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1622; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1623; GFX8-NEXT:    s_endpgm
1624;
1625; GFX9-LABEL: sub_i32_constant:
1626; GFX9:       ; %bb.0: ; %entry
1627; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1628; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1629; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1630; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1631; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1632; GFX9-NEXT:    ; implicit-def: $vgpr1
1633; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1634; GFX9-NEXT:    s_cbranch_execz BB8_2
1635; GFX9-NEXT:  ; %bb.1:
1636; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1637; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1638; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1639; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1640; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1641; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1642; GFX9-NEXT:    buffer_wbinvl1_vol
1643; GFX9-NEXT:  BB8_2:
1644; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1645; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1646; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1647; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1648; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1649; GFX9-NEXT:    s_mov_b32 s2, -1
1650; GFX9-NEXT:    s_nop 0
1651; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1652; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1653; GFX9-NEXT:    s_endpgm
1654;
1655; GFX1064-LABEL: sub_i32_constant:
1656; GFX1064:       ; %bb.0: ; %entry
1657; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1658; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1659; GFX1064-NEXT:    ; implicit-def: $vgpr1
1660; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1661; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1662; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1663; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1664; GFX1064-NEXT:    s_cbranch_execz BB8_2
1665; GFX1064-NEXT:  ; %bb.1:
1666; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1667; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1668; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1669; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1670; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1671; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1672; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1673; GFX1064-NEXT:    buffer_gl0_inv
1674; GFX1064-NEXT:    buffer_gl1_inv
1675; GFX1064-NEXT:  BB8_2:
1676; GFX1064-NEXT:    v_nop
1677; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1678; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1679; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1680; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1681; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1682; GFX1064-NEXT:    s_mov_b32 s2, -1
1683; GFX1064-NEXT:    s_nop 0
1684; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1685; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1686; GFX1064-NEXT:    s_endpgm
1687;
1688; GFX1032-LABEL: sub_i32_constant:
1689; GFX1032:       ; %bb.0: ; %entry
1690; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1691; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1692; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1693; GFX1032-NEXT:    ; implicit-def: $vgpr1
1694; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1695; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1696; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1697; GFX1032-NEXT:    s_cbranch_execz BB8_2
1698; GFX1032-NEXT:  ; %bb.1:
1699; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1700; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1701; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1702; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1703; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1704; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1705; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1706; GFX1032-NEXT:    buffer_gl0_inv
1707; GFX1032-NEXT:    buffer_gl1_inv
1708; GFX1032-NEXT:  BB8_2:
1709; GFX1032-NEXT:    v_nop
1710; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1711; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1712; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1713; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1714; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1715; GFX1032-NEXT:    s_mov_b32 s2, -1
1716; GFX1032-NEXT:    s_nop 0
1717; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1718; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1719; GFX1032-NEXT:    s_endpgm
1720entry:
1721  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1722  store i32 %old, i32 addrspace(1)* %out
1723  ret void
1724}
1725
1726define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1727;
1728;
1729; GFX7LESS-LABEL: sub_i32_uniform:
1730; GFX7LESS:       ; %bb.0: ; %entry
1731; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1732; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1733; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
1734; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1735; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1736; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1737; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1738; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1739; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1740; GFX7LESS-NEXT:  ; %bb.1:
1741; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1742; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1743; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
1744; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1745; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
1746; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1747; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1748; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1749; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1750; GFX7LESS-NEXT:    buffer_wbinvl1
1751; GFX7LESS-NEXT:  BB9_2:
1752; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1753; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1754; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1756; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1757; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1758; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1759; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1760; GFX7LESS-NEXT:    s_endpgm
1761;
1762; GFX8-LABEL: sub_i32_uniform:
1763; GFX8:       ; %bb.0: ; %entry
1764; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1765; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1766; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1767; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1768; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1769; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1770; GFX8-NEXT:    ; implicit-def: $vgpr1
1771; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1772; GFX8-NEXT:    s_cbranch_execz BB9_2
1773; GFX8-NEXT:  ; %bb.1:
1774; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1775; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1777; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1778; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1779; GFX8-NEXT:    s_mov_b32 m0, -1
1780; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1781; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1782; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1783; GFX8-NEXT:    buffer_wbinvl1_vol
1784; GFX8-NEXT:  BB9_2:
1785; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1786; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1788; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1789; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1790; GFX8-NEXT:    s_mov_b32 s6, -1
1791; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1792; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1793; GFX8-NEXT:    s_endpgm
1794;
1795; GFX9-LABEL: sub_i32_uniform:
1796; GFX9:       ; %bb.0: ; %entry
1797; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1798; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
1799; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1800; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1801; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1802; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1803; GFX9-NEXT:    ; implicit-def: $vgpr1
1804; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1805; GFX9-NEXT:    s_cbranch_execz BB9_2
1806; GFX9-NEXT:  ; %bb.1:
1807; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1808; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1809; GFX9-NEXT:    s_mul_i32 s1, s0, s1
1810; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1811; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1812; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1813; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1814; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1815; GFX9-NEXT:    buffer_wbinvl1_vol
1816; GFX9-NEXT:  BB9_2:
1817; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1818; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1819; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
1820; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1821; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1822; GFX9-NEXT:    s_mov_b32 s6, -1
1823; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1824; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1825; GFX9-NEXT:    s_endpgm
1826;
1827; GFX1064-LABEL: sub_i32_uniform:
1828; GFX1064:       ; %bb.0: ; %entry
1829; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1830; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1831; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
1832; GFX1064-NEXT:    ; implicit-def: $vgpr1
1833; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1834; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1835; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1836; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1837; GFX1064-NEXT:    s_cbranch_execz BB9_2
1838; GFX1064-NEXT:  ; %bb.1:
1839; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1840; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1841; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1842; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
1843; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
1844; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1845; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1846; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1847; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1848; GFX1064-NEXT:    buffer_gl0_inv
1849; GFX1064-NEXT:    buffer_gl1_inv
1850; GFX1064-NEXT:  BB9_2:
1851; GFX1064-NEXT:    v_nop
1852; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
1853; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1854; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
1855; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1856; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1857; GFX1064-NEXT:    s_mov_b32 s6, -1
1858; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1859; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1860; GFX1064-NEXT:    s_endpgm
1861;
1862; GFX1032-LABEL: sub_i32_uniform:
1863; GFX1032:       ; %bb.0: ; %entry
1864; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1865; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
1866; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1867; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1868; GFX1032-NEXT:    ; implicit-def: $vgpr1
1869; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1870; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1871; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1872; GFX1032-NEXT:    s_cbranch_execz BB9_2
1873; GFX1032-NEXT:  ; %bb.1:
1874; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1875; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1876; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
1878; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
1879; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1880; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1881; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1882; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1883; GFX1032-NEXT:    buffer_gl0_inv
1884; GFX1032-NEXT:    buffer_gl1_inv
1885; GFX1032-NEXT:  BB9_2:
1886; GFX1032-NEXT:    v_nop
1887; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1888; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1889; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
1890; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1891; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1892; GFX1032-NEXT:    s_mov_b32 s6, -1
1893; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1894; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1895; GFX1032-NEXT:    s_endpgm
1896entry:
1897  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1898  store i32 %old, i32 addrspace(1)* %out
1899  ret void
1900}
1901
1902define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1903;
1904;
1905; GFX7LESS-LABEL: sub_i32_varying:
1906; GFX7LESS:       ; %bb.0: ; %entry
1907; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1908; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1909; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1910; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1911; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1912; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1913; GFX7LESS-NEXT:    buffer_wbinvl1
1914; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1915; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1916; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1917; GFX7LESS-NEXT:    s_endpgm
1918;
1919; GFX8-LABEL: sub_i32_varying:
1920; GFX8:       ; %bb.0: ; %entry
1921; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1922; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1923; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1924; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1925; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1926; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1927; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1928; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1929; GFX8-NEXT:    s_not_b64 exec, exec
1930; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1931; GFX8-NEXT:    s_not_b64 exec, exec
1932; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1933; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1934; GFX8-NEXT:    s_nop 1
1935; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1936; GFX8-NEXT:    s_nop 1
1937; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1938; GFX8-NEXT:    s_nop 1
1939; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1940; GFX8-NEXT:    s_nop 1
1941; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1942; GFX8-NEXT:    s_nop 1
1943; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1944; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
1945; GFX8-NEXT:    s_nop 0
1946; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1947; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1948; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1949; GFX8-NEXT:    ; implicit-def: $vgpr0
1950; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1951; GFX8-NEXT:    s_cbranch_execz BB10_2
1952; GFX8-NEXT:  ; %bb.1:
1953; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1954; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1955; GFX8-NEXT:    s_mov_b32 m0, -1
1956; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1957; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1958; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1959; GFX8-NEXT:    buffer_wbinvl1_vol
1960; GFX8-NEXT:  BB10_2:
1961; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1962; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1963; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1964; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1965; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1966; GFX8-NEXT:    s_mov_b32 s2, -1
1967; GFX8-NEXT:    s_nop 0
1968; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1969; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1970; GFX8-NEXT:    s_endpgm
1971;
1972; GFX9-LABEL: sub_i32_varying:
1973; GFX9:       ; %bb.0: ; %entry
1974; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1975; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1976; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1977; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1978; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1979; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1980; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1981; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1982; GFX9-NEXT:    s_not_b64 exec, exec
1983; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1984; GFX9-NEXT:    s_not_b64 exec, exec
1985; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1986; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1987; GFX9-NEXT:    s_nop 1
1988; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1989; GFX9-NEXT:    s_nop 1
1990; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1991; GFX9-NEXT:    s_nop 1
1992; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1993; GFX9-NEXT:    s_nop 1
1994; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1995; GFX9-NEXT:    s_nop 1
1996; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1997; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
1998; GFX9-NEXT:    s_nop 0
1999; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2000; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2001; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2002; GFX9-NEXT:    ; implicit-def: $vgpr0
2003; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2004; GFX9-NEXT:    s_cbranch_execz BB10_2
2005; GFX9-NEXT:  ; %bb.1:
2006; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2007; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2008; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2009; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2010; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2011; GFX9-NEXT:    buffer_wbinvl1_vol
2012; GFX9-NEXT:  BB10_2:
2013; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2014; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2015; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2016; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2017; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2018; GFX9-NEXT:    s_mov_b32 s2, -1
2019; GFX9-NEXT:    s_nop 0
2020; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2021; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2022; GFX9-NEXT:    s_endpgm
2023;
2024; GFX1064-LABEL: sub_i32_varying:
2025; GFX1064:       ; %bb.0: ; %entry
2026; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2027; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
2028; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2029; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2030; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2031; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2032; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2033; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
2034; GFX1064-NEXT:    s_not_b64 exec, exec
2035; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2036; GFX1064-NEXT:    s_not_b64 exec, exec
2037; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2038; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2039; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2040; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2041; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2042; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2043; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2044; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2045; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2046; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2047; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2048; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2049; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2050; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2051; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2052; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2053; GFX1064-NEXT:    s_mov_b32 s2, -1
2054; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2055; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2056; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2057; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2058; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2059; GFX1064-NEXT:    ; implicit-def: $vgpr0
2060; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2061; GFX1064-NEXT:    s_cbranch_execz BB10_2
2062; GFX1064-NEXT:  ; %bb.1:
2063; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2064; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
2065; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2066; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2067; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v7
2068; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2069; GFX1064-NEXT:    buffer_gl0_inv
2070; GFX1064-NEXT:    buffer_gl1_inv
2071; GFX1064-NEXT:  BB10_2:
2072; GFX1064-NEXT:    v_nop
2073; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2074; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2075; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2076; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2077; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2078; GFX1064-NEXT:    s_nop 1
2079; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2080; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2081; GFX1064-NEXT:    s_endpgm
2082;
2083; GFX1032-LABEL: sub_i32_varying:
2084; GFX1032:       ; %bb.0: ; %entry
2085; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2086; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
2087; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2088; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2089; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
2090; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2091; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
2092; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2093; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2094; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2095; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2096; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2097; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2098; GFX1032-NEXT:    s_mov_b32 s2, -1
2099; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2100; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2101; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2102; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2103; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2104; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2105; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2106; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2107; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2108; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2109; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2110; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2111; GFX1032-NEXT:    ; implicit-def: $vgpr0
2112; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2113; GFX1032-NEXT:    s_cbranch_execz BB10_2
2114; GFX1032-NEXT:  ; %bb.1:
2115; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2116; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
2117; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2118; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2119; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v7
2120; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2121; GFX1032-NEXT:    buffer_gl0_inv
2122; GFX1032-NEXT:    buffer_gl1_inv
2123; GFX1032-NEXT:  BB10_2:
2124; GFX1032-NEXT:    v_nop
2125; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2126; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2127; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2128; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2129; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2130; GFX1032-NEXT:    s_nop 1
2131; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2133; GFX1032-NEXT:    s_endpgm
2134entry:
2135  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2136  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2137  store i32 %old, i32 addrspace(1)* %out
2138  ret void
2139}
2140
2141define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2142;
2143;
2144; GFX7LESS-LABEL: sub_i64_constant:
2145; GFX7LESS:       ; %bb.0: ; %entry
2146; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2147; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2148; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2149; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2150; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2151; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2152; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2153; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2154; GFX7LESS-NEXT:  ; %bb.1:
2155; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2156; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2157; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2158; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2159; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2160; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2161; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2162; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2163; GFX7LESS-NEXT:    buffer_wbinvl1
2164; GFX7LESS-NEXT:  BB11_2:
2165; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2166; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2167; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2168; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2169; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2170; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2171; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2172; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2173; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2174; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2175; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2176; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2177; GFX7LESS-NEXT:    s_endpgm
2178;
2179; GFX8-LABEL: sub_i64_constant:
2180; GFX8:       ; %bb.0: ; %entry
2181; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2182; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2183; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2184; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2185; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2186; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2187; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2188; GFX8-NEXT:    s_cbranch_execz BB11_2
2189; GFX8-NEXT:  ; %bb.1:
2190; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2191; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2192; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2193; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2194; GFX8-NEXT:    s_mov_b32 m0, -1
2195; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2196; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2197; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2198; GFX8-NEXT:    buffer_wbinvl1_vol
2199; GFX8-NEXT:  BB11_2:
2200; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2201; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2202; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2203; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2204; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2205; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2206; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2207; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2208; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2209; GFX8-NEXT:    s_mov_b32 s2, -1
2210; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2211; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2212; GFX8-NEXT:    s_endpgm
2213;
2214; GFX9-LABEL: sub_i64_constant:
2215; GFX9:       ; %bb.0: ; %entry
2216; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2217; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2218; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2219; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2220; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2221; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2222; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2223; GFX9-NEXT:    s_cbranch_execz BB11_2
2224; GFX9-NEXT:  ; %bb.1:
2225; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2226; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2227; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2228; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2229; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2230; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2231; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2232; GFX9-NEXT:    buffer_wbinvl1_vol
2233; GFX9-NEXT:  BB11_2:
2234; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2235; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2236; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2237; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2238; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2239; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2240; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2241; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2242; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2243; GFX9-NEXT:    s_mov_b32 s2, -1
2244; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2245; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2246; GFX9-NEXT:    s_endpgm
2247;
2248; GFX1064-LABEL: sub_i64_constant:
2249; GFX1064:       ; %bb.0: ; %entry
2250; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2251; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2252; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2253; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2254; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2255; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2256; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2257; GFX1064-NEXT:    s_cbranch_execz BB11_2
2258; GFX1064-NEXT:  ; %bb.1:
2259; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2260; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2261; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2262; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2263; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2264; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2265; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2266; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2267; GFX1064-NEXT:    buffer_gl0_inv
2268; GFX1064-NEXT:    buffer_gl1_inv
2269; GFX1064-NEXT:  BB11_2:
2270; GFX1064-NEXT:    v_nop
2271; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2272; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2273; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2274; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2275; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2276; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2277; GFX1064-NEXT:    s_mov_b32 s2, -1
2278; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2279; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2280; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2281; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2282; GFX1064-NEXT:    s_endpgm
2283;
2284; GFX1032-LABEL: sub_i64_constant:
2285; GFX1032:       ; %bb.0: ; %entry
2286; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2287; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2288; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2289; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2290; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2291; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2292; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2293; GFX1032-NEXT:    s_cbranch_execz BB11_2
2294; GFX1032-NEXT:  ; %bb.1:
2295; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2296; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2297; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2298; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
2299; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2300; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2301; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2302; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2303; GFX1032-NEXT:    buffer_gl0_inv
2304; GFX1032-NEXT:    buffer_gl1_inv
2305; GFX1032-NEXT:  BB11_2:
2306; GFX1032-NEXT:    v_nop
2307; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2308; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2309; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2310; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2311; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2312; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2313; GFX1032-NEXT:    s_mov_b32 s2, -1
2314; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2315; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2316; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2318; GFX1032-NEXT:    s_endpgm
2319entry:
2320  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2321  store i64 %old, i64 addrspace(1)* %out
2322  ret void
2323}
2324
2325define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2326;
2327;
2328; GFX7LESS-LABEL: sub_i64_uniform:
2329; GFX7LESS:       ; %bb.0: ; %entry
2330; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2331; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2332; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2333; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2334; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2335; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2336; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2337; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2338; GFX7LESS-NEXT:  ; %bb.1:
2339; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2340; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2341; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2342; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2343; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2344; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2345; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2346; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2347; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2348; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2349; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2350; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2351; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2352; GFX7LESS-NEXT:    buffer_wbinvl1
2353; GFX7LESS-NEXT:  BB12_2:
2354; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2355; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2356; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2357; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2358; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2359; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2360; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2361; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2362; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2363; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2364; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2365; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2366; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2367; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2368; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2369; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2370; GFX7LESS-NEXT:    s_endpgm
2371;
2372; GFX8-LABEL: sub_i64_uniform:
2373; GFX8:       ; %bb.0: ; %entry
2374; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2375; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2376; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2377; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2378; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2379; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2380; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2381; GFX8-NEXT:    s_cbranch_execz BB12_2
2382; GFX8-NEXT:  ; %bb.1:
2383; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2384; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2385; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2386; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2387; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2388; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2389; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2390; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2391; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2392; GFX8-NEXT:    s_mov_b32 m0, -1
2393; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2394; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2395; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2396; GFX8-NEXT:    buffer_wbinvl1_vol
2397; GFX8-NEXT:  BB12_2:
2398; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2399; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2400; GFX8-NEXT:    s_mov_b32 s4, s0
2401; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2402; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2403; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2404; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2405; GFX8-NEXT:    s_mov_b32 s5, s1
2406; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2407; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2408; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2409; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2410; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2411; GFX8-NEXT:    s_mov_b32 s6, -1
2412; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2413; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2414; GFX8-NEXT:    s_endpgm
2415;
2416; GFX9-LABEL: sub_i64_uniform:
2417; GFX9:       ; %bb.0: ; %entry
2418; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2419; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2420; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2421; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2422; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2423; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2424; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2425; GFX9-NEXT:    s_cbranch_execz BB12_2
2426; GFX9-NEXT:  ; %bb.1:
2427; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2428; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2429; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2430; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2431; GFX9-NEXT:    s_add_i32 s8, s8, s7
2432; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2433; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2434; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2435; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2436; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2437; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2438; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2439; GFX9-NEXT:    buffer_wbinvl1_vol
2440; GFX9-NEXT:  BB12_2:
2441; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2442; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2443; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2444; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2445; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2446; GFX9-NEXT:    s_mov_b32 s4, s0
2447; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2448; GFX9-NEXT:    s_mov_b32 s5, s1
2449; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2450; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2451; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2452; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2453; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2454; GFX9-NEXT:    s_mov_b32 s6, -1
2455; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2456; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2457; GFX9-NEXT:    s_endpgm
2458;
2459; GFX1064-LABEL: sub_i64_uniform:
2460; GFX1064:       ; %bb.0: ; %entry
2461; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2462; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2463; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2464; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2465; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2466; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2467; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2468; GFX1064-NEXT:    s_cbranch_execz BB12_2
2469; GFX1064-NEXT:  ; %bb.1:
2470; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2471; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2472; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2473; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2474; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2475; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2476; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2477; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2478; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2479; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2480; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2481; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2482; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2483; GFX1064-NEXT:    buffer_gl0_inv
2484; GFX1064-NEXT:    buffer_gl1_inv
2485; GFX1064-NEXT:  BB12_2:
2486; GFX1064-NEXT:    v_nop
2487; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2488; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2489; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2490; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2491; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2492; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
2493; GFX1064-NEXT:    v_readfirstlane_b32 s5, v2
2494; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2495; GFX1064-NEXT:    s_mov_b32 s2, -1
2496; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2497; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s4, v0
2498; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
2499; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2500; GFX1064-NEXT:    s_endpgm
2501;
2502; GFX1032-LABEL: sub_i64_uniform:
2503; GFX1032:       ; %bb.0: ; %entry
2504; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2505; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2506; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2507; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2508; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2509; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2510; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2511; GFX1032-NEXT:    s_cbranch_execz BB12_2
2512; GFX1032-NEXT:  ; %bb.1:
2513; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2514; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2515; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2516; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2517; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2518; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2519; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2520; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2521; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2522; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2523; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2524; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2525; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2526; GFX1032-NEXT:    buffer_gl0_inv
2527; GFX1032-NEXT:    buffer_gl1_inv
2528; GFX1032-NEXT:  BB12_2:
2529; GFX1032-NEXT:    v_nop
2530; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2531; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2532; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2533; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2534; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2535; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
2536; GFX1032-NEXT:    v_readfirstlane_b32 s5, v2
2537; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2538; GFX1032-NEXT:    s_mov_b32 s2, -1
2539; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2540; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s4, v0
2541; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
2542; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2543; GFX1032-NEXT:    s_endpgm
2544entry:
2545  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2546  store i64 %old, i64 addrspace(1)* %out
2547  ret void
2548}
2549
2550; GCN-NOT: v_mbcnt_lo_u32_b32
2551; GCN-NOT: v_mbcnt_hi_u32_b32
2552; GCN-NOT: s_bcnt1_i32_b64
2553define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2554;
2555;
2556; GFX7LESS-LABEL: sub_i64_varying:
2557; GFX7LESS:       ; %bb.0: ; %entry
2558; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2559; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2560; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2561; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2562; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2563; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2564; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2565; GFX7LESS-NEXT:    buffer_wbinvl1
2566; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2567; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2568; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2569; GFX7LESS-NEXT:    s_endpgm
2570;
2571; GFX8-LABEL: sub_i64_varying:
2572; GFX8:       ; %bb.0: ; %entry
2573; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2574; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2575; GFX8-NEXT:    s_mov_b32 m0, -1
2576; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2577; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2578; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2579; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2580; GFX8-NEXT:    buffer_wbinvl1_vol
2581; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2582; GFX8-NEXT:    s_mov_b32 s2, -1
2583; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2584; GFX8-NEXT:    s_endpgm
2585;
2586; GFX9-LABEL: sub_i64_varying:
2587; GFX9:       ; %bb.0: ; %entry
2588; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2589; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2590; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2591; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2592; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2593; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2594; GFX9-NEXT:    buffer_wbinvl1_vol
2595; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2596; GFX9-NEXT:    s_mov_b32 s2, -1
2597; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2598; GFX9-NEXT:    s_endpgm
2599;
2600; GFX1064-LABEL: sub_i64_varying:
2601; GFX1064:       ; %bb.0: ; %entry
2602; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2603; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2604; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2605; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2606; GFX1064-NEXT:    s_mov_b32 s2, -1
2607; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2608; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2609; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2610; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2611; GFX1064-NEXT:    buffer_gl0_inv
2612; GFX1064-NEXT:    buffer_gl1_inv
2613; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2614; GFX1064-NEXT:    s_endpgm
2615;
2616; GFX1032-LABEL: sub_i64_varying:
2617; GFX1032:       ; %bb.0: ; %entry
2618; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2619; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2620; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2621; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2622; GFX1032-NEXT:    s_mov_b32 s2, -1
2623; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2624; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2625; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2626; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2627; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2628; GFX1032-NEXT:    buffer_gl0_inv
2629; GFX1032-NEXT:    buffer_gl1_inv
2630; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2631; GFX1032-NEXT:    s_endpgm
2632entry:
2633  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2634  %zext = zext i32 %lane to i64
2635  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2636  store i64 %old, i64 addrspace(1)* %out
2637  ret void
2638}
2639
2640define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2641;
2642;
2643; GFX7LESS-LABEL: and_i32_varying:
2644; GFX7LESS:       ; %bb.0: ; %entry
2645; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2646; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2647; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2648; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2649; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2650; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2651; GFX7LESS-NEXT:    buffer_wbinvl1
2652; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2653; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2654; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2655; GFX7LESS-NEXT:    s_endpgm
2656;
2657; GFX8-LABEL: and_i32_varying:
2658; GFX8:       ; %bb.0: ; %entry
2659; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2660; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2661; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2662; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2663; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2664; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2665; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2666; GFX8-NEXT:    s_not_b64 exec, exec
2667; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2668; GFX8-NEXT:    s_not_b64 exec, exec
2669; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2670; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2671; GFX8-NEXT:    s_nop 1
2672; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2673; GFX8-NEXT:    s_nop 1
2674; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2675; GFX8-NEXT:    s_nop 1
2676; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2677; GFX8-NEXT:    s_nop 1
2678; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2679; GFX8-NEXT:    s_nop 1
2680; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2681; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2682; GFX8-NEXT:    s_nop 0
2683; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2684; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2685; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2686; GFX8-NEXT:    ; implicit-def: $vgpr0
2687; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2688; GFX8-NEXT:    s_cbranch_execz BB14_2
2689; GFX8-NEXT:  ; %bb.1:
2690; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2691; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2692; GFX8-NEXT:    s_mov_b32 m0, -1
2693; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2694; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2695; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2696; GFX8-NEXT:    buffer_wbinvl1_vol
2697; GFX8-NEXT:  BB14_2:
2698; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2699; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2700; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2701; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2702; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2703; GFX8-NEXT:    s_mov_b32 s2, -1
2704; GFX8-NEXT:    s_nop 0
2705; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2706; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2707; GFX8-NEXT:    s_endpgm
2708;
2709; GFX9-LABEL: and_i32_varying:
2710; GFX9:       ; %bb.0: ; %entry
2711; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2712; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2713; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2714; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2715; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2716; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2717; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2718; GFX9-NEXT:    s_not_b64 exec, exec
2719; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2720; GFX9-NEXT:    s_not_b64 exec, exec
2721; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2722; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2723; GFX9-NEXT:    s_nop 1
2724; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2725; GFX9-NEXT:    s_nop 1
2726; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2727; GFX9-NEXT:    s_nop 1
2728; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2729; GFX9-NEXT:    s_nop 1
2730; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2731; GFX9-NEXT:    s_nop 1
2732; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2733; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2734; GFX9-NEXT:    s_nop 0
2735; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2736; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2737; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2738; GFX9-NEXT:    ; implicit-def: $vgpr0
2739; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2740; GFX9-NEXT:    s_cbranch_execz BB14_2
2741; GFX9-NEXT:  ; %bb.1:
2742; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2743; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2744; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2745; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2746; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2747; GFX9-NEXT:    buffer_wbinvl1_vol
2748; GFX9-NEXT:  BB14_2:
2749; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2750; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2751; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2752; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2753; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2754; GFX9-NEXT:    s_mov_b32 s2, -1
2755; GFX9-NEXT:    s_nop 0
2756; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2757; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2758; GFX9-NEXT:    s_endpgm
2759;
2760; GFX1064-LABEL: and_i32_varying:
2761; GFX1064:       ; %bb.0: ; %entry
2762; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2763; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
2764; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2765; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
2766; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2767; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2768; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2769; GFX1064-NEXT:    s_not_b64 exec, exec
2770; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
2771; GFX1064-NEXT:    s_not_b64 exec, exec
2772; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2773; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2774; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2775; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2776; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2777; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2778; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2779; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2780; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2781; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2782; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2783; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2784; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2785; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2786; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2787; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2788; GFX1064-NEXT:    s_mov_b32 s2, -1
2789; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2790; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2791; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2792; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2793; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
2794; GFX1064-NEXT:    ; implicit-def: $vgpr0
2795; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2796; GFX1064-NEXT:    s_cbranch_execz BB14_2
2797; GFX1064-NEXT:  ; %bb.1:
2798; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2799; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
2800; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2801; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2802; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v7
2803; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2804; GFX1064-NEXT:    buffer_gl0_inv
2805; GFX1064-NEXT:    buffer_gl1_inv
2806; GFX1064-NEXT:  BB14_2:
2807; GFX1064-NEXT:    v_nop
2808; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2809; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2810; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2811; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2812; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2813; GFX1064-NEXT:    s_nop 1
2814; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2815; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2816; GFX1064-NEXT:    s_endpgm
2817;
2818; GFX1032-LABEL: and_i32_varying:
2819; GFX1032:       ; %bb.0: ; %entry
2820; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2821; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
2822; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2823; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2824; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2825; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2826; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2827; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2828; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
2829; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2830; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2831; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2832; GFX1032-NEXT:    s_mov_b32 s2, -1
2833; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2834; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2835; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2836; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2837; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2838; GFX1032-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2839; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2840; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2841; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2842; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2843; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2844; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
2845; GFX1032-NEXT:    ; implicit-def: $vgpr0
2846; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2847; GFX1032-NEXT:    s_cbranch_execz BB14_2
2848; GFX1032-NEXT:  ; %bb.1:
2849; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2850; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
2851; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2852; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2853; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v7
2854; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2855; GFX1032-NEXT:    buffer_gl0_inv
2856; GFX1032-NEXT:    buffer_gl1_inv
2857; GFX1032-NEXT:  BB14_2:
2858; GFX1032-NEXT:    v_nop
2859; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2860; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2861; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2862; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2863; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2864; GFX1032-NEXT:    s_nop 1
2865; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2866; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2867; GFX1032-NEXT:    s_endpgm
2868entry:
2869  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2870  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2871  store i32 %old, i32 addrspace(1)* %out
2872  ret void
2873}
2874
2875define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2876;
2877;
2878; GFX7LESS-LABEL: or_i32_varying:
2879; GFX7LESS:       ; %bb.0: ; %entry
2880; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2881; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2882; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2883; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2884; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2885; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2886; GFX7LESS-NEXT:    buffer_wbinvl1
2887; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2888; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2889; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2890; GFX7LESS-NEXT:    s_endpgm
2891;
2892; GFX8-LABEL: or_i32_varying:
2893; GFX8:       ; %bb.0: ; %entry
2894; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2895; GFX8-NEXT:    s_mov_b64 s[2:3], exec
2896; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2897; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2898; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2899; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2900; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2901; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2902; GFX8-NEXT:    s_not_b64 exec, exec
2903; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2904; GFX8-NEXT:    s_not_b64 exec, exec
2905; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2906; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2907; GFX8-NEXT:    s_nop 1
2908; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2909; GFX8-NEXT:    s_nop 1
2910; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2911; GFX8-NEXT:    s_nop 1
2912; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2913; GFX8-NEXT:    s_nop 1
2914; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2915; GFX8-NEXT:    s_nop 1
2916; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2917; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2918; GFX8-NEXT:    s_nop 0
2919; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2920; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2921; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2922; GFX8-NEXT:    ; implicit-def: $vgpr0
2923; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2924; GFX8-NEXT:    s_cbranch_execz BB15_2
2925; GFX8-NEXT:  ; %bb.1:
2926; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2927; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2928; GFX8-NEXT:    s_mov_b32 m0, -1
2929; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2930; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2931; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2932; GFX8-NEXT:    buffer_wbinvl1_vol
2933; GFX8-NEXT:  BB15_2:
2934; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2935; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2936; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2937; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2938; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2939; GFX8-NEXT:    s_mov_b32 s2, -1
2940; GFX8-NEXT:    s_nop 0
2941; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2943; GFX8-NEXT:    s_endpgm
2944;
2945; GFX9-LABEL: or_i32_varying:
2946; GFX9:       ; %bb.0: ; %entry
2947; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2948; GFX9-NEXT:    s_mov_b64 s[2:3], exec
2949; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2950; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2951; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2952; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2953; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2954; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2955; GFX9-NEXT:    s_not_b64 exec, exec
2956; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2957; GFX9-NEXT:    s_not_b64 exec, exec
2958; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2959; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2960; GFX9-NEXT:    s_nop 1
2961; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2962; GFX9-NEXT:    s_nop 1
2963; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2964; GFX9-NEXT:    s_nop 1
2965; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2966; GFX9-NEXT:    s_nop 1
2967; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2968; GFX9-NEXT:    s_nop 1
2969; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2970; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2971; GFX9-NEXT:    s_nop 0
2972; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2973; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2974; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2975; GFX9-NEXT:    ; implicit-def: $vgpr0
2976; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2977; GFX9-NEXT:    s_cbranch_execz BB15_2
2978; GFX9-NEXT:  ; %bb.1:
2979; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2980; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2981; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2982; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2983; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2984; GFX9-NEXT:    buffer_wbinvl1_vol
2985; GFX9-NEXT:  BB15_2:
2986; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2987; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2988; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2989; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2990; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2991; GFX9-NEXT:    s_mov_b32 s2, -1
2992; GFX9-NEXT:    s_nop 0
2993; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2994; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2995; GFX9-NEXT:    s_endpgm
2996;
2997; GFX1064-LABEL: or_i32_varying:
2998; GFX1064:       ; %bb.0: ; %entry
2999; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3000; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
3001; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3002; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3003; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3004; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3005; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3006; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3007; GFX1064-NEXT:    s_not_b64 exec, exec
3008; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3009; GFX1064-NEXT:    s_not_b64 exec, exec
3010; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3011; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3012; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3013; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3014; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3015; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3016; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3017; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3018; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3019; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3020; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3021; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3022; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3023; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3024; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3025; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3026; GFX1064-NEXT:    s_mov_b32 s2, -1
3027; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3028; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3029; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3030; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3031; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3032; GFX1064-NEXT:    ; implicit-def: $vgpr0
3033; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3034; GFX1064-NEXT:    s_cbranch_execz BB15_2
3035; GFX1064-NEXT:  ; %bb.1:
3036; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3037; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3038; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3039; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3040; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v7
3041; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3042; GFX1064-NEXT:    buffer_gl0_inv
3043; GFX1064-NEXT:    buffer_gl1_inv
3044; GFX1064-NEXT:  BB15_2:
3045; GFX1064-NEXT:    v_nop
3046; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3047; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3048; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3049; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3050; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3051; GFX1064-NEXT:    s_nop 1
3052; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3053; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3054; GFX1064-NEXT:    s_endpgm
3055;
3056; GFX1032-LABEL: or_i32_varying:
3057; GFX1032:       ; %bb.0: ; %entry
3058; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3059; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
3060; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3061; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3062; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
3063; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3064; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
3065; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3066; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3067; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3068; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3069; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3070; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3071; GFX1032-NEXT:    s_mov_b32 s2, -1
3072; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3073; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3074; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3075; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3076; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3077; GFX1032-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3078; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3079; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3080; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3081; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3082; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3083; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3084; GFX1032-NEXT:    ; implicit-def: $vgpr0
3085; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3086; GFX1032-NEXT:    s_cbranch_execz BB15_2
3087; GFX1032-NEXT:  ; %bb.1:
3088; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3089; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3090; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3091; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3092; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v7
3093; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3094; GFX1032-NEXT:    buffer_gl0_inv
3095; GFX1032-NEXT:    buffer_gl1_inv
3096; GFX1032-NEXT:  BB15_2:
3097; GFX1032-NEXT:    v_nop
3098; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3099; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3100; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3101; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3102; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3103; GFX1032-NEXT:    s_nop 1
3104; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3105; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3106; GFX1032-NEXT:    s_endpgm
3107entry:
3108  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3109  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3110  store i32 %old, i32 addrspace(1)* %out
3111  ret void
3112}
3113
3114define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3115;
3116;
3117; GFX7LESS-LABEL: xor_i32_varying:
3118; GFX7LESS:       ; %bb.0: ; %entry
3119; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3120; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3121; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3122; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3123; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3124; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3125; GFX7LESS-NEXT:    buffer_wbinvl1
3126; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3127; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3128; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3129; GFX7LESS-NEXT:    s_endpgm
3130;
3131; GFX8-LABEL: xor_i32_varying:
3132; GFX8:       ; %bb.0: ; %entry
3133; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3134; GFX8-NEXT:    s_mov_b64 s[2:3], exec
3135; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3136; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3137; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3138; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3139; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3140; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3141; GFX8-NEXT:    s_not_b64 exec, exec
3142; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3143; GFX8-NEXT:    s_not_b64 exec, exec
3144; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3145; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3146; GFX8-NEXT:    s_nop 1
3147; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3148; GFX8-NEXT:    s_nop 1
3149; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3150; GFX8-NEXT:    s_nop 1
3151; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3152; GFX8-NEXT:    s_nop 1
3153; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3154; GFX8-NEXT:    s_nop 1
3155; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3156; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3157; GFX8-NEXT:    s_nop 0
3158; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3159; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3160; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3161; GFX8-NEXT:    ; implicit-def: $vgpr0
3162; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3163; GFX8-NEXT:    s_cbranch_execz BB16_2
3164; GFX8-NEXT:  ; %bb.1:
3165; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3166; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3167; GFX8-NEXT:    s_mov_b32 m0, -1
3168; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3169; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3170; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3171; GFX8-NEXT:    buffer_wbinvl1_vol
3172; GFX8-NEXT:  BB16_2:
3173; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3174; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3175; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3176; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3177; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3178; GFX8-NEXT:    s_mov_b32 s2, -1
3179; GFX8-NEXT:    s_nop 0
3180; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3181; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3182; GFX8-NEXT:    s_endpgm
3183;
3184; GFX9-LABEL: xor_i32_varying:
3185; GFX9:       ; %bb.0: ; %entry
3186; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3187; GFX9-NEXT:    s_mov_b64 s[2:3], exec
3188; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3189; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3190; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3191; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3192; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3193; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3194; GFX9-NEXT:    s_not_b64 exec, exec
3195; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3196; GFX9-NEXT:    s_not_b64 exec, exec
3197; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3198; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3199; GFX9-NEXT:    s_nop 1
3200; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3201; GFX9-NEXT:    s_nop 1
3202; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3203; GFX9-NEXT:    s_nop 1
3204; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3205; GFX9-NEXT:    s_nop 1
3206; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3207; GFX9-NEXT:    s_nop 1
3208; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3209; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3210; GFX9-NEXT:    s_nop 0
3211; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3212; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3213; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3214; GFX9-NEXT:    ; implicit-def: $vgpr0
3215; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3216; GFX9-NEXT:    s_cbranch_execz BB16_2
3217; GFX9-NEXT:  ; %bb.1:
3218; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3219; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3220; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3221; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3222; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3223; GFX9-NEXT:    buffer_wbinvl1_vol
3224; GFX9-NEXT:  BB16_2:
3225; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3226; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3227; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3228; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3229; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3230; GFX9-NEXT:    s_mov_b32 s2, -1
3231; GFX9-NEXT:    s_nop 0
3232; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3233; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3234; GFX9-NEXT:    s_endpgm
3235;
3236; GFX1064-LABEL: xor_i32_varying:
3237; GFX1064:       ; %bb.0: ; %entry
3238; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3239; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
3240; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3241; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3242; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3243; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3244; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3245; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3246; GFX1064-NEXT:    s_not_b64 exec, exec
3247; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3248; GFX1064-NEXT:    s_not_b64 exec, exec
3249; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3250; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3251; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3252; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3253; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3254; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3255; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3256; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3257; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3258; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3259; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3260; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3261; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3262; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3263; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3264; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3265; GFX1064-NEXT:    s_mov_b32 s2, -1
3266; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3267; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3268; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3269; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3270; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3271; GFX1064-NEXT:    ; implicit-def: $vgpr0
3272; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3273; GFX1064-NEXT:    s_cbranch_execz BB16_2
3274; GFX1064-NEXT:  ; %bb.1:
3275; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3276; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3277; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3278; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3279; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v7
3280; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3281; GFX1064-NEXT:    buffer_gl0_inv
3282; GFX1064-NEXT:    buffer_gl1_inv
3283; GFX1064-NEXT:  BB16_2:
3284; GFX1064-NEXT:    v_nop
3285; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3286; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3287; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3288; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3289; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3290; GFX1064-NEXT:    s_nop 1
3291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3292; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3293; GFX1064-NEXT:    s_endpgm
3294;
3295; GFX1032-LABEL: xor_i32_varying:
3296; GFX1032:       ; %bb.0: ; %entry
3297; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3298; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
3299; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3300; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3301; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
3302; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3303; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
3304; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3305; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3306; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3307; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3308; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3309; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3310; GFX1032-NEXT:    s_mov_b32 s2, -1
3311; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3312; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3313; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3314; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3315; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3316; GFX1032-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3317; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3318; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3319; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3320; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3321; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3322; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3323; GFX1032-NEXT:    ; implicit-def: $vgpr0
3324; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3325; GFX1032-NEXT:    s_cbranch_execz BB16_2
3326; GFX1032-NEXT:  ; %bb.1:
3327; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3328; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3329; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3330; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3331; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v7
3332; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3333; GFX1032-NEXT:    buffer_gl0_inv
3334; GFX1032-NEXT:    buffer_gl1_inv
3335; GFX1032-NEXT:  BB16_2:
3336; GFX1032-NEXT:    v_nop
3337; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3338; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3339; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3340; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3341; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3342; GFX1032-NEXT:    s_nop 1
3343; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3344; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3345; GFX1032-NEXT:    s_endpgm
3346entry:
3347  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3348  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3349  store i32 %old, i32 addrspace(1)* %out
3350  ret void
3351}
3352
3353define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3354;
3355;
3356; GFX7LESS-LABEL: max_i32_varying:
3357; GFX7LESS:       ; %bb.0: ; %entry
3358; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3359; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3360; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3361; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3362; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3363; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3364; GFX7LESS-NEXT:    buffer_wbinvl1
3365; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3366; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3367; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3368; GFX7LESS-NEXT:    s_endpgm
3369;
3370; GFX8-LABEL: max_i32_varying:
3371; GFX8:       ; %bb.0: ; %entry
3372; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3373; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3374; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3375; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3376; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3377; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3378; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3379; GFX8-NEXT:    s_not_b64 exec, exec
3380; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3381; GFX8-NEXT:    s_not_b64 exec, exec
3382; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3383; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3384; GFX8-NEXT:    s_nop 1
3385; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3386; GFX8-NEXT:    s_nop 1
3387; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3388; GFX8-NEXT:    s_nop 1
3389; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3390; GFX8-NEXT:    s_nop 1
3391; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3392; GFX8-NEXT:    s_nop 1
3393; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3394; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3395; GFX8-NEXT:    s_nop 0
3396; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3397; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3398; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3399; GFX8-NEXT:    ; implicit-def: $vgpr0
3400; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3401; GFX8-NEXT:    s_cbranch_execz BB17_2
3402; GFX8-NEXT:  ; %bb.1:
3403; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3404; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3405; GFX8-NEXT:    s_mov_b32 m0, -1
3406; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3407; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3408; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3409; GFX8-NEXT:    buffer_wbinvl1_vol
3410; GFX8-NEXT:  BB17_2:
3411; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3412; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3413; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3414; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3415; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3416; GFX8-NEXT:    s_mov_b32 s2, -1
3417; GFX8-NEXT:    s_nop 0
3418; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3419; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3420; GFX8-NEXT:    s_endpgm
3421;
3422; GFX9-LABEL: max_i32_varying:
3423; GFX9:       ; %bb.0: ; %entry
3424; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3425; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3426; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3427; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3428; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3429; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3430; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3431; GFX9-NEXT:    s_not_b64 exec, exec
3432; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3433; GFX9-NEXT:    s_not_b64 exec, exec
3434; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3435; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3436; GFX9-NEXT:    s_nop 1
3437; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3438; GFX9-NEXT:    s_nop 1
3439; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3440; GFX9-NEXT:    s_nop 1
3441; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3442; GFX9-NEXT:    s_nop 1
3443; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3444; GFX9-NEXT:    s_nop 1
3445; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3446; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3447; GFX9-NEXT:    s_nop 0
3448; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3449; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3450; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3451; GFX9-NEXT:    ; implicit-def: $vgpr0
3452; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3453; GFX9-NEXT:    s_cbranch_execz BB17_2
3454; GFX9-NEXT:  ; %bb.1:
3455; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3456; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3457; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3458; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3459; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3460; GFX9-NEXT:    buffer_wbinvl1_vol
3461; GFX9-NEXT:  BB17_2:
3462; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3463; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3464; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3465; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3466; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3467; GFX9-NEXT:    s_mov_b32 s2, -1
3468; GFX9-NEXT:    s_nop 0
3469; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3470; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3471; GFX9-NEXT:    s_endpgm
3472;
3473; GFX1064-LABEL: max_i32_varying:
3474; GFX1064:       ; %bb.0: ; %entry
3475; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3476; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3477; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3478; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
3479; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3480; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3481; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3482; GFX1064-NEXT:    s_not_b64 exec, exec
3483; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3484; GFX1064-NEXT:    s_not_b64 exec, exec
3485; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3486; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3487; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3488; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3489; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3490; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3491; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3492; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3493; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3494; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3495; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3496; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3497; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3498; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3499; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3500; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3501; GFX1064-NEXT:    s_mov_b32 s2, -1
3502; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3503; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3504; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3505; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3506; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3507; GFX1064-NEXT:    ; implicit-def: $vgpr0
3508; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3509; GFX1064-NEXT:    s_cbranch_execz BB17_2
3510; GFX1064-NEXT:  ; %bb.1:
3511; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3512; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3513; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3514; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3515; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v7
3516; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3517; GFX1064-NEXT:    buffer_gl0_inv
3518; GFX1064-NEXT:    buffer_gl1_inv
3519; GFX1064-NEXT:  BB17_2:
3520; GFX1064-NEXT:    v_nop
3521; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3522; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3523; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3524; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3525; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3526; GFX1064-NEXT:    s_nop 1
3527; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3529; GFX1064-NEXT:    s_endpgm
3530;
3531; GFX1032-LABEL: max_i32_varying:
3532; GFX1032:       ; %bb.0: ; %entry
3533; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3534; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3535; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3536; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3537; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3538; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3539; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3540; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3541; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3542; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3543; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3544; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3545; GFX1032-NEXT:    s_mov_b32 s2, -1
3546; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3547; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3548; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3549; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3550; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3551; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3552; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3553; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3554; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3555; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3556; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3557; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
3558; GFX1032-NEXT:    ; implicit-def: $vgpr0
3559; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3560; GFX1032-NEXT:    s_cbranch_execz BB17_2
3561; GFX1032-NEXT:  ; %bb.1:
3562; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3563; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3564; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3565; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3566; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v7
3567; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3568; GFX1032-NEXT:    buffer_gl0_inv
3569; GFX1032-NEXT:    buffer_gl1_inv
3570; GFX1032-NEXT:  BB17_2:
3571; GFX1032-NEXT:    v_nop
3572; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3573; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3574; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3575; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3576; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3577; GFX1032-NEXT:    s_nop 1
3578; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3579; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3580; GFX1032-NEXT:    s_endpgm
3581entry:
3582  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3583  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3584  store i32 %old, i32 addrspace(1)* %out
3585  ret void
3586}
3587
3588define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3589;
3590;
3591; GFX7LESS-LABEL: max_i64_constant:
3592; GFX7LESS:       ; %bb.0: ; %entry
3593; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3594; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3595; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3596; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3597; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3598; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3599; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3600; GFX7LESS-NEXT:  ; %bb.1:
3601; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3602; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3603; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3604; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3605; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3606; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3607; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3608; GFX7LESS-NEXT:    buffer_wbinvl1
3609; GFX7LESS-NEXT:  BB18_2:
3610; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3611; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3612; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3613; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3614; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3615; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3616; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3617; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3618; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3619; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3620; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3621; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3622; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3623; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3624; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3625; GFX7LESS-NEXT:    s_endpgm
3626;
3627; GFX8-LABEL: max_i64_constant:
3628; GFX8:       ; %bb.0: ; %entry
3629; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3630; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3631; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3632; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3633; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3634; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3635; GFX8-NEXT:    s_cbranch_execz BB18_2
3636; GFX8-NEXT:  ; %bb.1:
3637; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3638; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3639; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3640; GFX8-NEXT:    s_mov_b32 m0, -1
3641; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3642; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3643; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3644; GFX8-NEXT:    buffer_wbinvl1_vol
3645; GFX8-NEXT:  BB18_2:
3646; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3647; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3648; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3649; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3650; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3651; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3652; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3653; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3654; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3655; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3656; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3657; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3658; GFX8-NEXT:    s_mov_b32 s2, -1
3659; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3660; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3661; GFX8-NEXT:    s_endpgm
3662;
3663; GFX9-LABEL: max_i64_constant:
3664; GFX9:       ; %bb.0: ; %entry
3665; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3666; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3667; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3668; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3669; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3670; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3671; GFX9-NEXT:    s_cbranch_execz BB18_2
3672; GFX9-NEXT:  ; %bb.1:
3673; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3674; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3675; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3676; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3677; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3678; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3679; GFX9-NEXT:    buffer_wbinvl1_vol
3680; GFX9-NEXT:  BB18_2:
3681; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3682; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3683; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3684; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3685; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3686; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3687; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3688; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3689; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3690; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3691; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3692; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3693; GFX9-NEXT:    s_mov_b32 s2, -1
3694; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3695; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3696; GFX9-NEXT:    s_endpgm
3697;
3698; GFX1064-LABEL: max_i64_constant:
3699; GFX1064:       ; %bb.0: ; %entry
3700; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3701; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3702; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3703; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3704; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3705; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3706; GFX1064-NEXT:    s_cbranch_execz BB18_2
3707; GFX1064-NEXT:  ; %bb.1:
3708; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3709; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3710; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3711; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3712; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3713; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3714; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3715; GFX1064-NEXT:    buffer_gl0_inv
3716; GFX1064-NEXT:    buffer_gl1_inv
3717; GFX1064-NEXT:  BB18_2:
3718; GFX1064-NEXT:    v_nop
3719; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3720; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
3721; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
3722; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3723; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3724; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3725; GFX1064-NEXT:    s_mov_b32 s2, -1
3726; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3727; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
3728; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
3729; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3730; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3731; GFX1064-NEXT:    s_endpgm
3732;
3733; GFX1032-LABEL: max_i64_constant:
3734; GFX1032:       ; %bb.0: ; %entry
3735; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3736; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3737; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3738; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3739; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3740; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3741; GFX1032-NEXT:    s_cbranch_execz BB18_2
3742; GFX1032-NEXT:  ; %bb.1:
3743; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3744; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3745; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3746; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3747; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3748; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3749; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3750; GFX1032-NEXT:    buffer_gl0_inv
3751; GFX1032-NEXT:    buffer_gl1_inv
3752; GFX1032-NEXT:  BB18_2:
3753; GFX1032-NEXT:    v_nop
3754; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3755; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
3756; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
3757; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3758; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3759; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3760; GFX1032-NEXT:    s_mov_b32 s2, -1
3761; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1]
3762; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
3763; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
3764; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3765; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3766; GFX1032-NEXT:    s_endpgm
3767entry:
3768  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3769  store i64 %old, i64 addrspace(1)* %out
3770  ret void
3771}
3772
3773define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3774;
3775;
3776; GFX7LESS-LABEL: min_i32_varying:
3777; GFX7LESS:       ; %bb.0: ; %entry
3778; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3779; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3780; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3781; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3782; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3783; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3784; GFX7LESS-NEXT:    buffer_wbinvl1
3785; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3786; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3787; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3788; GFX7LESS-NEXT:    s_endpgm
3789;
3790; GFX8-LABEL: min_i32_varying:
3791; GFX8:       ; %bb.0: ; %entry
3792; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3793; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3794; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3795; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3796; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3797; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3798; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3799; GFX8-NEXT:    s_not_b64 exec, exec
3800; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3801; GFX8-NEXT:    s_not_b64 exec, exec
3802; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3803; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3804; GFX8-NEXT:    s_nop 1
3805; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3806; GFX8-NEXT:    s_nop 1
3807; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3808; GFX8-NEXT:    s_nop 1
3809; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3810; GFX8-NEXT:    s_nop 1
3811; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3812; GFX8-NEXT:    s_nop 1
3813; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3814; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3815; GFX8-NEXT:    s_nop 0
3816; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3817; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3818; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3819; GFX8-NEXT:    ; implicit-def: $vgpr0
3820; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3821; GFX8-NEXT:    s_cbranch_execz BB19_2
3822; GFX8-NEXT:  ; %bb.1:
3823; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3824; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3825; GFX8-NEXT:    s_mov_b32 m0, -1
3826; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3827; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3828; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3829; GFX8-NEXT:    buffer_wbinvl1_vol
3830; GFX8-NEXT:  BB19_2:
3831; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3832; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3833; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3834; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3835; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3836; GFX8-NEXT:    s_mov_b32 s2, -1
3837; GFX8-NEXT:    s_nop 0
3838; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3839; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3840; GFX8-NEXT:    s_endpgm
3841;
3842; GFX9-LABEL: min_i32_varying:
3843; GFX9:       ; %bb.0: ; %entry
3844; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3845; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3846; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3847; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3848; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3849; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3850; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3851; GFX9-NEXT:    s_not_b64 exec, exec
3852; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3853; GFX9-NEXT:    s_not_b64 exec, exec
3854; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3855; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3856; GFX9-NEXT:    s_nop 1
3857; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3858; GFX9-NEXT:    s_nop 1
3859; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3860; GFX9-NEXT:    s_nop 1
3861; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3862; GFX9-NEXT:    s_nop 1
3863; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3864; GFX9-NEXT:    s_nop 1
3865; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3866; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3867; GFX9-NEXT:    s_nop 0
3868; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3869; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3870; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3871; GFX9-NEXT:    ; implicit-def: $vgpr0
3872; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3873; GFX9-NEXT:    s_cbranch_execz BB19_2
3874; GFX9-NEXT:  ; %bb.1:
3875; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3876; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3877; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3878; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3879; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3880; GFX9-NEXT:    buffer_wbinvl1_vol
3881; GFX9-NEXT:  BB19_2:
3882; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3883; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3884; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3885; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3886; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3887; GFX9-NEXT:    s_mov_b32 s2, -1
3888; GFX9-NEXT:    s_nop 0
3889; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3890; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3891; GFX9-NEXT:    s_endpgm
3892;
3893; GFX1064-LABEL: min_i32_varying:
3894; GFX1064:       ; %bb.0: ; %entry
3895; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3896; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3897; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3898; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
3899; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3900; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3901; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3902; GFX1064-NEXT:    s_not_b64 exec, exec
3903; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3904; GFX1064-NEXT:    s_not_b64 exec, exec
3905; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3906; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3907; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3908; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3909; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3910; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3911; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3912; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3913; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3914; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3915; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3916; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3917; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3918; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3919; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3920; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3921; GFX1064-NEXT:    s_mov_b32 s2, -1
3922; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3923; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3924; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3925; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3926; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3927; GFX1064-NEXT:    ; implicit-def: $vgpr0
3928; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3929; GFX1064-NEXT:    s_cbranch_execz BB19_2
3930; GFX1064-NEXT:  ; %bb.1:
3931; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3932; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3933; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3934; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3935; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v7
3936; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3937; GFX1064-NEXT:    buffer_gl0_inv
3938; GFX1064-NEXT:    buffer_gl1_inv
3939; GFX1064-NEXT:  BB19_2:
3940; GFX1064-NEXT:    v_nop
3941; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3942; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3943; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3944; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3945; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3946; GFX1064-NEXT:    s_nop 1
3947; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3948; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3949; GFX1064-NEXT:    s_endpgm
3950;
3951; GFX1032-LABEL: min_i32_varying:
3952; GFX1032:       ; %bb.0: ; %entry
3953; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3954; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
3955; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3956; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3957; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3958; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3959; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3960; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3961; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3962; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3963; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3964; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3965; GFX1032-NEXT:    s_mov_b32 s2, -1
3966; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3967; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3968; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3969; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3970; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3971; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3972; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3973; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3974; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3975; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3976; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3977; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
3978; GFX1032-NEXT:    ; implicit-def: $vgpr0
3979; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3980; GFX1032-NEXT:    s_cbranch_execz BB19_2
3981; GFX1032-NEXT:  ; %bb.1:
3982; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3983; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3984; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3985; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3986; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v7
3987; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3988; GFX1032-NEXT:    buffer_gl0_inv
3989; GFX1032-NEXT:    buffer_gl1_inv
3990; GFX1032-NEXT:  BB19_2:
3991; GFX1032-NEXT:    v_nop
3992; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3993; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3994; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3995; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3996; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3997; GFX1032-NEXT:    s_nop 1
3998; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3999; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4000; GFX1032-NEXT:    s_endpgm
4001entry:
4002  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4003  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4004  store i32 %old, i32 addrspace(1)* %out
4005  ret void
4006}
4007
4008define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
4009;
4010;
4011; GFX7LESS-LABEL: min_i64_constant:
4012; GFX7LESS:       ; %bb.0: ; %entry
4013; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4014; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4015; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4016; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4017; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4018; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4019; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
4020; GFX7LESS-NEXT:  ; %bb.1:
4021; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4022; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4023; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4024; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4025; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4026; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4027; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4028; GFX7LESS-NEXT:    buffer_wbinvl1
4029; GFX7LESS-NEXT:  BB20_2:
4030; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4031; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4032; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4033; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
4034; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4035; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4036; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4037; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4038; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4039; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4040; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4041; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4042; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4043; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4044; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4045; GFX7LESS-NEXT:    s_endpgm
4046;
4047; GFX8-LABEL: min_i64_constant:
4048; GFX8:       ; %bb.0: ; %entry
4049; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4050; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4051; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4052; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4053; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4054; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4055; GFX8-NEXT:    s_cbranch_execz BB20_2
4056; GFX8-NEXT:  ; %bb.1:
4057; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4058; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4059; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4060; GFX8-NEXT:    s_mov_b32 m0, -1
4061; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4062; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4063; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4064; GFX8-NEXT:    buffer_wbinvl1_vol
4065; GFX8-NEXT:  BB20_2:
4066; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4067; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4068; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
4069; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4070; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4071; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4072; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4073; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4074; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4075; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4076; GFX8-NEXT:    s_mov_b32 s2, -1
4077; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4078; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4079; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4080; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4081; GFX8-NEXT:    s_endpgm
4082;
4083; GFX9-LABEL: min_i64_constant:
4084; GFX9:       ; %bb.0: ; %entry
4085; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4086; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4087; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4088; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4089; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4090; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4091; GFX9-NEXT:    s_cbranch_execz BB20_2
4092; GFX9-NEXT:  ; %bb.1:
4093; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4094; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4095; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4096; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4097; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4098; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4099; GFX9-NEXT:    buffer_wbinvl1_vol
4100; GFX9-NEXT:  BB20_2:
4101; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4102; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4103; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4104; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4105; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4106; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4107; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4108; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4109; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4110; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4111; GFX9-NEXT:    s_mov_b32 s2, -1
4112; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4113; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4114; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4115; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4116; GFX9-NEXT:    s_endpgm
4117;
4118; GFX1064-LABEL: min_i64_constant:
4119; GFX1064:       ; %bb.0: ; %entry
4120; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4121; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4122; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4123; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4124; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4125; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4126; GFX1064-NEXT:    s_cbranch_execz BB20_2
4127; GFX1064-NEXT:  ; %bb.1:
4128; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4129; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4130; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4131; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4132; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4133; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4134; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4135; GFX1064-NEXT:    buffer_gl0_inv
4136; GFX1064-NEXT:    buffer_gl1_inv
4137; GFX1064-NEXT:  BB20_2:
4138; GFX1064-NEXT:    v_nop
4139; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4140; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4141; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4142; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4143; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4144; GFX1064-NEXT:    s_mov_b32 s2, -1
4145; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4146; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4147; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
4148; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4149; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4150; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4151; GFX1064-NEXT:    s_endpgm
4152;
4153; GFX1032-LABEL: min_i64_constant:
4154; GFX1032:       ; %bb.0: ; %entry
4155; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4156; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4157; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4158; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4159; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4160; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4161; GFX1032-NEXT:    s_cbranch_execz BB20_2
4162; GFX1032-NEXT:  ; %bb.1:
4163; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4164; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4165; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4166; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4167; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4168; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4169; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4170; GFX1032-NEXT:    buffer_gl0_inv
4171; GFX1032-NEXT:    buffer_gl1_inv
4172; GFX1032-NEXT:  BB20_2:
4173; GFX1032-NEXT:    v_nop
4174; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4175; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
4176; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
4177; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4178; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4179; GFX1032-NEXT:    s_mov_b32 s2, -1
4180; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4181; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
4182; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
4183; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
4184; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4185; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4186; GFX1032-NEXT:    s_endpgm
4187entry:
4188  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4189  store i64 %old, i64 addrspace(1)* %out
4190  ret void
4191}
4192
4193define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4194;
4195;
4196; GFX7LESS-LABEL: umax_i32_varying:
4197; GFX7LESS:       ; %bb.0: ; %entry
4198; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4199; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4200; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4201; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4202; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4203; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4204; GFX7LESS-NEXT:    buffer_wbinvl1
4205; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4206; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4207; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4208; GFX7LESS-NEXT:    s_endpgm
4209;
4210; GFX8-LABEL: umax_i32_varying:
4211; GFX8:       ; %bb.0: ; %entry
4212; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4213; GFX8-NEXT:    s_mov_b64 s[2:3], exec
4214; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4215; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4216; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4217; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4218; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4219; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4220; GFX8-NEXT:    s_not_b64 exec, exec
4221; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4222; GFX8-NEXT:    s_not_b64 exec, exec
4223; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4224; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4225; GFX8-NEXT:    s_nop 1
4226; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4227; GFX8-NEXT:    s_nop 1
4228; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4229; GFX8-NEXT:    s_nop 1
4230; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4231; GFX8-NEXT:    s_nop 1
4232; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4233; GFX8-NEXT:    s_nop 1
4234; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4235; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4236; GFX8-NEXT:    s_nop 0
4237; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4238; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4239; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4240; GFX8-NEXT:    ; implicit-def: $vgpr0
4241; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4242; GFX8-NEXT:    s_cbranch_execz BB21_2
4243; GFX8-NEXT:  ; %bb.1:
4244; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4245; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4246; GFX8-NEXT:    s_mov_b32 m0, -1
4247; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4248; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4249; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4250; GFX8-NEXT:    buffer_wbinvl1_vol
4251; GFX8-NEXT:  BB21_2:
4252; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4253; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4254; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4255; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4256; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4257; GFX8-NEXT:    s_mov_b32 s2, -1
4258; GFX8-NEXT:    s_nop 0
4259; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4260; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4261; GFX8-NEXT:    s_endpgm
4262;
4263; GFX9-LABEL: umax_i32_varying:
4264; GFX9:       ; %bb.0: ; %entry
4265; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4266; GFX9-NEXT:    s_mov_b64 s[2:3], exec
4267; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4268; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4269; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4270; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4271; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4272; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4273; GFX9-NEXT:    s_not_b64 exec, exec
4274; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4275; GFX9-NEXT:    s_not_b64 exec, exec
4276; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4277; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4278; GFX9-NEXT:    s_nop 1
4279; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4280; GFX9-NEXT:    s_nop 1
4281; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4282; GFX9-NEXT:    s_nop 1
4283; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4284; GFX9-NEXT:    s_nop 1
4285; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4286; GFX9-NEXT:    s_nop 1
4287; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4288; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4289; GFX9-NEXT:    s_nop 0
4290; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4291; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4292; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4293; GFX9-NEXT:    ; implicit-def: $vgpr0
4294; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4295; GFX9-NEXT:    s_cbranch_execz BB21_2
4296; GFX9-NEXT:  ; %bb.1:
4297; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4298; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4299; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4300; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4301; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4302; GFX9-NEXT:    buffer_wbinvl1_vol
4303; GFX9-NEXT:  BB21_2:
4304; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4305; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4306; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4307; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4308; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4309; GFX9-NEXT:    s_mov_b32 s2, -1
4310; GFX9-NEXT:    s_nop 0
4311; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4312; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4313; GFX9-NEXT:    s_endpgm
4314;
4315; GFX1064-LABEL: umax_i32_varying:
4316; GFX1064:       ; %bb.0: ; %entry
4317; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4318; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
4319; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4320; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4321; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4322; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4323; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4324; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4325; GFX1064-NEXT:    s_not_b64 exec, exec
4326; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4327; GFX1064-NEXT:    s_not_b64 exec, exec
4328; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4329; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4330; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4331; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4332; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4333; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4334; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4335; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4336; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4337; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4338; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4339; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4340; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4341; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4342; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4343; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4344; GFX1064-NEXT:    s_mov_b32 s2, -1
4345; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4346; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4347; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4348; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4349; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4350; GFX1064-NEXT:    ; implicit-def: $vgpr0
4351; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4352; GFX1064-NEXT:    s_cbranch_execz BB21_2
4353; GFX1064-NEXT:  ; %bb.1:
4354; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4355; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4356; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4357; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4358; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v7
4359; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4360; GFX1064-NEXT:    buffer_gl0_inv
4361; GFX1064-NEXT:    buffer_gl1_inv
4362; GFX1064-NEXT:  BB21_2:
4363; GFX1064-NEXT:    v_nop
4364; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4365; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4366; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4367; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4368; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4369; GFX1064-NEXT:    s_nop 1
4370; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4371; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4372; GFX1064-NEXT:    s_endpgm
4373;
4374; GFX1032-LABEL: umax_i32_varying:
4375; GFX1032:       ; %bb.0: ; %entry
4376; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4377; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
4378; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4379; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4380; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
4381; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4382; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
4383; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4384; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4385; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4386; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4387; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4388; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4389; GFX1032-NEXT:    s_mov_b32 s2, -1
4390; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4391; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4392; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4393; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4394; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4395; GFX1032-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4396; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4397; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4398; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4399; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4400; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4401; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4402; GFX1032-NEXT:    ; implicit-def: $vgpr0
4403; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4404; GFX1032-NEXT:    s_cbranch_execz BB21_2
4405; GFX1032-NEXT:  ; %bb.1:
4406; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4407; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4408; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4409; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4410; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v7
4411; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4412; GFX1032-NEXT:    buffer_gl0_inv
4413; GFX1032-NEXT:    buffer_gl1_inv
4414; GFX1032-NEXT:  BB21_2:
4415; GFX1032-NEXT:    v_nop
4416; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4417; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4418; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4419; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4420; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4421; GFX1032-NEXT:    s_nop 1
4422; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4423; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4424; GFX1032-NEXT:    s_endpgm
4425entry:
4426  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4427  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4428  store i32 %old, i32 addrspace(1)* %out
4429  ret void
4430}
4431
4432define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4433;
4434;
4435; GFX7LESS-LABEL: umax_i64_constant:
4436; GFX7LESS:       ; %bb.0: ; %entry
4437; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4438; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4439; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4440; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4441; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4442; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4443; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4444; GFX7LESS-NEXT:  ; %bb.1:
4445; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4446; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4447; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4448; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4449; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4450; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4451; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4452; GFX7LESS-NEXT:    buffer_wbinvl1
4453; GFX7LESS-NEXT:  BB22_2:
4454; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4455; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4456; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4457; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4458; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4459; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4460; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4461; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4462; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4463; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4464; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4465; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4466; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4467; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4468; GFX7LESS-NEXT:    s_endpgm
4469;
4470; GFX8-LABEL: umax_i64_constant:
4471; GFX8:       ; %bb.0: ; %entry
4472; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4473; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4474; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4475; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4476; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4477; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4478; GFX8-NEXT:    s_cbranch_execz BB22_2
4479; GFX8-NEXT:  ; %bb.1:
4480; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4481; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4482; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4483; GFX8-NEXT:    s_mov_b32 m0, -1
4484; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4485; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4486; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4487; GFX8-NEXT:    buffer_wbinvl1_vol
4488; GFX8-NEXT:  BB22_2:
4489; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4490; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4491; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4492; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4493; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4494; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4495; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4496; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4497; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4498; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4499; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4500; GFX8-NEXT:    s_mov_b32 s2, -1
4501; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4502; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4503; GFX8-NEXT:    s_endpgm
4504;
4505; GFX9-LABEL: umax_i64_constant:
4506; GFX9:       ; %bb.0: ; %entry
4507; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4508; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4509; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4510; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4511; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4512; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4513; GFX9-NEXT:    s_cbranch_execz BB22_2
4514; GFX9-NEXT:  ; %bb.1:
4515; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4516; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4517; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4518; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4519; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4520; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4521; GFX9-NEXT:    buffer_wbinvl1_vol
4522; GFX9-NEXT:  BB22_2:
4523; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4524; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4525; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4526; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4527; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4528; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4529; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4530; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4531; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4532; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4533; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4534; GFX9-NEXT:    s_mov_b32 s2, -1
4535; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4536; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4537; GFX9-NEXT:    s_endpgm
4538;
4539; GFX1064-LABEL: umax_i64_constant:
4540; GFX1064:       ; %bb.0: ; %entry
4541; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4542; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4543; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4544; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4545; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4546; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4547; GFX1064-NEXT:    s_cbranch_execz BB22_2
4548; GFX1064-NEXT:  ; %bb.1:
4549; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4550; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4551; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4552; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4553; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4554; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4555; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4556; GFX1064-NEXT:    buffer_gl0_inv
4557; GFX1064-NEXT:    buffer_gl1_inv
4558; GFX1064-NEXT:  BB22_2:
4559; GFX1064-NEXT:    v_nop
4560; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4561; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4562; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4563; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4564; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4565; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4566; GFX1064-NEXT:    s_mov_b32 s2, -1
4567; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4568; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4569; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s5, vcc
4570; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4571; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4572; GFX1064-NEXT:    s_endpgm
4573;
4574; GFX1032-LABEL: umax_i64_constant:
4575; GFX1032:       ; %bb.0: ; %entry
4576; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4577; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4578; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4579; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4580; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4581; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4582; GFX1032-NEXT:    s_cbranch_execz BB22_2
4583; GFX1032-NEXT:  ; %bb.1:
4584; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4585; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4586; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4587; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4588; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4589; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4590; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4591; GFX1032-NEXT:    buffer_gl0_inv
4592; GFX1032-NEXT:    buffer_gl1_inv
4593; GFX1032-NEXT:  BB22_2:
4594; GFX1032-NEXT:    v_nop
4595; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4596; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
4597; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
4598; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4599; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4600; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4601; GFX1032-NEXT:    s_mov_b32 s2, -1
4602; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
4603; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
4604; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s5, vcc_lo
4605; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4606; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4607; GFX1032-NEXT:    s_endpgm
4608entry:
4609  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4610  store i64 %old, i64 addrspace(1)* %out
4611  ret void
4612}
4613
4614define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4615;
4616;
4617; GFX7LESS-LABEL: umin_i32_varying:
4618; GFX7LESS:       ; %bb.0: ; %entry
4619; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4620; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4621; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4622; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4623; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4624; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4625; GFX7LESS-NEXT:    buffer_wbinvl1
4626; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4627; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4628; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4629; GFX7LESS-NEXT:    s_endpgm
4630;
4631; GFX8-LABEL: umin_i32_varying:
4632; GFX8:       ; %bb.0: ; %entry
4633; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4634; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4635; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4636; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4637; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4638; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4639; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4640; GFX8-NEXT:    s_not_b64 exec, exec
4641; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4642; GFX8-NEXT:    s_not_b64 exec, exec
4643; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4644; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4645; GFX8-NEXT:    s_nop 1
4646; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4647; GFX8-NEXT:    s_nop 1
4648; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4649; GFX8-NEXT:    s_nop 1
4650; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4651; GFX8-NEXT:    s_nop 1
4652; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4653; GFX8-NEXT:    s_nop 1
4654; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4655; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4656; GFX8-NEXT:    s_nop 0
4657; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4658; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4659; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4660; GFX8-NEXT:    ; implicit-def: $vgpr0
4661; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4662; GFX8-NEXT:    s_cbranch_execz BB23_2
4663; GFX8-NEXT:  ; %bb.1:
4664; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4665; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4666; GFX8-NEXT:    s_mov_b32 m0, -1
4667; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4668; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4669; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4670; GFX8-NEXT:    buffer_wbinvl1_vol
4671; GFX8-NEXT:  BB23_2:
4672; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4673; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4674; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4675; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4676; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4677; GFX8-NEXT:    s_mov_b32 s2, -1
4678; GFX8-NEXT:    s_nop 0
4679; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4680; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4681; GFX8-NEXT:    s_endpgm
4682;
4683; GFX9-LABEL: umin_i32_varying:
4684; GFX9:       ; %bb.0: ; %entry
4685; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4686; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4687; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4688; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4689; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4690; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4691; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4692; GFX9-NEXT:    s_not_b64 exec, exec
4693; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4694; GFX9-NEXT:    s_not_b64 exec, exec
4695; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4696; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4697; GFX9-NEXT:    s_nop 1
4698; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4699; GFX9-NEXT:    s_nop 1
4700; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4701; GFX9-NEXT:    s_nop 1
4702; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4703; GFX9-NEXT:    s_nop 1
4704; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4705; GFX9-NEXT:    s_nop 1
4706; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4707; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4708; GFX9-NEXT:    s_nop 0
4709; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4710; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4711; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4712; GFX9-NEXT:    ; implicit-def: $vgpr0
4713; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4714; GFX9-NEXT:    s_cbranch_execz BB23_2
4715; GFX9-NEXT:  ; %bb.1:
4716; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4717; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4718; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4719; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4720; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4721; GFX9-NEXT:    buffer_wbinvl1_vol
4722; GFX9-NEXT:  BB23_2:
4723; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4724; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4725; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4726; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4727; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4728; GFX9-NEXT:    s_mov_b32 s2, -1
4729; GFX9-NEXT:    s_nop 0
4730; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4731; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4732; GFX9-NEXT:    s_endpgm
4733;
4734; GFX1064-LABEL: umin_i32_varying:
4735; GFX1064:       ; %bb.0: ; %entry
4736; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4737; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
4738; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4739; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4
4740; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4741; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4742; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4743; GFX1064-NEXT:    s_not_b64 exec, exec
4744; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
4745; GFX1064-NEXT:    s_not_b64 exec, exec
4746; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4747; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4748; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4749; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4750; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4751; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4752; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4753; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4754; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4755; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4756; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4757; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4758; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4759; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4760; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4761; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4762; GFX1064-NEXT:    s_mov_b32 s2, -1
4763; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4764; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4765; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4766; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4767; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4768; GFX1064-NEXT:    ; implicit-def: $vgpr0
4769; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4770; GFX1064-NEXT:    s_cbranch_execz BB23_2
4771; GFX1064-NEXT:  ; %bb.1:
4772; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4773; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4774; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4775; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4776; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v7
4777; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4778; GFX1064-NEXT:    buffer_gl0_inv
4779; GFX1064-NEXT:    buffer_gl1_inv
4780; GFX1064-NEXT:  BB23_2:
4781; GFX1064-NEXT:    v_nop
4782; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4783; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4784; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4785; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4786; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4787; GFX1064-NEXT:    s_nop 1
4788; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4789; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4790; GFX1064-NEXT:    s_endpgm
4791;
4792; GFX1032-LABEL: umin_i32_varying:
4793; GFX1032:       ; %bb.0: ; %entry
4794; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4795; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0
4796; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4797; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4798; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4799; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4800; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4801; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4802; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
4803; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4804; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4805; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4806; GFX1032-NEXT:    s_mov_b32 s2, -1
4807; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4808; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4809; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4810; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4811; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4812; GFX1032-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4813; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4814; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4815; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4816; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4817; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4818; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4819; GFX1032-NEXT:    ; implicit-def: $vgpr0
4820; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4821; GFX1032-NEXT:    s_cbranch_execz BB23_2
4822; GFX1032-NEXT:  ; %bb.1:
4823; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4824; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4825; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4826; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4827; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v7
4828; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4829; GFX1032-NEXT:    buffer_gl0_inv
4830; GFX1032-NEXT:    buffer_gl1_inv
4831; GFX1032-NEXT:  BB23_2:
4832; GFX1032-NEXT:    v_nop
4833; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4834; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4835; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4836; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4837; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4838; GFX1032-NEXT:    s_nop 1
4839; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4840; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4841; GFX1032-NEXT:    s_endpgm
4842entry:
4843  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4844  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4845  store i32 %old, i32 addrspace(1)* %out
4846  ret void
4847}
4848
4849define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4850;
4851;
4852; GFX7LESS-LABEL: umin_i64_constant:
4853; GFX7LESS:       ; %bb.0: ; %entry
4854; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4855; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4856; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4857; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4858; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4859; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4860; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4861; GFX7LESS-NEXT:  ; %bb.1:
4862; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4863; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4864; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4865; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4866; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4867; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4868; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4869; GFX7LESS-NEXT:    buffer_wbinvl1
4870; GFX7LESS-NEXT:  BB24_2:
4871; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4872; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4873; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4874; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4875; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4876; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4877; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4878; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4879; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4880; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4881; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4882; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4883; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4884; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4885; GFX7LESS-NEXT:    s_endpgm
4886;
4887; GFX8-LABEL: umin_i64_constant:
4888; GFX8:       ; %bb.0: ; %entry
4889; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4890; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4891; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4892; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4893; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4894; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4895; GFX8-NEXT:    s_cbranch_execz BB24_2
4896; GFX8-NEXT:  ; %bb.1:
4897; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4898; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4899; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4900; GFX8-NEXT:    s_mov_b32 m0, -1
4901; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4902; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4903; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4904; GFX8-NEXT:    buffer_wbinvl1_vol
4905; GFX8-NEXT:  BB24_2:
4906; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4907; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4908; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4909; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4910; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4911; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4912; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4913; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4914; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4915; GFX8-NEXT:    s_mov_b32 s2, -1
4916; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4917; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4918; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4919; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4920; GFX8-NEXT:    s_endpgm
4921;
4922; GFX9-LABEL: umin_i64_constant:
4923; GFX9:       ; %bb.0: ; %entry
4924; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4925; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4926; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4927; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4928; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4929; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4930; GFX9-NEXT:    s_cbranch_execz BB24_2
4931; GFX9-NEXT:  ; %bb.1:
4932; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4933; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4934; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4935; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4936; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4937; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4938; GFX9-NEXT:    buffer_wbinvl1_vol
4939; GFX9-NEXT:  BB24_2:
4940; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4941; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4942; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4943; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4944; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4945; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4946; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4947; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4948; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4949; GFX9-NEXT:    s_mov_b32 s2, -1
4950; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4951; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4952; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4953; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4954; GFX9-NEXT:    s_endpgm
4955;
4956; GFX1064-LABEL: umin_i64_constant:
4957; GFX1064:       ; %bb.0: ; %entry
4958; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4959; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4960; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4961; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4962; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4963; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4964; GFX1064-NEXT:    s_cbranch_execz BB24_2
4965; GFX1064-NEXT:  ; %bb.1:
4966; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4967; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4968; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4969; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4970; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4971; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4972; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4973; GFX1064-NEXT:    buffer_gl0_inv
4974; GFX1064-NEXT:    buffer_gl1_inv
4975; GFX1064-NEXT:  BB24_2:
4976; GFX1064-NEXT:    v_nop
4977; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4978; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4979; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4980; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4981; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4982; GFX1064-NEXT:    s_mov_b32 s2, -1
4983; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4984; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4985; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
4986; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4987; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4988; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4989; GFX1064-NEXT:    s_endpgm
4990;
4991; GFX1032-LABEL: umin_i64_constant:
4992; GFX1032:       ; %bb.0: ; %entry
4993; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4994; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4995; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4996; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4997; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4998; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4999; GFX1032-NEXT:    s_cbranch_execz BB24_2
5000; GFX1032-NEXT:  ; %bb.1:
5001; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5002; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5003; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5004; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5005; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5006; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5007; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5008; GFX1032-NEXT:    buffer_gl0_inv
5009; GFX1032-NEXT:    buffer_gl1_inv
5010; GFX1032-NEXT:  BB24_2:
5011; GFX1032-NEXT:    v_nop
5012; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5013; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
5014; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
5015; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
5016; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5017; GFX1032-NEXT:    s_mov_b32 s2, -1
5018; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5019; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
5020; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
5021; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
5022; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5023; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5024; GFX1032-NEXT:    s_endpgm
5025entry:
5026  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
5027  store i64 %old, i64 addrspace(1)* %out
5028  ret void
5029}
5030