1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show that what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
21; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
30; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
31; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s4, 5
32; GFX7LESS-NEXT:    s_mov_b32 m0, -1
33; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
34; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
35; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
36; GFX7LESS-NEXT:    buffer_wbinvl1
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
39; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
40; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
41; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
42; GFX7LESS-NEXT:    s_mov_b32 s2, -1
43; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
59; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
60; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
61; GFX8-NEXT:    s_mov_b32 m0, -1
62; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
63; GFX8-NEXT:    ds_add_rtn_u32 v1, v2, v1
64; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
65; GFX8-NEXT:    buffer_wbinvl1_vol
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
68; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
69; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
70; GFX8-NEXT:    s_mov_b32 s3, 0xf000
71; GFX8-NEXT:    s_mov_b32 s2, -1
72; GFX8-NEXT:    s_nop 1
73; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
89; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
90; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
91; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
92; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
93; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
94; GFX9-NEXT:    buffer_wbinvl1_vol
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
97; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
98; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
99; GFX9-NEXT:    s_mov_b32 s3, 0xf000
100; GFX9-NEXT:    s_mov_b32 s2, -1
101; GFX9-NEXT:    s_nop 1
102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
109; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
119; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
120; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
121; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    buffer_gl0_inv
125; GFX1064-NEXT:    buffer_gl1_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    v_nop
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_nop 1
134; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
136; GFX1064-NEXT:    s_endpgm
137;
138; GFX1032-LABEL: add_i32_constant:
139; GFX1032:       ; %bb.0: ; %entry
140; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
141; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
142; GFX1032-NEXT:    ; implicit-def: $vcc_hi
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz BB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
151; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
152; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
153; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
155; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
156; GFX1032-NEXT:    buffer_gl0_inv
157; GFX1032-NEXT:    buffer_gl1_inv
158; GFX1032-NEXT:  BB0_2:
159; GFX1032-NEXT:    v_nop
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_nop 1
166; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
168; GFX1032-NEXT:    s_endpgm
169entry:
170  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
171  store i32 %old, i32 addrspace(1)* %out
172  ret void
173}
174
175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
176;
177;
178; GFX7LESS-LABEL: add_i32_uniform:
179; GFX7LESS:       ; %bb.0: ; %entry
180; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
181; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
182; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
183; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
184; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
185; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
186; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
187; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
188; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
189; GFX7LESS-NEXT:  ; %bb.1:
190; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
191; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
193; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
194; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
195; GFX7LESS-NEXT:    s_mov_b32 m0, -1
196; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
197; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
198; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
199; GFX7LESS-NEXT:    buffer_wbinvl1
200; GFX7LESS-NEXT:  BB1_2:
201; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
202; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
203; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
205; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
206; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
207; GFX7LESS-NEXT:    s_mov_b32 s6, -1
208; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
209; GFX7LESS-NEXT:    s_endpgm
210;
211; GFX8-LABEL: add_i32_uniform:
212; GFX8:       ; %bb.0: ; %entry
213; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
214; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
215; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
216; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
217; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
218; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
219; GFX8-NEXT:    ; implicit-def: $vgpr1
220; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
221; GFX8-NEXT:    s_cbranch_execz BB1_2
222; GFX8-NEXT:  ; %bb.1:
223; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
224; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX8-NEXT:    s_mul_i32 s1, s0, s1
226; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
227; GFX8-NEXT:    v_mov_b32_e32 v2, s1
228; GFX8-NEXT:    s_mov_b32 m0, -1
229; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
230; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
231; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
232; GFX8-NEXT:    buffer_wbinvl1_vol
233; GFX8-NEXT:  BB1_2:
234; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
237; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
238; GFX8-NEXT:    s_mov_b32 s7, 0xf000
239; GFX8-NEXT:    s_mov_b32 s6, -1
240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
241; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
242; GFX8-NEXT:    s_endpgm
243;
244; GFX9-LABEL: add_i32_uniform:
245; GFX9:       ; %bb.0: ; %entry
246; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
247; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
248; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
249; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
250; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
252; GFX9-NEXT:    ; implicit-def: $vgpr1
253; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
254; GFX9-NEXT:    s_cbranch_execz BB1_2
255; GFX9-NEXT:  ; %bb.1:
256; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
257; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX9-NEXT:    s_mul_i32 s1, s0, s1
259; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
260; GFX9-NEXT:    v_mov_b32_e32 v2, s1
261; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
262; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
263; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
264; GFX9-NEXT:    buffer_wbinvl1_vol
265; GFX9-NEXT:  BB1_2:
266; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
267; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
269; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
270; GFX9-NEXT:    s_mov_b32 s7, 0xf000
271; GFX9-NEXT:    s_mov_b32 s6, -1
272; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
273; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
274; GFX9-NEXT:    s_endpgm
275;
276; GFX1064-LABEL: add_i32_uniform:
277; GFX1064:       ; %bb.0: ; %entry
278; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
279; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
280; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
281; GFX1064-NEXT:    ; implicit-def: $vgpr1
282; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
283; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
284; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
285; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
286; GFX1064-NEXT:    s_cbranch_execz BB1_2
287; GFX1064-NEXT:  ; %bb.1:
288; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
289; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
290; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
292; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
293; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
294; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
295; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
296; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
297; GFX1064-NEXT:    buffer_gl0_inv
298; GFX1064-NEXT:    buffer_gl1_inv
299; GFX1064-NEXT:  BB1_2:
300; GFX1064-NEXT:    v_nop
301; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
302; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
304; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
305; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
306; GFX1064-NEXT:    s_mov_b32 s6, -1
307; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
308; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
309; GFX1064-NEXT:    s_endpgm
310;
311; GFX1032-LABEL: add_i32_uniform:
312; GFX1032:       ; %bb.0: ; %entry
313; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
314; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
315; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
316; GFX1032-NEXT:    ; implicit-def: $vcc_hi
317; GFX1032-NEXT:    ; implicit-def: $vgpr1
318; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
319; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
320; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
321; GFX1032-NEXT:    s_cbranch_execz BB1_2
322; GFX1032-NEXT:  ; %bb.1:
323; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
324; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
325; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
327; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
328; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
329; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
330; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
331; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
332; GFX1032-NEXT:    buffer_gl0_inv
333; GFX1032-NEXT:    buffer_gl1_inv
334; GFX1032-NEXT:  BB1_2:
335; GFX1032-NEXT:    v_nop
336; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
337; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
339; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
340; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
341; GFX1032-NEXT:    s_mov_b32 s6, -1
342; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
343; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
344; GFX1032-NEXT:    s_endpgm
345entry:
346  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
347  store i32 %old, i32 addrspace(1)* %out
348  ret void
349}
350
351; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
352; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
353; GFX7LESS-NOT: s_bcnt1_i32_b64
354; DPPCOMB: v_add_u32_dpp
355; DPPCOMB: v_add_u32_dpp
356; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
357; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
358; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
359define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
360;
361;
362; GFX7LESS-LABEL: add_i32_varying:
363; GFX7LESS:       ; %bb.0: ; %entry
364; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
365; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
366; GFX7LESS-NEXT:    s_mov_b32 m0, -1
367; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
368; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
369; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
370; GFX7LESS-NEXT:    buffer_wbinvl1
371; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
372; GFX7LESS-NEXT:    s_mov_b32 s2, -1
373; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
374; GFX7LESS-NEXT:    s_endpgm
375;
376; GFX8-LABEL: add_i32_varying:
377; GFX8:       ; %bb.0: ; %entry
378; GFX8-NEXT:    v_mov_b32_e32 v2, v0
379; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
380; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
381; GFX8-NEXT:    v_mov_b32_e32 v1, 0
382; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
383; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
384; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
385; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
386; GFX8-NEXT:    s_not_b64 exec, exec
387; GFX8-NEXT:    v_mov_b32_e32 v2, 0
388; GFX8-NEXT:    s_not_b64 exec, exec
389; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
390; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
391; GFX8-NEXT:    s_nop 1
392; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
393; GFX8-NEXT:    s_nop 1
394; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
395; GFX8-NEXT:    s_nop 1
396; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
397; GFX8-NEXT:    s_nop 1
398; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
399; GFX8-NEXT:    s_nop 1
400; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
401; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
402; GFX8-NEXT:    s_nop 0
403; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
404; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
405; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
406; GFX8-NEXT:    ; implicit-def: $vgpr0
407; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
408; GFX8-NEXT:    s_cbranch_execz BB2_2
409; GFX8-NEXT:  ; %bb.1:
410; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
411; GFX8-NEXT:    v_mov_b32_e32 v3, s2
412; GFX8-NEXT:    s_mov_b32 m0, -1
413; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
414; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
415; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
416; GFX8-NEXT:    buffer_wbinvl1_vol
417; GFX8-NEXT:  BB2_2:
418; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
419; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
420; GFX8-NEXT:    v_mov_b32_e32 v0, v1
421; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
422; GFX8-NEXT:    s_mov_b32 s3, 0xf000
423; GFX8-NEXT:    s_mov_b32 s2, -1
424; GFX8-NEXT:    s_nop 0
425; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
427; GFX8-NEXT:    s_endpgm
428;
429; GFX9-LABEL: add_i32_varying:
430; GFX9:       ; %bb.0: ; %entry
431; GFX9-NEXT:    v_mov_b32_e32 v2, v0
432; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
433; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
434; GFX9-NEXT:    v_mov_b32_e32 v1, 0
435; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
436; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
437; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
438; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
439; GFX9-NEXT:    s_not_b64 exec, exec
440; GFX9-NEXT:    v_mov_b32_e32 v2, 0
441; GFX9-NEXT:    s_not_b64 exec, exec
442; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
443; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
444; GFX9-NEXT:    s_nop 1
445; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
446; GFX9-NEXT:    s_nop 1
447; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
448; GFX9-NEXT:    s_nop 1
449; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
450; GFX9-NEXT:    s_nop 1
451; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
452; GFX9-NEXT:    s_nop 1
453; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
454; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
455; GFX9-NEXT:    s_nop 0
456; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
457; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
458; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
459; GFX9-NEXT:    ; implicit-def: $vgpr0
460; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
461; GFX9-NEXT:    s_cbranch_execz BB2_2
462; GFX9-NEXT:  ; %bb.1:
463; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
464; GFX9-NEXT:    v_mov_b32_e32 v3, s2
465; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
466; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
467; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
468; GFX9-NEXT:    buffer_wbinvl1_vol
469; GFX9-NEXT:  BB2_2:
470; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
471; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
472; GFX9-NEXT:    v_mov_b32_e32 v0, v1
473; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
474; GFX9-NEXT:    s_mov_b32 s3, 0xf000
475; GFX9-NEXT:    s_mov_b32 s2, -1
476; GFX9-NEXT:    s_nop 0
477; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
479; GFX9-NEXT:    s_endpgm
480;
481; GFX1064-LABEL: add_i32_varying:
482; GFX1064:       ; %bb.0: ; %entry
483; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
484; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
485; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
486; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
487; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
488; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
489; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
490; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
491; GFX1064-NEXT:    s_not_b64 exec, exec
492; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
493; GFX1064-NEXT:    s_not_b64 exec, exec
494; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
495; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
496; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
497; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
498; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
499; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
500; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
501; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
502; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
503; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
504; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
505; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
506; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
507; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
508; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
509; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
510; GFX1064-NEXT:    s_mov_b32 s2, -1
511; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
512; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
513; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
514; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
515; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
516; GFX1064-NEXT:    ; implicit-def: $vgpr0
517; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
518; GFX1064-NEXT:    s_cbranch_execz BB2_2
519; GFX1064-NEXT:  ; %bb.1:
520; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
521; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
522; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
523; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
524; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
525; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
526; GFX1064-NEXT:    buffer_gl0_inv
527; GFX1064-NEXT:    buffer_gl1_inv
528; GFX1064-NEXT:  BB2_2:
529; GFX1064-NEXT:    v_nop
530; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
531; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
532; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
533; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
534; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
535; GFX1064-NEXT:    s_nop 1
536; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
538; GFX1064-NEXT:    s_endpgm
539;
540; GFX1032-LABEL: add_i32_varying:
541; GFX1032:       ; %bb.0: ; %entry
542; GFX1032-NEXT:    ; implicit-def: $vcc_hi
543; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
544; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
545; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
546; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
547; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
548; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
549; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
550; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
551; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
552; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
553; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
554; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
555; GFX1032-NEXT:    s_mov_b32 s2, -1
556; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
557; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
558; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
559; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
560; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
561; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
562; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
563; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
564; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
565; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
566; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
567; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
568; GFX1032-NEXT:    ; implicit-def: $vgpr0
569; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
570; GFX1032-NEXT:    s_cbranch_execz BB2_2
571; GFX1032-NEXT:  ; %bb.1:
572; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
573; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
574; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
575; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
576; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
577; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
578; GFX1032-NEXT:    buffer_gl0_inv
579; GFX1032-NEXT:    buffer_gl1_inv
580; GFX1032-NEXT:  BB2_2:
581; GFX1032-NEXT:    v_nop
582; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
583; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
584; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
585; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
586; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
587; GFX1032-NEXT:    s_nop 1
588; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
589; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
590; GFX1032-NEXT:    s_endpgm
591entry:
592  %lane = call i32 @llvm.amdgcn.workitem.id.x()
593  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
594  store i32 %old, i32 addrspace(1)* %out
595  ret void
596}
597
598define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
599;
600;
601; GFX7LESS-LABEL: add_i32_varying_gfx1032:
602; GFX7LESS:       ; %bb.0: ; %entry
603; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
604; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
605; GFX7LESS-NEXT:    s_mov_b32 m0, -1
606; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
607; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
608; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
609; GFX7LESS-NEXT:    buffer_wbinvl1
610; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
611; GFX7LESS-NEXT:    s_mov_b32 s2, -1
612; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
613; GFX7LESS-NEXT:    s_endpgm
614;
615; GFX8-LABEL: add_i32_varying_gfx1032:
616; GFX8:       ; %bb.0: ; %entry
617; GFX8-NEXT:    v_mov_b32_e32 v2, v0
618; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
619; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
620; GFX8-NEXT:    v_mov_b32_e32 v1, 0
621; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
622; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
623; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
624; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
625; GFX8-NEXT:    s_not_b64 exec, exec
626; GFX8-NEXT:    v_mov_b32_e32 v2, 0
627; GFX8-NEXT:    s_not_b64 exec, exec
628; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
629; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
630; GFX8-NEXT:    s_nop 1
631; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
632; GFX8-NEXT:    s_nop 1
633; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
634; GFX8-NEXT:    s_nop 1
635; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
636; GFX8-NEXT:    s_nop 1
637; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
638; GFX8-NEXT:    s_nop 1
639; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
640; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
641; GFX8-NEXT:    s_nop 0
642; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
643; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
644; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
645; GFX8-NEXT:    ; implicit-def: $vgpr0
646; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
647; GFX8-NEXT:    s_cbranch_execz BB3_2
648; GFX8-NEXT:  ; %bb.1:
649; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
650; GFX8-NEXT:    v_mov_b32_e32 v3, s2
651; GFX8-NEXT:    s_mov_b32 m0, -1
652; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
653; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
654; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
655; GFX8-NEXT:    buffer_wbinvl1_vol
656; GFX8-NEXT:  BB3_2:
657; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
658; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
659; GFX8-NEXT:    v_mov_b32_e32 v0, v1
660; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
661; GFX8-NEXT:    s_mov_b32 s3, 0xf000
662; GFX8-NEXT:    s_mov_b32 s2, -1
663; GFX8-NEXT:    s_nop 0
664; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
666; GFX8-NEXT:    s_endpgm
667;
668; GFX9-LABEL: add_i32_varying_gfx1032:
669; GFX9:       ; %bb.0: ; %entry
670; GFX9-NEXT:    v_mov_b32_e32 v2, v0
671; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
672; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
673; GFX9-NEXT:    v_mov_b32_e32 v1, 0
674; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
675; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
676; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
677; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
678; GFX9-NEXT:    s_not_b64 exec, exec
679; GFX9-NEXT:    v_mov_b32_e32 v2, 0
680; GFX9-NEXT:    s_not_b64 exec, exec
681; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
682; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
683; GFX9-NEXT:    s_nop 1
684; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
685; GFX9-NEXT:    s_nop 1
686; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
687; GFX9-NEXT:    s_nop 1
688; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
689; GFX9-NEXT:    s_nop 1
690; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
691; GFX9-NEXT:    s_nop 1
692; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
693; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
694; GFX9-NEXT:    s_nop 0
695; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
696; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
697; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
698; GFX9-NEXT:    ; implicit-def: $vgpr0
699; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
700; GFX9-NEXT:    s_cbranch_execz BB3_2
701; GFX9-NEXT:  ; %bb.1:
702; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
703; GFX9-NEXT:    v_mov_b32_e32 v3, s2
704; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
705; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
706; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
707; GFX9-NEXT:    buffer_wbinvl1_vol
708; GFX9-NEXT:  BB3_2:
709; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
710; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
711; GFX9-NEXT:    v_mov_b32_e32 v0, v1
712; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
713; GFX9-NEXT:    s_mov_b32 s3, 0xf000
714; GFX9-NEXT:    s_mov_b32 s2, -1
715; GFX9-NEXT:    s_nop 0
716; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
718; GFX9-NEXT:    s_endpgm
719;
720; GFX1064-LABEL: add_i32_varying_gfx1032:
721; GFX1064:       ; %bb.0: ; %entry
722; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
723; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
724; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
725; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
726; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
727; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
728; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
729; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
730; GFX1064-NEXT:    s_not_b64 exec, exec
731; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
732; GFX1064-NEXT:    s_not_b64 exec, exec
733; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
734; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
735; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
736; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
737; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
738; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
739; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
740; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
741; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
742; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
743; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
744; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
745; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
746; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
747; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
748; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
749; GFX1064-NEXT:    s_mov_b32 s2, -1
750; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
751; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
752; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
753; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
754; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
755; GFX1064-NEXT:    ; implicit-def: $vgpr0
756; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
757; GFX1064-NEXT:    s_cbranch_execz BB3_2
758; GFX1064-NEXT:  ; %bb.1:
759; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
760; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
761; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
762; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
763; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
764; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
765; GFX1064-NEXT:    buffer_gl0_inv
766; GFX1064-NEXT:    buffer_gl1_inv
767; GFX1064-NEXT:  BB3_2:
768; GFX1064-NEXT:    v_nop
769; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
770; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
771; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
772; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
773; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
774; GFX1064-NEXT:    s_nop 1
775; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
777; GFX1064-NEXT:    s_endpgm
778;
779; GFX1032-LABEL: add_i32_varying_gfx1032:
780; GFX1032:       ; %bb.0: ; %entry
781; GFX1032-NEXT:    ; implicit-def: $vcc_hi
782; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
783; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
784; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
785; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
786; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
787; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
788; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
789; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
790; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
791; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
792; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
793; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
794; GFX1032-NEXT:    s_mov_b32 s2, -1
795; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
796; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
797; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
798; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
799; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
800; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
801; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
802; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
803; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
804; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
805; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
806; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
807; GFX1032-NEXT:    ; implicit-def: $vgpr0
808; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
809; GFX1032-NEXT:    s_cbranch_execz BB3_2
810; GFX1032-NEXT:  ; %bb.1:
811; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
812; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
813; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
814; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
815; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
816; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
817; GFX1032-NEXT:    buffer_gl0_inv
818; GFX1032-NEXT:    buffer_gl1_inv
819; GFX1032-NEXT:  BB3_2:
820; GFX1032-NEXT:    v_nop
821; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
822; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
823; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
824; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
825; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
826; GFX1032-NEXT:    s_nop 1
827; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
828; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
829; GFX1032-NEXT:    s_endpgm
830entry:
831  %lane = call i32 @llvm.amdgcn.workitem.id.x()
832  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
833  store i32 %old, i32 addrspace(1)* %out
834  ret void
835}
836
837define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
838;
839;
840; GFX7LESS-LABEL: add_i32_varying_gfx1064:
841; GFX7LESS:       ; %bb.0: ; %entry
842; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
843; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
844; GFX7LESS-NEXT:    s_mov_b32 m0, -1
845; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
846; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
847; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
848; GFX7LESS-NEXT:    buffer_wbinvl1
849; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
850; GFX7LESS-NEXT:    s_mov_b32 s2, -1
851; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
852; GFX7LESS-NEXT:    s_endpgm
853;
854; GFX8-LABEL: add_i32_varying_gfx1064:
855; GFX8:       ; %bb.0: ; %entry
856; GFX8-NEXT:    v_mov_b32_e32 v2, v0
857; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
858; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
859; GFX8-NEXT:    v_mov_b32_e32 v1, 0
860; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
861; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
862; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
863; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
864; GFX8-NEXT:    s_not_b64 exec, exec
865; GFX8-NEXT:    v_mov_b32_e32 v2, 0
866; GFX8-NEXT:    s_not_b64 exec, exec
867; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
868; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
869; GFX8-NEXT:    s_nop 1
870; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
871; GFX8-NEXT:    s_nop 1
872; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
873; GFX8-NEXT:    s_nop 1
874; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
875; GFX8-NEXT:    s_nop 1
876; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
877; GFX8-NEXT:    s_nop 1
878; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
879; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
880; GFX8-NEXT:    s_nop 0
881; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
882; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
883; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
884; GFX8-NEXT:    ; implicit-def: $vgpr0
885; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
886; GFX8-NEXT:    s_cbranch_execz BB4_2
887; GFX8-NEXT:  ; %bb.1:
888; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
889; GFX8-NEXT:    v_mov_b32_e32 v3, s2
890; GFX8-NEXT:    s_mov_b32 m0, -1
891; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
892; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
893; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
894; GFX8-NEXT:    buffer_wbinvl1_vol
895; GFX8-NEXT:  BB4_2:
896; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
897; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
898; GFX8-NEXT:    v_mov_b32_e32 v0, v1
899; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
900; GFX8-NEXT:    s_mov_b32 s3, 0xf000
901; GFX8-NEXT:    s_mov_b32 s2, -1
902; GFX8-NEXT:    s_nop 0
903; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
905; GFX8-NEXT:    s_endpgm
906;
907; GFX9-LABEL: add_i32_varying_gfx1064:
908; GFX9:       ; %bb.0: ; %entry
909; GFX9-NEXT:    v_mov_b32_e32 v2, v0
910; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
911; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
912; GFX9-NEXT:    v_mov_b32_e32 v1, 0
913; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
914; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
915; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
916; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
917; GFX9-NEXT:    s_not_b64 exec, exec
918; GFX9-NEXT:    v_mov_b32_e32 v2, 0
919; GFX9-NEXT:    s_not_b64 exec, exec
920; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
921; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
922; GFX9-NEXT:    s_nop 1
923; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
924; GFX9-NEXT:    s_nop 1
925; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
926; GFX9-NEXT:    s_nop 1
927; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
928; GFX9-NEXT:    s_nop 1
929; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
930; GFX9-NEXT:    s_nop 1
931; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
932; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
933; GFX9-NEXT:    s_nop 0
934; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
935; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
936; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
937; GFX9-NEXT:    ; implicit-def: $vgpr0
938; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
939; GFX9-NEXT:    s_cbranch_execz BB4_2
940; GFX9-NEXT:  ; %bb.1:
941; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
942; GFX9-NEXT:    v_mov_b32_e32 v3, s2
943; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
944; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
945; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
946; GFX9-NEXT:    buffer_wbinvl1_vol
947; GFX9-NEXT:  BB4_2:
948; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
949; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
950; GFX9-NEXT:    v_mov_b32_e32 v0, v1
951; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
952; GFX9-NEXT:    s_mov_b32 s3, 0xf000
953; GFX9-NEXT:    s_mov_b32 s2, -1
954; GFX9-NEXT:    s_nop 0
955; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
956; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
957; GFX9-NEXT:    s_endpgm
958;
959; GFX1064-LABEL: add_i32_varying_gfx1064:
960; GFX1064:       ; %bb.0: ; %entry
961; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
962; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
963; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
964; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
965; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
966; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
967; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
968; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
969; GFX1064-NEXT:    s_not_b64 exec, exec
970; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
971; GFX1064-NEXT:    s_not_b64 exec, exec
972; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
973; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
974; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
975; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
976; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
977; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
978; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
979; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
980; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
981; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
982; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
983; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
984; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
985; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
986; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
987; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
988; GFX1064-NEXT:    s_mov_b32 s2, -1
989; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
990; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
991; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
992; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
993; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
994; GFX1064-NEXT:    ; implicit-def: $vgpr0
995; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
996; GFX1064-NEXT:    s_cbranch_execz BB4_2
997; GFX1064-NEXT:  ; %bb.1:
998; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
999; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
1000; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1001; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1002; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
1003; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1004; GFX1064-NEXT:    buffer_gl0_inv
1005; GFX1064-NEXT:    buffer_gl1_inv
1006; GFX1064-NEXT:  BB4_2:
1007; GFX1064-NEXT:    v_nop
1008; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1009; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1010; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
1011; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1012; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1013; GFX1064-NEXT:    s_nop 1
1014; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1016; GFX1064-NEXT:    s_endpgm
1017;
1018; GFX1032-LABEL: add_i32_varying_gfx1064:
1019; GFX1032:       ; %bb.0: ; %entry
1020; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1021; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
1022; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1023; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1024; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1025; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1026; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
1027; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1028; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1029; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1030; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1031; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1032; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1033; GFX1032-NEXT:    s_mov_b32 s2, -1
1034; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1035; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1036; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1037; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
1038; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
1039; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1040; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
1041; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
1042; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
1043; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
1044; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1045; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1046; GFX1032-NEXT:    ; implicit-def: $vgpr0
1047; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1048; GFX1032-NEXT:    s_cbranch_execz BB4_2
1049; GFX1032-NEXT:  ; %bb.1:
1050; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1051; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
1052; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1053; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1054; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
1055; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1056; GFX1032-NEXT:    buffer_gl0_inv
1057; GFX1032-NEXT:    buffer_gl1_inv
1058; GFX1032-NEXT:  BB4_2:
1059; GFX1032-NEXT:    v_nop
1060; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1061; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1062; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1063; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1064; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1065; GFX1032-NEXT:    s_nop 1
1066; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1067; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1068; GFX1032-NEXT:    s_endpgm
1069entry:
1070  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1071  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1072  store i32 %old, i32 addrspace(1)* %out
1073  ret void
1074}
1075
1076define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1077;
1078;
1079; GFX7LESS-LABEL: add_i64_constant:
1080; GFX7LESS:       ; %bb.0: ; %entry
1081; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1082; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1083; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1084; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1085; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1086; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1087; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1088; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1089; GFX7LESS-NEXT:  ; %bb.1:
1090; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1091; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1092; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1093; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1094; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1095; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1096; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1097; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; GFX7LESS-NEXT:    buffer_wbinvl1
1099; GFX7LESS-NEXT:  BB5_2:
1100; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1101; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1102; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1103; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1104; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1105; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1106; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1107; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1108; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1109; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1110; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1112; GFX7LESS-NEXT:    s_endpgm
1113;
1114; GFX8-LABEL: add_i64_constant:
1115; GFX8:       ; %bb.0: ; %entry
1116; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1117; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1118; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1119; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1120; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1121; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1122; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1123; GFX8-NEXT:    s_cbranch_execz BB5_2
1124; GFX8-NEXT:  ; %bb.1:
1125; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1126; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1127; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1128; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1129; GFX8-NEXT:    s_mov_b32 m0, -1
1130; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1131; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1132; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1133; GFX8-NEXT:    buffer_wbinvl1_vol
1134; GFX8-NEXT:  BB5_2:
1135; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1136; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1137; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1138; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1139; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1140; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1141; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1142; GFX8-NEXT:    s_mov_b32 s2, -1
1143; GFX8-NEXT:    s_nop 2
1144; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1146; GFX8-NEXT:    s_endpgm
1147;
1148; GFX9-LABEL: add_i64_constant:
1149; GFX9:       ; %bb.0: ; %entry
1150; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1151; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1152; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1153; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1154; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1155; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1156; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1157; GFX9-NEXT:    s_cbranch_execz BB5_2
1158; GFX9-NEXT:  ; %bb.1:
1159; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1160; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1161; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1162; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1163; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1164; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1165; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1166; GFX9-NEXT:    buffer_wbinvl1_vol
1167; GFX9-NEXT:  BB5_2:
1168; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1169; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1170; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1171; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1172; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1173; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1174; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1175; GFX9-NEXT:    s_mov_b32 s2, -1
1176; GFX9-NEXT:    s_nop 2
1177; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1178; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1179; GFX9-NEXT:    s_endpgm
1180;
1181; GFX1064-LABEL: add_i64_constant:
1182; GFX1064:       ; %bb.0: ; %entry
1183; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1184; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1185; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1186; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1187; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
1188; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1189; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1190; GFX1064-NEXT:    s_cbranch_execz BB5_2
1191; GFX1064-NEXT:  ; %bb.1:
1192; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1193; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1194; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1195; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1196; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1197; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1198; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1199; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1200; GFX1064-NEXT:    buffer_gl0_inv
1201; GFX1064-NEXT:    buffer_gl1_inv
1202; GFX1064-NEXT:  BB5_2:
1203; GFX1064-NEXT:    v_nop
1204; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1205; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1206; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1207; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1208; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1209; GFX1064-NEXT:    s_mov_b32 s2, -1
1210; GFX1064-NEXT:    s_nop 2
1211; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1213; GFX1064-NEXT:    s_endpgm
1214;
1215; GFX1032-LABEL: add_i64_constant:
1216; GFX1032:       ; %bb.0: ; %entry
1217; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1218; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
1219; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1220; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1221; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1222; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1223; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1224; GFX1032-NEXT:    s_cbranch_execz BB5_2
1225; GFX1032-NEXT:  ; %bb.1:
1226; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1227; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1228; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1229; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1230; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1231; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1232; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1233; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1234; GFX1032-NEXT:    buffer_gl0_inv
1235; GFX1032-NEXT:    buffer_gl1_inv
1236; GFX1032-NEXT:  BB5_2:
1237; GFX1032-NEXT:    v_nop
1238; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1239; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1240; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1241; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1242; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1243; GFX1032-NEXT:    s_mov_b32 s2, -1
1244; GFX1032-NEXT:    s_nop 2
1245; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1247; GFX1032-NEXT:    s_endpgm
1248entry:
1249  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1250  store i64 %old, i64 addrspace(1)* %out
1251  ret void
1252}
1253
1254define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1255;
1256;
1257; GFX7LESS-LABEL: add_i64_uniform:
1258; GFX7LESS:       ; %bb.0: ; %entry
1259; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1260; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1261; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1262; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1263; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1264; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1265; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1266; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1267; GFX7LESS-NEXT:  ; %bb.1:
1268; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1269; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1270; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1272; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1273; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1274; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1275; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1276; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1277; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1278; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1279; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1280; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1281; GFX7LESS-NEXT:    buffer_wbinvl1
1282; GFX7LESS-NEXT:  BB6_2:
1283; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1284; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1285; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1286; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1288; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1289; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1290; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1291; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1292; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1293; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1294; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1295; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1296; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1297; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1298; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1299; GFX7LESS-NEXT:    s_endpgm
1300;
1301; GFX8-LABEL: add_i64_uniform:
1302; GFX8:       ; %bb.0: ; %entry
1303; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1304; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1305; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1306; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1307; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1308; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1309; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1310; GFX8-NEXT:    s_cbranch_execz BB6_2
1311; GFX8-NEXT:  ; %bb.1:
1312; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1313; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1314; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1316; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1317; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1318; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1319; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1320; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1321; GFX8-NEXT:    s_mov_b32 m0, -1
1322; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1323; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1324; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1325; GFX8-NEXT:    buffer_wbinvl1_vol
1326; GFX8-NEXT:  BB6_2:
1327; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1328; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1329; GFX8-NEXT:    s_mov_b32 s4, s0
1330; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1331; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1332; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1333; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1334; GFX8-NEXT:    s_mov_b32 s5, s1
1335; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1336; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1337; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1338; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1339; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1340; GFX8-NEXT:    s_mov_b32 s6, -1
1341; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1342; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1343; GFX8-NEXT:    s_endpgm
1344;
1345; GFX9-LABEL: add_i64_uniform:
1346; GFX9:       ; %bb.0: ; %entry
1347; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1348; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1349; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1350; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1351; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1352; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1353; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1354; GFX9-NEXT:    s_cbranch_execz BB6_2
1355; GFX9-NEXT:  ; %bb.1:
1356; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1357; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1359; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1360; GFX9-NEXT:    s_add_i32 s8, s8, s7
1361; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1362; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1363; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1364; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1365; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1366; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1367; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1368; GFX9-NEXT:    buffer_wbinvl1_vol
1369; GFX9-NEXT:  BB6_2:
1370; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1371; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1373; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1374; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1375; GFX9-NEXT:    s_mov_b32 s4, s0
1376; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1377; GFX9-NEXT:    s_mov_b32 s5, s1
1378; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1379; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1380; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1381; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1382; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1383; GFX9-NEXT:    s_mov_b32 s6, -1
1384; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1385; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1386; GFX9-NEXT:    s_endpgm
1387;
1388; GFX1064-LABEL: add_i64_uniform:
1389; GFX1064:       ; %bb.0: ; %entry
1390; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1391; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1392; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1393; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1394; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1395; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1396; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1397; GFX1064-NEXT:    s_cbranch_execz BB6_2
1398; GFX1064-NEXT:  ; %bb.1:
1399; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1400; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1401; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1403; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1404; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1405; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1406; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1407; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1408; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1409; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1410; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1411; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1412; GFX1064-NEXT:    buffer_gl0_inv
1413; GFX1064-NEXT:    buffer_gl1_inv
1414; GFX1064-NEXT:  BB6_2:
1415; GFX1064-NEXT:    v_nop
1416; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1417; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1418; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1419; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1420; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1421; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
1422; GFX1064-NEXT:    v_readfirstlane_b32 s5, v2
1423; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1424; GFX1064-NEXT:    s_mov_b32 s2, -1
1425; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1426; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s4, v0
1427; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc
1428; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1429; GFX1064-NEXT:    s_endpgm
1430;
1431; GFX1032-LABEL: add_i64_uniform:
1432; GFX1032:       ; %bb.0: ; %entry
1433; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1434; GFX1032-NEXT:    v_cmp_ne_u32_e64 s5, 1, 0
1435; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1436; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1437; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1438; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1439; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1440; GFX1032-NEXT:    s_cbranch_execz BB6_2
1441; GFX1032-NEXT:  ; %bb.1:
1442; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1443; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1444; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1446; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1447; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1448; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1449; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1450; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1451; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1452; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1453; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1454; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1455; GFX1032-NEXT:    buffer_gl0_inv
1456; GFX1032-NEXT:    buffer_gl1_inv
1457; GFX1032-NEXT:  BB6_2:
1458; GFX1032-NEXT:    v_nop
1459; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1460; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1461; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1462; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1463; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1464; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
1465; GFX1032-NEXT:    v_readfirstlane_b32 s5, v2
1466; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1467; GFX1032-NEXT:    s_mov_b32 s2, -1
1468; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1469; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s4, v0
1470; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
1471; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1472; GFX1032-NEXT:    s_endpgm
1473entry:
1474  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1475  store i64 %old, i64 addrspace(1)* %out
1476  ret void
1477}
1478
1479; GCN-NOT: v_mbcnt_lo_u32_b32
1480; GCN-NOT: v_mbcnt_hi_u32_b32
1481; GCN-NOT: s_bcnt1_i32_b64
1482define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1483;
1484;
1485; GFX7LESS-LABEL: add_i64_varying:
1486; GFX7LESS:       ; %bb.0: ; %entry
1487; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1488; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1489; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1490; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1491; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1492; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1493; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1494; GFX7LESS-NEXT:    buffer_wbinvl1
1495; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1496; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1497; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1498; GFX7LESS-NEXT:    s_endpgm
1499;
1500; GFX8-LABEL: add_i64_varying:
1501; GFX8:       ; %bb.0: ; %entry
1502; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1503; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1504; GFX8-NEXT:    s_mov_b32 m0, -1
1505; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1506; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1507; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1508; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1509; GFX8-NEXT:    buffer_wbinvl1_vol
1510; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1511; GFX8-NEXT:    s_mov_b32 s2, -1
1512; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1513; GFX8-NEXT:    s_endpgm
1514;
1515; GFX9-LABEL: add_i64_varying:
1516; GFX9:       ; %bb.0: ; %entry
1517; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1518; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1519; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1520; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1521; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1522; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1523; GFX9-NEXT:    buffer_wbinvl1_vol
1524; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1525; GFX9-NEXT:    s_mov_b32 s2, -1
1526; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1527; GFX9-NEXT:    s_endpgm
1528;
1529; GFX1064-LABEL: add_i64_varying:
1530; GFX1064:       ; %bb.0: ; %entry
1531; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1532; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1533; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1534; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1535; GFX1064-NEXT:    s_mov_b32 s2, -1
1536; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1537; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1538; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1539; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1540; GFX1064-NEXT:    buffer_gl0_inv
1541; GFX1064-NEXT:    buffer_gl1_inv
1542; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1543; GFX1064-NEXT:    s_endpgm
1544;
1545; GFX1032-LABEL: add_i64_varying:
1546; GFX1032:       ; %bb.0: ; %entry
1547; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1548; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1549; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1550; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1551; GFX1032-NEXT:    s_mov_b32 s2, -1
1552; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1553; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1554; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1555; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1556; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1557; GFX1032-NEXT:    buffer_gl0_inv
1558; GFX1032-NEXT:    buffer_gl1_inv
1559; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1560; GFX1032-NEXT:    s_endpgm
1561entry:
1562  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1563  %zext = zext i32 %lane to i64
1564  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1565  store i64 %old, i64 addrspace(1)* %out
1566  ret void
1567}
1568
1569define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1570;
1571;
1572; GFX7LESS-LABEL: sub_i32_constant:
1573; GFX7LESS:       ; %bb.0: ; %entry
1574; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1575; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1576; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1577; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1578; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1579; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1580; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1581; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1582; GFX7LESS-NEXT:  ; %bb.1:
1583; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1584; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1585; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s4, 5
1586; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1587; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1588; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1589; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1590; GFX7LESS-NEXT:    buffer_wbinvl1
1591; GFX7LESS-NEXT:  BB8_2:
1592; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1593; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1594; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1595; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1596; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1597; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1598; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1599; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1600; GFX7LESS-NEXT:    s_endpgm
1601;
1602; GFX8-LABEL: sub_i32_constant:
1603; GFX8:       ; %bb.0: ; %entry
1604; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1605; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1606; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1607; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1608; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1609; GFX8-NEXT:    ; implicit-def: $vgpr1
1610; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1611; GFX8-NEXT:    s_cbranch_execz BB8_2
1612; GFX8-NEXT:  ; %bb.1:
1613; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1614; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1615; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1616; GFX8-NEXT:    s_mov_b32 m0, -1
1617; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1618; GFX8-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1619; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1620; GFX8-NEXT:    buffer_wbinvl1_vol
1621; GFX8-NEXT:  BB8_2:
1622; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1623; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1624; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1625; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1626; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1627; GFX8-NEXT:    s_mov_b32 s2, -1
1628; GFX8-NEXT:    s_nop 0
1629; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1630; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1631; GFX8-NEXT:    s_endpgm
1632;
1633; GFX9-LABEL: sub_i32_constant:
1634; GFX9:       ; %bb.0: ; %entry
1635; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1636; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1637; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1638; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1639; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1640; GFX9-NEXT:    ; implicit-def: $vgpr1
1641; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1642; GFX9-NEXT:    s_cbranch_execz BB8_2
1643; GFX9-NEXT:  ; %bb.1:
1644; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1645; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1646; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1647; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1648; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1649; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1650; GFX9-NEXT:    buffer_wbinvl1_vol
1651; GFX9-NEXT:  BB8_2:
1652; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1653; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1654; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1655; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1656; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1657; GFX9-NEXT:    s_mov_b32 s2, -1
1658; GFX9-NEXT:    s_nop 0
1659; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1660; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1661; GFX9-NEXT:    s_endpgm
1662;
1663; GFX1064-LABEL: sub_i32_constant:
1664; GFX1064:       ; %bb.0: ; %entry
1665; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1666; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1667; GFX1064-NEXT:    ; implicit-def: $vgpr1
1668; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1669; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1670; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1671; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1672; GFX1064-NEXT:    s_cbranch_execz BB8_2
1673; GFX1064-NEXT:  ; %bb.1:
1674; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1675; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1676; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1677; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1678; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1679; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1680; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1681; GFX1064-NEXT:    buffer_gl0_inv
1682; GFX1064-NEXT:    buffer_gl1_inv
1683; GFX1064-NEXT:  BB8_2:
1684; GFX1064-NEXT:    v_nop
1685; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1686; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1687; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1688; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1689; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1690; GFX1064-NEXT:    s_mov_b32 s2, -1
1691; GFX1064-NEXT:    s_nop 0
1692; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1694; GFX1064-NEXT:    s_endpgm
1695;
1696; GFX1032-LABEL: sub_i32_constant:
1697; GFX1032:       ; %bb.0: ; %entry
1698; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1699; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
1700; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1701; GFX1032-NEXT:    ; implicit-def: $vgpr1
1702; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1703; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1704; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1705; GFX1032-NEXT:    s_cbranch_execz BB8_2
1706; GFX1032-NEXT:  ; %bb.1:
1707; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1708; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1709; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1710; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1711; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1712; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1713; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1714; GFX1032-NEXT:    buffer_gl0_inv
1715; GFX1032-NEXT:    buffer_gl1_inv
1716; GFX1032-NEXT:  BB8_2:
1717; GFX1032-NEXT:    v_nop
1718; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1719; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1720; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1721; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1722; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1723; GFX1032-NEXT:    s_mov_b32 s2, -1
1724; GFX1032-NEXT:    s_nop 0
1725; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1726; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1727; GFX1032-NEXT:    s_endpgm
1728entry:
1729  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1730  store i32 %old, i32 addrspace(1)* %out
1731  ret void
1732}
1733
1734define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1735;
1736;
1737; GFX7LESS-LABEL: sub_i32_uniform:
1738; GFX7LESS:       ; %bb.0: ; %entry
1739; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1740; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
1741; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1742; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1743; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1744; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1745; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1746; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1747; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1748; GFX7LESS-NEXT:  ; %bb.1:
1749; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1750; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1751; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
1752; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1753; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
1754; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1755; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1756; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1757; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1758; GFX7LESS-NEXT:    buffer_wbinvl1
1759; GFX7LESS-NEXT:  BB9_2:
1760; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1761; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1762; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1763; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1764; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1765; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1766; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1767; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1768; GFX7LESS-NEXT:    s_endpgm
1769;
1770; GFX8-LABEL: sub_i32_uniform:
1771; GFX8:       ; %bb.0: ; %entry
1772; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1773; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1774; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1775; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1776; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1777; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1778; GFX8-NEXT:    ; implicit-def: $vgpr1
1779; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1780; GFX8-NEXT:    s_cbranch_execz BB9_2
1781; GFX8-NEXT:  ; %bb.1:
1782; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
1783; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1785; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1786; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1787; GFX8-NEXT:    s_mov_b32 m0, -1
1788; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1789; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1790; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1791; GFX8-NEXT:    buffer_wbinvl1_vol
1792; GFX8-NEXT:  BB9_2:
1793; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1794; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1796; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1797; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1798; GFX8-NEXT:    s_mov_b32 s6, -1
1799; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1800; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1801; GFX8-NEXT:    s_endpgm
1802;
1803; GFX9-LABEL: sub_i32_uniform:
1804; GFX9:       ; %bb.0: ; %entry
1805; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1806; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
1807; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1808; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1809; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1810; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1811; GFX9-NEXT:    ; implicit-def: $vgpr1
1812; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1813; GFX9-NEXT:    s_cbranch_execz BB9_2
1814; GFX9-NEXT:  ; %bb.1:
1815; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
1816; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1817; GFX9-NEXT:    s_mul_i32 s1, s0, s1
1818; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1819; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1820; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1821; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1822; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1823; GFX9-NEXT:    buffer_wbinvl1_vol
1824; GFX9-NEXT:  BB9_2:
1825; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1826; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1827; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
1828; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1829; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1830; GFX9-NEXT:    s_mov_b32 s6, -1
1831; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1832; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1833; GFX9-NEXT:    s_endpgm
1834;
1835; GFX1064-LABEL: sub_i32_uniform:
1836; GFX1064:       ; %bb.0: ; %entry
1837; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1838; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1839; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
1840; GFX1064-NEXT:    ; implicit-def: $vgpr1
1841; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1842; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1843; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1844; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1845; GFX1064-NEXT:    s_cbranch_execz BB9_2
1846; GFX1064-NEXT:  ; %bb.1:
1847; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1848; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1849; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1850; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
1851; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
1852; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1853; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1854; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1855; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1856; GFX1064-NEXT:    buffer_gl0_inv
1857; GFX1064-NEXT:    buffer_gl1_inv
1858; GFX1064-NEXT:  BB9_2:
1859; GFX1064-NEXT:    v_nop
1860; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
1861; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
1863; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1864; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1865; GFX1064-NEXT:    s_mov_b32 s6, -1
1866; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1867; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1868; GFX1064-NEXT:    s_endpgm
1869;
1870; GFX1032-LABEL: sub_i32_uniform:
1871; GFX1032:       ; %bb.0: ; %entry
1872; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1873; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
1874; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
1875; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1876; GFX1032-NEXT:    ; implicit-def: $vgpr1
1877; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1878; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1879; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1880; GFX1032-NEXT:    s_cbranch_execz BB9_2
1881; GFX1032-NEXT:  ; %bb.1:
1882; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1883; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1884; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1885; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
1886; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
1887; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1888; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1889; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1890; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1891; GFX1032-NEXT:    buffer_gl0_inv
1892; GFX1032-NEXT:    buffer_gl1_inv
1893; GFX1032-NEXT:  BB9_2:
1894; GFX1032-NEXT:    v_nop
1895; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1896; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
1898; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1899; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1900; GFX1032-NEXT:    s_mov_b32 s6, -1
1901; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1902; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1903; GFX1032-NEXT:    s_endpgm
1904entry:
1905  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1906  store i32 %old, i32 addrspace(1)* %out
1907  ret void
1908}
1909
1910; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
1911; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
1912; GFX7LESS-NOT: s_bcnt1_i32_b64
1913; DPPCOMB: v_add_u32_dpp
1914; DPPCOMB: v_add_u32_dpp
1915; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
1916; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
1917; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
1918define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1919;
1920;
1921; GFX7LESS-LABEL: sub_i32_varying:
1922; GFX7LESS:       ; %bb.0: ; %entry
1923; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1924; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1925; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1926; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1927; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1928; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1929; GFX7LESS-NEXT:    buffer_wbinvl1
1930; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1931; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1932; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1933; GFX7LESS-NEXT:    s_endpgm
1934;
1935; GFX8-LABEL: sub_i32_varying:
1936; GFX8:       ; %bb.0: ; %entry
1937; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1938; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1939; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1940; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1941; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1942; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1943; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1944; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1945; GFX8-NEXT:    s_not_b64 exec, exec
1946; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1947; GFX8-NEXT:    s_not_b64 exec, exec
1948; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1949; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1950; GFX8-NEXT:    s_nop 1
1951; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1952; GFX8-NEXT:    s_nop 1
1953; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1954; GFX8-NEXT:    s_nop 1
1955; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1956; GFX8-NEXT:    s_nop 1
1957; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1958; GFX8-NEXT:    s_nop 1
1959; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1960; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
1961; GFX8-NEXT:    s_nop 0
1962; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1963; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1964; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1965; GFX8-NEXT:    ; implicit-def: $vgpr0
1966; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1967; GFX8-NEXT:    s_cbranch_execz BB10_2
1968; GFX8-NEXT:  ; %bb.1:
1969; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1970; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1971; GFX8-NEXT:    s_mov_b32 m0, -1
1972; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1973; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1974; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1975; GFX8-NEXT:    buffer_wbinvl1_vol
1976; GFX8-NEXT:  BB10_2:
1977; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1978; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1979; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1980; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1981; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1982; GFX8-NEXT:    s_mov_b32 s2, -1
1983; GFX8-NEXT:    s_nop 0
1984; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1986; GFX8-NEXT:    s_endpgm
1987;
1988; GFX9-LABEL: sub_i32_varying:
1989; GFX9:       ; %bb.0: ; %entry
1990; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1991; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1992; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1993; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1994; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1995; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1996; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1997; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1998; GFX9-NEXT:    s_not_b64 exec, exec
1999; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2000; GFX9-NEXT:    s_not_b64 exec, exec
2001; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2002; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2003; GFX9-NEXT:    s_nop 1
2004; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2005; GFX9-NEXT:    s_nop 1
2006; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2007; GFX9-NEXT:    s_nop 1
2008; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2009; GFX9-NEXT:    s_nop 1
2010; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2011; GFX9-NEXT:    s_nop 1
2012; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2013; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2014; GFX9-NEXT:    s_nop 0
2015; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2016; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2017; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2018; GFX9-NEXT:    ; implicit-def: $vgpr0
2019; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2020; GFX9-NEXT:    s_cbranch_execz BB10_2
2021; GFX9-NEXT:  ; %bb.1:
2022; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2023; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2024; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2025; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2026; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2027; GFX9-NEXT:    buffer_wbinvl1_vol
2028; GFX9-NEXT:  BB10_2:
2029; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2030; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2031; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2032; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2033; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2034; GFX9-NEXT:    s_mov_b32 s2, -1
2035; GFX9-NEXT:    s_nop 0
2036; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2038; GFX9-NEXT:    s_endpgm
2039;
2040; GFX1064-LABEL: sub_i32_varying:
2041; GFX1064:       ; %bb.0: ; %entry
2042; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2043; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2044; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2045; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2046; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2047; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2048; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2049; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
2050; GFX1064-NEXT:    s_not_b64 exec, exec
2051; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2052; GFX1064-NEXT:    s_not_b64 exec, exec
2053; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2054; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2055; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2056; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2057; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2058; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2059; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2060; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2061; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2062; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2063; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2064; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2065; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2066; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2067; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2068; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2069; GFX1064-NEXT:    s_mov_b32 s2, -1
2070; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2071; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2072; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2073; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2074; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2075; GFX1064-NEXT:    ; implicit-def: $vgpr0
2076; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2077; GFX1064-NEXT:    s_cbranch_execz BB10_2
2078; GFX1064-NEXT:  ; %bb.1:
2079; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2080; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
2081; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2082; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2083; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v7
2084; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2085; GFX1064-NEXT:    buffer_gl0_inv
2086; GFX1064-NEXT:    buffer_gl1_inv
2087; GFX1064-NEXT:  BB10_2:
2088; GFX1064-NEXT:    v_nop
2089; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2090; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2091; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2092; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2093; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2094; GFX1064-NEXT:    s_nop 1
2095; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2096; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2097; GFX1064-NEXT:    s_endpgm
2098;
2099; GFX1032-LABEL: sub_i32_varying:
2100; GFX1032:       ; %bb.0: ; %entry
2101; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2102; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2103; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2104; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2105; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2106; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2107; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
2108; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2109; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2110; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2111; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2112; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2113; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2114; GFX1032-NEXT:    s_mov_b32 s2, -1
2115; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2116; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2117; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2118; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2119; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2120; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2121; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2122; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2123; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2124; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2125; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2126; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2127; GFX1032-NEXT:    ; implicit-def: $vgpr0
2128; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2129; GFX1032-NEXT:    s_cbranch_execz BB10_2
2130; GFX1032-NEXT:  ; %bb.1:
2131; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2132; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
2133; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2134; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2135; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v7
2136; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2137; GFX1032-NEXT:    buffer_gl0_inv
2138; GFX1032-NEXT:    buffer_gl1_inv
2139; GFX1032-NEXT:  BB10_2:
2140; GFX1032-NEXT:    v_nop
2141; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2142; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2143; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2144; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2145; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2146; GFX1032-NEXT:    s_nop 1
2147; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2149; GFX1032-NEXT:    s_endpgm
2150entry:
2151  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2152  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2153  store i32 %old, i32 addrspace(1)* %out
2154  ret void
2155}
2156
2157define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2158;
2159;
2160; GFX7LESS-LABEL: sub_i64_constant:
2161; GFX7LESS:       ; %bb.0: ; %entry
2162; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2163; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2164; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2165; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2166; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2167; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2168; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2169; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2170; GFX7LESS-NEXT:  ; %bb.1:
2171; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2172; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2173; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2174; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2175; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2176; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2177; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2178; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2179; GFX7LESS-NEXT:    buffer_wbinvl1
2180; GFX7LESS-NEXT:  BB11_2:
2181; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2182; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2183; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2184; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2185; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2186; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2187; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2188; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2189; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2190; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2191; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2192; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2193; GFX7LESS-NEXT:    s_endpgm
2194;
2195; GFX8-LABEL: sub_i64_constant:
2196; GFX8:       ; %bb.0: ; %entry
2197; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2198; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2199; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2200; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2201; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2202; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2203; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2204; GFX8-NEXT:    s_cbranch_execz BB11_2
2205; GFX8-NEXT:  ; %bb.1:
2206; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2207; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2208; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2209; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2210; GFX8-NEXT:    s_mov_b32 m0, -1
2211; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2212; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2213; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2214; GFX8-NEXT:    buffer_wbinvl1_vol
2215; GFX8-NEXT:  BB11_2:
2216; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2217; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2218; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2219; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2220; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2221; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2222; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2223; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2224; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2225; GFX8-NEXT:    s_mov_b32 s2, -1
2226; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2227; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2228; GFX8-NEXT:    s_endpgm
2229;
2230; GFX9-LABEL: sub_i64_constant:
2231; GFX9:       ; %bb.0: ; %entry
2232; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2233; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2234; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2235; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2236; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2237; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2238; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2239; GFX9-NEXT:    s_cbranch_execz BB11_2
2240; GFX9-NEXT:  ; %bb.1:
2241; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2242; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2243; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2244; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2245; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2246; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2247; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2248; GFX9-NEXT:    buffer_wbinvl1_vol
2249; GFX9-NEXT:  BB11_2:
2250; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2251; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2252; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2253; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2254; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2255; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2256; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2257; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2258; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2259; GFX9-NEXT:    s_mov_b32 s2, -1
2260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2262; GFX9-NEXT:    s_endpgm
2263;
2264; GFX1064-LABEL: sub_i64_constant:
2265; GFX1064:       ; %bb.0: ; %entry
2266; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2267; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2268; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2269; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2270; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2271; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2272; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2273; GFX1064-NEXT:    s_cbranch_execz BB11_2
2274; GFX1064-NEXT:  ; %bb.1:
2275; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2276; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2277; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2278; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2279; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2280; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2281; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2282; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2283; GFX1064-NEXT:    buffer_gl0_inv
2284; GFX1064-NEXT:    buffer_gl1_inv
2285; GFX1064-NEXT:  BB11_2:
2286; GFX1064-NEXT:    v_nop
2287; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2288; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2289; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2290; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2291; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2292; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2293; GFX1064-NEXT:    s_mov_b32 s2, -1
2294; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2295; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2296; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2297; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2298; GFX1064-NEXT:    s_endpgm
2299;
2300; GFX1032-LABEL: sub_i64_constant:
2301; GFX1032:       ; %bb.0: ; %entry
2302; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2303; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
2304; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2305; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2306; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2307; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2308; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2309; GFX1032-NEXT:    s_cbranch_execz BB11_2
2310; GFX1032-NEXT:  ; %bb.1:
2311; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2312; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2313; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2314; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
2315; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2316; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2317; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2318; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2319; GFX1032-NEXT:    buffer_gl0_inv
2320; GFX1032-NEXT:    buffer_gl1_inv
2321; GFX1032-NEXT:  BB11_2:
2322; GFX1032-NEXT:    v_nop
2323; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2324; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2325; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2326; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2327; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2328; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2329; GFX1032-NEXT:    s_mov_b32 s2, -1
2330; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2331; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2332; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2333; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2334; GFX1032-NEXT:    s_endpgm
2335entry:
2336  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2337  store i64 %old, i64 addrspace(1)* %out
2338  ret void
2339}
2340
2341define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2342;
2343;
2344; GFX7LESS-LABEL: sub_i64_uniform:
2345; GFX7LESS:       ; %bb.0: ; %entry
2346; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2347; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2348; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2349; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2350; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2351; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2352; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2353; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2354; GFX7LESS-NEXT:  ; %bb.1:
2355; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2356; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2357; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2358; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2359; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2360; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2361; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2362; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2363; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2364; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2365; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2366; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2367; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2368; GFX7LESS-NEXT:    buffer_wbinvl1
2369; GFX7LESS-NEXT:  BB12_2:
2370; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2371; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2372; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2373; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2374; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2375; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2376; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2377; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2378; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2379; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2380; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2381; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2382; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2383; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2384; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2385; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2386; GFX7LESS-NEXT:    s_endpgm
2387;
2388; GFX8-LABEL: sub_i64_uniform:
2389; GFX8:       ; %bb.0: ; %entry
2390; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2391; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2392; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2393; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2394; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2395; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2396; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2397; GFX8-NEXT:    s_cbranch_execz BB12_2
2398; GFX8-NEXT:  ; %bb.1:
2399; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2400; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2402; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2403; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2404; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2405; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2406; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2407; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2408; GFX8-NEXT:    s_mov_b32 m0, -1
2409; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2410; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2411; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2412; GFX8-NEXT:    buffer_wbinvl1_vol
2413; GFX8-NEXT:  BB12_2:
2414; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2415; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2416; GFX8-NEXT:    s_mov_b32 s4, s0
2417; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2418; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2419; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2420; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2421; GFX8-NEXT:    s_mov_b32 s5, s1
2422; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2423; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2424; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2425; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2426; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2427; GFX8-NEXT:    s_mov_b32 s6, -1
2428; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2429; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2430; GFX8-NEXT:    s_endpgm
2431;
2432; GFX9-LABEL: sub_i64_uniform:
2433; GFX9:       ; %bb.0: ; %entry
2434; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2435; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2436; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2437; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2438; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2439; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2440; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2441; GFX9-NEXT:    s_cbranch_execz BB12_2
2442; GFX9-NEXT:  ; %bb.1:
2443; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2445; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2446; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2447; GFX9-NEXT:    s_add_i32 s8, s8, s7
2448; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2449; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2450; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2451; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2452; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2453; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2454; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2455; GFX9-NEXT:    buffer_wbinvl1_vol
2456; GFX9-NEXT:  BB12_2:
2457; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2458; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2459; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2460; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2461; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2462; GFX9-NEXT:    s_mov_b32 s4, s0
2463; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2464; GFX9-NEXT:    s_mov_b32 s5, s1
2465; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2466; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2467; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2468; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2469; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2470; GFX9-NEXT:    s_mov_b32 s6, -1
2471; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2472; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2473; GFX9-NEXT:    s_endpgm
2474;
2475; GFX1064-LABEL: sub_i64_uniform:
2476; GFX1064:       ; %bb.0: ; %entry
2477; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2478; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2479; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2480; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2481; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2482; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2483; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2484; GFX1064-NEXT:    s_cbranch_execz BB12_2
2485; GFX1064-NEXT:  ; %bb.1:
2486; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2487; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2488; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2489; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2490; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2491; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2492; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2493; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2494; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2495; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2496; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2497; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2498; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2499; GFX1064-NEXT:    buffer_gl0_inv
2500; GFX1064-NEXT:    buffer_gl1_inv
2501; GFX1064-NEXT:  BB12_2:
2502; GFX1064-NEXT:    v_nop
2503; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2504; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2506; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2507; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2508; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
2509; GFX1064-NEXT:    v_readfirstlane_b32 s5, v2
2510; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2511; GFX1064-NEXT:    s_mov_b32 s2, -1
2512; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2513; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s4, v0
2514; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
2515; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2516; GFX1064-NEXT:    s_endpgm
2517;
2518; GFX1032-LABEL: sub_i64_uniform:
2519; GFX1032:       ; %bb.0: ; %entry
2520; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2521; GFX1032-NEXT:    v_cmp_ne_u32_e64 s5, 1, 0
2522; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2523; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2524; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2525; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2526; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2527; GFX1032-NEXT:    s_cbranch_execz BB12_2
2528; GFX1032-NEXT:  ; %bb.1:
2529; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2530; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2531; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2532; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2533; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2534; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2535; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2536; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2537; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2538; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2539; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2540; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2541; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2542; GFX1032-NEXT:    buffer_gl0_inv
2543; GFX1032-NEXT:    buffer_gl1_inv
2544; GFX1032-NEXT:  BB12_2:
2545; GFX1032-NEXT:    v_nop
2546; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2547; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2548; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2549; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2550; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2551; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
2552; GFX1032-NEXT:    v_readfirstlane_b32 s5, v2
2553; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2554; GFX1032-NEXT:    s_mov_b32 s2, -1
2555; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2556; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s4, v0
2557; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
2558; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2559; GFX1032-NEXT:    s_endpgm
2560entry:
2561  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2562  store i64 %old, i64 addrspace(1)* %out
2563  ret void
2564}
2565
2566; GCN-NOT: v_mbcnt_lo_u32_b32
2567; GCN-NOT: v_mbcnt_hi_u32_b32
2568; GCN-NOT: s_bcnt1_i32_b64
2569define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2570;
2571;
2572; GFX7LESS-LABEL: sub_i64_varying:
2573; GFX7LESS:       ; %bb.0: ; %entry
2574; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2575; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2576; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2577; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2578; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2579; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2580; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2581; GFX7LESS-NEXT:    buffer_wbinvl1
2582; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2583; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2584; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2585; GFX7LESS-NEXT:    s_endpgm
2586;
2587; GFX8-LABEL: sub_i64_varying:
2588; GFX8:       ; %bb.0: ; %entry
2589; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2590; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2591; GFX8-NEXT:    s_mov_b32 m0, -1
2592; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2593; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2594; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2595; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2596; GFX8-NEXT:    buffer_wbinvl1_vol
2597; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2598; GFX8-NEXT:    s_mov_b32 s2, -1
2599; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2600; GFX8-NEXT:    s_endpgm
2601;
2602; GFX9-LABEL: sub_i64_varying:
2603; GFX9:       ; %bb.0: ; %entry
2604; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2605; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2606; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2607; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2608; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2609; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2610; GFX9-NEXT:    buffer_wbinvl1_vol
2611; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2612; GFX9-NEXT:    s_mov_b32 s2, -1
2613; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2614; GFX9-NEXT:    s_endpgm
2615;
2616; GFX1064-LABEL: sub_i64_varying:
2617; GFX1064:       ; %bb.0: ; %entry
2618; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2619; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2620; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2621; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2622; GFX1064-NEXT:    s_mov_b32 s2, -1
2623; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2624; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2625; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2626; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2627; GFX1064-NEXT:    buffer_gl0_inv
2628; GFX1064-NEXT:    buffer_gl1_inv
2629; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2630; GFX1064-NEXT:    s_endpgm
2631;
2632; GFX1032-LABEL: sub_i64_varying:
2633; GFX1032:       ; %bb.0: ; %entry
2634; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2635; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2636; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2637; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2638; GFX1032-NEXT:    s_mov_b32 s2, -1
2639; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2640; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2641; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2642; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2643; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2644; GFX1032-NEXT:    buffer_gl0_inv
2645; GFX1032-NEXT:    buffer_gl1_inv
2646; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2647; GFX1032-NEXT:    s_endpgm
2648entry:
2649  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2650  %zext = zext i32 %lane to i64
2651  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2652  store i64 %old, i64 addrspace(1)* %out
2653  ret void
2654}
2655
2656; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
2657; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
2658; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
2659define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2660;
2661;
2662; GFX7LESS-LABEL: and_i32_varying:
2663; GFX7LESS:       ; %bb.0: ; %entry
2664; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2665; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2666; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2667; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2668; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2669; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2670; GFX7LESS-NEXT:    buffer_wbinvl1
2671; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2672; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2673; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2674; GFX7LESS-NEXT:    s_endpgm
2675;
2676; GFX8-LABEL: and_i32_varying:
2677; GFX8:       ; %bb.0: ; %entry
2678; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2679; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2680; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
2681; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
2682; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2683; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2684; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2685; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2686; GFX8-NEXT:    s_not_b64 exec, exec
2687; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2688; GFX8-NEXT:    s_not_b64 exec, exec
2689; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2690; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2691; GFX8-NEXT:    s_nop 1
2692; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2693; GFX8-NEXT:    s_nop 1
2694; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2695; GFX8-NEXT:    s_nop 1
2696; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2697; GFX8-NEXT:    s_nop 1
2698; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2699; GFX8-NEXT:    s_nop 1
2700; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2701; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2702; GFX8-NEXT:    s_nop 0
2703; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2704; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2705; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2706; GFX8-NEXT:    ; implicit-def: $vgpr0
2707; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2708; GFX8-NEXT:    s_cbranch_execz BB14_2
2709; GFX8-NEXT:  ; %bb.1:
2710; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2711; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2712; GFX8-NEXT:    s_mov_b32 m0, -1
2713; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2714; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2715; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2716; GFX8-NEXT:    buffer_wbinvl1_vol
2717; GFX8-NEXT:  BB14_2:
2718; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2719; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2720; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2721; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2722; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2723; GFX8-NEXT:    s_mov_b32 s2, -1
2724; GFX8-NEXT:    s_nop 0
2725; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2726; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2727; GFX8-NEXT:    s_endpgm
2728;
2729; GFX9-LABEL: and_i32_varying:
2730; GFX9:       ; %bb.0: ; %entry
2731; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2732; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2733; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
2734; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
2735; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2736; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2737; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2738; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2739; GFX9-NEXT:    s_not_b64 exec, exec
2740; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2741; GFX9-NEXT:    s_not_b64 exec, exec
2742; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2743; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2744; GFX9-NEXT:    s_nop 1
2745; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2746; GFX9-NEXT:    s_nop 1
2747; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2748; GFX9-NEXT:    s_nop 1
2749; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2750; GFX9-NEXT:    s_nop 1
2751; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2752; GFX9-NEXT:    s_nop 1
2753; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2754; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2755; GFX9-NEXT:    s_nop 0
2756; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2757; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2758; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2759; GFX9-NEXT:    ; implicit-def: $vgpr0
2760; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2761; GFX9-NEXT:    s_cbranch_execz BB14_2
2762; GFX9-NEXT:  ; %bb.1:
2763; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2764; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2765; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2766; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2767; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2768; GFX9-NEXT:    buffer_wbinvl1_vol
2769; GFX9-NEXT:  BB14_2:
2770; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2771; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2772; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2773; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2774; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2775; GFX9-NEXT:    s_mov_b32 s2, -1
2776; GFX9-NEXT:    s_nop 0
2777; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2778; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2779; GFX9-NEXT:    s_endpgm
2780;
2781; GFX1064-LABEL: and_i32_varying:
2782; GFX1064:       ; %bb.0: ; %entry
2783; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2784; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2785; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2786; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
2787; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
2788; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2789; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2790; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2791; GFX1064-NEXT:    s_not_b64 exec, exec
2792; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
2793; GFX1064-NEXT:    s_not_b64 exec, exec
2794; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2795; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2796; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2797; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2798; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2799; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2800; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2801; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2802; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2803; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2804; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2805; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2806; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2807; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2808; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2809; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2810; GFX1064-NEXT:    s_mov_b32 s2, -1
2811; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2812; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2813; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2814; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2815; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
2816; GFX1064-NEXT:    ; implicit-def: $vgpr0
2817; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2818; GFX1064-NEXT:    s_cbranch_execz BB14_2
2819; GFX1064-NEXT:  ; %bb.1:
2820; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2821; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
2822; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2823; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2824; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v7
2825; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2826; GFX1064-NEXT:    buffer_gl0_inv
2827; GFX1064-NEXT:    buffer_gl1_inv
2828; GFX1064-NEXT:  BB14_2:
2829; GFX1064-NEXT:    v_nop
2830; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2831; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2832; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2833; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2834; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2835; GFX1064-NEXT:    s_nop 1
2836; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2838; GFX1064-NEXT:    s_endpgm
2839;
2840; GFX1032-LABEL: and_i32_varying:
2841; GFX1032:       ; %bb.0: ; %entry
2842; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2843; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
2844; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2845; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2846; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
2847; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2848; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2849; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2850; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2851; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
2852; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2853; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2854; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2855; GFX1032-NEXT:    s_mov_b32 s2, -1
2856; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2857; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2858; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2859; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2860; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2861; GFX1032-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2862; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2863; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2864; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2865; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2866; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2867; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
2868; GFX1032-NEXT:    ; implicit-def: $vgpr0
2869; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2870; GFX1032-NEXT:    s_cbranch_execz BB14_2
2871; GFX1032-NEXT:  ; %bb.1:
2872; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2873; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
2874; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2875; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2876; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v7
2877; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2878; GFX1032-NEXT:    buffer_gl0_inv
2879; GFX1032-NEXT:    buffer_gl1_inv
2880; GFX1032-NEXT:  BB14_2:
2881; GFX1032-NEXT:    v_nop
2882; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2883; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2884; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2885; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2886; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2887; GFX1032-NEXT:    s_nop 1
2888; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2889; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2890; GFX1032-NEXT:    s_endpgm
2891entry:
2892  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2893  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2894  store i32 %old, i32 addrspace(1)* %out
2895  ret void
2896}
2897
2898; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
2899; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
2900; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
2901define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2902;
2903;
2904; GFX7LESS-LABEL: or_i32_varying:
2905; GFX7LESS:       ; %bb.0: ; %entry
2906; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2907; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2908; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2909; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2910; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2911; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2912; GFX7LESS-NEXT:    buffer_wbinvl1
2913; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2914; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2915; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2916; GFX7LESS-NEXT:    s_endpgm
2917;
2918; GFX8-LABEL: or_i32_varying:
2919; GFX8:       ; %bb.0: ; %entry
2920; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2921; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2922; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2923; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2924; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2925; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2926; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2927; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2928; GFX8-NEXT:    s_not_b64 exec, exec
2929; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2930; GFX8-NEXT:    s_not_b64 exec, exec
2931; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2932; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2933; GFX8-NEXT:    s_nop 1
2934; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2935; GFX8-NEXT:    s_nop 1
2936; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2937; GFX8-NEXT:    s_nop 1
2938; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2939; GFX8-NEXT:    s_nop 1
2940; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2941; GFX8-NEXT:    s_nop 1
2942; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2943; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2944; GFX8-NEXT:    s_nop 0
2945; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2946; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2947; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2948; GFX8-NEXT:    ; implicit-def: $vgpr0
2949; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2950; GFX8-NEXT:    s_cbranch_execz BB15_2
2951; GFX8-NEXT:  ; %bb.1:
2952; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2953; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2954; GFX8-NEXT:    s_mov_b32 m0, -1
2955; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2956; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2957; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2958; GFX8-NEXT:    buffer_wbinvl1_vol
2959; GFX8-NEXT:  BB15_2:
2960; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2961; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2962; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2963; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2964; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2965; GFX8-NEXT:    s_mov_b32 s2, -1
2966; GFX8-NEXT:    s_nop 0
2967; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2968; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2969; GFX8-NEXT:    s_endpgm
2970;
2971; GFX9-LABEL: or_i32_varying:
2972; GFX9:       ; %bb.0: ; %entry
2973; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2974; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2975; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2976; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2977; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2978; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2979; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2980; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2981; GFX9-NEXT:    s_not_b64 exec, exec
2982; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2983; GFX9-NEXT:    s_not_b64 exec, exec
2984; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2985; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2986; GFX9-NEXT:    s_nop 1
2987; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2988; GFX9-NEXT:    s_nop 1
2989; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2990; GFX9-NEXT:    s_nop 1
2991; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2992; GFX9-NEXT:    s_nop 1
2993; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2994; GFX9-NEXT:    s_nop 1
2995; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2996; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2997; GFX9-NEXT:    s_nop 0
2998; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2999; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3000; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3001; GFX9-NEXT:    ; implicit-def: $vgpr0
3002; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3003; GFX9-NEXT:    s_cbranch_execz BB15_2
3004; GFX9-NEXT:  ; %bb.1:
3005; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3006; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3007; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3008; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3009; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3010; GFX9-NEXT:    buffer_wbinvl1_vol
3011; GFX9-NEXT:  BB15_2:
3012; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3013; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3014; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3015; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3016; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3017; GFX9-NEXT:    s_mov_b32 s2, -1
3018; GFX9-NEXT:    s_nop 0
3019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3020; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3021; GFX9-NEXT:    s_endpgm
3022;
3023; GFX1064-LABEL: or_i32_varying:
3024; GFX1064:       ; %bb.0: ; %entry
3025; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3026; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3027; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3028; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3029; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3030; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3031; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3032; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3033; GFX1064-NEXT:    s_not_b64 exec, exec
3034; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3035; GFX1064-NEXT:    s_not_b64 exec, exec
3036; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3037; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3038; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3039; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3040; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3041; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3042; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3043; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3044; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3045; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3046; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3047; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3048; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3049; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3050; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3051; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3052; GFX1064-NEXT:    s_mov_b32 s2, -1
3053; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3054; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3055; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3056; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3057; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3058; GFX1064-NEXT:    ; implicit-def: $vgpr0
3059; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3060; GFX1064-NEXT:    s_cbranch_execz BB15_2
3061; GFX1064-NEXT:  ; %bb.1:
3062; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3063; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3064; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3065; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3066; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v7
3067; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3068; GFX1064-NEXT:    buffer_gl0_inv
3069; GFX1064-NEXT:    buffer_gl1_inv
3070; GFX1064-NEXT:  BB15_2:
3071; GFX1064-NEXT:    v_nop
3072; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3073; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3074; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3075; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3076; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3077; GFX1064-NEXT:    s_nop 1
3078; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3079; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3080; GFX1064-NEXT:    s_endpgm
3081;
3082; GFX1032-LABEL: or_i32_varying:
3083; GFX1032:       ; %bb.0: ; %entry
3084; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3085; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3086; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3087; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3088; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3089; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3090; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3091; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3092; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3093; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3094; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3095; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3096; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3097; GFX1032-NEXT:    s_mov_b32 s2, -1
3098; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3099; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3100; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3101; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3102; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3103; GFX1032-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3104; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3105; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3106; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3107; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3108; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3109; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3110; GFX1032-NEXT:    ; implicit-def: $vgpr0
3111; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3112; GFX1032-NEXT:    s_cbranch_execz BB15_2
3113; GFX1032-NEXT:  ; %bb.1:
3114; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3115; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3116; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3117; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3118; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v7
3119; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3120; GFX1032-NEXT:    buffer_gl0_inv
3121; GFX1032-NEXT:    buffer_gl1_inv
3122; GFX1032-NEXT:  BB15_2:
3123; GFX1032-NEXT:    v_nop
3124; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3125; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3126; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3127; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3128; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3129; GFX1032-NEXT:    s_nop 1
3130; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3131; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3132; GFX1032-NEXT:    s_endpgm
3133entry:
3134  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3135  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3136  store i32 %old, i32 addrspace(1)* %out
3137  ret void
3138}
3139
3140; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
3141; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
3142; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
3143define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3144;
3145;
3146; GFX7LESS-LABEL: xor_i32_varying:
3147; GFX7LESS:       ; %bb.0: ; %entry
3148; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3149; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3150; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3151; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3152; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3153; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3154; GFX7LESS-NEXT:    buffer_wbinvl1
3155; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3156; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3157; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3158; GFX7LESS-NEXT:    s_endpgm
3159;
3160; GFX8-LABEL: xor_i32_varying:
3161; GFX8:       ; %bb.0: ; %entry
3162; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3163; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3164; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3165; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3166; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3167; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3168; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3169; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3170; GFX8-NEXT:    s_not_b64 exec, exec
3171; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3172; GFX8-NEXT:    s_not_b64 exec, exec
3173; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3174; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3175; GFX8-NEXT:    s_nop 1
3176; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3177; GFX8-NEXT:    s_nop 1
3178; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3179; GFX8-NEXT:    s_nop 1
3180; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3181; GFX8-NEXT:    s_nop 1
3182; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3183; GFX8-NEXT:    s_nop 1
3184; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3185; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3186; GFX8-NEXT:    s_nop 0
3187; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3188; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3189; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3190; GFX8-NEXT:    ; implicit-def: $vgpr0
3191; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3192; GFX8-NEXT:    s_cbranch_execz BB16_2
3193; GFX8-NEXT:  ; %bb.1:
3194; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3195; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3196; GFX8-NEXT:    s_mov_b32 m0, -1
3197; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3198; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3199; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3200; GFX8-NEXT:    buffer_wbinvl1_vol
3201; GFX8-NEXT:  BB16_2:
3202; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3203; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3204; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3205; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3206; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3207; GFX8-NEXT:    s_mov_b32 s2, -1
3208; GFX8-NEXT:    s_nop 0
3209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3210; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3211; GFX8-NEXT:    s_endpgm
3212;
3213; GFX9-LABEL: xor_i32_varying:
3214; GFX9:       ; %bb.0: ; %entry
3215; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3216; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3217; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3218; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3219; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3220; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3221; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3222; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3223; GFX9-NEXT:    s_not_b64 exec, exec
3224; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3225; GFX9-NEXT:    s_not_b64 exec, exec
3226; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3227; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3228; GFX9-NEXT:    s_nop 1
3229; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3230; GFX9-NEXT:    s_nop 1
3231; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3232; GFX9-NEXT:    s_nop 1
3233; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3234; GFX9-NEXT:    s_nop 1
3235; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3236; GFX9-NEXT:    s_nop 1
3237; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3238; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3239; GFX9-NEXT:    s_nop 0
3240; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3241; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3242; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3243; GFX9-NEXT:    ; implicit-def: $vgpr0
3244; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3245; GFX9-NEXT:    s_cbranch_execz BB16_2
3246; GFX9-NEXT:  ; %bb.1:
3247; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3248; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3249; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3250; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3251; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3252; GFX9-NEXT:    buffer_wbinvl1_vol
3253; GFX9-NEXT:  BB16_2:
3254; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3255; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3256; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3257; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3258; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3259; GFX9-NEXT:    s_mov_b32 s2, -1
3260; GFX9-NEXT:    s_nop 0
3261; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3262; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3263; GFX9-NEXT:    s_endpgm
3264;
3265; GFX1064-LABEL: xor_i32_varying:
3266; GFX1064:       ; %bb.0: ; %entry
3267; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3268; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3269; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3270; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3271; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3272; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3273; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3274; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3275; GFX1064-NEXT:    s_not_b64 exec, exec
3276; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3277; GFX1064-NEXT:    s_not_b64 exec, exec
3278; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3279; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3280; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3281; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3282; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3283; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3284; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3285; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3286; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3287; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3288; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3289; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3290; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3291; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3292; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3293; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3294; GFX1064-NEXT:    s_mov_b32 s2, -1
3295; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3296; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3297; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3298; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3299; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3300; GFX1064-NEXT:    ; implicit-def: $vgpr0
3301; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3302; GFX1064-NEXT:    s_cbranch_execz BB16_2
3303; GFX1064-NEXT:  ; %bb.1:
3304; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3305; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3306; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3307; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3308; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v7
3309; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3310; GFX1064-NEXT:    buffer_gl0_inv
3311; GFX1064-NEXT:    buffer_gl1_inv
3312; GFX1064-NEXT:  BB16_2:
3313; GFX1064-NEXT:    v_nop
3314; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3315; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3316; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3317; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3318; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3319; GFX1064-NEXT:    s_nop 1
3320; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3321; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3322; GFX1064-NEXT:    s_endpgm
3323;
3324; GFX1032-LABEL: xor_i32_varying:
3325; GFX1032:       ; %bb.0: ; %entry
3326; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3327; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3328; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3329; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3330; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3331; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3332; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3333; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3334; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3335; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3336; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3337; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3338; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3339; GFX1032-NEXT:    s_mov_b32 s2, -1
3340; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3341; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3342; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3343; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3344; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3345; GFX1032-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3346; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3347; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3348; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3349; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3350; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3351; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3352; GFX1032-NEXT:    ; implicit-def: $vgpr0
3353; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3354; GFX1032-NEXT:    s_cbranch_execz BB16_2
3355; GFX1032-NEXT:  ; %bb.1:
3356; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3357; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3358; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3359; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3360; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v7
3361; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3362; GFX1032-NEXT:    buffer_gl0_inv
3363; GFX1032-NEXT:    buffer_gl1_inv
3364; GFX1032-NEXT:  BB16_2:
3365; GFX1032-NEXT:    v_nop
3366; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3367; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3368; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3369; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3370; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3371; GFX1032-NEXT:    s_nop 1
3372; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3373; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3374; GFX1032-NEXT:    s_endpgm
3375entry:
3376  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3377  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3378  store i32 %old, i32 addrspace(1)* %out
3379  ret void
3380}
3381
3382; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
3383; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
3384; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
3385define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3386;
3387;
3388; GFX7LESS-LABEL: max_i32_varying:
3389; GFX7LESS:       ; %bb.0: ; %entry
3390; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3391; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3392; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3393; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3394; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3395; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3396; GFX7LESS-NEXT:    buffer_wbinvl1
3397; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3398; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3399; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3400; GFX7LESS-NEXT:    s_endpgm
3401;
3402; GFX8-LABEL: max_i32_varying:
3403; GFX8:       ; %bb.0: ; %entry
3404; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3405; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3406; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3407; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3408; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3409; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3410; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3411; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3412; GFX8-NEXT:    s_not_b64 exec, exec
3413; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3414; GFX8-NEXT:    s_not_b64 exec, exec
3415; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3416; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3417; GFX8-NEXT:    s_nop 1
3418; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3419; GFX8-NEXT:    s_nop 1
3420; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3421; GFX8-NEXT:    s_nop 1
3422; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3423; GFX8-NEXT:    s_nop 1
3424; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3425; GFX8-NEXT:    s_nop 1
3426; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3427; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3428; GFX8-NEXT:    s_nop 0
3429; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3430; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3431; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3432; GFX8-NEXT:    ; implicit-def: $vgpr0
3433; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3434; GFX8-NEXT:    s_cbranch_execz BB17_2
3435; GFX8-NEXT:  ; %bb.1:
3436; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3437; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3438; GFX8-NEXT:    s_mov_b32 m0, -1
3439; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3440; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3441; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3442; GFX8-NEXT:    buffer_wbinvl1_vol
3443; GFX8-NEXT:  BB17_2:
3444; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3445; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3446; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3447; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3448; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3449; GFX8-NEXT:    s_mov_b32 s2, -1
3450; GFX8-NEXT:    s_nop 0
3451; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3452; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3453; GFX8-NEXT:    s_endpgm
3454;
3455; GFX9-LABEL: max_i32_varying:
3456; GFX9:       ; %bb.0: ; %entry
3457; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3458; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3459; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3460; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3461; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3462; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3463; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3464; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3465; GFX9-NEXT:    s_not_b64 exec, exec
3466; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3467; GFX9-NEXT:    s_not_b64 exec, exec
3468; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3469; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3470; GFX9-NEXT:    s_nop 1
3471; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3472; GFX9-NEXT:    s_nop 1
3473; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3474; GFX9-NEXT:    s_nop 1
3475; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3476; GFX9-NEXT:    s_nop 1
3477; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3478; GFX9-NEXT:    s_nop 1
3479; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3480; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3481; GFX9-NEXT:    s_nop 0
3482; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3483; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3484; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3485; GFX9-NEXT:    ; implicit-def: $vgpr0
3486; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3487; GFX9-NEXT:    s_cbranch_execz BB17_2
3488; GFX9-NEXT:  ; %bb.1:
3489; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3490; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3491; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3492; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3493; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3494; GFX9-NEXT:    buffer_wbinvl1_vol
3495; GFX9-NEXT:  BB17_2:
3496; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3497; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3498; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3499; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3500; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3501; GFX9-NEXT:    s_mov_b32 s2, -1
3502; GFX9-NEXT:    s_nop 0
3503; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3504; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3505; GFX9-NEXT:    s_endpgm
3506;
3507; GFX1064-LABEL: max_i32_varying:
3508; GFX1064:       ; %bb.0: ; %entry
3509; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3510; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3511; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3512; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
3513; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
3514; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3515; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3516; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3517; GFX1064-NEXT:    s_not_b64 exec, exec
3518; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3519; GFX1064-NEXT:    s_not_b64 exec, exec
3520; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3521; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3522; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3523; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3524; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3525; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3526; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3527; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3528; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3529; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3530; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3531; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3532; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3533; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3534; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3535; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3536; GFX1064-NEXT:    s_mov_b32 s2, -1
3537; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3538; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3539; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3540; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3541; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3542; GFX1064-NEXT:    ; implicit-def: $vgpr0
3543; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3544; GFX1064-NEXT:    s_cbranch_execz BB17_2
3545; GFX1064-NEXT:  ; %bb.1:
3546; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3547; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3548; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3549; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3550; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v7
3551; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3552; GFX1064-NEXT:    buffer_gl0_inv
3553; GFX1064-NEXT:    buffer_gl1_inv
3554; GFX1064-NEXT:  BB17_2:
3555; GFX1064-NEXT:    v_nop
3556; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3557; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3558; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3559; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3560; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3561; GFX1064-NEXT:    s_nop 1
3562; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3563; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3564; GFX1064-NEXT:    s_endpgm
3565;
3566; GFX1032-LABEL: max_i32_varying:
3567; GFX1032:       ; %bb.0: ; %entry
3568; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3569; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3570; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3571; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3572; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
3573; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3574; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3575; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3576; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3577; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3578; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3579; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3580; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3581; GFX1032-NEXT:    s_mov_b32 s2, -1
3582; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3583; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3584; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3585; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3586; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3587; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3588; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3589; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3590; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3591; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3592; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3593; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
3594; GFX1032-NEXT:    ; implicit-def: $vgpr0
3595; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3596; GFX1032-NEXT:    s_cbranch_execz BB17_2
3597; GFX1032-NEXT:  ; %bb.1:
3598; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3599; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3600; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3601; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3602; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v7
3603; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3604; GFX1032-NEXT:    buffer_gl0_inv
3605; GFX1032-NEXT:    buffer_gl1_inv
3606; GFX1032-NEXT:  BB17_2:
3607; GFX1032-NEXT:    v_nop
3608; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3609; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3610; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3611; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3612; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3613; GFX1032-NEXT:    s_nop 1
3614; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3615; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3616; GFX1032-NEXT:    s_endpgm
3617entry:
3618  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3619  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3620  store i32 %old, i32 addrspace(1)* %out
3621  ret void
3622}
3623
3624define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3625;
3626;
3627; GFX7LESS-LABEL: max_i64_constant:
3628; GFX7LESS:       ; %bb.0: ; %entry
3629; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3630; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3631; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3632; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
3633; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3634; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3635; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3636; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3637; GFX7LESS-NEXT:  ; %bb.1:
3638; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3639; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3640; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3641; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3642; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3643; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3644; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3645; GFX7LESS-NEXT:    buffer_wbinvl1
3646; GFX7LESS-NEXT:  BB18_2:
3647; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3648; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3649; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3650; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3651; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3652; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3653; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3654; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3655; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3656; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3657; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3658; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3659; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3660; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3661; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3662; GFX7LESS-NEXT:    s_endpgm
3663;
3664; GFX8-LABEL: max_i64_constant:
3665; GFX8:       ; %bb.0: ; %entry
3666; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3667; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3668; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3669; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3670; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3671; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3672; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3673; GFX8-NEXT:    s_cbranch_execz BB18_2
3674; GFX8-NEXT:  ; %bb.1:
3675; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3676; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3677; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3678; GFX8-NEXT:    s_mov_b32 m0, -1
3679; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3680; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3681; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3682; GFX8-NEXT:    buffer_wbinvl1_vol
3683; GFX8-NEXT:  BB18_2:
3684; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3685; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3686; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3687; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3688; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3689; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3690; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3691; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3692; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3693; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3694; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3695; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3696; GFX8-NEXT:    s_mov_b32 s2, -1
3697; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3698; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3699; GFX8-NEXT:    s_endpgm
3700;
3701; GFX9-LABEL: max_i64_constant:
3702; GFX9:       ; %bb.0: ; %entry
3703; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3704; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3705; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3706; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3707; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3708; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3709; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3710; GFX9-NEXT:    s_cbranch_execz BB18_2
3711; GFX9-NEXT:  ; %bb.1:
3712; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3713; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3714; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3715; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3716; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3717; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3718; GFX9-NEXT:    buffer_wbinvl1_vol
3719; GFX9-NEXT:  BB18_2:
3720; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3721; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3722; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3723; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3724; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3725; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3726; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3727; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3728; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3729; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3730; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3731; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3732; GFX9-NEXT:    s_mov_b32 s2, -1
3733; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3734; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3735; GFX9-NEXT:    s_endpgm
3736;
3737; GFX1064-LABEL: max_i64_constant:
3738; GFX1064:       ; %bb.0: ; %entry
3739; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3740; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3741; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3742; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3743; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3744; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3745; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3746; GFX1064-NEXT:    s_cbranch_execz BB18_2
3747; GFX1064-NEXT:  ; %bb.1:
3748; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3749; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3750; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3751; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3752; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3753; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3754; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3755; GFX1064-NEXT:    buffer_gl0_inv
3756; GFX1064-NEXT:    buffer_gl1_inv
3757; GFX1064-NEXT:  BB18_2:
3758; GFX1064-NEXT:    v_nop
3759; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3760; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
3761; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
3762; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3763; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3764; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3765; GFX1064-NEXT:    s_mov_b32 s2, -1
3766; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3767; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
3768; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
3769; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3770; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3771; GFX1064-NEXT:    s_endpgm
3772;
3773; GFX1032-LABEL: max_i64_constant:
3774; GFX1032:       ; %bb.0: ; %entry
3775; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3776; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3777; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3778; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3779; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3780; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3781; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3782; GFX1032-NEXT:    s_cbranch_execz BB18_2
3783; GFX1032-NEXT:  ; %bb.1:
3784; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3785; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3786; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3787; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3788; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3789; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3790; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3791; GFX1032-NEXT:    buffer_gl0_inv
3792; GFX1032-NEXT:    buffer_gl1_inv
3793; GFX1032-NEXT:  BB18_2:
3794; GFX1032-NEXT:    v_nop
3795; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3796; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
3797; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
3798; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3799; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3800; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3801; GFX1032-NEXT:    s_mov_b32 s2, -1
3802; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1]
3803; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
3804; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
3805; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3806; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3807; GFX1032-NEXT:    s_endpgm
3808entry:
3809  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3810  store i64 %old, i64 addrspace(1)* %out
3811  ret void
3812}
3813
3814; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
3815; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
3816; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
3817define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3818;
3819;
3820; GFX7LESS-LABEL: min_i32_varying:
3821; GFX7LESS:       ; %bb.0: ; %entry
3822; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3823; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3824; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3825; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3826; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3827; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3828; GFX7LESS-NEXT:    buffer_wbinvl1
3829; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3830; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3831; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3832; GFX7LESS-NEXT:    s_endpgm
3833;
3834; GFX8-LABEL: min_i32_varying:
3835; GFX8:       ; %bb.0: ; %entry
3836; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3837; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3838; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3839; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3840; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3841; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3842; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3843; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3844; GFX8-NEXT:    s_not_b64 exec, exec
3845; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3846; GFX8-NEXT:    s_not_b64 exec, exec
3847; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3848; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3849; GFX8-NEXT:    s_nop 1
3850; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3851; GFX8-NEXT:    s_nop 1
3852; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3853; GFX8-NEXT:    s_nop 1
3854; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3855; GFX8-NEXT:    s_nop 1
3856; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3857; GFX8-NEXT:    s_nop 1
3858; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3859; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3860; GFX8-NEXT:    s_nop 0
3861; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3862; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3863; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3864; GFX8-NEXT:    ; implicit-def: $vgpr0
3865; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3866; GFX8-NEXT:    s_cbranch_execz BB19_2
3867; GFX8-NEXT:  ; %bb.1:
3868; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3869; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3870; GFX8-NEXT:    s_mov_b32 m0, -1
3871; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3872; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3873; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3874; GFX8-NEXT:    buffer_wbinvl1_vol
3875; GFX8-NEXT:  BB19_2:
3876; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3877; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3878; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3879; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3880; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3881; GFX8-NEXT:    s_mov_b32 s2, -1
3882; GFX8-NEXT:    s_nop 0
3883; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3884; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3885; GFX8-NEXT:    s_endpgm
3886;
3887; GFX9-LABEL: min_i32_varying:
3888; GFX9:       ; %bb.0: ; %entry
3889; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3890; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3891; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3892; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3893; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3894; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3895; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3896; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3897; GFX9-NEXT:    s_not_b64 exec, exec
3898; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3899; GFX9-NEXT:    s_not_b64 exec, exec
3900; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3901; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3902; GFX9-NEXT:    s_nop 1
3903; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3904; GFX9-NEXT:    s_nop 1
3905; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3906; GFX9-NEXT:    s_nop 1
3907; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3908; GFX9-NEXT:    s_nop 1
3909; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3910; GFX9-NEXT:    s_nop 1
3911; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3912; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3913; GFX9-NEXT:    s_nop 0
3914; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3915; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3916; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3917; GFX9-NEXT:    ; implicit-def: $vgpr0
3918; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3919; GFX9-NEXT:    s_cbranch_execz BB19_2
3920; GFX9-NEXT:  ; %bb.1:
3921; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3922; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3923; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3924; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3925; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3926; GFX9-NEXT:    buffer_wbinvl1_vol
3927; GFX9-NEXT:  BB19_2:
3928; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3929; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3930; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3931; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3932; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3933; GFX9-NEXT:    s_mov_b32 s2, -1
3934; GFX9-NEXT:    s_nop 0
3935; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3936; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3937; GFX9-NEXT:    s_endpgm
3938;
3939; GFX1064-LABEL: min_i32_varying:
3940; GFX1064:       ; %bb.0: ; %entry
3941; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3942; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3943; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3944; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
3945; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
3946; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3947; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3948; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3949; GFX1064-NEXT:    s_not_b64 exec, exec
3950; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3951; GFX1064-NEXT:    s_not_b64 exec, exec
3952; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3953; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3954; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3955; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3956; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3957; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3958; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3959; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3960; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3961; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3962; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3963; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3964; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3965; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3966; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3967; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3968; GFX1064-NEXT:    s_mov_b32 s2, -1
3969; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3970; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3971; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3972; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3973; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3974; GFX1064-NEXT:    ; implicit-def: $vgpr0
3975; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3976; GFX1064-NEXT:    s_cbranch_execz BB19_2
3977; GFX1064-NEXT:  ; %bb.1:
3978; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3979; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3980; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3981; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3982; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v7
3983; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3984; GFX1064-NEXT:    buffer_gl0_inv
3985; GFX1064-NEXT:    buffer_gl1_inv
3986; GFX1064-NEXT:  BB19_2:
3987; GFX1064-NEXT:    v_nop
3988; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3989; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3990; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3991; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3992; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3993; GFX1064-NEXT:    s_nop 1
3994; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3995; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3996; GFX1064-NEXT:    s_endpgm
3997;
3998; GFX1032-LABEL: min_i32_varying:
3999; GFX1032:       ; %bb.0: ; %entry
4000; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4001; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4002; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4003; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4004; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4005; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4006; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
4007; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4008; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4009; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4010; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4011; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4012; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4013; GFX1032-NEXT:    s_mov_b32 s2, -1
4014; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4015; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4016; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4017; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4018; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4019; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4020; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4021; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4022; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4023; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4024; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4025; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4026; GFX1032-NEXT:    ; implicit-def: $vgpr0
4027; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4028; GFX1032-NEXT:    s_cbranch_execz BB19_2
4029; GFX1032-NEXT:  ; %bb.1:
4030; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4031; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4032; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4033; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4034; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v7
4035; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4036; GFX1032-NEXT:    buffer_gl0_inv
4037; GFX1032-NEXT:    buffer_gl1_inv
4038; GFX1032-NEXT:  BB19_2:
4039; GFX1032-NEXT:    v_nop
4040; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4041; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4042; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4043; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
4044; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4045; GFX1032-NEXT:    s_nop 1
4046; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4047; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4048; GFX1032-NEXT:    s_endpgm
4049entry:
4050  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4051  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4052  store i32 %old, i32 addrspace(1)* %out
4053  ret void
4054}
4055
4056define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
4057;
4058;
4059; GFX7LESS-LABEL: min_i64_constant:
4060; GFX7LESS:       ; %bb.0: ; %entry
4061; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4062; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4063; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4064; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
4065; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4066; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4067; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4068; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
4069; GFX7LESS-NEXT:  ; %bb.1:
4070; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4071; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4072; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4073; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4074; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4075; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4076; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4077; GFX7LESS-NEXT:    buffer_wbinvl1
4078; GFX7LESS-NEXT:  BB20_2:
4079; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4080; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4081; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4082; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
4083; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4084; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4085; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4086; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4087; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4088; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4089; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4090; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4091; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4092; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4093; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4094; GFX7LESS-NEXT:    s_endpgm
4095;
4096; GFX8-LABEL: min_i64_constant:
4097; GFX8:       ; %bb.0: ; %entry
4098; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4099; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4100; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4101; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4102; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4103; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4104; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4105; GFX8-NEXT:    s_cbranch_execz BB20_2
4106; GFX8-NEXT:  ; %bb.1:
4107; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4108; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4109; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4110; GFX8-NEXT:    s_mov_b32 m0, -1
4111; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4112; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4113; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4114; GFX8-NEXT:    buffer_wbinvl1_vol
4115; GFX8-NEXT:  BB20_2:
4116; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4117; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4118; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
4119; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4120; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4121; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4122; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4123; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4124; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4125; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4126; GFX8-NEXT:    s_mov_b32 s2, -1
4127; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4128; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4129; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4130; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4131; GFX8-NEXT:    s_endpgm
4132;
4133; GFX9-LABEL: min_i64_constant:
4134; GFX9:       ; %bb.0: ; %entry
4135; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4136; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4137; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4138; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4139; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4140; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4141; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4142; GFX9-NEXT:    s_cbranch_execz BB20_2
4143; GFX9-NEXT:  ; %bb.1:
4144; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4145; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4146; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4147; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4148; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4149; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4150; GFX9-NEXT:    buffer_wbinvl1_vol
4151; GFX9-NEXT:  BB20_2:
4152; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4153; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4154; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4155; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4156; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4157; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4158; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4159; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4160; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4161; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4162; GFX9-NEXT:    s_mov_b32 s2, -1
4163; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4164; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4165; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4166; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4167; GFX9-NEXT:    s_endpgm
4168;
4169; GFX1064-LABEL: min_i64_constant:
4170; GFX1064:       ; %bb.0: ; %entry
4171; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4172; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4173; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4174; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4175; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4176; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4177; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4178; GFX1064-NEXT:    s_cbranch_execz BB20_2
4179; GFX1064-NEXT:  ; %bb.1:
4180; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4181; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4182; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4183; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4184; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4185; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4186; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4187; GFX1064-NEXT:    buffer_gl0_inv
4188; GFX1064-NEXT:    buffer_gl1_inv
4189; GFX1064-NEXT:  BB20_2:
4190; GFX1064-NEXT:    v_nop
4191; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4192; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4193; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4194; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4195; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4196; GFX1064-NEXT:    s_mov_b32 s2, -1
4197; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4198; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4199; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
4200; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4201; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4202; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4203; GFX1064-NEXT:    s_endpgm
4204;
4205; GFX1032-LABEL: min_i64_constant:
4206; GFX1032:       ; %bb.0: ; %entry
4207; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4208; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4209; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4210; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4211; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4212; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4213; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4214; GFX1032-NEXT:    s_cbranch_execz BB20_2
4215; GFX1032-NEXT:  ; %bb.1:
4216; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4217; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4218; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4219; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4220; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4221; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4222; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4223; GFX1032-NEXT:    buffer_gl0_inv
4224; GFX1032-NEXT:    buffer_gl1_inv
4225; GFX1032-NEXT:  BB20_2:
4226; GFX1032-NEXT:    v_nop
4227; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4228; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
4229; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
4230; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4231; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4232; GFX1032-NEXT:    s_mov_b32 s2, -1
4233; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4234; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
4235; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
4236; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
4237; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4238; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4239; GFX1032-NEXT:    s_endpgm
4240entry:
4241  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4242  store i64 %old, i64 addrspace(1)* %out
4243  ret void
4244}
4245
4246; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
4247; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
4248; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
4249define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4250;
4251;
4252; GFX7LESS-LABEL: umax_i32_varying:
4253; GFX7LESS:       ; %bb.0: ; %entry
4254; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4255; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4256; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4257; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4258; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4259; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4260; GFX7LESS-NEXT:    buffer_wbinvl1
4261; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4262; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4263; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4264; GFX7LESS-NEXT:    s_endpgm
4265;
4266; GFX8-LABEL: umax_i32_varying:
4267; GFX8:       ; %bb.0: ; %entry
4268; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4269; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4270; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4271; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4272; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4273; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4274; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4275; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4276; GFX8-NEXT:    s_not_b64 exec, exec
4277; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4278; GFX8-NEXT:    s_not_b64 exec, exec
4279; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4280; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4281; GFX8-NEXT:    s_nop 1
4282; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4283; GFX8-NEXT:    s_nop 1
4284; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4285; GFX8-NEXT:    s_nop 1
4286; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4287; GFX8-NEXT:    s_nop 1
4288; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4289; GFX8-NEXT:    s_nop 1
4290; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4291; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4292; GFX8-NEXT:    s_nop 0
4293; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4294; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4295; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4296; GFX8-NEXT:    ; implicit-def: $vgpr0
4297; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4298; GFX8-NEXT:    s_cbranch_execz BB21_2
4299; GFX8-NEXT:  ; %bb.1:
4300; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4301; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4302; GFX8-NEXT:    s_mov_b32 m0, -1
4303; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4304; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4305; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4306; GFX8-NEXT:    buffer_wbinvl1_vol
4307; GFX8-NEXT:  BB21_2:
4308; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4309; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4310; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4311; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4312; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4313; GFX8-NEXT:    s_mov_b32 s2, -1
4314; GFX8-NEXT:    s_nop 0
4315; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4316; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4317; GFX8-NEXT:    s_endpgm
4318;
4319; GFX9-LABEL: umax_i32_varying:
4320; GFX9:       ; %bb.0: ; %entry
4321; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4322; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4323; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4324; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4325; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4326; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4327; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4328; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4329; GFX9-NEXT:    s_not_b64 exec, exec
4330; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4331; GFX9-NEXT:    s_not_b64 exec, exec
4332; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4333; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4334; GFX9-NEXT:    s_nop 1
4335; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4336; GFX9-NEXT:    s_nop 1
4337; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4338; GFX9-NEXT:    s_nop 1
4339; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4340; GFX9-NEXT:    s_nop 1
4341; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4342; GFX9-NEXT:    s_nop 1
4343; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4344; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4345; GFX9-NEXT:    s_nop 0
4346; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4347; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4348; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4349; GFX9-NEXT:    ; implicit-def: $vgpr0
4350; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4351; GFX9-NEXT:    s_cbranch_execz BB21_2
4352; GFX9-NEXT:  ; %bb.1:
4353; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4354; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4355; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4356; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4357; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4358; GFX9-NEXT:    buffer_wbinvl1_vol
4359; GFX9-NEXT:  BB21_2:
4360; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4361; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4362; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4363; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4364; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4365; GFX9-NEXT:    s_mov_b32 s2, -1
4366; GFX9-NEXT:    s_nop 0
4367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4368; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4369; GFX9-NEXT:    s_endpgm
4370;
4371; GFX1064-LABEL: umax_i32_varying:
4372; GFX1064:       ; %bb.0: ; %entry
4373; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4374; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4375; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4376; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4377; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4378; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4379; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4380; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4381; GFX1064-NEXT:    s_not_b64 exec, exec
4382; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4383; GFX1064-NEXT:    s_not_b64 exec, exec
4384; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4385; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4386; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4387; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4388; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4389; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4390; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4391; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4392; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4393; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4394; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4395; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4396; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4397; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4398; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4399; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4400; GFX1064-NEXT:    s_mov_b32 s2, -1
4401; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4402; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4403; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4404; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4405; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4406; GFX1064-NEXT:    ; implicit-def: $vgpr0
4407; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4408; GFX1064-NEXT:    s_cbranch_execz BB21_2
4409; GFX1064-NEXT:  ; %bb.1:
4410; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4411; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4412; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4413; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4414; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v7
4415; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4416; GFX1064-NEXT:    buffer_gl0_inv
4417; GFX1064-NEXT:    buffer_gl1_inv
4418; GFX1064-NEXT:  BB21_2:
4419; GFX1064-NEXT:    v_nop
4420; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4421; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4422; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4423; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4424; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4425; GFX1064-NEXT:    s_nop 1
4426; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4427; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4428; GFX1064-NEXT:    s_endpgm
4429;
4430; GFX1032-LABEL: umax_i32_varying:
4431; GFX1032:       ; %bb.0: ; %entry
4432; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4433; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4434; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4435; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4436; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4437; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4438; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4439; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4440; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4441; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4442; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4443; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4444; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4445; GFX1032-NEXT:    s_mov_b32 s2, -1
4446; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4447; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4448; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4449; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4450; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4451; GFX1032-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4452; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4453; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4454; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4455; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4456; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4457; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4458; GFX1032-NEXT:    ; implicit-def: $vgpr0
4459; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4460; GFX1032-NEXT:    s_cbranch_execz BB21_2
4461; GFX1032-NEXT:  ; %bb.1:
4462; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4463; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4464; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4465; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4466; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v7
4467; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4468; GFX1032-NEXT:    buffer_gl0_inv
4469; GFX1032-NEXT:    buffer_gl1_inv
4470; GFX1032-NEXT:  BB21_2:
4471; GFX1032-NEXT:    v_nop
4472; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4473; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4474; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4475; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4476; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4477; GFX1032-NEXT:    s_nop 1
4478; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4479; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4480; GFX1032-NEXT:    s_endpgm
4481entry:
4482  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4483  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4484  store i32 %old, i32 addrspace(1)* %out
4485  ret void
4486}
4487
4488define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4489;
4490;
4491; GFX7LESS-LABEL: umax_i64_constant:
4492; GFX7LESS:       ; %bb.0: ; %entry
4493; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4494; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4495; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4496; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
4497; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4498; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4499; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4500; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4501; GFX7LESS-NEXT:  ; %bb.1:
4502; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4503; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4504; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4505; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4506; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4507; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4508; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4509; GFX7LESS-NEXT:    buffer_wbinvl1
4510; GFX7LESS-NEXT:  BB22_2:
4511; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4512; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4513; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4514; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4515; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4516; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4517; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4518; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4519; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4520; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4521; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4522; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4523; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4524; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4525; GFX7LESS-NEXT:    s_endpgm
4526;
4527; GFX8-LABEL: umax_i64_constant:
4528; GFX8:       ; %bb.0: ; %entry
4529; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4530; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4531; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4532; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4533; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4534; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4535; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4536; GFX8-NEXT:    s_cbranch_execz BB22_2
4537; GFX8-NEXT:  ; %bb.1:
4538; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4539; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4540; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4541; GFX8-NEXT:    s_mov_b32 m0, -1
4542; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4543; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4544; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4545; GFX8-NEXT:    buffer_wbinvl1_vol
4546; GFX8-NEXT:  BB22_2:
4547; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4548; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4549; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4550; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4551; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4552; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4553; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4554; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4555; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4556; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4557; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4558; GFX8-NEXT:    s_mov_b32 s2, -1
4559; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4560; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4561; GFX8-NEXT:    s_endpgm
4562;
4563; GFX9-LABEL: umax_i64_constant:
4564; GFX9:       ; %bb.0: ; %entry
4565; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4566; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4567; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4568; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4569; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4570; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4571; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4572; GFX9-NEXT:    s_cbranch_execz BB22_2
4573; GFX9-NEXT:  ; %bb.1:
4574; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4575; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4576; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4577; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4578; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4579; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4580; GFX9-NEXT:    buffer_wbinvl1_vol
4581; GFX9-NEXT:  BB22_2:
4582; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4583; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4584; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4585; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4586; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4587; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4588; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4589; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4590; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4591; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4592; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4593; GFX9-NEXT:    s_mov_b32 s2, -1
4594; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4595; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4596; GFX9-NEXT:    s_endpgm
4597;
4598; GFX1064-LABEL: umax_i64_constant:
4599; GFX1064:       ; %bb.0: ; %entry
4600; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4601; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4602; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4603; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4604; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4605; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4606; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4607; GFX1064-NEXT:    s_cbranch_execz BB22_2
4608; GFX1064-NEXT:  ; %bb.1:
4609; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4610; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4611; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4612; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4613; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4614; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4615; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4616; GFX1064-NEXT:    buffer_gl0_inv
4617; GFX1064-NEXT:    buffer_gl1_inv
4618; GFX1064-NEXT:  BB22_2:
4619; GFX1064-NEXT:    v_nop
4620; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4621; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4622; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4623; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4624; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4625; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4626; GFX1064-NEXT:    s_mov_b32 s2, -1
4627; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4628; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4629; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s5, vcc
4630; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4631; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4632; GFX1064-NEXT:    s_endpgm
4633;
4634; GFX1032-LABEL: umax_i64_constant:
4635; GFX1032:       ; %bb.0: ; %entry
4636; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4637; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4638; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4639; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4640; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4641; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4642; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4643; GFX1032-NEXT:    s_cbranch_execz BB22_2
4644; GFX1032-NEXT:  ; %bb.1:
4645; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4646; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4647; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4648; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4649; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4650; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4651; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4652; GFX1032-NEXT:    buffer_gl0_inv
4653; GFX1032-NEXT:    buffer_gl1_inv
4654; GFX1032-NEXT:  BB22_2:
4655; GFX1032-NEXT:    v_nop
4656; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4657; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
4658; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
4659; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4660; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4661; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4662; GFX1032-NEXT:    s_mov_b32 s2, -1
4663; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
4664; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
4665; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s5, vcc_lo
4666; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4667; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4668; GFX1032-NEXT:    s_endpgm
4669entry:
4670  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4671  store i64 %old, i64 addrspace(1)* %out
4672  ret void
4673}
4674
4675; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
4676; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
4677; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
4678define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4679;
4680;
4681; GFX7LESS-LABEL: umin_i32_varying:
4682; GFX7LESS:       ; %bb.0: ; %entry
4683; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4684; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4685; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4686; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4687; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4688; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4689; GFX7LESS-NEXT:    buffer_wbinvl1
4690; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4691; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4692; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4693; GFX7LESS-NEXT:    s_endpgm
4694;
4695; GFX8-LABEL: umin_i32_varying:
4696; GFX8:       ; %bb.0: ; %entry
4697; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4698; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4699; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
4700; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
4701; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4702; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4703; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4704; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4705; GFX8-NEXT:    s_not_b64 exec, exec
4706; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4707; GFX8-NEXT:    s_not_b64 exec, exec
4708; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4709; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4710; GFX8-NEXT:    s_nop 1
4711; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4712; GFX8-NEXT:    s_nop 1
4713; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4714; GFX8-NEXT:    s_nop 1
4715; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4716; GFX8-NEXT:    s_nop 1
4717; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4718; GFX8-NEXT:    s_nop 1
4719; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4720; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4721; GFX8-NEXT:    s_nop 0
4722; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4723; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4724; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4725; GFX8-NEXT:    ; implicit-def: $vgpr0
4726; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4727; GFX8-NEXT:    s_cbranch_execz BB23_2
4728; GFX8-NEXT:  ; %bb.1:
4729; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4730; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4731; GFX8-NEXT:    s_mov_b32 m0, -1
4732; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4733; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4734; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4735; GFX8-NEXT:    buffer_wbinvl1_vol
4736; GFX8-NEXT:  BB23_2:
4737; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4738; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4739; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4740; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4741; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4742; GFX8-NEXT:    s_mov_b32 s2, -1
4743; GFX8-NEXT:    s_nop 0
4744; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4745; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4746; GFX8-NEXT:    s_endpgm
4747;
4748; GFX9-LABEL: umin_i32_varying:
4749; GFX9:       ; %bb.0: ; %entry
4750; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4751; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4752; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
4753; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
4754; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4755; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4756; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4757; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4758; GFX9-NEXT:    s_not_b64 exec, exec
4759; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4760; GFX9-NEXT:    s_not_b64 exec, exec
4761; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4762; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4763; GFX9-NEXT:    s_nop 1
4764; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4765; GFX9-NEXT:    s_nop 1
4766; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4767; GFX9-NEXT:    s_nop 1
4768; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4769; GFX9-NEXT:    s_nop 1
4770; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4771; GFX9-NEXT:    s_nop 1
4772; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4773; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4774; GFX9-NEXT:    s_nop 0
4775; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4776; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4777; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4778; GFX9-NEXT:    ; implicit-def: $vgpr0
4779; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4780; GFX9-NEXT:    s_cbranch_execz BB23_2
4781; GFX9-NEXT:  ; %bb.1:
4782; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4783; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4784; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4785; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4786; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4787; GFX9-NEXT:    buffer_wbinvl1_vol
4788; GFX9-NEXT:  BB23_2:
4789; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4790; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4791; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4792; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4793; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4794; GFX9-NEXT:    s_mov_b32 s2, -1
4795; GFX9-NEXT:    s_nop 0
4796; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4797; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4798; GFX9-NEXT:    s_endpgm
4799;
4800; GFX1064-LABEL: umin_i32_varying:
4801; GFX1064:       ; %bb.0: ; %entry
4802; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4803; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4804; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4805; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4806; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
4807; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4808; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4809; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4810; GFX1064-NEXT:    s_not_b64 exec, exec
4811; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
4812; GFX1064-NEXT:    s_not_b64 exec, exec
4813; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4814; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4815; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4816; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4817; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4818; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4819; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4820; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4821; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4822; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4823; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4824; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4825; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4826; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4827; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4828; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4829; GFX1064-NEXT:    s_mov_b32 s2, -1
4830; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4831; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4832; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4833; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4834; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4835; GFX1064-NEXT:    ; implicit-def: $vgpr0
4836; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4837; GFX1064-NEXT:    s_cbranch_execz BB23_2
4838; GFX1064-NEXT:  ; %bb.1:
4839; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4840; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4841; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4842; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4843; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v7
4844; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4845; GFX1064-NEXT:    buffer_gl0_inv
4846; GFX1064-NEXT:    buffer_gl1_inv
4847; GFX1064-NEXT:  BB23_2:
4848; GFX1064-NEXT:    v_nop
4849; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4850; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4851; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4852; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4853; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4854; GFX1064-NEXT:    s_nop 1
4855; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4856; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4857; GFX1064-NEXT:    s_endpgm
4858;
4859; GFX1032-LABEL: umin_i32_varying:
4860; GFX1032:       ; %bb.0: ; %entry
4861; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4862; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4863; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4864; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4865; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4866; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4867; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4868; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4869; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4870; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
4871; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4872; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4873; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4874; GFX1032-NEXT:    s_mov_b32 s2, -1
4875; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4876; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4877; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4878; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4879; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4880; GFX1032-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4881; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4882; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4883; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4884; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4885; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4886; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4887; GFX1032-NEXT:    ; implicit-def: $vgpr0
4888; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4889; GFX1032-NEXT:    s_cbranch_execz BB23_2
4890; GFX1032-NEXT:  ; %bb.1:
4891; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4892; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4893; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4894; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4895; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v7
4896; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4897; GFX1032-NEXT:    buffer_gl0_inv
4898; GFX1032-NEXT:    buffer_gl1_inv
4899; GFX1032-NEXT:  BB23_2:
4900; GFX1032-NEXT:    v_nop
4901; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4902; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4903; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4904; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4905; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4906; GFX1032-NEXT:    s_nop 1
4907; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4908; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4909; GFX1032-NEXT:    s_endpgm
4910entry:
4911  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4912  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4913  store i32 %old, i32 addrspace(1)* %out
4914  ret void
4915}
4916
4917define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4918;
4919;
4920; GFX7LESS-LABEL: umin_i64_constant:
4921; GFX7LESS:       ; %bb.0: ; %entry
4922; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4923; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4924; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4925; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
4926; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4927; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4928; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4929; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4930; GFX7LESS-NEXT:  ; %bb.1:
4931; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4932; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4933; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4934; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4935; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4936; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4937; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4938; GFX7LESS-NEXT:    buffer_wbinvl1
4939; GFX7LESS-NEXT:  BB24_2:
4940; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4941; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4942; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4943; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4944; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4945; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4946; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4947; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4948; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4949; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4950; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4951; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4952; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4953; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4954; GFX7LESS-NEXT:    s_endpgm
4955;
4956; GFX8-LABEL: umin_i64_constant:
4957; GFX8:       ; %bb.0: ; %entry
4958; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4959; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4960; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4961; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4962; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4963; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4964; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4965; GFX8-NEXT:    s_cbranch_execz BB24_2
4966; GFX8-NEXT:  ; %bb.1:
4967; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4968; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4969; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4970; GFX8-NEXT:    s_mov_b32 m0, -1
4971; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4972; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4973; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4974; GFX8-NEXT:    buffer_wbinvl1_vol
4975; GFX8-NEXT:  BB24_2:
4976; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4977; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4978; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4979; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4980; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4981; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4982; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4983; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4984; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4985; GFX8-NEXT:    s_mov_b32 s2, -1
4986; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4987; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4988; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4989; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4990; GFX8-NEXT:    s_endpgm
4991;
4992; GFX9-LABEL: umin_i64_constant:
4993; GFX9:       ; %bb.0: ; %entry
4994; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4995; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4996; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4997; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4998; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4999; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5000; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5001; GFX9-NEXT:    s_cbranch_execz BB24_2
5002; GFX9-NEXT:  ; %bb.1:
5003; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5004; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5005; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5006; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5007; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5008; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5009; GFX9-NEXT:    buffer_wbinvl1_vol
5010; GFX9-NEXT:  BB24_2:
5011; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5012; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5013; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5014; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5015; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5016; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5017; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5018; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5019; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5020; GFX9-NEXT:    s_mov_b32 s2, -1
5021; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5022; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5023; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5024; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5025; GFX9-NEXT:    s_endpgm
5026;
5027; GFX1064-LABEL: umin_i64_constant:
5028; GFX1064:       ; %bb.0: ; %entry
5029; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
5030; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5031; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
5032; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
5033; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5034; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5035; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5036; GFX1064-NEXT:    s_cbranch_execz BB24_2
5037; GFX1064-NEXT:  ; %bb.1:
5038; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5039; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5040; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5041; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5042; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5043; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5044; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5045; GFX1064-NEXT:    buffer_gl0_inv
5046; GFX1064-NEXT:    buffer_gl1_inv
5047; GFX1064-NEXT:  BB24_2:
5048; GFX1064-NEXT:    v_nop
5049; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5050; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
5051; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
5052; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5053; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5054; GFX1064-NEXT:    s_mov_b32 s2, -1
5055; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5056; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5057; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
5058; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
5059; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5060; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5061; GFX1064-NEXT:    s_endpgm
5062;
5063; GFX1032-LABEL: umin_i64_constant:
5064; GFX1032:       ; %bb.0: ; %entry
5065; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5066; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
5067; GFX1032-NEXT:    ; implicit-def: $vcc_hi
5068; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
5069; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5070; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5071; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5072; GFX1032-NEXT:    s_cbranch_execz BB24_2
5073; GFX1032-NEXT:  ; %bb.1:
5074; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5075; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5076; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5077; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5078; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5079; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5080; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5081; GFX1032-NEXT:    buffer_gl0_inv
5082; GFX1032-NEXT:    buffer_gl1_inv
5083; GFX1032-NEXT:  BB24_2:
5084; GFX1032-NEXT:    v_nop
5085; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5086; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
5087; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
5088; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
5089; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5090; GFX1032-NEXT:    s_mov_b32 s2, -1
5091; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5092; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
5093; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
5094; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
5095; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5096; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5097; GFX1032-NEXT:    s_endpgm
5098entry:
5099  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
5100  store i64 %old, i64 addrspace(1)* %out
5101  ret void
5102}
5103