1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show that what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
21; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
27; GFX7LESS-NEXT:    ; mask branch BB0_2
28; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
29; GFX7LESS-NEXT:  BB0_1:
30; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
32; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s4, 5
33; GFX7LESS-NEXT:    s_mov_b32 m0, -1
34; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
35; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
36; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
37; GFX7LESS-NEXT:    buffer_wbinvl1
38; GFX7LESS-NEXT:  BB0_2:
39; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
40; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
43; GFX7LESS-NEXT:    s_mov_b32 s2, -1
44; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
46; GFX7LESS-NEXT:    s_endpgm
47;
48; GFX8-LABEL: add_i32_constant:
49; GFX8:       ; %bb.0: ; %entry
50; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
51; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
52; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
53; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
54; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
55; GFX8-NEXT:    ; implicit-def: $vgpr1
56; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
57; GFX8-NEXT:    ; mask branch BB0_2
58; GFX8-NEXT:    s_cbranch_execz BB0_2
59; GFX8-NEXT:  BB0_1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
61; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
62; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
63; GFX8-NEXT:    s_mov_b32 m0, -1
64; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
65; GFX8-NEXT:    ds_add_rtn_u32 v1, v2, v1
66; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
67; GFX8-NEXT:    buffer_wbinvl1_vol
68; GFX8-NEXT:  BB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
70; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
71; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
72; GFX8-NEXT:    s_mov_b32 s3, 0xf000
73; GFX8-NEXT:    s_mov_b32 s2, -1
74; GFX8-NEXT:    s_nop 1
75; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
88; GFX9-NEXT:    ; mask branch BB0_2
89; GFX9-NEXT:    s_cbranch_execz BB0_2
90; GFX9-NEXT:  BB0_1:
91; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
92; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
93; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
94; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
96; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
97; GFX9-NEXT:    buffer_wbinvl1_vol
98; GFX9-NEXT:  BB0_2:
99; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
107; GFX9-NEXT:    s_endpgm
108;
109; GFX1064-LABEL: add_i32_constant:
110; GFX1064:       ; %bb.0: ; %entry
111; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
112; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
113; GFX1064-NEXT:    ; implicit-def: $vgpr1
114; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
115; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
116; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
117; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
118; GFX1064-NEXT:    ; mask branch BB0_2
119; GFX1064-NEXT:    s_cbranch_execz BB0_2
120; GFX1064-NEXT:  BB0_1:
121; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
122; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
123; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
124; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
125; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
126; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
127; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
128; GFX1064-NEXT:    buffer_gl0_inv
129; GFX1064-NEXT:    buffer_gl1_inv
130; GFX1064-NEXT:  BB0_2:
131; GFX1064-NEXT:    v_nop
132; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
133; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
134; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
135; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
136; GFX1064-NEXT:    s_mov_b32 s2, -1
137; GFX1064-NEXT:    s_nop 1
138; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
140; GFX1064-NEXT:    s_endpgm
141;
142; GFX1032-LABEL: add_i32_constant:
143; GFX1032:       ; %bb.0: ; %entry
144; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
145; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
146; GFX1032-NEXT:    ; implicit-def: $vcc_hi
147; GFX1032-NEXT:    ; implicit-def: $vgpr1
148; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
149; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
150; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
151; GFX1032-NEXT:    ; mask branch BB0_2
152; GFX1032-NEXT:    s_cbranch_execz BB0_2
153; GFX1032-NEXT:  BB0_1:
154; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
155; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
156; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
157; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
158; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
159; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
160; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
161; GFX1032-NEXT:    buffer_gl0_inv
162; GFX1032-NEXT:    buffer_gl1_inv
163; GFX1032-NEXT:  BB0_2:
164; GFX1032-NEXT:    v_nop
165; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
166; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
167; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
168; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
169; GFX1032-NEXT:    s_mov_b32 s2, -1
170; GFX1032-NEXT:    s_nop 1
171; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
173; GFX1032-NEXT:    s_endpgm
174entry:
175  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
176  store i32 %old, i32 addrspace(1)* %out
177  ret void
178}
179
180define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
181;
182;
183; GFX7LESS-LABEL: add_i32_uniform:
184; GFX7LESS:       ; %bb.0: ; %entry
185; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
186; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
187; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
188; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
189; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
190; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
191; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
192; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
193; GFX7LESS-NEXT:    ; mask branch BB1_2
194; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
195; GFX7LESS-NEXT:  BB1_1:
196; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
197; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
199; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
200; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
201; GFX7LESS-NEXT:    s_mov_b32 m0, -1
202; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
203; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
204; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
205; GFX7LESS-NEXT:    buffer_wbinvl1
206; GFX7LESS-NEXT:  BB1_2:
207; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
208; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
209; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
211; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
212; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
213; GFX7LESS-NEXT:    s_mov_b32 s6, -1
214; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
215; GFX7LESS-NEXT:    s_endpgm
216;
217; GFX8-LABEL: add_i32_uniform:
218; GFX8:       ; %bb.0: ; %entry
219; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
220; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
221; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
222; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
223; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
224; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
225; GFX8-NEXT:    ; implicit-def: $vgpr1
226; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
227; GFX8-NEXT:    ; mask branch BB1_2
228; GFX8-NEXT:    s_cbranch_execz BB1_2
229; GFX8-NEXT:  BB1_1:
230; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
231; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX8-NEXT:    s_mul_i32 s1, s0, s1
233; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
234; GFX8-NEXT:    v_mov_b32_e32 v2, s1
235; GFX8-NEXT:    s_mov_b32 m0, -1
236; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
237; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
238; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
239; GFX8-NEXT:    buffer_wbinvl1_vol
240; GFX8-NEXT:  BB1_2:
241; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
242; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
244; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
245; GFX8-NEXT:    s_mov_b32 s7, 0xf000
246; GFX8-NEXT:    s_mov_b32 s6, -1
247; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
248; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
249; GFX8-NEXT:    s_endpgm
250;
251; GFX9-LABEL: add_i32_uniform:
252; GFX9:       ; %bb.0: ; %entry
253; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
254; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
255; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
256; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
257; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
258; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
259; GFX9-NEXT:    ; implicit-def: $vgpr1
260; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
261; GFX9-NEXT:    ; mask branch BB1_2
262; GFX9-NEXT:    s_cbranch_execz BB1_2
263; GFX9-NEXT:  BB1_1:
264; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
265; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX9-NEXT:    s_mul_i32 s1, s0, s1
267; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
268; GFX9-NEXT:    v_mov_b32_e32 v2, s1
269; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
270; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
271; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
272; GFX9-NEXT:    buffer_wbinvl1_vol
273; GFX9-NEXT:  BB1_2:
274; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
277; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
278; GFX9-NEXT:    s_mov_b32 s7, 0xf000
279; GFX9-NEXT:    s_mov_b32 s6, -1
280; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
281; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
282; GFX9-NEXT:    s_endpgm
283;
284; GFX1064-LABEL: add_i32_uniform:
285; GFX1064:       ; %bb.0: ; %entry
286; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
287; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
288; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
289; GFX1064-NEXT:    ; implicit-def: $vgpr1
290; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
291; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
292; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
293; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
294; GFX1064-NEXT:    ; mask branch BB1_2
295; GFX1064-NEXT:    s_cbranch_execz BB1_2
296; GFX1064-NEXT:  BB1_1:
297; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
298; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
299; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
301; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
302; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
303; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
304; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
305; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
306; GFX1064-NEXT:    buffer_gl0_inv
307; GFX1064-NEXT:    buffer_gl1_inv
308; GFX1064-NEXT:  BB1_2:
309; GFX1064-NEXT:    v_nop
310; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
311; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
313; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
314; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
315; GFX1064-NEXT:    s_mov_b32 s6, -1
316; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
317; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
318; GFX1064-NEXT:    s_endpgm
319;
320; GFX1032-LABEL: add_i32_uniform:
321; GFX1032:       ; %bb.0: ; %entry
322; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
323; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
324; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
325; GFX1032-NEXT:    ; implicit-def: $vcc_hi
326; GFX1032-NEXT:    ; implicit-def: $vgpr1
327; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
328; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
329; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
330; GFX1032-NEXT:    ; mask branch BB1_2
331; GFX1032-NEXT:    s_cbranch_execz BB1_2
332; GFX1032-NEXT:  BB1_1:
333; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
334; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
335; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
337; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
338; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
339; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
340; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
341; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
342; GFX1032-NEXT:    buffer_gl0_inv
343; GFX1032-NEXT:    buffer_gl1_inv
344; GFX1032-NEXT:  BB1_2:
345; GFX1032-NEXT:    v_nop
346; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
347; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
349; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
350; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
351; GFX1032-NEXT:    s_mov_b32 s6, -1
352; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
353; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
354; GFX1032-NEXT:    s_endpgm
355entry:
356  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
357  store i32 %old, i32 addrspace(1)* %out
358  ret void
359}
360
361; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
362; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
363; GFX7LESS-NOT: s_bcnt1_i32_b64
364; DPPCOMB: v_add_u32_dpp
365; DPPCOMB: v_add_u32_dpp
366; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
367; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
368; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
369define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
370;
371;
372; GFX7LESS-LABEL: add_i32_varying:
373; GFX7LESS:       ; %bb.0: ; %entry
374; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
375; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
376; GFX7LESS-NEXT:    s_mov_b32 m0, -1
377; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
378; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
379; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
380; GFX7LESS-NEXT:    buffer_wbinvl1
381; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
382; GFX7LESS-NEXT:    s_mov_b32 s2, -1
383; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
384; GFX7LESS-NEXT:    s_endpgm
385;
386; GFX8-LABEL: add_i32_varying:
387; GFX8:       ; %bb.0: ; %entry
388; GFX8-NEXT:    v_mov_b32_e32 v2, v0
389; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
390; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
391; GFX8-NEXT:    v_mov_b32_e32 v1, 0
392; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
393; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
394; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
395; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
396; GFX8-NEXT:    s_not_b64 exec, exec
397; GFX8-NEXT:    v_mov_b32_e32 v2, 0
398; GFX8-NEXT:    s_not_b64 exec, exec
399; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
400; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
401; GFX8-NEXT:    s_nop 1
402; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
403; GFX8-NEXT:    s_nop 1
404; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
405; GFX8-NEXT:    s_nop 1
406; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
407; GFX8-NEXT:    s_nop 1
408; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
409; GFX8-NEXT:    s_nop 1
410; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
411; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
412; GFX8-NEXT:    s_nop 0
413; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
414; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
415; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
416; GFX8-NEXT:    ; implicit-def: $vgpr0
417; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
418; GFX8-NEXT:    ; mask branch BB2_2
419; GFX8-NEXT:    s_cbranch_execz BB2_2
420; GFX8-NEXT:  BB2_1:
421; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
422; GFX8-NEXT:    v_mov_b32_e32 v3, s2
423; GFX8-NEXT:    s_mov_b32 m0, -1
424; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
425; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
426; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
427; GFX8-NEXT:    buffer_wbinvl1_vol
428; GFX8-NEXT:  BB2_2:
429; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
430; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
431; GFX8-NEXT:    v_mov_b32_e32 v0, v1
432; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
433; GFX8-NEXT:    s_mov_b32 s3, 0xf000
434; GFX8-NEXT:    s_mov_b32 s2, -1
435; GFX8-NEXT:    s_nop 0
436; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
438; GFX8-NEXT:    s_endpgm
439;
440; GFX9-LABEL: add_i32_varying:
441; GFX9:       ; %bb.0: ; %entry
442; GFX9-NEXT:    v_mov_b32_e32 v2, v0
443; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
444; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
445; GFX9-NEXT:    v_mov_b32_e32 v1, 0
446; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
447; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
448; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
449; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
450; GFX9-NEXT:    s_not_b64 exec, exec
451; GFX9-NEXT:    v_mov_b32_e32 v2, 0
452; GFX9-NEXT:    s_not_b64 exec, exec
453; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
454; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
455; GFX9-NEXT:    s_nop 1
456; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
457; GFX9-NEXT:    s_nop 1
458; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
459; GFX9-NEXT:    s_nop 1
460; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
461; GFX9-NEXT:    s_nop 1
462; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
463; GFX9-NEXT:    s_nop 1
464; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
465; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
466; GFX9-NEXT:    s_nop 0
467; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
468; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
469; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
470; GFX9-NEXT:    ; implicit-def: $vgpr0
471; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
472; GFX9-NEXT:    ; mask branch BB2_2
473; GFX9-NEXT:    s_cbranch_execz BB2_2
474; GFX9-NEXT:  BB2_1:
475; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
476; GFX9-NEXT:    v_mov_b32_e32 v3, s2
477; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
478; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
479; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
480; GFX9-NEXT:    buffer_wbinvl1_vol
481; GFX9-NEXT:  BB2_2:
482; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
483; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
484; GFX9-NEXT:    v_mov_b32_e32 v0, v1
485; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
486; GFX9-NEXT:    s_mov_b32 s3, 0xf000
487; GFX9-NEXT:    s_mov_b32 s2, -1
488; GFX9-NEXT:    s_nop 0
489; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
491; GFX9-NEXT:    s_endpgm
492;
493; GFX1064-LABEL: add_i32_varying:
494; GFX1064:       ; %bb.0: ; %entry
495; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
496; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
497; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
498; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
499; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
500; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
501; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
502; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
503; GFX1064-NEXT:    s_not_b64 exec, exec
504; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
505; GFX1064-NEXT:    s_not_b64 exec, exec
506; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
507; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
508; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
509; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
510; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
511; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
512; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
513; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
514; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
515; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
516; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
517; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
518; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
519; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
520; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
521; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
522; GFX1064-NEXT:    s_mov_b32 s2, -1
523; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
524; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
525; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
526; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
527; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
528; GFX1064-NEXT:    ; implicit-def: $vgpr0
529; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
530; GFX1064-NEXT:    ; mask branch BB2_2
531; GFX1064-NEXT:    s_cbranch_execz BB2_2
532; GFX1064-NEXT:  BB2_1:
533; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
534; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
535; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
536; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
537; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
538; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
539; GFX1064-NEXT:    buffer_gl0_inv
540; GFX1064-NEXT:    buffer_gl1_inv
541; GFX1064-NEXT:  BB2_2:
542; GFX1064-NEXT:    v_nop
543; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
544; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
545; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
546; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
547; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
548; GFX1064-NEXT:    s_nop 1
549; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
551; GFX1064-NEXT:    s_endpgm
552;
553; GFX1032-LABEL: add_i32_varying:
554; GFX1032:       ; %bb.0: ; %entry
555; GFX1032-NEXT:    ; implicit-def: $vcc_hi
556; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
557; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
558; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
559; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
560; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
561; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
562; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
563; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
564; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
565; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
566; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
567; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
568; GFX1032-NEXT:    s_mov_b32 s2, -1
569; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
570; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
571; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
572; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
573; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
574; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
575; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
576; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
577; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
578; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
579; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
580; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
581; GFX1032-NEXT:    ; implicit-def: $vgpr0
582; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
583; GFX1032-NEXT:    ; mask branch BB2_2
584; GFX1032-NEXT:    s_cbranch_execz BB2_2
585; GFX1032-NEXT:  BB2_1:
586; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
587; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
588; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
589; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
590; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
591; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
592; GFX1032-NEXT:    buffer_gl0_inv
593; GFX1032-NEXT:    buffer_gl1_inv
594; GFX1032-NEXT:  BB2_2:
595; GFX1032-NEXT:    v_nop
596; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
597; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
598; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
599; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
600; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
601; GFX1032-NEXT:    s_nop 1
602; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
604; GFX1032-NEXT:    s_endpgm
605entry:
606  %lane = call i32 @llvm.amdgcn.workitem.id.x()
607  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
608  store i32 %old, i32 addrspace(1)* %out
609  ret void
610}
611
612define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
613;
614;
615; GFX7LESS-LABEL: add_i32_varying_gfx1032:
616; GFX7LESS:       ; %bb.0: ; %entry
617; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
618; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
619; GFX7LESS-NEXT:    s_mov_b32 m0, -1
620; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
621; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
622; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
623; GFX7LESS-NEXT:    buffer_wbinvl1
624; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
625; GFX7LESS-NEXT:    s_mov_b32 s2, -1
626; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
627; GFX7LESS-NEXT:    s_endpgm
628;
629; GFX8-LABEL: add_i32_varying_gfx1032:
630; GFX8:       ; %bb.0: ; %entry
631; GFX8-NEXT:    v_mov_b32_e32 v2, v0
632; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
633; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
634; GFX8-NEXT:    v_mov_b32_e32 v1, 0
635; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
636; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
637; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
638; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
639; GFX8-NEXT:    s_not_b64 exec, exec
640; GFX8-NEXT:    v_mov_b32_e32 v2, 0
641; GFX8-NEXT:    s_not_b64 exec, exec
642; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
643; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
644; GFX8-NEXT:    s_nop 1
645; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
646; GFX8-NEXT:    s_nop 1
647; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
648; GFX8-NEXT:    s_nop 1
649; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
650; GFX8-NEXT:    s_nop 1
651; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
652; GFX8-NEXT:    s_nop 1
653; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
654; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
655; GFX8-NEXT:    s_nop 0
656; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
657; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
658; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
659; GFX8-NEXT:    ; implicit-def: $vgpr0
660; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
661; GFX8-NEXT:    ; mask branch BB3_2
662; GFX8-NEXT:    s_cbranch_execz BB3_2
663; GFX8-NEXT:  BB3_1:
664; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
665; GFX8-NEXT:    v_mov_b32_e32 v3, s2
666; GFX8-NEXT:    s_mov_b32 m0, -1
667; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
668; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
669; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
670; GFX8-NEXT:    buffer_wbinvl1_vol
671; GFX8-NEXT:  BB3_2:
672; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
673; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
674; GFX8-NEXT:    v_mov_b32_e32 v0, v1
675; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
676; GFX8-NEXT:    s_mov_b32 s3, 0xf000
677; GFX8-NEXT:    s_mov_b32 s2, -1
678; GFX8-NEXT:    s_nop 0
679; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
681; GFX8-NEXT:    s_endpgm
682;
683; GFX9-LABEL: add_i32_varying_gfx1032:
684; GFX9:       ; %bb.0: ; %entry
685; GFX9-NEXT:    v_mov_b32_e32 v2, v0
686; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
687; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
688; GFX9-NEXT:    v_mov_b32_e32 v1, 0
689; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
690; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
691; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
692; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
693; GFX9-NEXT:    s_not_b64 exec, exec
694; GFX9-NEXT:    v_mov_b32_e32 v2, 0
695; GFX9-NEXT:    s_not_b64 exec, exec
696; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
697; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
698; GFX9-NEXT:    s_nop 1
699; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
700; GFX9-NEXT:    s_nop 1
701; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
702; GFX9-NEXT:    s_nop 1
703; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
704; GFX9-NEXT:    s_nop 1
705; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
706; GFX9-NEXT:    s_nop 1
707; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
708; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
709; GFX9-NEXT:    s_nop 0
710; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
711; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
712; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
713; GFX9-NEXT:    ; implicit-def: $vgpr0
714; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
715; GFX9-NEXT:    ; mask branch BB3_2
716; GFX9-NEXT:    s_cbranch_execz BB3_2
717; GFX9-NEXT:  BB3_1:
718; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
719; GFX9-NEXT:    v_mov_b32_e32 v3, s2
720; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
721; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
722; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
723; GFX9-NEXT:    buffer_wbinvl1_vol
724; GFX9-NEXT:  BB3_2:
725; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
726; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
727; GFX9-NEXT:    v_mov_b32_e32 v0, v1
728; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
729; GFX9-NEXT:    s_mov_b32 s3, 0xf000
730; GFX9-NEXT:    s_mov_b32 s2, -1
731; GFX9-NEXT:    s_nop 0
732; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
734; GFX9-NEXT:    s_endpgm
735;
736; GFX1064-LABEL: add_i32_varying_gfx1032:
737; GFX1064:       ; %bb.0: ; %entry
738; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
739; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
740; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
741; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
742; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
743; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
744; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
745; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
746; GFX1064-NEXT:    s_not_b64 exec, exec
747; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
748; GFX1064-NEXT:    s_not_b64 exec, exec
749; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
750; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
751; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
752; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
753; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
754; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
755; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
756; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
757; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
758; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
759; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
760; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
761; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
762; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
763; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
764; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
765; GFX1064-NEXT:    s_mov_b32 s2, -1
766; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
767; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
768; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
769; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
770; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
771; GFX1064-NEXT:    ; implicit-def: $vgpr0
772; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
773; GFX1064-NEXT:    ; mask branch BB3_2
774; GFX1064-NEXT:    s_cbranch_execz BB3_2
775; GFX1064-NEXT:  BB3_1:
776; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
777; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
778; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
779; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
780; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
781; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
782; GFX1064-NEXT:    buffer_gl0_inv
783; GFX1064-NEXT:    buffer_gl1_inv
784; GFX1064-NEXT:  BB3_2:
785; GFX1064-NEXT:    v_nop
786; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
787; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
788; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
789; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
790; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
791; GFX1064-NEXT:    s_nop 1
792; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
794; GFX1064-NEXT:    s_endpgm
795;
796; GFX1032-LABEL: add_i32_varying_gfx1032:
797; GFX1032:       ; %bb.0: ; %entry
798; GFX1032-NEXT:    ; implicit-def: $vcc_hi
799; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
800; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
801; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
802; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
803; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
804; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
805; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
806; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
807; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
808; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
809; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
810; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
811; GFX1032-NEXT:    s_mov_b32 s2, -1
812; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
813; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
814; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
815; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
816; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
817; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
818; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
819; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
820; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
821; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
822; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
823; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
824; GFX1032-NEXT:    ; implicit-def: $vgpr0
825; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
826; GFX1032-NEXT:    ; mask branch BB3_2
827; GFX1032-NEXT:    s_cbranch_execz BB3_2
828; GFX1032-NEXT:  BB3_1:
829; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
830; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
831; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
832; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
833; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
834; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
835; GFX1032-NEXT:    buffer_gl0_inv
836; GFX1032-NEXT:    buffer_gl1_inv
837; GFX1032-NEXT:  BB3_2:
838; GFX1032-NEXT:    v_nop
839; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
840; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
841; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
842; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
843; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
844; GFX1032-NEXT:    s_nop 1
845; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
847; GFX1032-NEXT:    s_endpgm
848entry:
849  %lane = call i32 @llvm.amdgcn.workitem.id.x()
850  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
851  store i32 %old, i32 addrspace(1)* %out
852  ret void
853}
854
855define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
856;
857;
858; GFX7LESS-LABEL: add_i32_varying_gfx1064:
859; GFX7LESS:       ; %bb.0: ; %entry
860; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
861; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
862; GFX7LESS-NEXT:    s_mov_b32 m0, -1
863; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
864; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
865; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
866; GFX7LESS-NEXT:    buffer_wbinvl1
867; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
868; GFX7LESS-NEXT:    s_mov_b32 s2, -1
869; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
870; GFX7LESS-NEXT:    s_endpgm
871;
872; GFX8-LABEL: add_i32_varying_gfx1064:
873; GFX8:       ; %bb.0: ; %entry
874; GFX8-NEXT:    v_mov_b32_e32 v2, v0
875; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
876; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
877; GFX8-NEXT:    v_mov_b32_e32 v1, 0
878; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
879; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
880; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
881; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
882; GFX8-NEXT:    s_not_b64 exec, exec
883; GFX8-NEXT:    v_mov_b32_e32 v2, 0
884; GFX8-NEXT:    s_not_b64 exec, exec
885; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
886; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
887; GFX8-NEXT:    s_nop 1
888; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
889; GFX8-NEXT:    s_nop 1
890; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
891; GFX8-NEXT:    s_nop 1
892; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
893; GFX8-NEXT:    s_nop 1
894; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
895; GFX8-NEXT:    s_nop 1
896; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
897; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
898; GFX8-NEXT:    s_nop 0
899; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
900; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
901; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
902; GFX8-NEXT:    ; implicit-def: $vgpr0
903; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
904; GFX8-NEXT:    ; mask branch BB4_2
905; GFX8-NEXT:    s_cbranch_execz BB4_2
906; GFX8-NEXT:  BB4_1:
907; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
908; GFX8-NEXT:    v_mov_b32_e32 v3, s2
909; GFX8-NEXT:    s_mov_b32 m0, -1
910; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
911; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
912; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
913; GFX8-NEXT:    buffer_wbinvl1_vol
914; GFX8-NEXT:  BB4_2:
915; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
916; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
917; GFX8-NEXT:    v_mov_b32_e32 v0, v1
918; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
919; GFX8-NEXT:    s_mov_b32 s3, 0xf000
920; GFX8-NEXT:    s_mov_b32 s2, -1
921; GFX8-NEXT:    s_nop 0
922; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
924; GFX8-NEXT:    s_endpgm
925;
926; GFX9-LABEL: add_i32_varying_gfx1064:
927; GFX9:       ; %bb.0: ; %entry
928; GFX9-NEXT:    v_mov_b32_e32 v2, v0
929; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
930; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
931; GFX9-NEXT:    v_mov_b32_e32 v1, 0
932; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
933; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
934; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
935; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
936; GFX9-NEXT:    s_not_b64 exec, exec
937; GFX9-NEXT:    v_mov_b32_e32 v2, 0
938; GFX9-NEXT:    s_not_b64 exec, exec
939; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
940; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
941; GFX9-NEXT:    s_nop 1
942; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
943; GFX9-NEXT:    s_nop 1
944; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
945; GFX9-NEXT:    s_nop 1
946; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
947; GFX9-NEXT:    s_nop 1
948; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
949; GFX9-NEXT:    s_nop 1
950; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
951; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
952; GFX9-NEXT:    s_nop 0
953; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
954; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
955; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
956; GFX9-NEXT:    ; implicit-def: $vgpr0
957; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
958; GFX9-NEXT:    ; mask branch BB4_2
959; GFX9-NEXT:    s_cbranch_execz BB4_2
960; GFX9-NEXT:  BB4_1:
961; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
962; GFX9-NEXT:    v_mov_b32_e32 v3, s2
963; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
964; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
965; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
966; GFX9-NEXT:    buffer_wbinvl1_vol
967; GFX9-NEXT:  BB4_2:
968; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
969; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
970; GFX9-NEXT:    v_mov_b32_e32 v0, v1
971; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
972; GFX9-NEXT:    s_mov_b32 s3, 0xf000
973; GFX9-NEXT:    s_mov_b32 s2, -1
974; GFX9-NEXT:    s_nop 0
975; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
976; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
977; GFX9-NEXT:    s_endpgm
978;
979; GFX1064-LABEL: add_i32_varying_gfx1064:
980; GFX1064:       ; %bb.0: ; %entry
981; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
982; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
983; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
984; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
985; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
986; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
987; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
988; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
989; GFX1064-NEXT:    s_not_b64 exec, exec
990; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
991; GFX1064-NEXT:    s_not_b64 exec, exec
992; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
993; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
994; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
995; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
996; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
997; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
998; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
999; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1000; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
1001; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
1002; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1003; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
1004; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
1005; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
1006; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
1007; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
1008; GFX1064-NEXT:    s_mov_b32 s2, -1
1009; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
1010; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
1011; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
1012; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1013; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1014; GFX1064-NEXT:    ; implicit-def: $vgpr0
1015; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1016; GFX1064-NEXT:    ; mask branch BB4_2
1017; GFX1064-NEXT:    s_cbranch_execz BB4_2
1018; GFX1064-NEXT:  BB4_1:
1019; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1020; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
1021; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1022; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1023; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v7
1024; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1025; GFX1064-NEXT:    buffer_gl0_inv
1026; GFX1064-NEXT:    buffer_gl1_inv
1027; GFX1064-NEXT:  BB4_2:
1028; GFX1064-NEXT:    v_nop
1029; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1030; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1031; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
1032; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1033; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1034; GFX1064-NEXT:    s_nop 1
1035; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1037; GFX1064-NEXT:    s_endpgm
1038;
1039; GFX1032-LABEL: add_i32_varying_gfx1064:
1040; GFX1032:       ; %bb.0: ; %entry
1041; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1042; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
1043; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1044; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1045; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1046; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1047; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
1048; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1049; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1050; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1051; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1052; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1053; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1054; GFX1032-NEXT:    s_mov_b32 s2, -1
1055; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1056; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1057; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1058; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
1059; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
1060; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1061; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
1062; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
1063; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
1064; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
1065; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1066; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1067; GFX1032-NEXT:    ; implicit-def: $vgpr0
1068; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1069; GFX1032-NEXT:    ; mask branch BB4_2
1070; GFX1032-NEXT:    s_cbranch_execz BB4_2
1071; GFX1032-NEXT:  BB4_1:
1072; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1073; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
1074; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1075; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1076; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v7
1077; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1078; GFX1032-NEXT:    buffer_gl0_inv
1079; GFX1032-NEXT:    buffer_gl1_inv
1080; GFX1032-NEXT:  BB4_2:
1081; GFX1032-NEXT:    v_nop
1082; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1083; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1084; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1085; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1086; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1087; GFX1032-NEXT:    s_nop 1
1088; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1089; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1090; GFX1032-NEXT:    s_endpgm
1091entry:
1092  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1093  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1094  store i32 %old, i32 addrspace(1)* %out
1095  ret void
1096}
1097
1098define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1099;
1100;
1101; GFX7LESS-LABEL: add_i64_constant:
1102; GFX7LESS:       ; %bb.0: ; %entry
1103; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1104; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1105; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1106; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1107; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1108; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1109; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1110; GFX7LESS-NEXT:    ; mask branch BB5_2
1111; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1112; GFX7LESS-NEXT:  BB5_1:
1113; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1114; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1115; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1116; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1117; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1118; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1119; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1120; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1121; GFX7LESS-NEXT:    buffer_wbinvl1
1122; GFX7LESS-NEXT:  BB5_2:
1123; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1124; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1125; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1126; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1127; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1128; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1129; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1130; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1131; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1132; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1133; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1135; GFX7LESS-NEXT:    s_endpgm
1136;
1137; GFX8-LABEL: add_i64_constant:
1138; GFX8:       ; %bb.0: ; %entry
1139; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1140; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1141; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1142; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1143; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1144; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1145; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1146; GFX8-NEXT:    ; mask branch BB5_2
1147; GFX8-NEXT:    s_cbranch_execz BB5_2
1148; GFX8-NEXT:  BB5_1:
1149; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1150; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1151; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1152; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1153; GFX8-NEXT:    s_mov_b32 m0, -1
1154; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1155; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1156; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1157; GFX8-NEXT:    buffer_wbinvl1_vol
1158; GFX8-NEXT:  BB5_2:
1159; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1160; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1161; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1162; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1163; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1164; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1165; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1166; GFX8-NEXT:    s_mov_b32 s2, -1
1167; GFX8-NEXT:    s_nop 2
1168; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1169; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1170; GFX8-NEXT:    s_endpgm
1171;
1172; GFX9-LABEL: add_i64_constant:
1173; GFX9:       ; %bb.0: ; %entry
1174; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1175; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1176; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1177; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1178; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1179; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1180; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1181; GFX9-NEXT:    ; mask branch BB5_2
1182; GFX9-NEXT:    s_cbranch_execz BB5_2
1183; GFX9-NEXT:  BB5_1:
1184; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1185; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1186; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1187; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1188; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1189; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1190; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1191; GFX9-NEXT:    buffer_wbinvl1_vol
1192; GFX9-NEXT:  BB5_2:
1193; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1194; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1195; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1196; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1197; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1198; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1199; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1200; GFX9-NEXT:    s_mov_b32 s2, -1
1201; GFX9-NEXT:    s_nop 2
1202; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1204; GFX9-NEXT:    s_endpgm
1205;
1206; GFX1064-LABEL: add_i64_constant:
1207; GFX1064:       ; %bb.0: ; %entry
1208; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1209; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1210; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1211; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1212; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1213; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1214; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1215; GFX1064-NEXT:    ; mask branch BB5_2
1216; GFX1064-NEXT:    s_cbranch_execz BB5_2
1217; GFX1064-NEXT:  BB5_1:
1218; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1219; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1220; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
1221; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1222; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1223; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1224; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1225; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1226; GFX1064-NEXT:    buffer_gl0_inv
1227; GFX1064-NEXT:    buffer_gl1_inv
1228; GFX1064-NEXT:  BB5_2:
1229; GFX1064-NEXT:    v_nop
1230; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1231; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1232; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1233; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1234; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1235; GFX1064-NEXT:    s_mov_b32 s2, -1
1236; GFX1064-NEXT:    s_nop 2
1237; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1239; GFX1064-NEXT:    s_endpgm
1240;
1241; GFX1032-LABEL: add_i64_constant:
1242; GFX1032:       ; %bb.0: ; %entry
1243; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1244; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
1245; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1246; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1247; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1248; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1249; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1250; GFX1032-NEXT:    ; mask branch BB5_2
1251; GFX1032-NEXT:    s_cbranch_execz BB5_2
1252; GFX1032-NEXT:  BB5_1:
1253; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1254; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1255; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1256; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1257; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1258; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1259; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1260; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1261; GFX1032-NEXT:    buffer_gl0_inv
1262; GFX1032-NEXT:    buffer_gl1_inv
1263; GFX1032-NEXT:  BB5_2:
1264; GFX1032-NEXT:    v_nop
1265; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1266; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1267; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1268; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1269; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1270; GFX1032-NEXT:    s_mov_b32 s2, -1
1271; GFX1032-NEXT:    s_nop 2
1272; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1273; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1274; GFX1032-NEXT:    s_endpgm
1275entry:
1276  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1277  store i64 %old, i64 addrspace(1)* %out
1278  ret void
1279}
1280
1281define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1282;
1283;
1284; GFX7LESS-LABEL: add_i64_uniform:
1285; GFX7LESS:       ; %bb.0: ; %entry
1286; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1287; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1288; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1289; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1290; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1291; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1292; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1293; GFX7LESS-NEXT:    ; mask branch BB6_2
1294; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1295; GFX7LESS-NEXT:  BB6_1:
1296; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1297; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1298; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1299; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1300; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1301; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1302; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1303; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1304; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1305; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1306; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1307; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1308; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1309; GFX7LESS-NEXT:    buffer_wbinvl1
1310; GFX7LESS-NEXT:  BB6_2:
1311; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1312; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1313; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1314; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1316; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1317; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1318; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1319; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1320; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1321; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1322; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1323; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1324; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1325; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1326; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1327; GFX7LESS-NEXT:    s_endpgm
1328;
1329; GFX8-LABEL: add_i64_uniform:
1330; GFX8:       ; %bb.0: ; %entry
1331; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1332; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1333; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1334; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1335; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1336; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1337; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1338; GFX8-NEXT:    ; mask branch BB6_2
1339; GFX8-NEXT:    s_cbranch_execz BB6_2
1340; GFX8-NEXT:  BB6_1:
1341; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1342; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1343; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1345; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1346; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1347; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1348; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1349; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1350; GFX8-NEXT:    s_mov_b32 m0, -1
1351; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1352; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1353; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1354; GFX8-NEXT:    buffer_wbinvl1_vol
1355; GFX8-NEXT:  BB6_2:
1356; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1357; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX8-NEXT:    s_mov_b32 s4, s0
1359; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1360; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1361; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1362; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1363; GFX8-NEXT:    s_mov_b32 s5, s1
1364; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1365; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1366; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1367; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1368; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1369; GFX8-NEXT:    s_mov_b32 s6, -1
1370; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1371; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1372; GFX8-NEXT:    s_endpgm
1373;
1374; GFX9-LABEL: add_i64_uniform:
1375; GFX9:       ; %bb.0: ; %entry
1376; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1377; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1378; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1379; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1380; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1381; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1382; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1383; GFX9-NEXT:    ; mask branch BB6_2
1384; GFX9-NEXT:    s_cbranch_execz BB6_2
1385; GFX9-NEXT:  BB6_1:
1386; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1387; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX9-NEXT:    v_mul_hi_u32 v2, s2, v1
1390; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1391; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1392; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1393; GFX9-NEXT:    v_add_u32_e32 v2, s7, v2
1394; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1395; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1396; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1397; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1398; GFX9-NEXT:    buffer_wbinvl1_vol
1399; GFX9-NEXT:  BB6_2:
1400; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1401; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1403; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1404; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1405; GFX9-NEXT:    s_mov_b32 s4, s0
1406; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1407; GFX9-NEXT:    s_mov_b32 s5, s1
1408; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1409; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1410; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1411; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1412; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1413; GFX9-NEXT:    s_mov_b32 s6, -1
1414; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1415; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1416; GFX9-NEXT:    s_endpgm
1417;
1418; GFX1064-LABEL: add_i64_uniform:
1419; GFX1064:       ; %bb.0: ; %entry
1420; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1421; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1422; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1423; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1424; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1425; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1426; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1427; GFX1064-NEXT:    ; mask branch BB6_2
1428; GFX1064-NEXT:    s_cbranch_execz BB6_2
1429; GFX1064-NEXT:  BB6_1:
1430; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1431; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1432; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX1064-NEXT:    v_mul_hi_u32 v2, s2, s6
1434; GFX1064-NEXT:    s_mul_i32 s7, s2, s6
1435; GFX1064-NEXT:    s_mul_i32 s6, s3, s6
1436; GFX1064-NEXT:    v_mov_b32_e32 v1, s7
1437; GFX1064-NEXT:    v_add_nc_u32_e32 v2, s6, v2
1438; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1439; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1440; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1441; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1442; GFX1064-NEXT:    buffer_gl0_inv
1443; GFX1064-NEXT:    buffer_gl1_inv
1444; GFX1064-NEXT:  BB6_2:
1445; GFX1064-NEXT:    v_nop
1446; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1447; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1448; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1449; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1450; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1451; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
1452; GFX1064-NEXT:    v_readfirstlane_b32 s5, v2
1453; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1454; GFX1064-NEXT:    s_mov_b32 s2, -1
1455; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1456; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s4, v0
1457; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc
1458; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1459; GFX1064-NEXT:    s_endpgm
1460;
1461; GFX1032-LABEL: add_i64_uniform:
1462; GFX1032:       ; %bb.0: ; %entry
1463; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1464; GFX1032-NEXT:    v_cmp_ne_u32_e64 s5, 1, 0
1465; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1466; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1467; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1468; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1469; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1470; GFX1032-NEXT:    ; mask branch BB6_2
1471; GFX1032-NEXT:    s_cbranch_execz BB6_2
1472; GFX1032-NEXT:  BB6_1:
1473; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1474; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1475; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1476; GFX1032-NEXT:    v_mul_hi_u32 v2, s2, s5
1477; GFX1032-NEXT:    s_mul_i32 s6, s2, s5
1478; GFX1032-NEXT:    s_mul_i32 s5, s3, s5
1479; GFX1032-NEXT:    v_mov_b32_e32 v1, s6
1480; GFX1032-NEXT:    v_add_nc_u32_e32 v2, s5, v2
1481; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1482; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1483; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1484; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1485; GFX1032-NEXT:    buffer_gl0_inv
1486; GFX1032-NEXT:    buffer_gl1_inv
1487; GFX1032-NEXT:  BB6_2:
1488; GFX1032-NEXT:    v_nop
1489; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1490; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1492; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1493; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1494; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
1495; GFX1032-NEXT:    v_readfirstlane_b32 s5, v2
1496; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1497; GFX1032-NEXT:    s_mov_b32 s2, -1
1498; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1499; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s4, v0
1500; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
1501; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1502; GFX1032-NEXT:    s_endpgm
1503entry:
1504  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1505  store i64 %old, i64 addrspace(1)* %out
1506  ret void
1507}
1508
1509; GCN-NOT: v_mbcnt_lo_u32_b32
1510; GCN-NOT: v_mbcnt_hi_u32_b32
1511; GCN-NOT: s_bcnt1_i32_b64
1512define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1513;
1514;
1515; GFX7LESS-LABEL: add_i64_varying:
1516; GFX7LESS:       ; %bb.0: ; %entry
1517; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1518; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1519; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1520; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1521; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1522; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1523; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1524; GFX7LESS-NEXT:    buffer_wbinvl1
1525; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1526; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1527; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1528; GFX7LESS-NEXT:    s_endpgm
1529;
1530; GFX8-LABEL: add_i64_varying:
1531; GFX8:       ; %bb.0: ; %entry
1532; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1533; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1534; GFX8-NEXT:    s_mov_b32 m0, -1
1535; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1536; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1537; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1538; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1539; GFX8-NEXT:    buffer_wbinvl1_vol
1540; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1541; GFX8-NEXT:    s_mov_b32 s2, -1
1542; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1543; GFX8-NEXT:    s_endpgm
1544;
1545; GFX9-LABEL: add_i64_varying:
1546; GFX9:       ; %bb.0: ; %entry
1547; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1548; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1549; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1550; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1551; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1552; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1553; GFX9-NEXT:    buffer_wbinvl1_vol
1554; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1555; GFX9-NEXT:    s_mov_b32 s2, -1
1556; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1557; GFX9-NEXT:    s_endpgm
1558;
1559; GFX1064-LABEL: add_i64_varying:
1560; GFX1064:       ; %bb.0: ; %entry
1561; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1562; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1563; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1564; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1565; GFX1064-NEXT:    s_mov_b32 s2, -1
1566; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1567; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1568; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1569; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1570; GFX1064-NEXT:    buffer_gl0_inv
1571; GFX1064-NEXT:    buffer_gl1_inv
1572; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1573; GFX1064-NEXT:    s_endpgm
1574;
1575; GFX1032-LABEL: add_i64_varying:
1576; GFX1032:       ; %bb.0: ; %entry
1577; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1578; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1579; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1580; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1581; GFX1032-NEXT:    s_mov_b32 s2, -1
1582; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1583; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1584; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1585; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1586; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1587; GFX1032-NEXT:    buffer_gl0_inv
1588; GFX1032-NEXT:    buffer_gl1_inv
1589; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1590; GFX1032-NEXT:    s_endpgm
1591entry:
1592  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1593  %zext = zext i32 %lane to i64
1594  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1595  store i64 %old, i64 addrspace(1)* %out
1596  ret void
1597}
1598
1599define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1600;
1601;
1602; GFX7LESS-LABEL: sub_i32_constant:
1603; GFX7LESS:       ; %bb.0: ; %entry
1604; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1605; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1606; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1607; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1608; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1609; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1610; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1611; GFX7LESS-NEXT:    ; mask branch BB8_2
1612; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1613; GFX7LESS-NEXT:  BB8_1:
1614; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1615; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1616; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s4, 5
1617; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1618; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1619; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1620; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1621; GFX7LESS-NEXT:    buffer_wbinvl1
1622; GFX7LESS-NEXT:  BB8_2:
1623; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1624; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1625; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1626; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1627; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1628; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1629; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1630; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1631; GFX7LESS-NEXT:    s_endpgm
1632;
1633; GFX8-LABEL: sub_i32_constant:
1634; GFX8:       ; %bb.0: ; %entry
1635; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1636; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1637; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1638; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1639; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1640; GFX8-NEXT:    ; implicit-def: $vgpr1
1641; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1642; GFX8-NEXT:    ; mask branch BB8_2
1643; GFX8-NEXT:    s_cbranch_execz BB8_2
1644; GFX8-NEXT:  BB8_1:
1645; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1646; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1647; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1648; GFX8-NEXT:    s_mov_b32 m0, -1
1649; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1650; GFX8-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1651; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1652; GFX8-NEXT:    buffer_wbinvl1_vol
1653; GFX8-NEXT:  BB8_2:
1654; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1655; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1656; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1657; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1658; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1659; GFX8-NEXT:    s_mov_b32 s2, -1
1660; GFX8-NEXT:    s_nop 0
1661; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1662; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1663; GFX8-NEXT:    s_endpgm
1664;
1665; GFX9-LABEL: sub_i32_constant:
1666; GFX9:       ; %bb.0: ; %entry
1667; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1668; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
1669; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1670; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1671; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1672; GFX9-NEXT:    ; implicit-def: $vgpr1
1673; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1674; GFX9-NEXT:    ; mask branch BB8_2
1675; GFX9-NEXT:    s_cbranch_execz BB8_2
1676; GFX9-NEXT:  BB8_1:
1677; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1678; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1679; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1680; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1681; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1682; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1683; GFX9-NEXT:    buffer_wbinvl1_vol
1684; GFX9-NEXT:  BB8_2:
1685; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1686; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1687; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1688; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1689; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1690; GFX9-NEXT:    s_mov_b32 s2, -1
1691; GFX9-NEXT:    s_nop 0
1692; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1694; GFX9-NEXT:    s_endpgm
1695;
1696; GFX1064-LABEL: sub_i32_constant:
1697; GFX1064:       ; %bb.0: ; %entry
1698; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1699; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1700; GFX1064-NEXT:    ; implicit-def: $vgpr1
1701; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1702; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1703; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1704; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1705; GFX1064-NEXT:    ; mask branch BB8_2
1706; GFX1064-NEXT:    s_cbranch_execz BB8_2
1707; GFX1064-NEXT:  BB8_1:
1708; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1709; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1710; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1711; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1712; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1713; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1714; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1715; GFX1064-NEXT:    buffer_gl0_inv
1716; GFX1064-NEXT:    buffer_gl1_inv
1717; GFX1064-NEXT:  BB8_2:
1718; GFX1064-NEXT:    v_nop
1719; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1720; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1721; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1722; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1723; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1724; GFX1064-NEXT:    s_mov_b32 s2, -1
1725; GFX1064-NEXT:    s_nop 0
1726; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1728; GFX1064-NEXT:    s_endpgm
1729;
1730; GFX1032-LABEL: sub_i32_constant:
1731; GFX1032:       ; %bb.0: ; %entry
1732; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1733; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
1734; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1735; GFX1032-NEXT:    ; implicit-def: $vgpr1
1736; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1737; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1738; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1739; GFX1032-NEXT:    ; mask branch BB8_2
1740; GFX1032-NEXT:    s_cbranch_execz BB8_2
1741; GFX1032-NEXT:  BB8_1:
1742; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1743; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1744; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1745; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1746; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1747; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1748; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1749; GFX1032-NEXT:    buffer_gl0_inv
1750; GFX1032-NEXT:    buffer_gl1_inv
1751; GFX1032-NEXT:  BB8_2:
1752; GFX1032-NEXT:    v_nop
1753; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1754; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1755; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1756; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1757; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1758; GFX1032-NEXT:    s_mov_b32 s2, -1
1759; GFX1032-NEXT:    s_nop 0
1760; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1761; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1762; GFX1032-NEXT:    s_endpgm
1763entry:
1764  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1765  store i32 %old, i32 addrspace(1)* %out
1766  ret void
1767}
1768
1769define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1770;
1771;
1772; GFX7LESS-LABEL: sub_i32_uniform:
1773; GFX7LESS:       ; %bb.0: ; %entry
1774; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1775; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0xb
1776; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1777; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1778; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1779; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1780; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1781; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1782; GFX7LESS-NEXT:    ; mask branch BB9_2
1783; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1784; GFX7LESS-NEXT:  BB9_1:
1785; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1786; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
1788; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1789; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
1790; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1791; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1792; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1793; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1794; GFX7LESS-NEXT:    buffer_wbinvl1
1795; GFX7LESS-NEXT:  BB9_2:
1796; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1797; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1798; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1800; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1801; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1802; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1803; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1804; GFX7LESS-NEXT:    s_endpgm
1805;
1806; GFX8-LABEL: sub_i32_uniform:
1807; GFX8:       ; %bb.0: ; %entry
1808; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1809; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1810; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1811; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1812; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1813; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1814; GFX8-NEXT:    ; implicit-def: $vgpr1
1815; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1816; GFX8-NEXT:    ; mask branch BB9_2
1817; GFX8-NEXT:    s_cbranch_execz BB9_2
1818; GFX8-NEXT:  BB9_1:
1819; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
1820; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1822; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1823; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1824; GFX8-NEXT:    s_mov_b32 m0, -1
1825; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1826; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1827; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1828; GFX8-NEXT:    buffer_wbinvl1_vol
1829; GFX8-NEXT:  BB9_2:
1830; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1831; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1832; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1833; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1834; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1835; GFX8-NEXT:    s_mov_b32 s6, -1
1836; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1837; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1838; GFX8-NEXT:    s_endpgm
1839;
1840; GFX9-LABEL: sub_i32_uniform:
1841; GFX9:       ; %bb.0: ; %entry
1842; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1843; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
1844; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
1845; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1846; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1847; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1848; GFX9-NEXT:    ; implicit-def: $vgpr1
1849; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1850; GFX9-NEXT:    ; mask branch BB9_2
1851; GFX9-NEXT:    s_cbranch_execz BB9_2
1852; GFX9-NEXT:  BB9_1:
1853; GFX9-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
1854; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX9-NEXT:    s_mul_i32 s1, s0, s1
1856; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1857; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1858; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1859; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1860; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1861; GFX9-NEXT:    buffer_wbinvl1_vol
1862; GFX9-NEXT:  BB9_2:
1863; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1864; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
1866; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1867; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1868; GFX9-NEXT:    s_mov_b32 s6, -1
1869; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1870; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1871; GFX9-NEXT:    s_endpgm
1872;
1873; GFX1064-LABEL: sub_i32_uniform:
1874; GFX1064:       ; %bb.0: ; %entry
1875; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1876; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1877; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
1878; GFX1064-NEXT:    ; implicit-def: $vgpr1
1879; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1880; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1881; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1882; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1883; GFX1064-NEXT:    ; mask branch BB9_2
1884; GFX1064-NEXT:    s_cbranch_execz BB9_2
1885; GFX1064-NEXT:  BB9_1:
1886; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1887; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1888; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1889; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
1890; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
1891; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1892; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1893; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1894; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1895; GFX1064-NEXT:    buffer_gl0_inv
1896; GFX1064-NEXT:    buffer_gl1_inv
1897; GFX1064-NEXT:  BB9_2:
1898; GFX1064-NEXT:    v_nop
1899; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
1900; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1901; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
1902; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1903; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1904; GFX1064-NEXT:    s_mov_b32 s6, -1
1905; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1906; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1907; GFX1064-NEXT:    s_endpgm
1908;
1909; GFX1032-LABEL: sub_i32_uniform:
1910; GFX1032:       ; %bb.0: ; %entry
1911; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1912; GFX1032-NEXT:    s_load_dword s0, s[0:1], 0x2c
1913; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
1914; GFX1032-NEXT:    ; implicit-def: $vcc_hi
1915; GFX1032-NEXT:    ; implicit-def: $vgpr1
1916; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1917; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1918; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1919; GFX1032-NEXT:    ; mask branch BB9_2
1920; GFX1032-NEXT:    s_cbranch_execz BB9_2
1921; GFX1032-NEXT:  BB9_1:
1922; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1923; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1924; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
1926; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
1927; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1928; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1929; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1930; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1931; GFX1032-NEXT:    buffer_gl0_inv
1932; GFX1032-NEXT:    buffer_gl1_inv
1933; GFX1032-NEXT:  BB9_2:
1934; GFX1032-NEXT:    v_nop
1935; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1936; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1937; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
1938; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1939; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1940; GFX1032-NEXT:    s_mov_b32 s6, -1
1941; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1942; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1943; GFX1032-NEXT:    s_endpgm
1944entry:
1945  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1946  store i32 %old, i32 addrspace(1)* %out
1947  ret void
1948}
1949
1950; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
1951; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
1952; GFX7LESS-NOT: s_bcnt1_i32_b64
1953; DPPCOMB: v_add_u32_dpp
1954; DPPCOMB: v_add_u32_dpp
1955; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
1956; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
1957; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
1958define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1959;
1960;
1961; GFX7LESS-LABEL: sub_i32_varying:
1962; GFX7LESS:       ; %bb.0: ; %entry
1963; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1964; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1965; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1966; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1967; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1968; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1969; GFX7LESS-NEXT:    buffer_wbinvl1
1970; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1971; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1972; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1973; GFX7LESS-NEXT:    s_endpgm
1974;
1975; GFX8-LABEL: sub_i32_varying:
1976; GFX8:       ; %bb.0: ; %entry
1977; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1978; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1979; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1980; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1981; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1982; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
1983; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1984; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1985; GFX8-NEXT:    s_not_b64 exec, exec
1986; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1987; GFX8-NEXT:    s_not_b64 exec, exec
1988; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1989; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1990; GFX8-NEXT:    s_nop 1
1991; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1992; GFX8-NEXT:    s_nop 1
1993; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1994; GFX8-NEXT:    s_nop 1
1995; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1996; GFX8-NEXT:    s_nop 1
1997; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1998; GFX8-NEXT:    s_nop 1
1999; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2000; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2001; GFX8-NEXT:    s_nop 0
2002; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2003; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2004; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2005; GFX8-NEXT:    ; implicit-def: $vgpr0
2006; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2007; GFX8-NEXT:    ; mask branch BB10_2
2008; GFX8-NEXT:    s_cbranch_execz BB10_2
2009; GFX8-NEXT:  BB10_1:
2010; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2011; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2012; GFX8-NEXT:    s_mov_b32 m0, -1
2013; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2014; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2015; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2016; GFX8-NEXT:    buffer_wbinvl1_vol
2017; GFX8-NEXT:  BB10_2:
2018; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2019; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2020; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2021; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2022; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2023; GFX8-NEXT:    s_mov_b32 s2, -1
2024; GFX8-NEXT:    s_nop 0
2025; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2026; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2027; GFX8-NEXT:    s_endpgm
2028;
2029; GFX9-LABEL: sub_i32_varying:
2030; GFX9:       ; %bb.0: ; %entry
2031; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2032; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2033; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2034; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2035; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2036; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2037; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2038; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2039; GFX9-NEXT:    s_not_b64 exec, exec
2040; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2041; GFX9-NEXT:    s_not_b64 exec, exec
2042; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2043; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2044; GFX9-NEXT:    s_nop 1
2045; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2046; GFX9-NEXT:    s_nop 1
2047; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2048; GFX9-NEXT:    s_nop 1
2049; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2050; GFX9-NEXT:    s_nop 1
2051; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2052; GFX9-NEXT:    s_nop 1
2053; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2054; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2055; GFX9-NEXT:    s_nop 0
2056; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2057; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2058; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2059; GFX9-NEXT:    ; implicit-def: $vgpr0
2060; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2061; GFX9-NEXT:    ; mask branch BB10_2
2062; GFX9-NEXT:    s_cbranch_execz BB10_2
2063; GFX9-NEXT:  BB10_1:
2064; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2065; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2066; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2067; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2068; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2069; GFX9-NEXT:    buffer_wbinvl1_vol
2070; GFX9-NEXT:  BB10_2:
2071; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2072; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2073; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2074; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2075; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2076; GFX9-NEXT:    s_mov_b32 s2, -1
2077; GFX9-NEXT:    s_nop 0
2078; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2080; GFX9-NEXT:    s_endpgm
2081;
2082; GFX1064-LABEL: sub_i32_varying:
2083; GFX1064:       ; %bb.0: ; %entry
2084; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2085; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2086; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2087; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2088; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2089; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2090; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2091; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
2092; GFX1064-NEXT:    s_not_b64 exec, exec
2093; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2094; GFX1064-NEXT:    s_not_b64 exec, exec
2095; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2096; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2097; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2098; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2099; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2100; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2101; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2102; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2103; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2104; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2105; GFX1064-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2106; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2107; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2108; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2109; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2110; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2111; GFX1064-NEXT:    s_mov_b32 s2, -1
2112; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2113; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2114; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2115; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2116; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2117; GFX1064-NEXT:    ; implicit-def: $vgpr0
2118; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2119; GFX1064-NEXT:    ; mask branch BB10_2
2120; GFX1064-NEXT:    s_cbranch_execz BB10_2
2121; GFX1064-NEXT:  BB10_1:
2122; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2123; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
2124; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2125; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2126; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v7
2127; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2128; GFX1064-NEXT:    buffer_gl0_inv
2129; GFX1064-NEXT:    buffer_gl1_inv
2130; GFX1064-NEXT:  BB10_2:
2131; GFX1064-NEXT:    v_nop
2132; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2133; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2134; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2135; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2136; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2137; GFX1064-NEXT:    s_nop 1
2138; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2139; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2140; GFX1064-NEXT:    s_endpgm
2141;
2142; GFX1032-LABEL: sub_i32_varying:
2143; GFX1032:       ; %bb.0: ; %entry
2144; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2145; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2146; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2147; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2148; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2149; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2150; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
2151; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2152; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2153; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2154; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2155; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2156; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2157; GFX1032-NEXT:    s_mov_b32 s2, -1
2158; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2159; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2160; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2161; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2162; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2163; GFX1032-NEXT:    v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2164; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2165; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2166; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2167; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2168; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2169; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2170; GFX1032-NEXT:    ; implicit-def: $vgpr0
2171; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2172; GFX1032-NEXT:    ; mask branch BB10_2
2173; GFX1032-NEXT:    s_cbranch_execz BB10_2
2174; GFX1032-NEXT:  BB10_1:
2175; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2176; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
2177; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2178; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2179; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v7
2180; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2181; GFX1032-NEXT:    buffer_gl0_inv
2182; GFX1032-NEXT:    buffer_gl1_inv
2183; GFX1032-NEXT:  BB10_2:
2184; GFX1032-NEXT:    v_nop
2185; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2186; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2187; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2188; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2189; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2190; GFX1032-NEXT:    s_nop 1
2191; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2192; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2193; GFX1032-NEXT:    s_endpgm
2194entry:
2195  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2196  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2197  store i32 %old, i32 addrspace(1)* %out
2198  ret void
2199}
2200
2201define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2202;
2203;
2204; GFX7LESS-LABEL: sub_i64_constant:
2205; GFX7LESS:       ; %bb.0: ; %entry
2206; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2207; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2208; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2209; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2210; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2211; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2212; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2213; GFX7LESS-NEXT:    ; mask branch BB11_2
2214; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2215; GFX7LESS-NEXT:  BB11_1:
2216; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2217; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2218; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2219; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2220; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2221; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2222; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2223; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2224; GFX7LESS-NEXT:    buffer_wbinvl1
2225; GFX7LESS-NEXT:  BB11_2:
2226; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2227; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2228; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2229; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2230; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2231; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2232; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2233; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2234; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2235; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2236; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2237; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2238; GFX7LESS-NEXT:    s_endpgm
2239;
2240; GFX8-LABEL: sub_i64_constant:
2241; GFX8:       ; %bb.0: ; %entry
2242; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2243; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2244; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2245; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2246; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2247; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2248; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2249; GFX8-NEXT:    ; mask branch BB11_2
2250; GFX8-NEXT:    s_cbranch_execz BB11_2
2251; GFX8-NEXT:  BB11_1:
2252; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2253; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2254; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2255; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2256; GFX8-NEXT:    s_mov_b32 m0, -1
2257; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2258; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2259; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2260; GFX8-NEXT:    buffer_wbinvl1_vol
2261; GFX8-NEXT:  BB11_2:
2262; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2263; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2264; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2265; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2266; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2267; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2268; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2269; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2270; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2271; GFX8-NEXT:    s_mov_b32 s2, -1
2272; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2273; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2274; GFX8-NEXT:    s_endpgm
2275;
2276; GFX9-LABEL: sub_i64_constant:
2277; GFX9:       ; %bb.0: ; %entry
2278; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2279; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
2280; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2281; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2282; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2283; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2284; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2285; GFX9-NEXT:    ; mask branch BB11_2
2286; GFX9-NEXT:    s_cbranch_execz BB11_2
2287; GFX9-NEXT:  BB11_1:
2288; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2289; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2290; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2291; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2292; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2293; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2294; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2295; GFX9-NEXT:    buffer_wbinvl1_vol
2296; GFX9-NEXT:  BB11_2:
2297; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2298; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2299; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2300; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2301; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2302; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2303; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2304; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2305; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2306; GFX9-NEXT:    s_mov_b32 s2, -1
2307; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2308; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2309; GFX9-NEXT:    s_endpgm
2310;
2311; GFX1064-LABEL: sub_i64_constant:
2312; GFX1064:       ; %bb.0: ; %entry
2313; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2314; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2315; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2316; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
2317; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
2318; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2319; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2320; GFX1064-NEXT:    ; mask branch BB11_2
2321; GFX1064-NEXT:    s_cbranch_execz BB11_2
2322; GFX1064-NEXT:  BB11_1:
2323; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2324; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2325; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
2326; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
2327; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2328; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2329; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2330; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2331; GFX1064-NEXT:    buffer_gl0_inv
2332; GFX1064-NEXT:    buffer_gl1_inv
2333; GFX1064-NEXT:  BB11_2:
2334; GFX1064-NEXT:    v_nop
2335; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2336; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2337; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2338; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2339; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2340; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2341; GFX1064-NEXT:    s_mov_b32 s2, -1
2342; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2343; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2344; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2345; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2346; GFX1064-NEXT:    s_endpgm
2347;
2348; GFX1032-LABEL: sub_i64_constant:
2349; GFX1032:       ; %bb.0: ; %entry
2350; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2351; GFX1032-NEXT:    v_cmp_ne_u32_e64 s3, 1, 0
2352; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2353; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2354; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2355; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2356; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2357; GFX1032-NEXT:    ; mask branch BB11_2
2358; GFX1032-NEXT:    s_cbranch_execz BB11_2
2359; GFX1032-NEXT:  BB11_1:
2360; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2361; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2362; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2363; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
2364; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2365; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2366; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2367; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2368; GFX1032-NEXT:    buffer_gl0_inv
2369; GFX1032-NEXT:    buffer_gl1_inv
2370; GFX1032-NEXT:  BB11_2:
2371; GFX1032-NEXT:    v_nop
2372; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2373; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2374; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2375; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2376; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2377; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2378; GFX1032-NEXT:    s_mov_b32 s2, -1
2379; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2380; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2381; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2382; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2383; GFX1032-NEXT:    s_endpgm
2384entry:
2385  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2386  store i64 %old, i64 addrspace(1)* %out
2387  ret void
2388}
2389
2390define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2391;
2392;
2393; GFX7LESS-LABEL: sub_i64_uniform:
2394; GFX7LESS:       ; %bb.0: ; %entry
2395; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2396; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2397; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2398; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2399; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2400; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2401; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2402; GFX7LESS-NEXT:    ; mask branch BB12_2
2403; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2404; GFX7LESS-NEXT:  BB12_1:
2405; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2406; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2407; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2408; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2409; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2410; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2411; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2412; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2413; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2414; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2415; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2416; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2417; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2418; GFX7LESS-NEXT:    buffer_wbinvl1
2419; GFX7LESS-NEXT:  BB12_2:
2420; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2421; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2422; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2423; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2424; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2425; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2426; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2427; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2428; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2429; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2430; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2431; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2432; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2433; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2434; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2435; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2436; GFX7LESS-NEXT:    s_endpgm
2437;
2438; GFX8-LABEL: sub_i64_uniform:
2439; GFX8:       ; %bb.0: ; %entry
2440; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2441; GFX8-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2442; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2443; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2444; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2445; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2446; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2447; GFX8-NEXT:    ; mask branch BB12_2
2448; GFX8-NEXT:    s_cbranch_execz BB12_2
2449; GFX8-NEXT:  BB12_1:
2450; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2451; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2452; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2453; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2454; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2455; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2456; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2457; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2458; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2459; GFX8-NEXT:    s_mov_b32 m0, -1
2460; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2461; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2462; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2463; GFX8-NEXT:    buffer_wbinvl1_vol
2464; GFX8-NEXT:  BB12_2:
2465; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2466; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2467; GFX8-NEXT:    s_mov_b32 s4, s0
2468; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2469; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2470; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2471; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2472; GFX8-NEXT:    s_mov_b32 s5, s1
2473; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2474; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2475; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2476; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2477; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2478; GFX8-NEXT:    s_mov_b32 s6, -1
2479; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2480; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2481; GFX8-NEXT:    s_endpgm
2482;
2483; GFX9-LABEL: sub_i64_uniform:
2484; GFX9:       ; %bb.0: ; %entry
2485; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2486; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2487; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2488; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2489; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2490; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2491; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2492; GFX9-NEXT:    ; mask branch BB12_2
2493; GFX9-NEXT:    s_cbranch_execz BB12_2
2494; GFX9-NEXT:  BB12_1:
2495; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2496; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2497; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX9-NEXT:    v_mul_hi_u32 v2, s2, v1
2499; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2500; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2501; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2502; GFX9-NEXT:    v_add_u32_e32 v2, s7, v2
2503; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2504; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2505; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2506; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2507; GFX9-NEXT:    buffer_wbinvl1_vol
2508; GFX9-NEXT:  BB12_2:
2509; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2510; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2512; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2513; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2514; GFX9-NEXT:    s_mov_b32 s4, s0
2515; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2516; GFX9-NEXT:    s_mov_b32 s5, s1
2517; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2518; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2519; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2520; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2521; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2522; GFX9-NEXT:    s_mov_b32 s6, -1
2523; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2524; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2525; GFX9-NEXT:    s_endpgm
2526;
2527; GFX1064-LABEL: sub_i64_uniform:
2528; GFX1064:       ; %bb.0: ; %entry
2529; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
2530; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2531; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2532; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2533; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2534; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2535; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2536; GFX1064-NEXT:    ; mask branch BB12_2
2537; GFX1064-NEXT:    s_cbranch_execz BB12_2
2538; GFX1064-NEXT:  BB12_1:
2539; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2540; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2541; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX1064-NEXT:    v_mul_hi_u32 v2, s2, s6
2543; GFX1064-NEXT:    s_mul_i32 s7, s2, s6
2544; GFX1064-NEXT:    s_mul_i32 s6, s3, s6
2545; GFX1064-NEXT:    v_mov_b32_e32 v1, s7
2546; GFX1064-NEXT:    v_add_nc_u32_e32 v2, s6, v2
2547; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2548; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2549; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2550; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2551; GFX1064-NEXT:    buffer_gl0_inv
2552; GFX1064-NEXT:    buffer_gl1_inv
2553; GFX1064-NEXT:  BB12_2:
2554; GFX1064-NEXT:    v_nop
2555; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2556; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2557; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2558; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2559; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2560; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
2561; GFX1064-NEXT:    v_readfirstlane_b32 s5, v2
2562; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2563; GFX1064-NEXT:    s_mov_b32 s2, -1
2564; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2565; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s4, v0
2566; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
2567; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2568; GFX1064-NEXT:    s_endpgm
2569;
2570; GFX1032-LABEL: sub_i64_uniform:
2571; GFX1032:       ; %bb.0: ; %entry
2572; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2573; GFX1032-NEXT:    v_cmp_ne_u32_e64 s5, 1, 0
2574; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2575; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2576; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2577; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2578; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2579; GFX1032-NEXT:    ; mask branch BB12_2
2580; GFX1032-NEXT:    s_cbranch_execz BB12_2
2581; GFX1032-NEXT:  BB12_1:
2582; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2583; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2584; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2585; GFX1032-NEXT:    v_mul_hi_u32 v2, s2, s5
2586; GFX1032-NEXT:    s_mul_i32 s6, s2, s5
2587; GFX1032-NEXT:    s_mul_i32 s5, s3, s5
2588; GFX1032-NEXT:    v_mov_b32_e32 v1, s6
2589; GFX1032-NEXT:    v_add_nc_u32_e32 v2, s5, v2
2590; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2591; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2592; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2593; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2594; GFX1032-NEXT:    buffer_gl0_inv
2595; GFX1032-NEXT:    buffer_gl1_inv
2596; GFX1032-NEXT:  BB12_2:
2597; GFX1032-NEXT:    v_nop
2598; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2599; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2600; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2601; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2602; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2603; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
2604; GFX1032-NEXT:    v_readfirstlane_b32 s5, v2
2605; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2606; GFX1032-NEXT:    s_mov_b32 s2, -1
2607; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2608; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s4, v0
2609; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
2610; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2611; GFX1032-NEXT:    s_endpgm
2612entry:
2613  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2614  store i64 %old, i64 addrspace(1)* %out
2615  ret void
2616}
2617
2618; GCN-NOT: v_mbcnt_lo_u32_b32
2619; GCN-NOT: v_mbcnt_hi_u32_b32
2620; GCN-NOT: s_bcnt1_i32_b64
2621define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2622;
2623;
2624; GFX7LESS-LABEL: sub_i64_varying:
2625; GFX7LESS:       ; %bb.0: ; %entry
2626; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2627; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2628; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2629; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2630; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2631; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2632; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2633; GFX7LESS-NEXT:    buffer_wbinvl1
2634; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2635; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2636; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2637; GFX7LESS-NEXT:    s_endpgm
2638;
2639; GFX8-LABEL: sub_i64_varying:
2640; GFX8:       ; %bb.0: ; %entry
2641; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2642; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2643; GFX8-NEXT:    s_mov_b32 m0, -1
2644; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2645; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2646; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2647; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2648; GFX8-NEXT:    buffer_wbinvl1_vol
2649; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2650; GFX8-NEXT:    s_mov_b32 s2, -1
2651; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2652; GFX8-NEXT:    s_endpgm
2653;
2654; GFX9-LABEL: sub_i64_varying:
2655; GFX9:       ; %bb.0: ; %entry
2656; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2657; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2658; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2659; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2660; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2661; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2662; GFX9-NEXT:    buffer_wbinvl1_vol
2663; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2664; GFX9-NEXT:    s_mov_b32 s2, -1
2665; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2666; GFX9-NEXT:    s_endpgm
2667;
2668; GFX1064-LABEL: sub_i64_varying:
2669; GFX1064:       ; %bb.0: ; %entry
2670; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2671; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2672; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2673; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2674; GFX1064-NEXT:    s_mov_b32 s2, -1
2675; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2676; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2677; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2678; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2679; GFX1064-NEXT:    buffer_gl0_inv
2680; GFX1064-NEXT:    buffer_gl1_inv
2681; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2682; GFX1064-NEXT:    s_endpgm
2683;
2684; GFX1032-LABEL: sub_i64_varying:
2685; GFX1032:       ; %bb.0: ; %entry
2686; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2687; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2688; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2689; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2690; GFX1032-NEXT:    s_mov_b32 s2, -1
2691; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2692; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2693; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2694; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2695; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2696; GFX1032-NEXT:    buffer_gl0_inv
2697; GFX1032-NEXT:    buffer_gl1_inv
2698; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2699; GFX1032-NEXT:    s_endpgm
2700entry:
2701  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2702  %zext = zext i32 %lane to i64
2703  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2704  store i64 %old, i64 addrspace(1)* %out
2705  ret void
2706}
2707
2708; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
2709; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
2710; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
2711define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2712;
2713;
2714; GFX7LESS-LABEL: and_i32_varying:
2715; GFX7LESS:       ; %bb.0: ; %entry
2716; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2717; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2718; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2719; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2720; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2721; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2722; GFX7LESS-NEXT:    buffer_wbinvl1
2723; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2724; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2725; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2726; GFX7LESS-NEXT:    s_endpgm
2727;
2728; GFX8-LABEL: and_i32_varying:
2729; GFX8:       ; %bb.0: ; %entry
2730; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2731; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2732; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
2733; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
2734; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2735; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2736; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2737; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2738; GFX8-NEXT:    s_not_b64 exec, exec
2739; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2740; GFX8-NEXT:    s_not_b64 exec, exec
2741; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2742; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2743; GFX8-NEXT:    s_nop 1
2744; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2745; GFX8-NEXT:    s_nop 1
2746; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2747; GFX8-NEXT:    s_nop 1
2748; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2749; GFX8-NEXT:    s_nop 1
2750; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2751; GFX8-NEXT:    s_nop 1
2752; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2753; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
2754; GFX8-NEXT:    s_nop 0
2755; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2756; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2757; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2758; GFX8-NEXT:    ; implicit-def: $vgpr0
2759; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2760; GFX8-NEXT:    ; mask branch BB14_2
2761; GFX8-NEXT:    s_cbranch_execz BB14_2
2762; GFX8-NEXT:  BB14_1:
2763; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2764; GFX8-NEXT:    v_mov_b32_e32 v3, s2
2765; GFX8-NEXT:    s_mov_b32 m0, -1
2766; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2767; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2768; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2769; GFX8-NEXT:    buffer_wbinvl1_vol
2770; GFX8-NEXT:  BB14_2:
2771; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2772; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2773; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2774; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2775; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2776; GFX8-NEXT:    s_mov_b32 s2, -1
2777; GFX8-NEXT:    s_nop 0
2778; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2779; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2780; GFX8-NEXT:    s_endpgm
2781;
2782; GFX9-LABEL: and_i32_varying:
2783; GFX9:       ; %bb.0: ; %entry
2784; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2785; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2786; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
2787; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
2788; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2789; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2790; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2791; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2792; GFX9-NEXT:    s_not_b64 exec, exec
2793; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2794; GFX9-NEXT:    s_not_b64 exec, exec
2795; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2796; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2797; GFX9-NEXT:    s_nop 1
2798; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2799; GFX9-NEXT:    s_nop 1
2800; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2801; GFX9-NEXT:    s_nop 1
2802; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2803; GFX9-NEXT:    s_nop 1
2804; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2805; GFX9-NEXT:    s_nop 1
2806; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2807; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
2808; GFX9-NEXT:    s_nop 0
2809; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2810; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2811; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2812; GFX9-NEXT:    ; implicit-def: $vgpr0
2813; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2814; GFX9-NEXT:    ; mask branch BB14_2
2815; GFX9-NEXT:    s_cbranch_execz BB14_2
2816; GFX9-NEXT:  BB14_1:
2817; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2818; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2819; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2820; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2821; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2822; GFX9-NEXT:    buffer_wbinvl1_vol
2823; GFX9-NEXT:  BB14_2:
2824; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2825; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2826; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2827; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2828; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2829; GFX9-NEXT:    s_mov_b32 s2, -1
2830; GFX9-NEXT:    s_nop 0
2831; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2832; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2833; GFX9-NEXT:    s_endpgm
2834;
2835; GFX1064-LABEL: and_i32_varying:
2836; GFX1064:       ; %bb.0: ; %entry
2837; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2838; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2839; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
2840; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
2841; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
2842; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2843; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2844; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2845; GFX1064-NEXT:    s_not_b64 exec, exec
2846; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
2847; GFX1064-NEXT:    s_not_b64 exec, exec
2848; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2849; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2850; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2851; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2852; GFX1064-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2853; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
2854; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2855; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2856; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
2857; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
2858; GFX1064-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2859; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
2860; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2861; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
2862; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
2863; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
2864; GFX1064-NEXT:    s_mov_b32 s2, -1
2865; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
2866; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
2867; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
2868; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2869; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
2870; GFX1064-NEXT:    ; implicit-def: $vgpr0
2871; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2872; GFX1064-NEXT:    ; mask branch BB14_2
2873; GFX1064-NEXT:    s_cbranch_execz BB14_2
2874; GFX1064-NEXT:  BB14_1:
2875; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2876; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
2877; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2878; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2879; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v7
2880; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2881; GFX1064-NEXT:    buffer_gl0_inv
2882; GFX1064-NEXT:    buffer_gl1_inv
2883; GFX1064-NEXT:  BB14_2:
2884; GFX1064-NEXT:    v_nop
2885; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2886; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2887; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
2888; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2889; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2890; GFX1064-NEXT:    s_nop 1
2891; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2892; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2893; GFX1064-NEXT:    s_endpgm
2894;
2895; GFX1032-LABEL: and_i32_varying:
2896; GFX1032:       ; %bb.0: ; %entry
2897; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2898; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
2899; GFX1032-NEXT:    ; implicit-def: $vcc_hi
2900; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
2901; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
2902; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2903; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2904; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2905; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2906; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
2907; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2908; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2909; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2910; GFX1032-NEXT:    s_mov_b32 s2, -1
2911; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2912; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2913; GFX1032-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2914; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
2915; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
2916; GFX1032-NEXT:    v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2917; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
2918; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2919; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
2920; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
2921; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2922; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
2923; GFX1032-NEXT:    ; implicit-def: $vgpr0
2924; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2925; GFX1032-NEXT:    ; mask branch BB14_2
2926; GFX1032-NEXT:    s_cbranch_execz BB14_2
2927; GFX1032-NEXT:  BB14_1:
2928; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2929; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
2930; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2931; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2932; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v7
2933; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2934; GFX1032-NEXT:    buffer_gl0_inv
2935; GFX1032-NEXT:    buffer_gl1_inv
2936; GFX1032-NEXT:  BB14_2:
2937; GFX1032-NEXT:    v_nop
2938; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2939; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2940; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2941; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2942; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2943; GFX1032-NEXT:    s_nop 1
2944; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2945; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2946; GFX1032-NEXT:    s_endpgm
2947entry:
2948  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2949  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2950  store i32 %old, i32 addrspace(1)* %out
2951  ret void
2952}
2953
2954; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
2955; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
2956; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
2957define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2958;
2959;
2960; GFX7LESS-LABEL: or_i32_varying:
2961; GFX7LESS:       ; %bb.0: ; %entry
2962; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2963; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2964; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2965; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2966; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2967; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2968; GFX7LESS-NEXT:    buffer_wbinvl1
2969; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2970; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2971; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2972; GFX7LESS-NEXT:    s_endpgm
2973;
2974; GFX8-LABEL: or_i32_varying:
2975; GFX8:       ; %bb.0: ; %entry
2976; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2977; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2978; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2979; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2980; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2981; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
2982; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2983; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2984; GFX8-NEXT:    s_not_b64 exec, exec
2985; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2986; GFX8-NEXT:    s_not_b64 exec, exec
2987; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2988; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2989; GFX8-NEXT:    s_nop 1
2990; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2991; GFX8-NEXT:    s_nop 1
2992; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2993; GFX8-NEXT:    s_nop 1
2994; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2995; GFX8-NEXT:    s_nop 1
2996; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2997; GFX8-NEXT:    s_nop 1
2998; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2999; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3000; GFX8-NEXT:    s_nop 0
3001; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3002; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3003; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3004; GFX8-NEXT:    ; implicit-def: $vgpr0
3005; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3006; GFX8-NEXT:    ; mask branch BB15_2
3007; GFX8-NEXT:    s_cbranch_execz BB15_2
3008; GFX8-NEXT:  BB15_1:
3009; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3010; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3011; GFX8-NEXT:    s_mov_b32 m0, -1
3012; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3013; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3014; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3015; GFX8-NEXT:    buffer_wbinvl1_vol
3016; GFX8-NEXT:  BB15_2:
3017; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3018; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3019; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3020; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3021; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3022; GFX8-NEXT:    s_mov_b32 s2, -1
3023; GFX8-NEXT:    s_nop 0
3024; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3025; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3026; GFX8-NEXT:    s_endpgm
3027;
3028; GFX9-LABEL: or_i32_varying:
3029; GFX9:       ; %bb.0: ; %entry
3030; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3031; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3032; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3033; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3034; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3035; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3036; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3037; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3038; GFX9-NEXT:    s_not_b64 exec, exec
3039; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3040; GFX9-NEXT:    s_not_b64 exec, exec
3041; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3042; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3043; GFX9-NEXT:    s_nop 1
3044; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3045; GFX9-NEXT:    s_nop 1
3046; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3047; GFX9-NEXT:    s_nop 1
3048; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3049; GFX9-NEXT:    s_nop 1
3050; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3051; GFX9-NEXT:    s_nop 1
3052; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3053; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3054; GFX9-NEXT:    s_nop 0
3055; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3056; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3057; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3058; GFX9-NEXT:    ; implicit-def: $vgpr0
3059; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3060; GFX9-NEXT:    ; mask branch BB15_2
3061; GFX9-NEXT:    s_cbranch_execz BB15_2
3062; GFX9-NEXT:  BB15_1:
3063; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3064; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3065; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3066; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3067; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3068; GFX9-NEXT:    buffer_wbinvl1_vol
3069; GFX9-NEXT:  BB15_2:
3070; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3071; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3072; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3073; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3074; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3075; GFX9-NEXT:    s_mov_b32 s2, -1
3076; GFX9-NEXT:    s_nop 0
3077; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3078; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3079; GFX9-NEXT:    s_endpgm
3080;
3081; GFX1064-LABEL: or_i32_varying:
3082; GFX1064:       ; %bb.0: ; %entry
3083; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3084; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3085; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3086; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3087; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3088; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3089; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3090; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3091; GFX1064-NEXT:    s_not_b64 exec, exec
3092; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3093; GFX1064-NEXT:    s_not_b64 exec, exec
3094; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3095; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3096; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3097; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3098; GFX1064-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3099; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3100; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3101; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3102; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3103; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3104; GFX1064-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3105; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3106; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3107; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3108; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3109; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3110; GFX1064-NEXT:    s_mov_b32 s2, -1
3111; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3112; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3113; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3114; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3116; GFX1064-NEXT:    ; implicit-def: $vgpr0
3117; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3118; GFX1064-NEXT:    ; mask branch BB15_2
3119; GFX1064-NEXT:    s_cbranch_execz BB15_2
3120; GFX1064-NEXT:  BB15_1:
3121; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3122; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3125; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v7
3126; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3127; GFX1064-NEXT:    buffer_gl0_inv
3128; GFX1064-NEXT:    buffer_gl1_inv
3129; GFX1064-NEXT:  BB15_2:
3130; GFX1064-NEXT:    v_nop
3131; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3132; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3133; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3134; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3135; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3136; GFX1064-NEXT:    s_nop 1
3137; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3138; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3139; GFX1064-NEXT:    s_endpgm
3140;
3141; GFX1032-LABEL: or_i32_varying:
3142; GFX1032:       ; %bb.0: ; %entry
3143; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3144; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3145; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3146; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3147; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3148; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3149; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3150; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3151; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3152; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3153; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3154; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3155; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3156; GFX1032-NEXT:    s_mov_b32 s2, -1
3157; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3158; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3159; GFX1032-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3160; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3161; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3162; GFX1032-NEXT:    v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3163; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3164; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3165; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3166; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3167; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3168; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3169; GFX1032-NEXT:    ; implicit-def: $vgpr0
3170; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3171; GFX1032-NEXT:    ; mask branch BB15_2
3172; GFX1032-NEXT:    s_cbranch_execz BB15_2
3173; GFX1032-NEXT:  BB15_1:
3174; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3175; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3176; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3177; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3178; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v7
3179; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3180; GFX1032-NEXT:    buffer_gl0_inv
3181; GFX1032-NEXT:    buffer_gl1_inv
3182; GFX1032-NEXT:  BB15_2:
3183; GFX1032-NEXT:    v_nop
3184; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3185; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3186; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3187; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3188; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3189; GFX1032-NEXT:    s_nop 1
3190; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3191; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3192; GFX1032-NEXT:    s_endpgm
3193entry:
3194  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3195  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3196  store i32 %old, i32 addrspace(1)* %out
3197  ret void
3198}
3199
3200; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
3201; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
3202; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
3203define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3204;
3205;
3206; GFX7LESS-LABEL: xor_i32_varying:
3207; GFX7LESS:       ; %bb.0: ; %entry
3208; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3209; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3210; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3211; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3212; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3213; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3214; GFX7LESS-NEXT:    buffer_wbinvl1
3215; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3216; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3217; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3218; GFX7LESS-NEXT:    s_endpgm
3219;
3220; GFX8-LABEL: xor_i32_varying:
3221; GFX8:       ; %bb.0: ; %entry
3222; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3223; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3224; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3225; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3226; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3227; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3228; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3229; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3230; GFX8-NEXT:    s_not_b64 exec, exec
3231; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3232; GFX8-NEXT:    s_not_b64 exec, exec
3233; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3234; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3235; GFX8-NEXT:    s_nop 1
3236; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3237; GFX8-NEXT:    s_nop 1
3238; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3239; GFX8-NEXT:    s_nop 1
3240; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3241; GFX8-NEXT:    s_nop 1
3242; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3243; GFX8-NEXT:    s_nop 1
3244; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3245; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3246; GFX8-NEXT:    s_nop 0
3247; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3248; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3249; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3250; GFX8-NEXT:    ; implicit-def: $vgpr0
3251; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3252; GFX8-NEXT:    ; mask branch BB16_2
3253; GFX8-NEXT:    s_cbranch_execz BB16_2
3254; GFX8-NEXT:  BB16_1:
3255; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3256; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3257; GFX8-NEXT:    s_mov_b32 m0, -1
3258; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3259; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3260; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3261; GFX8-NEXT:    buffer_wbinvl1_vol
3262; GFX8-NEXT:  BB16_2:
3263; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3264; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3265; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3266; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3267; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3268; GFX8-NEXT:    s_mov_b32 s2, -1
3269; GFX8-NEXT:    s_nop 0
3270; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3271; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3272; GFX8-NEXT:    s_endpgm
3273;
3274; GFX9-LABEL: xor_i32_varying:
3275; GFX9:       ; %bb.0: ; %entry
3276; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3277; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3278; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3279; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3280; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3281; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3282; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3283; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3284; GFX9-NEXT:    s_not_b64 exec, exec
3285; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3286; GFX9-NEXT:    s_not_b64 exec, exec
3287; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3288; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3289; GFX9-NEXT:    s_nop 1
3290; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3291; GFX9-NEXT:    s_nop 1
3292; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3293; GFX9-NEXT:    s_nop 1
3294; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3295; GFX9-NEXT:    s_nop 1
3296; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3297; GFX9-NEXT:    s_nop 1
3298; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3299; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3300; GFX9-NEXT:    s_nop 0
3301; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3302; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3303; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3304; GFX9-NEXT:    ; implicit-def: $vgpr0
3305; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3306; GFX9-NEXT:    ; mask branch BB16_2
3307; GFX9-NEXT:    s_cbranch_execz BB16_2
3308; GFX9-NEXT:  BB16_1:
3309; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3310; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3311; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3312; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3313; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3314; GFX9-NEXT:    buffer_wbinvl1_vol
3315; GFX9-NEXT:  BB16_2:
3316; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3317; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3318; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3319; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3320; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3321; GFX9-NEXT:    s_mov_b32 s2, -1
3322; GFX9-NEXT:    s_nop 0
3323; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3324; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3325; GFX9-NEXT:    s_endpgm
3326;
3327; GFX1064-LABEL: xor_i32_varying:
3328; GFX1064:       ; %bb.0: ; %entry
3329; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3330; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3331; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3332; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3333; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3334; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3335; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3336; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3337; GFX1064-NEXT:    s_not_b64 exec, exec
3338; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
3339; GFX1064-NEXT:    s_not_b64 exec, exec
3340; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3341; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3342; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3343; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3344; GFX1064-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3345; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3346; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3347; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3348; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3349; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3350; GFX1064-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3351; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3352; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3353; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3354; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3355; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3356; GFX1064-NEXT:    s_mov_b32 s2, -1
3357; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3358; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3359; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3360; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3361; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3362; GFX1064-NEXT:    ; implicit-def: $vgpr0
3363; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3364; GFX1064-NEXT:    ; mask branch BB16_2
3365; GFX1064-NEXT:    s_cbranch_execz BB16_2
3366; GFX1064-NEXT:  BB16_1:
3367; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3368; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3369; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3370; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3371; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v7
3372; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3373; GFX1064-NEXT:    buffer_gl0_inv
3374; GFX1064-NEXT:    buffer_gl1_inv
3375; GFX1064-NEXT:  BB16_2:
3376; GFX1064-NEXT:    v_nop
3377; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3378; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3379; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3380; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3381; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3382; GFX1064-NEXT:    s_nop 1
3383; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3384; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3385; GFX1064-NEXT:    s_endpgm
3386;
3387; GFX1032-LABEL: xor_i32_varying:
3388; GFX1032:       ; %bb.0: ; %entry
3389; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3390; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3391; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3392; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3393; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3394; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3395; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3396; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3397; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3398; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
3399; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3400; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3401; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3402; GFX1032-NEXT:    s_mov_b32 s2, -1
3403; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3404; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3405; GFX1032-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3406; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3407; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3408; GFX1032-NEXT:    v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3409; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3410; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3411; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3412; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3413; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3414; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3415; GFX1032-NEXT:    ; implicit-def: $vgpr0
3416; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3417; GFX1032-NEXT:    ; mask branch BB16_2
3418; GFX1032-NEXT:    s_cbranch_execz BB16_2
3419; GFX1032-NEXT:  BB16_1:
3420; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3421; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3422; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3423; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3424; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v7
3425; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3426; GFX1032-NEXT:    buffer_gl0_inv
3427; GFX1032-NEXT:    buffer_gl1_inv
3428; GFX1032-NEXT:  BB16_2:
3429; GFX1032-NEXT:    v_nop
3430; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3431; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3432; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3433; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3434; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3435; GFX1032-NEXT:    s_nop 1
3436; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3437; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3438; GFX1032-NEXT:    s_endpgm
3439entry:
3440  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3441  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3442  store i32 %old, i32 addrspace(1)* %out
3443  ret void
3444}
3445
3446; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
3447; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
3448; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
3449define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3450;
3451;
3452; GFX7LESS-LABEL: max_i32_varying:
3453; GFX7LESS:       ; %bb.0: ; %entry
3454; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3455; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3456; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3457; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3458; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3459; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3460; GFX7LESS-NEXT:    buffer_wbinvl1
3461; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3462; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3463; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3464; GFX7LESS-NEXT:    s_endpgm
3465;
3466; GFX8-LABEL: max_i32_varying:
3467; GFX8:       ; %bb.0: ; %entry
3468; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3469; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3470; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3471; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3472; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3473; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3474; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3475; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3476; GFX8-NEXT:    s_not_b64 exec, exec
3477; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3478; GFX8-NEXT:    s_not_b64 exec, exec
3479; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3480; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3481; GFX8-NEXT:    s_nop 1
3482; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3483; GFX8-NEXT:    s_nop 1
3484; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3485; GFX8-NEXT:    s_nop 1
3486; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3487; GFX8-NEXT:    s_nop 1
3488; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3489; GFX8-NEXT:    s_nop 1
3490; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3491; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3492; GFX8-NEXT:    s_nop 0
3493; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3494; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3495; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3496; GFX8-NEXT:    ; implicit-def: $vgpr0
3497; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3498; GFX8-NEXT:    ; mask branch BB17_2
3499; GFX8-NEXT:    s_cbranch_execz BB17_2
3500; GFX8-NEXT:  BB17_1:
3501; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3502; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3503; GFX8-NEXT:    s_mov_b32 m0, -1
3504; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3505; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3506; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3507; GFX8-NEXT:    buffer_wbinvl1_vol
3508; GFX8-NEXT:  BB17_2:
3509; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3510; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3511; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3512; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3513; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3514; GFX8-NEXT:    s_mov_b32 s2, -1
3515; GFX8-NEXT:    s_nop 0
3516; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3517; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3518; GFX8-NEXT:    s_endpgm
3519;
3520; GFX9-LABEL: max_i32_varying:
3521; GFX9:       ; %bb.0: ; %entry
3522; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3523; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3524; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3525; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3526; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3527; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3528; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3529; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3530; GFX9-NEXT:    s_not_b64 exec, exec
3531; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3532; GFX9-NEXT:    s_not_b64 exec, exec
3533; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3534; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3535; GFX9-NEXT:    s_nop 1
3536; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3537; GFX9-NEXT:    s_nop 1
3538; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3539; GFX9-NEXT:    s_nop 1
3540; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3541; GFX9-NEXT:    s_nop 1
3542; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3543; GFX9-NEXT:    s_nop 1
3544; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3545; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3546; GFX9-NEXT:    s_nop 0
3547; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3548; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3549; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3550; GFX9-NEXT:    ; implicit-def: $vgpr0
3551; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3552; GFX9-NEXT:    ; mask branch BB17_2
3553; GFX9-NEXT:    s_cbranch_execz BB17_2
3554; GFX9-NEXT:  BB17_1:
3555; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3556; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3557; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3558; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3559; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3560; GFX9-NEXT:    buffer_wbinvl1_vol
3561; GFX9-NEXT:  BB17_2:
3562; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3563; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3564; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3565; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3566; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3567; GFX9-NEXT:    s_mov_b32 s2, -1
3568; GFX9-NEXT:    s_nop 0
3569; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3570; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3571; GFX9-NEXT:    s_endpgm
3572;
3573; GFX1064-LABEL: max_i32_varying:
3574; GFX1064:       ; %bb.0: ; %entry
3575; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3576; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3577; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3578; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
3579; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
3580; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3581; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3582; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3583; GFX1064-NEXT:    s_not_b64 exec, exec
3584; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3585; GFX1064-NEXT:    s_not_b64 exec, exec
3586; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3587; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3588; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3589; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3590; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3591; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3592; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3593; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3594; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
3595; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
3596; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3597; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
3598; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3599; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
3600; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3601; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
3602; GFX1064-NEXT:    s_mov_b32 s2, -1
3603; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
3604; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
3605; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3606; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3607; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
3608; GFX1064-NEXT:    ; implicit-def: $vgpr0
3609; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3610; GFX1064-NEXT:    ; mask branch BB17_2
3611; GFX1064-NEXT:    s_cbranch_execz BB17_2
3612; GFX1064-NEXT:  BB17_1:
3613; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3614; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
3615; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3616; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3617; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v7
3618; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3619; GFX1064-NEXT:    buffer_gl0_inv
3620; GFX1064-NEXT:    buffer_gl1_inv
3621; GFX1064-NEXT:  BB17_2:
3622; GFX1064-NEXT:    v_nop
3623; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3624; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3625; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3626; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3627; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3628; GFX1064-NEXT:    s_nop 1
3629; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3630; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3631; GFX1064-NEXT:    s_endpgm
3632;
3633; GFX1032-LABEL: max_i32_varying:
3634; GFX1032:       ; %bb.0: ; %entry
3635; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3636; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3637; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3638; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3639; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
3640; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3641; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3642; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3643; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3644; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3645; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3646; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
3647; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3648; GFX1032-NEXT:    s_mov_b32 s2, -1
3649; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3650; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3651; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3652; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3653; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3654; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3655; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
3656; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3657; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
3658; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
3659; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
3660; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
3661; GFX1032-NEXT:    ; implicit-def: $vgpr0
3662; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3663; GFX1032-NEXT:    ; mask branch BB17_2
3664; GFX1032-NEXT:    s_cbranch_execz BB17_2
3665; GFX1032-NEXT:  BB17_1:
3666; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3667; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
3668; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3669; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3670; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v7
3671; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3672; GFX1032-NEXT:    buffer_gl0_inv
3673; GFX1032-NEXT:    buffer_gl1_inv
3674; GFX1032-NEXT:  BB17_2:
3675; GFX1032-NEXT:    v_nop
3676; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3677; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3678; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3679; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3680; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3681; GFX1032-NEXT:    s_nop 1
3682; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3683; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3684; GFX1032-NEXT:    s_endpgm
3685entry:
3686  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3687  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3688  store i32 %old, i32 addrspace(1)* %out
3689  ret void
3690}
3691
3692define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3693;
3694;
3695; GFX7LESS-LABEL: max_i64_constant:
3696; GFX7LESS:       ; %bb.0: ; %entry
3697; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3698; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3699; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3700; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
3701; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3702; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3703; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3704; GFX7LESS-NEXT:    ; mask branch BB18_2
3705; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3706; GFX7LESS-NEXT:  BB18_1:
3707; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3708; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3709; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3710; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3711; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3712; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3713; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3714; GFX7LESS-NEXT:    buffer_wbinvl1
3715; GFX7LESS-NEXT:  BB18_2:
3716; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3717; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3718; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3719; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3720; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3721; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3722; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3723; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3724; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3725; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3726; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3727; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3728; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3729; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3730; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3731; GFX7LESS-NEXT:    s_endpgm
3732;
3733; GFX8-LABEL: max_i64_constant:
3734; GFX8:       ; %bb.0: ; %entry
3735; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3736; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3737; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3738; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3739; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3740; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3741; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3742; GFX8-NEXT:    ; mask branch BB18_2
3743; GFX8-NEXT:    s_cbranch_execz BB18_2
3744; GFX8-NEXT:  BB18_1:
3745; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3746; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3747; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3748; GFX8-NEXT:    s_mov_b32 m0, -1
3749; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3750; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3751; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3752; GFX8-NEXT:    buffer_wbinvl1_vol
3753; GFX8-NEXT:  BB18_2:
3754; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3755; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3756; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3757; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3758; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3759; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3760; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3761; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3762; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3763; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3764; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3765; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3766; GFX8-NEXT:    s_mov_b32 s2, -1
3767; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3768; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3769; GFX8-NEXT:    s_endpgm
3770;
3771; GFX9-LABEL: max_i64_constant:
3772; GFX9:       ; %bb.0: ; %entry
3773; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3774; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3775; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3776; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3777; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3778; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3779; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3780; GFX9-NEXT:    ; mask branch BB18_2
3781; GFX9-NEXT:    s_cbranch_execz BB18_2
3782; GFX9-NEXT:  BB18_1:
3783; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3784; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3785; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3786; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3787; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3788; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3789; GFX9-NEXT:    buffer_wbinvl1_vol
3790; GFX9-NEXT:  BB18_2:
3791; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3792; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3793; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3794; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3795; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3796; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3797; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3798; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3799; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3800; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3801; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3802; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3803; GFX9-NEXT:    s_mov_b32 s2, -1
3804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3805; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3806; GFX9-NEXT:    s_endpgm
3807;
3808; GFX1064-LABEL: max_i64_constant:
3809; GFX1064:       ; %bb.0: ; %entry
3810; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3811; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3812; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3813; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
3814; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3815; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3816; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3817; GFX1064-NEXT:    ; mask branch BB18_2
3818; GFX1064-NEXT:    s_cbranch_execz BB18_2
3819; GFX1064-NEXT:  BB18_1:
3820; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3821; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3822; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3823; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3824; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3825; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3826; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3827; GFX1064-NEXT:    buffer_gl0_inv
3828; GFX1064-NEXT:    buffer_gl1_inv
3829; GFX1064-NEXT:  BB18_2:
3830; GFX1064-NEXT:    v_nop
3831; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3832; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
3833; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
3834; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3835; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3836; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3837; GFX1064-NEXT:    s_mov_b32 s2, -1
3838; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3839; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
3840; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
3841; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3842; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3843; GFX1064-NEXT:    s_endpgm
3844;
3845; GFX1032-LABEL: max_i64_constant:
3846; GFX1032:       ; %bb.0: ; %entry
3847; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3848; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
3849; GFX1032-NEXT:    ; implicit-def: $vcc_hi
3850; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3851; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3852; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3853; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3854; GFX1032-NEXT:    ; mask branch BB18_2
3855; GFX1032-NEXT:    s_cbranch_execz BB18_2
3856; GFX1032-NEXT:  BB18_1:
3857; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3858; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3859; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3860; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3861; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3862; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3863; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3864; GFX1032-NEXT:    buffer_gl0_inv
3865; GFX1032-NEXT:    buffer_gl1_inv
3866; GFX1032-NEXT:  BB18_2:
3867; GFX1032-NEXT:    v_nop
3868; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3869; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
3870; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
3871; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3872; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3873; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3874; GFX1032-NEXT:    s_mov_b32 s2, -1
3875; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1]
3876; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
3877; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
3878; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3879; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3880; GFX1032-NEXT:    s_endpgm
3881entry:
3882  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3883  store i64 %old, i64 addrspace(1)* %out
3884  ret void
3885}
3886
3887; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
3888; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
3889; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
3890define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3891;
3892;
3893; GFX7LESS-LABEL: min_i32_varying:
3894; GFX7LESS:       ; %bb.0: ; %entry
3895; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3896; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3897; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3898; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3899; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3900; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3901; GFX7LESS-NEXT:    buffer_wbinvl1
3902; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3903; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3904; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3905; GFX7LESS-NEXT:    s_endpgm
3906;
3907; GFX8-LABEL: min_i32_varying:
3908; GFX8:       ; %bb.0: ; %entry
3909; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3910; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3911; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3912; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3913; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3914; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3915; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3916; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3917; GFX8-NEXT:    s_not_b64 exec, exec
3918; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3919; GFX8-NEXT:    s_not_b64 exec, exec
3920; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
3921; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3922; GFX8-NEXT:    s_nop 1
3923; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3924; GFX8-NEXT:    s_nop 1
3925; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3926; GFX8-NEXT:    s_nop 1
3927; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3928; GFX8-NEXT:    s_nop 1
3929; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3930; GFX8-NEXT:    s_nop 1
3931; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3932; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
3933; GFX8-NEXT:    s_nop 0
3934; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3935; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
3936; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3937; GFX8-NEXT:    ; implicit-def: $vgpr0
3938; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3939; GFX8-NEXT:    ; mask branch BB19_2
3940; GFX8-NEXT:    s_cbranch_execz BB19_2
3941; GFX8-NEXT:  BB19_1:
3942; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3943; GFX8-NEXT:    v_mov_b32_e32 v3, s2
3944; GFX8-NEXT:    s_mov_b32 m0, -1
3945; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3946; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3947; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3948; GFX8-NEXT:    buffer_wbinvl1_vol
3949; GFX8-NEXT:  BB19_2:
3950; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3951; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3952; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3953; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3954; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3955; GFX8-NEXT:    s_mov_b32 s2, -1
3956; GFX8-NEXT:    s_nop 0
3957; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3958; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3959; GFX8-NEXT:    s_endpgm
3960;
3961; GFX9-LABEL: min_i32_varying:
3962; GFX9:       ; %bb.0: ; %entry
3963; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3964; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
3965; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
3966; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
3967; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3968; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3969; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3970; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3971; GFX9-NEXT:    s_not_b64 exec, exec
3972; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3973; GFX9-NEXT:    s_not_b64 exec, exec
3974; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
3975; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3976; GFX9-NEXT:    s_nop 1
3977; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3978; GFX9-NEXT:    s_nop 1
3979; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3980; GFX9-NEXT:    s_nop 1
3981; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3982; GFX9-NEXT:    s_nop 1
3983; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3984; GFX9-NEXT:    s_nop 1
3985; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3986; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
3987; GFX9-NEXT:    s_nop 0
3988; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3989; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
3990; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3991; GFX9-NEXT:    ; implicit-def: $vgpr0
3992; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3993; GFX9-NEXT:    ; mask branch BB19_2
3994; GFX9-NEXT:    s_cbranch_execz BB19_2
3995; GFX9-NEXT:  BB19_1:
3996; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3997; GFX9-NEXT:    v_mov_b32_e32 v3, s2
3998; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3999; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
4000; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4001; GFX9-NEXT:    buffer_wbinvl1_vol
4002; GFX9-NEXT:  BB19_2:
4003; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4004; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4005; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4006; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
4007; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4008; GFX9-NEXT:    s_mov_b32 s2, -1
4009; GFX9-NEXT:    s_nop 0
4010; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4011; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4012; GFX9-NEXT:    s_endpgm
4013;
4014; GFX1064-LABEL: min_i32_varying:
4015; GFX1064:       ; %bb.0: ; %entry
4016; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4017; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4018; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4019; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4020; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
4021; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4022; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
4023; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4024; GFX1064-NEXT:    s_not_b64 exec, exec
4025; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4026; GFX1064-NEXT:    s_not_b64 exec, exec
4027; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4028; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4029; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4030; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4031; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4032; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4033; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4034; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4035; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4036; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4037; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4038; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4039; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4040; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4041; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4042; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4043; GFX1064-NEXT:    s_mov_b32 s2, -1
4044; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4045; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4046; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4047; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4048; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4049; GFX1064-NEXT:    ; implicit-def: $vgpr0
4050; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4051; GFX1064-NEXT:    ; mask branch BB19_2
4052; GFX1064-NEXT:    s_cbranch_execz BB19_2
4053; GFX1064-NEXT:  BB19_1:
4054; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4055; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4056; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4057; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4058; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v7
4059; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4060; GFX1064-NEXT:    buffer_gl0_inv
4061; GFX1064-NEXT:    buffer_gl1_inv
4062; GFX1064-NEXT:  BB19_2:
4063; GFX1064-NEXT:    v_nop
4064; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4065; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4066; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4067; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
4068; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4069; GFX1064-NEXT:    s_nop 1
4070; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4071; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4072; GFX1064-NEXT:    s_endpgm
4073;
4074; GFX1032-LABEL: min_i32_varying:
4075; GFX1032:       ; %bb.0: ; %entry
4076; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4077; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4078; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4079; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4080; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4081; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4082; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
4083; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4084; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4085; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4086; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4087; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4088; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4089; GFX1032-NEXT:    s_mov_b32 s2, -1
4090; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4091; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4092; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4093; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4094; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4095; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4096; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4097; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4098; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4099; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4100; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4101; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4102; GFX1032-NEXT:    ; implicit-def: $vgpr0
4103; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4104; GFX1032-NEXT:    ; mask branch BB19_2
4105; GFX1032-NEXT:    s_cbranch_execz BB19_2
4106; GFX1032-NEXT:  BB19_1:
4107; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4108; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4109; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4110; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4111; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v7
4112; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4113; GFX1032-NEXT:    buffer_gl0_inv
4114; GFX1032-NEXT:    buffer_gl1_inv
4115; GFX1032-NEXT:  BB19_2:
4116; GFX1032-NEXT:    v_nop
4117; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4118; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4119; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4120; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
4121; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4122; GFX1032-NEXT:    s_nop 1
4123; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4124; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4125; GFX1032-NEXT:    s_endpgm
4126entry:
4127  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4128  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4129  store i32 %old, i32 addrspace(1)* %out
4130  ret void
4131}
4132
4133define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
4134;
4135;
4136; GFX7LESS-LABEL: min_i64_constant:
4137; GFX7LESS:       ; %bb.0: ; %entry
4138; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4139; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4140; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4141; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
4142; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4143; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4144; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4145; GFX7LESS-NEXT:    ; mask branch BB20_2
4146; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
4147; GFX7LESS-NEXT:  BB20_1:
4148; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4149; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4150; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4151; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4152; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4153; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4154; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4155; GFX7LESS-NEXT:    buffer_wbinvl1
4156; GFX7LESS-NEXT:  BB20_2:
4157; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4158; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4159; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4160; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
4161; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4162; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4163; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4164; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4165; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4166; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4167; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4168; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4169; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4170; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4171; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4172; GFX7LESS-NEXT:    s_endpgm
4173;
4174; GFX8-LABEL: min_i64_constant:
4175; GFX8:       ; %bb.0: ; %entry
4176; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4177; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4178; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4179; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4180; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4181; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4182; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4183; GFX8-NEXT:    ; mask branch BB20_2
4184; GFX8-NEXT:    s_cbranch_execz BB20_2
4185; GFX8-NEXT:  BB20_1:
4186; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4187; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4188; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4189; GFX8-NEXT:    s_mov_b32 m0, -1
4190; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4191; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4192; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4193; GFX8-NEXT:    buffer_wbinvl1_vol
4194; GFX8-NEXT:  BB20_2:
4195; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4196; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4197; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
4198; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4199; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4200; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4201; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4202; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4203; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4204; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4205; GFX8-NEXT:    s_mov_b32 s2, -1
4206; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4207; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4208; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4209; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4210; GFX8-NEXT:    s_endpgm
4211;
4212; GFX9-LABEL: min_i64_constant:
4213; GFX9:       ; %bb.0: ; %entry
4214; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4215; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4216; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4217; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4218; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4219; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4220; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4221; GFX9-NEXT:    ; mask branch BB20_2
4222; GFX9-NEXT:    s_cbranch_execz BB20_2
4223; GFX9-NEXT:  BB20_1:
4224; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4225; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4226; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4227; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4228; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4229; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4230; GFX9-NEXT:    buffer_wbinvl1_vol
4231; GFX9-NEXT:  BB20_2:
4232; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4233; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4234; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4235; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4236; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4237; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4238; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4239; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4240; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4241; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4242; GFX9-NEXT:    s_mov_b32 s2, -1
4243; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4244; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4246; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4247; GFX9-NEXT:    s_endpgm
4248;
4249; GFX1064-LABEL: min_i64_constant:
4250; GFX1064:       ; %bb.0: ; %entry
4251; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4252; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4253; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4254; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4255; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4256; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4257; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4258; GFX1064-NEXT:    ; mask branch BB20_2
4259; GFX1064-NEXT:    s_cbranch_execz BB20_2
4260; GFX1064-NEXT:  BB20_1:
4261; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4262; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4263; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4264; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4265; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4266; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4267; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4268; GFX1064-NEXT:    buffer_gl0_inv
4269; GFX1064-NEXT:    buffer_gl1_inv
4270; GFX1064-NEXT:  BB20_2:
4271; GFX1064-NEXT:    v_nop
4272; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4273; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4274; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4275; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4276; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4277; GFX1064-NEXT:    s_mov_b32 s2, -1
4278; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4279; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4280; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
4281; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4282; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4283; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4284; GFX1064-NEXT:    s_endpgm
4285;
4286; GFX1032-LABEL: min_i64_constant:
4287; GFX1032:       ; %bb.0: ; %entry
4288; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4289; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4290; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4291; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4292; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4293; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4294; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4295; GFX1032-NEXT:    ; mask branch BB20_2
4296; GFX1032-NEXT:    s_cbranch_execz BB20_2
4297; GFX1032-NEXT:  BB20_1:
4298; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4299; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4300; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4301; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4302; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4303; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4304; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4305; GFX1032-NEXT:    buffer_gl0_inv
4306; GFX1032-NEXT:    buffer_gl1_inv
4307; GFX1032-NEXT:  BB20_2:
4308; GFX1032-NEXT:    v_nop
4309; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4310; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
4311; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
4312; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4313; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4314; GFX1032-NEXT:    s_mov_b32 s2, -1
4315; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4316; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
4317; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
4318; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
4319; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4320; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4321; GFX1032-NEXT:    s_endpgm
4322entry:
4323  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4324  store i64 %old, i64 addrspace(1)* %out
4325  ret void
4326}
4327
4328; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
4329; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
4330; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
4331define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4332;
4333;
4334; GFX7LESS-LABEL: umax_i32_varying:
4335; GFX7LESS:       ; %bb.0: ; %entry
4336; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4337; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4338; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4339; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4340; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4341; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4342; GFX7LESS-NEXT:    buffer_wbinvl1
4343; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4344; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4345; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4346; GFX7LESS-NEXT:    s_endpgm
4347;
4348; GFX8-LABEL: umax_i32_varying:
4349; GFX8:       ; %bb.0: ; %entry
4350; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4351; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4352; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4353; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4354; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4355; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4356; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4357; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4358; GFX8-NEXT:    s_not_b64 exec, exec
4359; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4360; GFX8-NEXT:    s_not_b64 exec, exec
4361; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4362; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4363; GFX8-NEXT:    s_nop 1
4364; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4365; GFX8-NEXT:    s_nop 1
4366; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4367; GFX8-NEXT:    s_nop 1
4368; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4369; GFX8-NEXT:    s_nop 1
4370; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4371; GFX8-NEXT:    s_nop 1
4372; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4373; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4374; GFX8-NEXT:    s_nop 0
4375; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4376; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4377; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4378; GFX8-NEXT:    ; implicit-def: $vgpr0
4379; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4380; GFX8-NEXT:    ; mask branch BB21_2
4381; GFX8-NEXT:    s_cbranch_execz BB21_2
4382; GFX8-NEXT:  BB21_1:
4383; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4384; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4385; GFX8-NEXT:    s_mov_b32 m0, -1
4386; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4387; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4388; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4389; GFX8-NEXT:    buffer_wbinvl1_vol
4390; GFX8-NEXT:  BB21_2:
4391; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4392; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4393; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4394; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4395; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4396; GFX8-NEXT:    s_mov_b32 s2, -1
4397; GFX8-NEXT:    s_nop 0
4398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4399; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4400; GFX8-NEXT:    s_endpgm
4401;
4402; GFX9-LABEL: umax_i32_varying:
4403; GFX9:       ; %bb.0: ; %entry
4404; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4405; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4406; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4407; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4408; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4409; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4410; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4411; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4412; GFX9-NEXT:    s_not_b64 exec, exec
4413; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4414; GFX9-NEXT:    s_not_b64 exec, exec
4415; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4416; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4417; GFX9-NEXT:    s_nop 1
4418; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4419; GFX9-NEXT:    s_nop 1
4420; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4421; GFX9-NEXT:    s_nop 1
4422; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4423; GFX9-NEXT:    s_nop 1
4424; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4425; GFX9-NEXT:    s_nop 1
4426; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4427; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4428; GFX9-NEXT:    s_nop 0
4429; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4430; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4431; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4432; GFX9-NEXT:    ; implicit-def: $vgpr0
4433; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4434; GFX9-NEXT:    ; mask branch BB21_2
4435; GFX9-NEXT:    s_cbranch_execz BB21_2
4436; GFX9-NEXT:  BB21_1:
4437; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4438; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4439; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4440; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4441; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4442; GFX9-NEXT:    buffer_wbinvl1_vol
4443; GFX9-NEXT:  BB21_2:
4444; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4445; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4446; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4447; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4448; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4449; GFX9-NEXT:    s_mov_b32 s2, -1
4450; GFX9-NEXT:    s_nop 0
4451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4452; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4453; GFX9-NEXT:    s_endpgm
4454;
4455; GFX1064-LABEL: umax_i32_varying:
4456; GFX1064:       ; %bb.0: ; %entry
4457; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4458; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4459; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4460; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4461; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4462; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4463; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4464; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4465; GFX1064-NEXT:    s_not_b64 exec, exec
4466; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4467; GFX1064-NEXT:    s_not_b64 exec, exec
4468; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4469; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4470; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4471; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4472; GFX1064-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4473; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4474; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4475; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4476; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4477; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4478; GFX1064-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4479; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4480; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4481; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4482; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4483; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4484; GFX1064-NEXT:    s_mov_b32 s2, -1
4485; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4486; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4487; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4488; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4489; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4490; GFX1064-NEXT:    ; implicit-def: $vgpr0
4491; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4492; GFX1064-NEXT:    ; mask branch BB21_2
4493; GFX1064-NEXT:    s_cbranch_execz BB21_2
4494; GFX1064-NEXT:  BB21_1:
4495; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4496; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4497; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4498; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4499; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v7
4500; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4501; GFX1064-NEXT:    buffer_gl0_inv
4502; GFX1064-NEXT:    buffer_gl1_inv
4503; GFX1064-NEXT:  BB21_2:
4504; GFX1064-NEXT:    v_nop
4505; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4506; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4507; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4508; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4509; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4510; GFX1064-NEXT:    s_nop 1
4511; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4513; GFX1064-NEXT:    s_endpgm
4514;
4515; GFX1032-LABEL: umax_i32_varying:
4516; GFX1032:       ; %bb.0: ; %entry
4517; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4518; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4519; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4520; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4521; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4522; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4523; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4524; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4525; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4526; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4527; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4528; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4529; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4530; GFX1032-NEXT:    s_mov_b32 s2, -1
4531; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4532; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4533; GFX1032-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4534; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4535; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4536; GFX1032-NEXT:    v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4537; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4538; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4539; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4540; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4541; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4542; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4543; GFX1032-NEXT:    ; implicit-def: $vgpr0
4544; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4545; GFX1032-NEXT:    ; mask branch BB21_2
4546; GFX1032-NEXT:    s_cbranch_execz BB21_2
4547; GFX1032-NEXT:  BB21_1:
4548; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4549; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4550; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4551; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4552; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v7
4553; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4554; GFX1032-NEXT:    buffer_gl0_inv
4555; GFX1032-NEXT:    buffer_gl1_inv
4556; GFX1032-NEXT:  BB21_2:
4557; GFX1032-NEXT:    v_nop
4558; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4559; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4560; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4561; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4562; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4563; GFX1032-NEXT:    s_nop 1
4564; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4565; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4566; GFX1032-NEXT:    s_endpgm
4567entry:
4568  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4569  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4570  store i32 %old, i32 addrspace(1)* %out
4571  ret void
4572}
4573
4574define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4575;
4576;
4577; GFX7LESS-LABEL: umax_i64_constant:
4578; GFX7LESS:       ; %bb.0: ; %entry
4579; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4580; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4581; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4582; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
4583; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4584; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4585; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4586; GFX7LESS-NEXT:    ; mask branch BB22_2
4587; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4588; GFX7LESS-NEXT:  BB22_1:
4589; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4590; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4591; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4592; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4593; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4594; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4595; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4596; GFX7LESS-NEXT:    buffer_wbinvl1
4597; GFX7LESS-NEXT:  BB22_2:
4598; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4599; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4600; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4601; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4602; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4603; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4604; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4605; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4606; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4607; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4608; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4609; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4610; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4611; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4612; GFX7LESS-NEXT:    s_endpgm
4613;
4614; GFX8-LABEL: umax_i64_constant:
4615; GFX8:       ; %bb.0: ; %entry
4616; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4617; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4618; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4619; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4620; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4621; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4622; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4623; GFX8-NEXT:    ; mask branch BB22_2
4624; GFX8-NEXT:    s_cbranch_execz BB22_2
4625; GFX8-NEXT:  BB22_1:
4626; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4627; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4628; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4629; GFX8-NEXT:    s_mov_b32 m0, -1
4630; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4631; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4632; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4633; GFX8-NEXT:    buffer_wbinvl1_vol
4634; GFX8-NEXT:  BB22_2:
4635; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4636; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4637; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4638; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4639; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4640; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4641; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4642; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4643; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4644; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4645; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4646; GFX8-NEXT:    s_mov_b32 s2, -1
4647; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4648; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4649; GFX8-NEXT:    s_endpgm
4650;
4651; GFX9-LABEL: umax_i64_constant:
4652; GFX9:       ; %bb.0: ; %entry
4653; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4654; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4655; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4656; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
4657; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4658; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4659; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4660; GFX9-NEXT:    ; mask branch BB22_2
4661; GFX9-NEXT:    s_cbranch_execz BB22_2
4662; GFX9-NEXT:  BB22_1:
4663; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4664; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4665; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4666; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4667; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4668; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4669; GFX9-NEXT:    buffer_wbinvl1_vol
4670; GFX9-NEXT:  BB22_2:
4671; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4672; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4673; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4674; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4675; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4676; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4677; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4678; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4679; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4680; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4681; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4682; GFX9-NEXT:    s_mov_b32 s2, -1
4683; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4684; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4685; GFX9-NEXT:    s_endpgm
4686;
4687; GFX1064-LABEL: umax_i64_constant:
4688; GFX1064:       ; %bb.0: ; %entry
4689; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4690; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4691; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4692; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
4693; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4694; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4695; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4696; GFX1064-NEXT:    ; mask branch BB22_2
4697; GFX1064-NEXT:    s_cbranch_execz BB22_2
4698; GFX1064-NEXT:  BB22_1:
4699; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4700; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4701; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4702; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4703; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4704; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4705; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4706; GFX1064-NEXT:    buffer_gl0_inv
4707; GFX1064-NEXT:    buffer_gl1_inv
4708; GFX1064-NEXT:  BB22_2:
4709; GFX1064-NEXT:    v_nop
4710; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4711; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
4712; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
4713; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4714; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4715; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4716; GFX1064-NEXT:    s_mov_b32 s2, -1
4717; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4718; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
4719; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s5, vcc
4720; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4721; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4722; GFX1064-NEXT:    s_endpgm
4723;
4724; GFX1032-LABEL: umax_i64_constant:
4725; GFX1032:       ; %bb.0: ; %entry
4726; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4727; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4728; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4729; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4730; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4731; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4732; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4733; GFX1032-NEXT:    ; mask branch BB22_2
4734; GFX1032-NEXT:    s_cbranch_execz BB22_2
4735; GFX1032-NEXT:  BB22_1:
4736; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4737; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4738; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4739; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4740; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4741; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4742; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4743; GFX1032-NEXT:    buffer_gl0_inv
4744; GFX1032-NEXT:    buffer_gl1_inv
4745; GFX1032-NEXT:  BB22_2:
4746; GFX1032-NEXT:    v_nop
4747; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4748; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
4749; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
4750; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4751; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4752; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4753; GFX1032-NEXT:    s_mov_b32 s2, -1
4754; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
4755; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
4756; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s5, vcc_lo
4757; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4758; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4759; GFX1032-NEXT:    s_endpgm
4760entry:
4761  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4762  store i64 %old, i64 addrspace(1)* %out
4763  ret void
4764}
4765
4766; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
4767; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
4768; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
4769define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4770;
4771;
4772; GFX7LESS-LABEL: umin_i32_varying:
4773; GFX7LESS:       ; %bb.0: ; %entry
4774; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4775; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4776; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4777; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4778; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4779; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4780; GFX7LESS-NEXT:    buffer_wbinvl1
4781; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4782; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4783; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4784; GFX7LESS-NEXT:    s_endpgm
4785;
4786; GFX8-LABEL: umin_i32_varying:
4787; GFX8:       ; %bb.0: ; %entry
4788; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4789; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4790; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
4791; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
4792; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4793; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4794; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4795; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4796; GFX8-NEXT:    s_not_b64 exec, exec
4797; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4798; GFX8-NEXT:    s_not_b64 exec, exec
4799; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
4800; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4801; GFX8-NEXT:    s_nop 1
4802; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4803; GFX8-NEXT:    s_nop 1
4804; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4805; GFX8-NEXT:    s_nop 1
4806; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4807; GFX8-NEXT:    s_nop 1
4808; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4809; GFX8-NEXT:    s_nop 1
4810; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4811; GFX8-NEXT:    v_readlane_b32 s2, v2, 63
4812; GFX8-NEXT:    s_nop 0
4813; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4814; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
4815; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4816; GFX8-NEXT:    ; implicit-def: $vgpr0
4817; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4818; GFX8-NEXT:    ; mask branch BB23_2
4819; GFX8-NEXT:    s_cbranch_execz BB23_2
4820; GFX8-NEXT:  BB23_1:
4821; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4822; GFX8-NEXT:    v_mov_b32_e32 v3, s2
4823; GFX8-NEXT:    s_mov_b32 m0, -1
4824; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4825; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4826; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4827; GFX8-NEXT:    buffer_wbinvl1_vol
4828; GFX8-NEXT:  BB23_2:
4829; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4830; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4831; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4832; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4833; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4834; GFX8-NEXT:    s_mov_b32 s2, -1
4835; GFX8-NEXT:    s_nop 0
4836; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4837; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4838; GFX8-NEXT:    s_endpgm
4839;
4840; GFX9-LABEL: umin_i32_varying:
4841; GFX9:       ; %bb.0: ; %entry
4842; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4843; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4844; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, s2, 0
4845; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, s3, v3
4846; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4847; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4848; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4849; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4850; GFX9-NEXT:    s_not_b64 exec, exec
4851; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4852; GFX9-NEXT:    s_not_b64 exec, exec
4853; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
4854; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4855; GFX9-NEXT:    s_nop 1
4856; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4857; GFX9-NEXT:    s_nop 1
4858; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4859; GFX9-NEXT:    s_nop 1
4860; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4861; GFX9-NEXT:    s_nop 1
4862; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4863; GFX9-NEXT:    s_nop 1
4864; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4865; GFX9-NEXT:    v_readlane_b32 s2, v2, 63
4866; GFX9-NEXT:    s_nop 0
4867; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4868; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
4869; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4870; GFX9-NEXT:    ; implicit-def: $vgpr0
4871; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4872; GFX9-NEXT:    ; mask branch BB23_2
4873; GFX9-NEXT:    s_cbranch_execz BB23_2
4874; GFX9-NEXT:  BB23_1:
4875; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4876; GFX9-NEXT:    v_mov_b32_e32 v3, s2
4877; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4878; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4879; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4880; GFX9-NEXT:    buffer_wbinvl1_vol
4881; GFX9-NEXT:  BB23_2:
4882; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4883; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4884; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4885; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4886; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4887; GFX9-NEXT:    s_mov_b32 s2, -1
4888; GFX9-NEXT:    s_nop 0
4889; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4890; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4891; GFX9-NEXT:    s_endpgm
4892;
4893; GFX1064-LABEL: umin_i32_varying:
4894; GFX1064:       ; %bb.0: ; %entry
4895; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4896; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
4897; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
4898; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4899; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v4, s3, v4
4900; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4901; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4902; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4903; GFX1064-NEXT:    s_not_b64 exec, exec
4904; GFX1064-NEXT:    v_mov_b32_e32 v2, -1
4905; GFX1064-NEXT:    s_not_b64 exec, exec
4906; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4907; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4908; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4909; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4910; GFX1064-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4911; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
4912; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4913; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4914; GFX1064-NEXT:    v_readlane_b32 s2, v2, 31
4915; GFX1064-NEXT:    v_mov_b32_e32 v3, s2
4916; GFX1064-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4917; GFX1064-NEXT:    v_readlane_b32 s2, v2, 15
4918; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4919; GFX1064-NEXT:    v_readlane_b32 s3, v2, 31
4920; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
4921; GFX1064-NEXT:    v_writelane_b32 v1, s2, 16
4922; GFX1064-NEXT:    s_mov_b32 s2, -1
4923; GFX1064-NEXT:    v_writelane_b32 v1, s3, 32
4924; GFX1064-NEXT:    v_readlane_b32 s3, v2, 63
4925; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
4926; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4927; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4928; GFX1064-NEXT:    ; implicit-def: $vgpr0
4929; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4930; GFX1064-NEXT:    ; mask branch BB23_2
4931; GFX1064-NEXT:    s_cbranch_execz BB23_2
4932; GFX1064-NEXT:  BB23_1:
4933; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4934; GFX1064-NEXT:    v_mov_b32_e32 v7, s3
4935; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4936; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4937; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v7
4938; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4939; GFX1064-NEXT:    buffer_gl0_inv
4940; GFX1064-NEXT:    buffer_gl1_inv
4941; GFX1064-NEXT:  BB23_2:
4942; GFX1064-NEXT:    v_nop
4943; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4944; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4945; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
4946; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4947; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4948; GFX1064-NEXT:    s_nop 1
4949; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4950; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4951; GFX1064-NEXT:    s_endpgm
4952;
4953; GFX1032-LABEL: umin_i32_varying:
4954; GFX1032:       ; %bb.0: ; %entry
4955; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4956; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
4957; GFX1032-NEXT:    ; implicit-def: $vcc_hi
4958; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
4959; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v4, s2, 0
4960; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4961; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4962; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4963; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4964; GFX1032-NEXT:    v_mov_b32_e32 v2, -1
4965; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4966; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
4967; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4968; GFX1032-NEXT:    s_mov_b32 s2, -1
4969; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4970; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4971; GFX1032-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4972; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
4973; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
4974; GFX1032-NEXT:    v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4975; GFX1032-NEXT:    v_readlane_b32 s3, v2, 31
4976; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4977; GFX1032-NEXT:    v_readlane_b32 s5, v2, 15
4978; GFX1032-NEXT:    v_writelane_b32 v1, s5, 16
4979; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
4980; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4981; GFX1032-NEXT:    ; implicit-def: $vgpr0
4982; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4983; GFX1032-NEXT:    ; mask branch BB23_2
4984; GFX1032-NEXT:    s_cbranch_execz BB23_2
4985; GFX1032-NEXT:  BB23_1:
4986; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4987; GFX1032-NEXT:    v_mov_b32_e32 v7, s3
4988; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4989; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4990; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v7
4991; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4992; GFX1032-NEXT:    buffer_gl0_inv
4993; GFX1032-NEXT:    buffer_gl1_inv
4994; GFX1032-NEXT:  BB23_2:
4995; GFX1032-NEXT:    v_nop
4996; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4997; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4998; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
4999; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
5000; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5001; GFX1032-NEXT:    s_nop 1
5002; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5003; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5004; GFX1032-NEXT:    s_endpgm
5005entry:
5006  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5007  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5008  store i32 %old, i32 addrspace(1)* %out
5009  ret void
5010}
5011
5012define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
5013;
5014;
5015; GFX7LESS-LABEL: umin_i64_constant:
5016; GFX7LESS:       ; %bb.0: ; %entry
5017; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5018; GFX7LESS-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
5019; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
5020; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
5021; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5022; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5023; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5024; GFX7LESS-NEXT:    ; mask branch BB24_2
5025; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
5026; GFX7LESS-NEXT:  BB24_1:
5027; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5028; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5029; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5030; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5031; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5032; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5033; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5034; GFX7LESS-NEXT:    buffer_wbinvl1
5035; GFX7LESS-NEXT:  BB24_2:
5036; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5037; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5038; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5039; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5040; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5041; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5042; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5043; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5044; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5045; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
5046; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5047; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5048; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5049; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5050; GFX7LESS-NEXT:    s_endpgm
5051;
5052; GFX8-LABEL: umin_i64_constant:
5053; GFX8:       ; %bb.0: ; %entry
5054; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5055; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
5056; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
5057; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
5058; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5059; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5060; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5061; GFX8-NEXT:    ; mask branch BB24_2
5062; GFX8-NEXT:    s_cbranch_execz BB24_2
5063; GFX8-NEXT:  BB24_1:
5064; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5065; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5066; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5067; GFX8-NEXT:    s_mov_b32 m0, -1
5068; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5069; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5070; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5071; GFX8-NEXT:    buffer_wbinvl1_vol
5072; GFX8-NEXT:  BB24_2:
5073; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5074; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5075; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5076; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5077; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5078; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5079; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5080; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5081; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5082; GFX8-NEXT:    s_mov_b32 s2, -1
5083; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5084; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5085; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5086; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5087; GFX8-NEXT:    s_endpgm
5088;
5089; GFX9-LABEL: umin_i64_constant:
5090; GFX9:       ; %bb.0: ; %entry
5091; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5092; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
5093; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
5094; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
5095; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5096; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5097; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5098; GFX9-NEXT:    ; mask branch BB24_2
5099; GFX9-NEXT:    s_cbranch_execz BB24_2
5100; GFX9-NEXT:  BB24_1:
5101; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5102; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5103; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5104; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5105; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5106; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5107; GFX9-NEXT:    buffer_wbinvl1_vol
5108; GFX9-NEXT:  BB24_2:
5109; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5110; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5111; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5112; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5113; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5114; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5115; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5116; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5117; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5118; GFX9-NEXT:    s_mov_b32 s2, -1
5119; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5120; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5121; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5122; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5123; GFX9-NEXT:    s_endpgm
5124;
5125; GFX1064-LABEL: umin_i64_constant:
5126; GFX1064:       ; %bb.0: ; %entry
5127; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
5128; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5129; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
5130; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
5131; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5132; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5133; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5134; GFX1064-NEXT:    ; mask branch BB24_2
5135; GFX1064-NEXT:    s_cbranch_execz BB24_2
5136; GFX1064-NEXT:  BB24_1:
5137; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5138; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5139; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5140; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5141; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5142; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5143; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5144; GFX1064-NEXT:    buffer_gl0_inv
5145; GFX1064-NEXT:    buffer_gl1_inv
5146; GFX1064-NEXT:  BB24_2:
5147; GFX1064-NEXT:    v_nop
5148; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5149; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
5150; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
5151; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
5152; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5153; GFX1064-NEXT:    s_mov_b32 s2, -1
5154; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5155; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
5156; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc
5157; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc
5158; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5159; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5160; GFX1064-NEXT:    s_endpgm
5161;
5162; GFX1032-LABEL: umin_i64_constant:
5163; GFX1032:       ; %bb.0: ; %entry
5164; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5165; GFX1032-NEXT:    v_cmp_ne_u32_e64 s2, 1, 0
5166; GFX1032-NEXT:    ; implicit-def: $vcc_hi
5167; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
5168; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5169; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5170; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5171; GFX1032-NEXT:    ; mask branch BB24_2
5172; GFX1032-NEXT:    s_cbranch_execz BB24_2
5173; GFX1032-NEXT:  BB24_1:
5174; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5175; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
5176; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5177; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5178; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5179; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
5180; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5181; GFX1032-NEXT:    buffer_gl0_inv
5182; GFX1032-NEXT:    buffer_gl1_inv
5183; GFX1032-NEXT:  BB24_2:
5184; GFX1032-NEXT:    v_nop
5185; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5186; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
5187; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
5188; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
5189; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5190; GFX1032-NEXT:    s_mov_b32 s2, -1
5191; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5192; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
5193; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s5, vcc_lo
5194; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
5195; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5196; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5197; GFX1032-NEXT:    s_endpgm
5198entry:
5199  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
5200  store i64 %old, i64 addrspace(1)* %out
5201  ret void
5202}
5203