1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
32; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
33; GFX7LESS-NEXT:    s_mov_b32 m0, -1
34; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
43; GFX7LESS-NEXT:    s_mov_b32 s2, -1
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    s_mul_i32 s2, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
61; GFX8-NEXT:    v_mov_b32_e32 v2, s2
62; GFX8-NEXT:    s_mov_b32 m0, -1
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
70; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
71; GFX8-NEXT:    s_mov_b32 s3, 0xf000
72; GFX8-NEXT:    s_mov_b32 s2, -1
73; GFX8-NEXT:    s_nop 1
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    s_mul_i32 s2, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
91; GFX9-NEXT:    v_mov_b32_e32 v2, s2
92; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
99; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
100; GFX9-NEXT:    s_mov_b32 s3, 0xf000
101; GFX9-NEXT:    s_mov_b32 s2, -1
102; GFX9-NEXT:    s_nop 1
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
119; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
120; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
121; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
122; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
124; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX1064-NEXT:    buffer_gl0_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
135; GFX1064-NEXT:    s_endpgm
136;
137; GFX1032-LABEL: add_i32_constant:
138; GFX1032:       ; %bb.0: ; %entry
139; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
140; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
141; GFX1032-NEXT:    ; implicit-def: $vgpr1
142; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
143; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
144; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
145; GFX1032-NEXT:    s_cbranch_execz BB0_2
146; GFX1032-NEXT:  ; %bb.1:
147; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
148; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
149; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
150; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
151; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
152; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
153; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
154; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX1032-NEXT:    buffer_gl0_inv
156; GFX1032-NEXT:  BB0_2:
157; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
158; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
159; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
160; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
161; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
162; GFX1032-NEXT:    s_mov_b32 s2, -1
163; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
165; GFX1032-NEXT:    s_endpgm
166entry:
167  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
168  store i32 %old, i32 addrspace(1)* %out
169  ret void
170}
171
172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
173;
174;
175; GFX7LESS-LABEL: add_i32_uniform:
176; GFX7LESS:       ; %bb.0: ; %entry
177; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
178; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
179; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
180; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
181; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
182; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
183; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
184; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
185; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
186; GFX7LESS-NEXT:  ; %bb.1:
187; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
188; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
190; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
191; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
192; GFX7LESS-NEXT:    s_mov_b32 m0, -1
193; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
195; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX7LESS-NEXT:  BB1_2:
197; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
198; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
200; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
201; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
202; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
203; GFX7LESS-NEXT:    s_mov_b32 s6, -1
204; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
205; GFX7LESS-NEXT:    s_endpgm
206;
207; GFX8-LABEL: add_i32_uniform:
208; GFX8:       ; %bb.0: ; %entry
209; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
210; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
211; GFX8-NEXT:    s_mov_b64 s[2:3], exec
212; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
213; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
214; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
215; GFX8-NEXT:    ; implicit-def: $vgpr1
216; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
217; GFX8-NEXT:    s_cbranch_execz BB1_2
218; GFX8-NEXT:  ; %bb.1:
219; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
220; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX8-NEXT:    s_mul_i32 s1, s0, s1
222; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
223; GFX8-NEXT:    v_mov_b32_e32 v2, s1
224; GFX8-NEXT:    s_mov_b32 m0, -1
225; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
227; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX8-NEXT:  BB1_2:
229; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
230; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
232; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
233; GFX8-NEXT:    s_mov_b32 s7, 0xf000
234; GFX8-NEXT:    s_mov_b32 s6, -1
235; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
236; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
237; GFX8-NEXT:    s_endpgm
238;
239; GFX9-LABEL: add_i32_uniform:
240; GFX9:       ; %bb.0: ; %entry
241; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
242; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
243; GFX9-NEXT:    s_mov_b64 s[6:7], exec
244; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
245; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
246; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
247; GFX9-NEXT:    ; implicit-def: $vgpr1
248; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
249; GFX9-NEXT:    s_cbranch_execz BB1_2
250; GFX9-NEXT:  ; %bb.1:
251; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    s_mul_i32 s3, s2, s3
254; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
255; GFX9-NEXT:    v_mov_b32_e32 v2, s3
256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:  BB1_2:
260; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
261; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
263; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
264; GFX9-NEXT:    s_mov_b32 s7, 0xf000
265; GFX9-NEXT:    s_mov_b32 s6, -1
266; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
267; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
268; GFX9-NEXT:    s_endpgm
269;
270; GFX1064-LABEL: add_i32_uniform:
271; GFX1064:       ; %bb.0: ; %entry
272; GFX1064-NEXT:    s_clause 0x1
273; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
274; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
275; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
276; GFX1064-NEXT:    ; implicit-def: $vgpr1
277; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
278; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
279; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
280; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
281; GFX1064-NEXT:    s_cbranch_execz BB1_2
282; GFX1064-NEXT:  ; %bb.1:
283; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
284; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
285; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
287; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
288; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
289; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX1064-NEXT:    buffer_gl0_inv
293; GFX1064-NEXT:  BB1_2:
294; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
295; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
296; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
298; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
299; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
300; GFX1064-NEXT:    s_mov_b32 s6, -1
301; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
302; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
303; GFX1064-NEXT:    s_endpgm
304;
305; GFX1032-LABEL: add_i32_uniform:
306; GFX1032:       ; %bb.0: ; %entry
307; GFX1032-NEXT:    s_clause 0x1
308; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
309; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
310; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
311; GFX1032-NEXT:    ; implicit-def: $vgpr1
312; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
313; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
314; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
315; GFX1032-NEXT:    s_cbranch_execz BB1_2
316; GFX1032-NEXT:  ; %bb.1:
317; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
318; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
319; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
321; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
322; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
323; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
324; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
325; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX1032-NEXT:    buffer_gl0_inv
327; GFX1032-NEXT:  BB1_2:
328; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
329; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
330; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
332; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
333; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
334; GFX1032-NEXT:    s_mov_b32 s6, -1
335; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
336; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
337; GFX1032-NEXT:    s_endpgm
338entry:
339  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
340  store i32 %old, i32 addrspace(1)* %out
341  ret void
342}
343
344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
345;
346;
347; GFX7LESS-LABEL: add_i32_varying:
348; GFX7LESS:       ; %bb.0: ; %entry
349; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
350; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
351; GFX7LESS-NEXT:    s_mov_b32 m0, -1
352; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
354; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
356; GFX7LESS-NEXT:    s_mov_b32 s2, -1
357; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
358; GFX7LESS-NEXT:    s_endpgm
359;
360; GFX8-LABEL: add_i32_varying:
361; GFX8:       ; %bb.0: ; %entry
362; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
363; GFX8-NEXT:    v_mov_b32_e32 v2, v0
364; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
365; GFX8-NEXT:    v_mov_b32_e32 v1, 0
366; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
367; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
368; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
369; GFX8-NEXT:    s_not_b64 exec, exec
370; GFX8-NEXT:    v_mov_b32_e32 v2, 0
371; GFX8-NEXT:    s_not_b64 exec, exec
372; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
373; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
374; GFX8-NEXT:    s_nop 1
375; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
376; GFX8-NEXT:    s_nop 1
377; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
378; GFX8-NEXT:    s_nop 1
379; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
380; GFX8-NEXT:    s_nop 1
381; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
382; GFX8-NEXT:    s_nop 1
383; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
384; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
385; GFX8-NEXT:    s_nop 0
386; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
387; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
388; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
389; GFX8-NEXT:    ; implicit-def: $vgpr0
390; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
391; GFX8-NEXT:    s_cbranch_execz BB2_2
392; GFX8-NEXT:  ; %bb.1:
393; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
394; GFX8-NEXT:    v_mov_b32_e32 v3, s4
395; GFX8-NEXT:    s_mov_b32 m0, -1
396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX8-NEXT:  BB2_2:
400; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
403; GFX8-NEXT:    v_mov_b32_e32 v0, v1
404; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
405; GFX8-NEXT:    s_mov_b32 s3, 0xf000
406; GFX8-NEXT:    s_mov_b32 s2, -1
407; GFX8-NEXT:    s_nop 0
408; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
409; GFX8-NEXT:    s_endpgm
410;
411; GFX9-LABEL: add_i32_varying:
412; GFX9:       ; %bb.0: ; %entry
413; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
414; GFX9-NEXT:    v_mov_b32_e32 v2, v0
415; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
416; GFX9-NEXT:    v_mov_b32_e32 v1, 0
417; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
418; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
419; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
420; GFX9-NEXT:    s_not_b64 exec, exec
421; GFX9-NEXT:    v_mov_b32_e32 v2, 0
422; GFX9-NEXT:    s_not_b64 exec, exec
423; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
424; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
425; GFX9-NEXT:    s_nop 1
426; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
427; GFX9-NEXT:    s_nop 1
428; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
429; GFX9-NEXT:    s_nop 1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
433; GFX9-NEXT:    s_nop 1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
435; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
436; GFX9-NEXT:    s_nop 0
437; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
438; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
439; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
440; GFX9-NEXT:    ; implicit-def: $vgpr0
441; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
442; GFX9-NEXT:    s_cbranch_execz BB2_2
443; GFX9-NEXT:  ; %bb.1:
444; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
445; GFX9-NEXT:    v_mov_b32_e32 v3, s4
446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:  BB2_2:
450; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
453; GFX9-NEXT:    v_mov_b32_e32 v0, v1
454; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
455; GFX9-NEXT:    s_mov_b32 s3, 0xf000
456; GFX9-NEXT:    s_mov_b32 s2, -1
457; GFX9-NEXT:    s_nop 0
458; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
459; GFX9-NEXT:    s_endpgm
460;
461; GFX1064-LABEL: add_i32_varying:
462; GFX1064:       ; %bb.0: ; %entry
463; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
464; GFX1064-NEXT:    s_not_b64 exec, exec
465; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
466; GFX1064-NEXT:    s_not_b64 exec, exec
467; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
468; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
469; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
470; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
471; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
472; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
473; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
474; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
475; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
476; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
477; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
478; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
479; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
480; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
481; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
482; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
483; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
484; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
485; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
486; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
487; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
488; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
489; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
490; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
491; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
492; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
493; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
494; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
495; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
496; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
497; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
498; GFX1064-NEXT:    s_mov_b32 s2, -1
499; GFX1064-NEXT:    ; implicit-def: $vgpr0
500; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
501; GFX1064-NEXT:    s_cbranch_execz BB2_2
502; GFX1064-NEXT:  ; %bb.1:
503; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
504; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
505; GFX1064-NEXT:    s_mov_b32 s3, s7
506; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
507; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
508; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
509; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX1064-NEXT:    buffer_gl0_inv
511; GFX1064-NEXT:  BB2_2:
512; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
513; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
514; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
515; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
516; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
517; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
518; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
519; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
520; GFX1064-NEXT:    s_endpgm
521;
522; GFX1032-LABEL: add_i32_varying:
523; GFX1032:       ; %bb.0: ; %entry
524; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
525; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
526; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
527; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
528; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
529; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
530; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
531; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
532; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
533; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
534; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
535; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
536; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
537; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
538; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
539; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
540; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
541; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
542; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
543; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
544; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
545; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
546; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
547; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
548; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
549; GFX1032-NEXT:    s_mov_b32 s2, -1
550; GFX1032-NEXT:    ; implicit-def: $vgpr0
551; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
552; GFX1032-NEXT:    s_cbranch_execz BB2_2
553; GFX1032-NEXT:  ; %bb.1:
554; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
555; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
556; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
557; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
558; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
559; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX1032-NEXT:    buffer_gl0_inv
561; GFX1032-NEXT:  BB2_2:
562; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
563; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
564; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
565; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
566; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
567; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
568; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
570; GFX1032-NEXT:    s_endpgm
571entry:
572  %lane = call i32 @llvm.amdgcn.workitem.id.x()
573  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
574  store i32 %old, i32 addrspace(1)* %out
575  ret void
576}
577
578define amdgpu_kernel void @add_i32_varying_nouse() {
579; GFX7LESS-LABEL: add_i32_varying_nouse:
580; GFX7LESS:       ; %bb.0: ; %entry
581; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
582; GFX7LESS-NEXT:    s_mov_b32 m0, -1
583; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX7LESS-NEXT:    ds_add_u32 v1, v0
585; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX7LESS-NEXT:    s_endpgm
587;
588; GFX8-LABEL: add_i32_varying_nouse:
589; GFX8:       ; %bb.0: ; %entry
590; GFX8-NEXT:    v_mov_b32_e32 v1, v0
591; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
592; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
593; GFX8-NEXT:    s_not_b64 exec, exec
594; GFX8-NEXT:    v_mov_b32_e32 v1, 0
595; GFX8-NEXT:    s_not_b64 exec, exec
596; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
597; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
598; GFX8-NEXT:    s_nop 1
599; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
600; GFX8-NEXT:    s_nop 1
601; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
602; GFX8-NEXT:    s_nop 1
603; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
604; GFX8-NEXT:    s_nop 1
605; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
606; GFX8-NEXT:    s_nop 1
607; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
608; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
609; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
610; GFX8-NEXT:    s_mov_b32 s0, s2
611; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
612; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
613; GFX8-NEXT:    s_cbranch_execz BB3_2
614; GFX8-NEXT:  ; %bb.1:
615; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
616; GFX8-NEXT:    v_mov_b32_e32 v2, s0
617; GFX8-NEXT:    s_mov_b32 m0, -1
618; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX8-NEXT:    ds_add_u32 v0, v2
620; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX8-NEXT:  BB3_2:
622; GFX8-NEXT:    s_endpgm
623;
624; GFX9-LABEL: add_i32_varying_nouse:
625; GFX9:       ; %bb.0: ; %entry
626; GFX9-NEXT:    v_mov_b32_e32 v1, v0
627; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
628; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
629; GFX9-NEXT:    s_not_b64 exec, exec
630; GFX9-NEXT:    v_mov_b32_e32 v1, 0
631; GFX9-NEXT:    s_not_b64 exec, exec
632; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
633; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
634; GFX9-NEXT:    s_nop 1
635; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
636; GFX9-NEXT:    s_nop 1
637; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
638; GFX9-NEXT:    s_nop 1
639; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
640; GFX9-NEXT:    s_nop 1
641; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
642; GFX9-NEXT:    s_nop 1
643; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
644; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
645; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
646; GFX9-NEXT:    s_mov_b32 s0, s2
647; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
648; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
649; GFX9-NEXT:    s_cbranch_execz BB3_2
650; GFX9-NEXT:  ; %bb.1:
651; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
652; GFX9-NEXT:    v_mov_b32_e32 v2, s0
653; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX9-NEXT:    ds_add_u32 v0, v2
655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX9-NEXT:  BB3_2:
657; GFX9-NEXT:    s_endpgm
658;
659; GFX1064-LABEL: add_i32_varying_nouse:
660; GFX1064:       ; %bb.0: ; %entry
661; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
662; GFX1064-NEXT:    s_not_b64 exec, exec
663; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
664; GFX1064-NEXT:    s_not_b64 exec, exec
665; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
666; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
670; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
671; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
672; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
673; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
674; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
675; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
676; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
677; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
678; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
679; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
680; GFX1064-NEXT:    s_add_i32 s0, s2, s3
681; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
682; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
683; GFX1064-NEXT:    s_cbranch_execz BB3_2
684; GFX1064-NEXT:  ; %bb.1:
685; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
686; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
687; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
688; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
689; GFX1064-NEXT:    ds_add_u32 v0, v3
690; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX1064-NEXT:    buffer_gl0_inv
692; GFX1064-NEXT:  BB3_2:
693; GFX1064-NEXT:    s_endpgm
694;
695; GFX1032-LABEL: add_i32_varying_nouse:
696; GFX1032:       ; %bb.0: ; %entry
697; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
698; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
699; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
700; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
701; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
702; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
703; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
704; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
705; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
706; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
707; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
708; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
709; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
710; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
711; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
712; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
713; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
714; GFX1032-NEXT:    s_cbranch_execz BB3_2
715; GFX1032-NEXT:  ; %bb.1:
716; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var32@abs32@lo
717; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
718; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
719; GFX1032-NEXT:    ds_add_u32 v3, v0
720; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
721; GFX1032-NEXT:    buffer_gl0_inv
722; GFX1032-NEXT:  BB3_2:
723; GFX1032-NEXT:    s_endpgm
724entry:
725  %lane = call i32 @llvm.amdgcn.workitem.id.x()
726  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
727  ret void
728}
729
730define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
731;
732;
733; GFX7LESS-LABEL: add_i64_constant:
734; GFX7LESS:       ; %bb.0: ; %entry
735; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
736; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
737; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
738; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
739; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
740; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
741; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
742; GFX7LESS-NEXT:    s_cbranch_execz BB4_2
743; GFX7LESS-NEXT:  ; %bb.1:
744; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
745; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
746; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
747; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
748; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
749; GFX7LESS-NEXT:    s_mov_b32 m0, -1
750; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
752; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX7LESS-NEXT:  BB4_2:
754; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
755; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
756; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
757; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
758; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
759; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
760; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
761; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
762; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
763; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
764; GFX7LESS-NEXT:    s_mov_b32 s2, -1
765; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
766; GFX7LESS-NEXT:    s_endpgm
767;
768; GFX8-LABEL: add_i64_constant:
769; GFX8:       ; %bb.0: ; %entry
770; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
771; GFX8-NEXT:    s_mov_b64 s[4:5], exec
772; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
773; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
774; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
775; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
776; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
777; GFX8-NEXT:    s_cbranch_execz BB4_2
778; GFX8-NEXT:  ; %bb.1:
779; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
780; GFX8-NEXT:    s_mul_i32 s4, s4, 5
781; GFX8-NEXT:    v_mov_b32_e32 v1, s4
782; GFX8-NEXT:    v_mov_b32_e32 v2, 0
783; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
784; GFX8-NEXT:    s_mov_b32 m0, -1
785; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
787; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX8-NEXT:  BB4_2:
789; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
790; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
792; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
793; GFX8-NEXT:    v_mov_b32_e32 v1, s2
794; GFX8-NEXT:    v_mov_b32_e32 v2, s3
795; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
796; GFX8-NEXT:    s_mov_b32 s3, 0xf000
797; GFX8-NEXT:    s_mov_b32 s2, -1
798; GFX8-NEXT:    s_nop 2
799; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
800; GFX8-NEXT:    s_endpgm
801;
802; GFX9-LABEL: add_i64_constant:
803; GFX9:       ; %bb.0: ; %entry
804; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
805; GFX9-NEXT:    s_mov_b64 s[4:5], exec
806; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
807; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
808; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
809; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
810; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
811; GFX9-NEXT:    s_cbranch_execz BB4_2
812; GFX9-NEXT:  ; %bb.1:
813; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
814; GFX9-NEXT:    s_mul_i32 s4, s4, 5
815; GFX9-NEXT:    v_mov_b32_e32 v1, s4
816; GFX9-NEXT:    v_mov_b32_e32 v2, 0
817; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
818; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
820; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX9-NEXT:  BB4_2:
822; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
823; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
824; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
825; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
826; GFX9-NEXT:    v_mov_b32_e32 v1, s2
827; GFX9-NEXT:    v_mov_b32_e32 v2, s3
828; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
829; GFX9-NEXT:    s_mov_b32 s3, 0xf000
830; GFX9-NEXT:    s_mov_b32 s2, -1
831; GFX9-NEXT:    s_nop 2
832; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
833; GFX9-NEXT:    s_endpgm
834;
835; GFX1064-LABEL: add_i64_constant:
836; GFX1064:       ; %bb.0: ; %entry
837; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
838; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
839; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
840; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
841; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
842; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
843; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
844; GFX1064-NEXT:    s_cbranch_execz BB4_2
845; GFX1064-NEXT:  ; %bb.1:
846; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
847; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
848; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
849; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
850; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
851; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
852; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
853; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
854; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX1064-NEXT:    buffer_gl0_inv
856; GFX1064-NEXT:  BB4_2:
857; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
858; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
859; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
860; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
861; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
862; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
863; GFX1064-NEXT:    s_mov_b32 s2, -1
864; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
865; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
866; GFX1064-NEXT:    s_endpgm
867;
868; GFX1032-LABEL: add_i64_constant:
869; GFX1032:       ; %bb.0: ; %entry
870; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
871; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
872; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
873; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
874; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
875; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
876; GFX1032-NEXT:    s_cbranch_execz BB4_2
877; GFX1032-NEXT:  ; %bb.1:
878; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
879; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
880; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
881; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
882; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
883; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
884; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
885; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
886; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX1032-NEXT:    buffer_gl0_inv
888; GFX1032-NEXT:  BB4_2:
889; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
890; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
891; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
892; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
893; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
894; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
895; GFX1032-NEXT:    s_mov_b32 s2, -1
896; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
897; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
898; GFX1032-NEXT:    s_endpgm
899entry:
900  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
901  store i64 %old, i64 addrspace(1)* %out
902  ret void
903}
904
905define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
906;
907;
908; GFX7LESS-LABEL: add_i64_uniform:
909; GFX7LESS:       ; %bb.0: ; %entry
910; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
911; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
912; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
913; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
914; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
915; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
916; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
917; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
918; GFX7LESS-NEXT:  ; %bb.1:
919; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
920; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
921; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
922; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
923; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
924; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
925; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
926; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
927; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
928; GFX7LESS-NEXT:    s_mov_b32 m0, -1
929; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
930; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
931; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX7LESS-NEXT:  BB5_2:
933; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
934; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
935; GFX7LESS-NEXT:    s_mov_b32 s6, -1
936; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX7LESS-NEXT:    s_mov_b32 s4, s0
938; GFX7LESS-NEXT:    s_mov_b32 s5, s1
939; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
940; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
941; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
942; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
943; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
944; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
945; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
946; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
947; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
948; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
949; GFX7LESS-NEXT:    s_endpgm
950;
951; GFX8-LABEL: add_i64_uniform:
952; GFX8:       ; %bb.0: ; %entry
953; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
954; GFX8-NEXT:    s_mov_b64 s[6:7], exec
955; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
956; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
957; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
958; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
959; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
960; GFX8-NEXT:    s_cbranch_execz BB5_2
961; GFX8-NEXT:  ; %bb.1:
962; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
963; GFX8-NEXT:    v_mov_b32_e32 v1, s6
964; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
965; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
966; GFX8-NEXT:    s_mul_i32 s7, s3, s6
967; GFX8-NEXT:    s_mul_i32 s6, s2, s6
968; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
969; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
970; GFX8-NEXT:    v_mov_b32_e32 v1, s6
971; GFX8-NEXT:    s_mov_b32 m0, -1
972; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
974; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
975; GFX8-NEXT:  BB5_2:
976; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
977; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX8-NEXT:    s_mov_b32 s4, s0
979; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
980; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
981; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
982; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
983; GFX8-NEXT:    s_mov_b32 s5, s1
984; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
985; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
986; GFX8-NEXT:    v_mov_b32_e32 v2, s1
987; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
988; GFX8-NEXT:    s_mov_b32 s7, 0xf000
989; GFX8-NEXT:    s_mov_b32 s6, -1
990; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
991; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
992; GFX8-NEXT:    s_endpgm
993;
994; GFX9-LABEL: add_i64_uniform:
995; GFX9:       ; %bb.0: ; %entry
996; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
997; GFX9-NEXT:    s_mov_b64 s[6:7], exec
998; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
999; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1000; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1001; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1002; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1003; GFX9-NEXT:    s_cbranch_execz BB5_2
1004; GFX9-NEXT:  ; %bb.1:
1005; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1006; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1008; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1009; GFX9-NEXT:    s_add_i32 s8, s8, s7
1010; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1011; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1012; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1013; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1014; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1016; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX9-NEXT:  BB5_2:
1018; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1020; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1021; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1022; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1023; GFX9-NEXT:    s_mov_b32 s4, s0
1024; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1025; GFX9-NEXT:    s_mov_b32 s5, s1
1026; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1027; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1028; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1029; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1030; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1031; GFX9-NEXT:    s_mov_b32 s6, -1
1032; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1033; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1034; GFX9-NEXT:    s_endpgm
1035;
1036; GFX1064-LABEL: add_i64_uniform:
1037; GFX1064:       ; %bb.0: ; %entry
1038; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1039; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1040; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1041; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1042; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1043; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1044; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1045; GFX1064-NEXT:    s_cbranch_execz BB5_2
1046; GFX1064-NEXT:  ; %bb.1:
1047; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1048; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1049; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1050; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1051; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1052; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1053; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1054; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1055; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1056; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1057; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1058; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1059; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX1064-NEXT:    buffer_gl0_inv
1061; GFX1064-NEXT:  BB5_2:
1062; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1063; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1064; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1066; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1067; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1068; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1069; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1070; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1071; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1072; GFX1064-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1073; GFX1064-NEXT:    s_mov_b32 s2, -1
1074; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1075; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1076; GFX1064-NEXT:    s_endpgm
1077;
1078; GFX1032-LABEL: add_i64_uniform:
1079; GFX1032:       ; %bb.0: ; %entry
1080; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1081; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1082; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1083; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1084; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1085; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1086; GFX1032-NEXT:    s_cbranch_execz BB5_2
1087; GFX1032-NEXT:  ; %bb.1:
1088; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1089; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1090; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1091; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1092; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1093; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1094; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1095; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1096; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1097; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1099; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1100; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX1032-NEXT:    buffer_gl0_inv
1102; GFX1032-NEXT:  BB5_2:
1103; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1104; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1105; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1107; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1108; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1109; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1110; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1111; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1112; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1113; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
1114; GFX1032-NEXT:    s_mov_b32 s2, -1
1115; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1116; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1117; GFX1032-NEXT:    s_endpgm
1118entry:
1119  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1120  store i64 %old, i64 addrspace(1)* %out
1121  ret void
1122}
1123
1124define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1125;
1126;
1127; GFX7LESS-LABEL: add_i64_varying:
1128; GFX7LESS:       ; %bb.0: ; %entry
1129; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1130; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1131; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1132; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1133; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1135; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1137; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1138; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1139; GFX7LESS-NEXT:    s_endpgm
1140;
1141; GFX8-LABEL: add_i64_varying:
1142; GFX8:       ; %bb.0: ; %entry
1143; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1144; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1145; GFX8-NEXT:    s_mov_b32 m0, -1
1146; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1147; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1148; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1149; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1150; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1151; GFX8-NEXT:    s_mov_b32 s2, -1
1152; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1153; GFX8-NEXT:    s_endpgm
1154;
1155; GFX9-LABEL: add_i64_varying:
1156; GFX9:       ; %bb.0: ; %entry
1157; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1158; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1159; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1160; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1164; GFX9-NEXT:    s_mov_b32 s2, -1
1165; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1166; GFX9-NEXT:    s_endpgm
1167;
1168; GFX10-LABEL: add_i64_varying:
1169; GFX10:       ; %bb.0: ; %entry
1170; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1171; GFX10-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1172; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1173; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1174; GFX10-NEXT:    s_mov_b32 s2, -1
1175; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1176; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1177; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1178; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX10-NEXT:    buffer_gl0_inv
1180; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1181; GFX10-NEXT:    s_endpgm
1182entry:
1183  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1184  %zext = zext i32 %lane to i64
1185  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1186  store i64 %old, i64 addrspace(1)* %out
1187  ret void
1188}
1189
1190define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1191;
1192;
1193; GFX7LESS-LABEL: sub_i32_constant:
1194; GFX7LESS:       ; %bb.0: ; %entry
1195; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1196; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1197; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1198; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1199; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1200; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1201; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1202; GFX7LESS-NEXT:    s_cbranch_execz BB7_2
1203; GFX7LESS-NEXT:  ; %bb.1:
1204; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1205; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1206; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1207; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1208; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1209; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1211; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX7LESS-NEXT:  BB7_2:
1213; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1214; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1216; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1217; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1218; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1219; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1220; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1221; GFX7LESS-NEXT:    s_endpgm
1222;
1223; GFX8-LABEL: sub_i32_constant:
1224; GFX8:       ; %bb.0: ; %entry
1225; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1226; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1227; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1228; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1229; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1230; GFX8-NEXT:    ; implicit-def: $vgpr1
1231; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1232; GFX8-NEXT:    s_cbranch_execz BB7_2
1233; GFX8-NEXT:  ; %bb.1:
1234; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1235; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1236; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1237; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1238; GFX8-NEXT:    s_mov_b32 m0, -1
1239; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1240; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1241; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX8-NEXT:  BB7_2:
1243; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1244; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1246; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1247; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1248; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1249; GFX8-NEXT:    s_mov_b32 s2, -1
1250; GFX8-NEXT:    s_nop 0
1251; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1252; GFX8-NEXT:    s_endpgm
1253;
1254; GFX9-LABEL: sub_i32_constant:
1255; GFX9:       ; %bb.0: ; %entry
1256; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1257; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1258; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1259; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1260; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1261; GFX9-NEXT:    ; implicit-def: $vgpr1
1262; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1263; GFX9-NEXT:    s_cbranch_execz BB7_2
1264; GFX9-NEXT:  ; %bb.1:
1265; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1266; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1267; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1268; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1269; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1271; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX9-NEXT:  BB7_2:
1273; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1274; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1275; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1276; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1277; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1278; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1279; GFX9-NEXT:    s_mov_b32 s2, -1
1280; GFX9-NEXT:    s_nop 0
1281; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1282; GFX9-NEXT:    s_endpgm
1283;
1284; GFX1064-LABEL: sub_i32_constant:
1285; GFX1064:       ; %bb.0: ; %entry
1286; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1287; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1288; GFX1064-NEXT:    ; implicit-def: $vgpr1
1289; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1290; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1291; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1292; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1293; GFX1064-NEXT:    s_cbranch_execz BB7_2
1294; GFX1064-NEXT:  ; %bb.1:
1295; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1296; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1297; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1298; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1299; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1300; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1301; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1302; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX1064-NEXT:    buffer_gl0_inv
1304; GFX1064-NEXT:  BB7_2:
1305; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1306; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1307; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1308; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1309; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1310; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1311; GFX1064-NEXT:    s_mov_b32 s2, -1
1312; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1313; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1314; GFX1064-NEXT:    s_endpgm
1315;
1316; GFX1032-LABEL: sub_i32_constant:
1317; GFX1032:       ; %bb.0: ; %entry
1318; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1319; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1320; GFX1032-NEXT:    ; implicit-def: $vgpr1
1321; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1322; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1323; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1324; GFX1032-NEXT:    s_cbranch_execz BB7_2
1325; GFX1032-NEXT:  ; %bb.1:
1326; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1327; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1328; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1329; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1330; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1331; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1332; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1333; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1334; GFX1032-NEXT:    buffer_gl0_inv
1335; GFX1032-NEXT:  BB7_2:
1336; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1337; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1338; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1339; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1340; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1341; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1342; GFX1032-NEXT:    s_mov_b32 s2, -1
1343; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1345; GFX1032-NEXT:    s_endpgm
1346entry:
1347  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1348  store i32 %old, i32 addrspace(1)* %out
1349  ret void
1350}
1351
1352define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1353;
1354;
1355; GFX7LESS-LABEL: sub_i32_uniform:
1356; GFX7LESS:       ; %bb.0: ; %entry
1357; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1358; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1359; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
1360; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1361; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1362; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1363; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1364; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1365; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1366; GFX7LESS-NEXT:  ; %bb.1:
1367; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1368; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1369; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
1370; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1371; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1372; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1373; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1374; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1375; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1376; GFX7LESS-NEXT:  BB8_2:
1377; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
1378; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1380; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1381; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1382; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
1383; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1384; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1385; GFX7LESS-NEXT:    s_endpgm
1386;
1387; GFX8-LABEL: sub_i32_uniform:
1388; GFX8:       ; %bb.0: ; %entry
1389; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1390; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1391; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1392; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1393; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1394; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1395; GFX8-NEXT:    ; implicit-def: $vgpr1
1396; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1397; GFX8-NEXT:    s_cbranch_execz BB8_2
1398; GFX8-NEXT:  ; %bb.1:
1399; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1400; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1401; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1402; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1403; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1404; GFX8-NEXT:    s_mov_b32 m0, -1
1405; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1406; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1407; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1408; GFX8-NEXT:  BB8_2:
1409; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1410; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1412; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1413; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1414; GFX8-NEXT:    s_mov_b32 s6, -1
1415; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1416; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1417; GFX8-NEXT:    s_endpgm
1418;
1419; GFX9-LABEL: sub_i32_uniform:
1420; GFX9:       ; %bb.0: ; %entry
1421; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1422; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1423; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1424; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1425; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1426; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1427; GFX9-NEXT:    ; implicit-def: $vgpr1
1428; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1429; GFX9-NEXT:    s_cbranch_execz BB8_2
1430; GFX9-NEXT:  ; %bb.1:
1431; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX9-NEXT:    s_mul_i32 s3, s2, s3
1434; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1435; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1436; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1438; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1439; GFX9-NEXT:  BB8_2:
1440; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1441; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1442; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1443; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1444; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1445; GFX9-NEXT:    s_mov_b32 s6, -1
1446; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1447; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1448; GFX9-NEXT:    s_endpgm
1449;
1450; GFX1064-LABEL: sub_i32_uniform:
1451; GFX1064:       ; %bb.0: ; %entry
1452; GFX1064-NEXT:    s_clause 0x1
1453; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1454; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
1455; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1456; GFX1064-NEXT:    ; implicit-def: $vgpr1
1457; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1458; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1459; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1460; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1461; GFX1064-NEXT:    s_cbranch_execz BB8_2
1462; GFX1064-NEXT:  ; %bb.1:
1463; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1464; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1465; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1466; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
1467; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
1468; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1469; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1470; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1471; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX1064-NEXT:    buffer_gl0_inv
1473; GFX1064-NEXT:  BB8_2:
1474; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1475; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1476; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1478; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1479; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1480; GFX1064-NEXT:    s_mov_b32 s6, -1
1481; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1482; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1483; GFX1064-NEXT:    s_endpgm
1484;
1485; GFX1032-LABEL: sub_i32_uniform:
1486; GFX1032:       ; %bb.0: ; %entry
1487; GFX1032-NEXT:    s_clause 0x1
1488; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1489; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1490; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1491; GFX1032-NEXT:    ; implicit-def: $vgpr1
1492; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1493; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1494; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1495; GFX1032-NEXT:    s_cbranch_execz BB8_2
1496; GFX1032-NEXT:  ; %bb.1:
1497; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1498; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1499; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1500; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1501; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1502; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1503; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1504; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1505; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1506; GFX1032-NEXT:    buffer_gl0_inv
1507; GFX1032-NEXT:  BB8_2:
1508; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1509; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1510; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1511; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1512; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1513; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1514; GFX1032-NEXT:    s_mov_b32 s6, -1
1515; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1516; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1517; GFX1032-NEXT:    s_endpgm
1518entry:
1519  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1520  store i32 %old, i32 addrspace(1)* %out
1521  ret void
1522}
1523
1524define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1525;
1526;
1527; GFX7LESS-LABEL: sub_i32_varying:
1528; GFX7LESS:       ; %bb.0: ; %entry
1529; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1530; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1531; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1532; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1533; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1534; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1535; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1536; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1537; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1538; GFX7LESS-NEXT:    s_endpgm
1539;
1540; GFX8-LABEL: sub_i32_varying:
1541; GFX8:       ; %bb.0: ; %entry
1542; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1543; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1544; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1545; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1546; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1547; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1548; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1549; GFX8-NEXT:    s_not_b64 exec, exec
1550; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1551; GFX8-NEXT:    s_not_b64 exec, exec
1552; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1553; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1554; GFX8-NEXT:    s_nop 1
1555; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1556; GFX8-NEXT:    s_nop 1
1557; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1558; GFX8-NEXT:    s_nop 1
1559; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1560; GFX8-NEXT:    s_nop 1
1561; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1562; GFX8-NEXT:    s_nop 1
1563; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1564; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1565; GFX8-NEXT:    s_nop 0
1566; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1567; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1568; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1569; GFX8-NEXT:    ; implicit-def: $vgpr0
1570; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1571; GFX8-NEXT:    s_cbranch_execz BB9_2
1572; GFX8-NEXT:  ; %bb.1:
1573; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1574; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1575; GFX8-NEXT:    s_mov_b32 m0, -1
1576; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1577; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1578; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1579; GFX8-NEXT:  BB9_2:
1580; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1581; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1582; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1583; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1584; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1585; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1586; GFX8-NEXT:    s_mov_b32 s2, -1
1587; GFX8-NEXT:    s_nop 0
1588; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1589; GFX8-NEXT:    s_endpgm
1590;
1591; GFX9-LABEL: sub_i32_varying:
1592; GFX9:       ; %bb.0: ; %entry
1593; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1594; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1595; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1596; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1597; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1598; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1599; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1600; GFX9-NEXT:    s_not_b64 exec, exec
1601; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1602; GFX9-NEXT:    s_not_b64 exec, exec
1603; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1604; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1605; GFX9-NEXT:    s_nop 1
1606; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1607; GFX9-NEXT:    s_nop 1
1608; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1609; GFX9-NEXT:    s_nop 1
1610; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1611; GFX9-NEXT:    s_nop 1
1612; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1613; GFX9-NEXT:    s_nop 1
1614; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1615; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1616; GFX9-NEXT:    s_nop 0
1617; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1618; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1619; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1620; GFX9-NEXT:    ; implicit-def: $vgpr0
1621; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1622; GFX9-NEXT:    s_cbranch_execz BB9_2
1623; GFX9-NEXT:  ; %bb.1:
1624; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1625; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1626; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1627; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1628; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1629; GFX9-NEXT:  BB9_2:
1630; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1631; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1632; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1633; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1634; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1635; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1636; GFX9-NEXT:    s_mov_b32 s2, -1
1637; GFX9-NEXT:    s_nop 0
1638; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1639; GFX9-NEXT:    s_endpgm
1640;
1641; GFX1064-LABEL: sub_i32_varying:
1642; GFX1064:       ; %bb.0: ; %entry
1643; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1644; GFX1064-NEXT:    s_not_b64 exec, exec
1645; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1646; GFX1064-NEXT:    s_not_b64 exec, exec
1647; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1648; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1649; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1650; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1651; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1652; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1653; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1654; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1655; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1656; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
1657; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
1658; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1659; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
1660; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1661; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1662; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1663; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1664; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
1665; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
1666; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1667; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1668; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1669; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
1670; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
1671; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
1672; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1673; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1674; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1675; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
1676; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1677; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1678; GFX1064-NEXT:    s_mov_b32 s2, -1
1679; GFX1064-NEXT:    ; implicit-def: $vgpr0
1680; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1681; GFX1064-NEXT:    s_cbranch_execz BB9_2
1682; GFX1064-NEXT:  ; %bb.1:
1683; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1684; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
1685; GFX1064-NEXT:    s_mov_b32 s3, s7
1686; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1687; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1688; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
1689; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1690; GFX1064-NEXT:    buffer_gl0_inv
1691; GFX1064-NEXT:  BB9_2:
1692; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1693; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1694; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1695; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
1696; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1697; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1698; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1700; GFX1064-NEXT:    s_endpgm
1701;
1702; GFX1032-LABEL: sub_i32_varying:
1703; GFX1032:       ; %bb.0: ; %entry
1704; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1705; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1706; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1707; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1708; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1709; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1710; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1711; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1712; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1713; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1714; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1715; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1716; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1717; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1718; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1719; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1720; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1721; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1722; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1723; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1724; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1725; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1726; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1727; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1728; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1729; GFX1032-NEXT:    s_mov_b32 s2, -1
1730; GFX1032-NEXT:    ; implicit-def: $vgpr0
1731; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1732; GFX1032-NEXT:    s_cbranch_execz BB9_2
1733; GFX1032-NEXT:  ; %bb.1:
1734; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1735; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1736; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1737; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1738; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
1739; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX1032-NEXT:    buffer_gl0_inv
1741; GFX1032-NEXT:  BB9_2:
1742; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1743; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1744; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1745; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1746; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1747; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1748; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1749; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1750; GFX1032-NEXT:    s_endpgm
1751entry:
1752  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1753  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1754  store i32 %old, i32 addrspace(1)* %out
1755  ret void
1756}
1757
1758define amdgpu_kernel void @sub_i32_varying_nouse() {
1759; GFX7LESS-LABEL: sub_i32_varying_nouse:
1760; GFX7LESS:       ; %bb.0: ; %entry
1761; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1762; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1763; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1764; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
1765; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX7LESS-NEXT:    s_endpgm
1767;
1768; GFX8-LABEL: sub_i32_varying_nouse:
1769; GFX8:       ; %bb.0: ; %entry
1770; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1771; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1772; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1773; GFX8-NEXT:    s_not_b64 exec, exec
1774; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1775; GFX8-NEXT:    s_not_b64 exec, exec
1776; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
1777; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1778; GFX8-NEXT:    s_nop 1
1779; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1780; GFX8-NEXT:    s_nop 1
1781; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1782; GFX8-NEXT:    s_nop 1
1783; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1784; GFX8-NEXT:    s_nop 1
1785; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1786; GFX8-NEXT:    s_nop 1
1787; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1788; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
1789; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
1790; GFX8-NEXT:    s_mov_b32 s0, s2
1791; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1792; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1793; GFX8-NEXT:    s_cbranch_execz BB10_2
1794; GFX8-NEXT:  ; %bb.1:
1795; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1796; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1797; GFX8-NEXT:    s_mov_b32 m0, -1
1798; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX8-NEXT:    ds_sub_u32 v0, v2
1800; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1801; GFX8-NEXT:  BB10_2:
1802; GFX8-NEXT:    s_endpgm
1803;
1804; GFX9-LABEL: sub_i32_varying_nouse:
1805; GFX9:       ; %bb.0: ; %entry
1806; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1807; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1808; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1809; GFX9-NEXT:    s_not_b64 exec, exec
1810; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1811; GFX9-NEXT:    s_not_b64 exec, exec
1812; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
1813; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1814; GFX9-NEXT:    s_nop 1
1815; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1816; GFX9-NEXT:    s_nop 1
1817; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1818; GFX9-NEXT:    s_nop 1
1819; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1820; GFX9-NEXT:    s_nop 1
1821; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1822; GFX9-NEXT:    s_nop 1
1823; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1824; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
1825; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
1826; GFX9-NEXT:    s_mov_b32 s0, s2
1827; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1828; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1829; GFX9-NEXT:    s_cbranch_execz BB10_2
1830; GFX9-NEXT:  ; %bb.1:
1831; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1832; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1833; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1834; GFX9-NEXT:    ds_sub_u32 v0, v2
1835; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1836; GFX9-NEXT:  BB10_2:
1837; GFX9-NEXT:    s_endpgm
1838;
1839; GFX1064-LABEL: sub_i32_varying_nouse:
1840; GFX1064:       ; %bb.0: ; %entry
1841; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1842; GFX1064-NEXT:    s_not_b64 exec, exec
1843; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1844; GFX1064-NEXT:    s_not_b64 exec, exec
1845; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1846; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1847; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1848; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1849; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1850; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1851; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1852; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1853; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1854; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1855; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1856; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
1857; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
1858; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1859; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1860; GFX1064-NEXT:    s_add_i32 s0, s2, s3
1861; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1862; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1863; GFX1064-NEXT:    s_cbranch_execz BB10_2
1864; GFX1064-NEXT:  ; %bb.1:
1865; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1866; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
1867; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1868; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1869; GFX1064-NEXT:    ds_sub_u32 v0, v3
1870; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX1064-NEXT:    buffer_gl0_inv
1872; GFX1064-NEXT:  BB10_2:
1873; GFX1064-NEXT:    s_endpgm
1874;
1875; GFX1032-LABEL: sub_i32_varying_nouse:
1876; GFX1032:       ; %bb.0: ; %entry
1877; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1878; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1879; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1880; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1881; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
1882; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1883; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1884; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1885; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1886; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1887; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1888; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1889; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
1890; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1891; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
1892; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1893; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1894; GFX1032-NEXT:    s_cbranch_execz BB10_2
1895; GFX1032-NEXT:  ; %bb.1:
1896; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var32@abs32@lo
1897; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1898; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1899; GFX1032-NEXT:    ds_sub_u32 v3, v0
1900; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1901; GFX1032-NEXT:    buffer_gl0_inv
1902; GFX1032-NEXT:  BB10_2:
1903; GFX1032-NEXT:    s_endpgm
1904entry:
1905  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1906  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1907  ret void
1908}
1909
1910define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
1911;
1912;
1913; GFX7LESS-LABEL: sub_i64_constant:
1914; GFX7LESS:       ; %bb.0: ; %entry
1915; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1916; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1917; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1918; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1919; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1920; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1921; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1922; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
1923; GFX7LESS-NEXT:  ; %bb.1:
1924; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1925; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
1926; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1927; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1928; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
1929; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1930; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1931; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
1932; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1933; GFX7LESS-NEXT:  BB11_2:
1934; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1935; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1936; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1937; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1938; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1939; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1940; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1941; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1942; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1943; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1944; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1945; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1946; GFX7LESS-NEXT:    s_endpgm
1947;
1948; GFX8-LABEL: sub_i64_constant:
1949; GFX8:       ; %bb.0: ; %entry
1950; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1951; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1952; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1953; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1954; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1955; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1956; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1957; GFX8-NEXT:    s_cbranch_execz BB11_2
1958; GFX8-NEXT:  ; %bb.1:
1959; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1960; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1961; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1962; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1963; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1964; GFX8-NEXT:    s_mov_b32 m0, -1
1965; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1966; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
1967; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX8-NEXT:  BB11_2:
1969; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1970; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1972; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1973; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1974; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1975; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1976; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1977; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1978; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1979; GFX8-NEXT:    s_mov_b32 s2, -1
1980; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1981; GFX8-NEXT:    s_endpgm
1982;
1983; GFX9-LABEL: sub_i64_constant:
1984; GFX9:       ; %bb.0: ; %entry
1985; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1986; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1987; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1988; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1989; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1990; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1991; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1992; GFX9-NEXT:    s_cbranch_execz BB11_2
1993; GFX9-NEXT:  ; %bb.1:
1994; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1995; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1996; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1997; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1998; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1999; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2000; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2001; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2002; GFX9-NEXT:  BB11_2:
2003; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2004; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2005; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2006; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2007; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2008; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2009; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2010; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2011; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2012; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2013; GFX9-NEXT:    s_mov_b32 s2, -1
2014; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2015; GFX9-NEXT:    s_endpgm
2016;
2017; GFX1064-LABEL: sub_i64_constant:
2018; GFX1064:       ; %bb.0: ; %entry
2019; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2020; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2021; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2022; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2023; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2024; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2025; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2026; GFX1064-NEXT:    s_cbranch_execz BB11_2
2027; GFX1064-NEXT:  ; %bb.1:
2028; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2029; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2030; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2031; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2032; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
2033; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2034; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2035; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2036; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX1064-NEXT:    buffer_gl0_inv
2038; GFX1064-NEXT:  BB11_2:
2039; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2040; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2041; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2042; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2043; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2044; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2045; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
2046; GFX1064-NEXT:    s_mov_b32 s2, -1
2047; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2048; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2049; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2050; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2051; GFX1064-NEXT:    s_endpgm
2052;
2053; GFX1032-LABEL: sub_i64_constant:
2054; GFX1032:       ; %bb.0: ; %entry
2055; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2056; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2057; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2058; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
2059; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2060; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2061; GFX1032-NEXT:    s_cbranch_execz BB11_2
2062; GFX1032-NEXT:  ; %bb.1:
2063; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2064; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2065; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2066; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2067; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
2068; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2069; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2070; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2071; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2072; GFX1032-NEXT:    buffer_gl0_inv
2073; GFX1032-NEXT:  BB11_2:
2074; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2075; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2076; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2077; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2078; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2079; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2080; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
2081; GFX1032-NEXT:    s_mov_b32 s2, -1
2082; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2083; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2084; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2086; GFX1032-NEXT:    s_endpgm
2087entry:
2088  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2089  store i64 %old, i64 addrspace(1)* %out
2090  ret void
2091}
2092
2093define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2094;
2095;
2096; GFX7LESS-LABEL: sub_i64_uniform:
2097; GFX7LESS:       ; %bb.0: ; %entry
2098; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2099; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2100; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2101; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2102; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2103; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2104; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2105; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2106; GFX7LESS-NEXT:  ; %bb.1:
2107; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2108; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2109; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2110; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2111; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2112; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2113; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2114; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2115; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2116; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2117; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2118; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2119; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX7LESS-NEXT:  BB12_2:
2121; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2122; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2123; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2124; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2125; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2126; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2127; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2128; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2129; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2130; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2131; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2132; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2133; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2134; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2135; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2136; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2137; GFX7LESS-NEXT:    s_endpgm
2138;
2139; GFX8-LABEL: sub_i64_uniform:
2140; GFX8:       ; %bb.0: ; %entry
2141; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2142; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2143; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2144; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2145; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2146; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2147; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2148; GFX8-NEXT:    s_cbranch_execz BB12_2
2149; GFX8-NEXT:  ; %bb.1:
2150; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2151; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2152; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2153; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2154; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2155; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2156; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2157; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2158; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2159; GFX8-NEXT:    s_mov_b32 m0, -1
2160; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2161; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2162; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2163; GFX8-NEXT:  BB12_2:
2164; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2165; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2166; GFX8-NEXT:    s_mov_b32 s4, s0
2167; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2168; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2169; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2170; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2171; GFX8-NEXT:    s_mov_b32 s5, s1
2172; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2173; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2174; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2175; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2176; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2177; GFX8-NEXT:    s_mov_b32 s6, -1
2178; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2179; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2180; GFX8-NEXT:    s_endpgm
2181;
2182; GFX9-LABEL: sub_i64_uniform:
2183; GFX9:       ; %bb.0: ; %entry
2184; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2185; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2186; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2187; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2188; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2189; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2190; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2191; GFX9-NEXT:    s_cbranch_execz BB12_2
2192; GFX9-NEXT:  ; %bb.1:
2193; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2194; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2195; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2196; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2197; GFX9-NEXT:    s_add_i32 s8, s8, s7
2198; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2199; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2200; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2201; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2202; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2204; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2205; GFX9-NEXT:  BB12_2:
2206; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2209; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2210; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2211; GFX9-NEXT:    s_mov_b32 s4, s0
2212; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2213; GFX9-NEXT:    s_mov_b32 s5, s1
2214; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2215; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2216; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2217; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2218; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2219; GFX9-NEXT:    s_mov_b32 s6, -1
2220; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2221; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2222; GFX9-NEXT:    s_endpgm
2223;
2224; GFX1064-LABEL: sub_i64_uniform:
2225; GFX1064:       ; %bb.0: ; %entry
2226; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2227; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2228; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2229; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2230; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2231; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2232; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2233; GFX1064-NEXT:    s_cbranch_execz BB12_2
2234; GFX1064-NEXT:  ; %bb.1:
2235; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2236; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2237; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2238; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2239; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2240; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2241; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2242; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2243; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2244; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2245; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2246; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2247; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2248; GFX1064-NEXT:    buffer_gl0_inv
2249; GFX1064-NEXT:  BB12_2:
2250; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2251; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2252; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2253; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2254; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2255; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2256; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2257; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2258; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2259; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2260; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2261; GFX1064-NEXT:    s_mov_b32 s2, -1
2262; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2263; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2264; GFX1064-NEXT:    s_endpgm
2265;
2266; GFX1032-LABEL: sub_i64_uniform:
2267; GFX1032:       ; %bb.0: ; %entry
2268; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2269; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2270; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2271; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
2272; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2273; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2274; GFX1032-NEXT:    s_cbranch_execz BB12_2
2275; GFX1032-NEXT:  ; %bb.1:
2276; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2277; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2278; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2279; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2280; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2281; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2282; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2283; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2284; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2285; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2286; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2287; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2288; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2289; GFX1032-NEXT:    buffer_gl0_inv
2290; GFX1032-NEXT:  BB12_2:
2291; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2292; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2293; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2294; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2295; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2296; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2297; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2298; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2299; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2300; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2301; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2302; GFX1032-NEXT:    s_mov_b32 s2, -1
2303; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2304; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2305; GFX1032-NEXT:    s_endpgm
2306entry:
2307  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2308  store i64 %old, i64 addrspace(1)* %out
2309  ret void
2310}
2311
2312define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2313;
2314;
2315; GFX7LESS-LABEL: sub_i64_varying:
2316; GFX7LESS:       ; %bb.0: ; %entry
2317; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2318; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2319; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2320; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2321; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2323; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2324; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2325; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2326; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2327; GFX7LESS-NEXT:    s_endpgm
2328;
2329; GFX8-LABEL: sub_i64_varying:
2330; GFX8:       ; %bb.0: ; %entry
2331; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2332; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2333; GFX8-NEXT:    s_mov_b32 m0, -1
2334; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2335; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2336; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2337; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2338; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2339; GFX8-NEXT:    s_mov_b32 s2, -1
2340; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2341; GFX8-NEXT:    s_endpgm
2342;
2343; GFX9-LABEL: sub_i64_varying:
2344; GFX9:       ; %bb.0: ; %entry
2345; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2346; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2347; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2349; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2350; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2351; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2352; GFX9-NEXT:    s_mov_b32 s2, -1
2353; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2354; GFX9-NEXT:    s_endpgm
2355;
2356; GFX10-LABEL: sub_i64_varying:
2357; GFX10:       ; %bb.0: ; %entry
2358; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2359; GFX10-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2360; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2361; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2362; GFX10-NEXT:    s_mov_b32 s2, -1
2363; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2364; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2365; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2366; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2367; GFX10-NEXT:    buffer_gl0_inv
2368; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2369; GFX10-NEXT:    s_endpgm
2370entry:
2371  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2372  %zext = zext i32 %lane to i64
2373  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2374  store i64 %old, i64 addrspace(1)* %out
2375  ret void
2376}
2377
2378define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2379;
2380;
2381; GFX7LESS-LABEL: and_i32_varying:
2382; GFX7LESS:       ; %bb.0: ; %entry
2383; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2384; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2385; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2386; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2387; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2388; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2389; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2390; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2391; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2392; GFX7LESS-NEXT:    s_endpgm
2393;
2394; GFX8-LABEL: and_i32_varying:
2395; GFX8:       ; %bb.0: ; %entry
2396; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2397; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2398; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2399; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2400; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2401; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2402; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2403; GFX8-NEXT:    s_not_b64 exec, exec
2404; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2405; GFX8-NEXT:    s_not_b64 exec, exec
2406; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2407; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2408; GFX8-NEXT:    s_nop 1
2409; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2410; GFX8-NEXT:    s_nop 1
2411; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2412; GFX8-NEXT:    s_nop 1
2413; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2414; GFX8-NEXT:    s_nop 1
2415; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2416; GFX8-NEXT:    s_nop 1
2417; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2418; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2419; GFX8-NEXT:    s_nop 0
2420; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2421; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2422; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2423; GFX8-NEXT:    ; implicit-def: $vgpr0
2424; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2425; GFX8-NEXT:    s_cbranch_execz BB14_2
2426; GFX8-NEXT:  ; %bb.1:
2427; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2428; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2429; GFX8-NEXT:    s_mov_b32 m0, -1
2430; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2431; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2432; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2433; GFX8-NEXT:  BB14_2:
2434; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2435; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2436; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2437; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2438; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2439; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2440; GFX8-NEXT:    s_mov_b32 s2, -1
2441; GFX8-NEXT:    s_nop 0
2442; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2443; GFX8-NEXT:    s_endpgm
2444;
2445; GFX9-LABEL: and_i32_varying:
2446; GFX9:       ; %bb.0: ; %entry
2447; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2448; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2449; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2450; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2451; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2452; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2453; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2454; GFX9-NEXT:    s_not_b64 exec, exec
2455; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2456; GFX9-NEXT:    s_not_b64 exec, exec
2457; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2458; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2459; GFX9-NEXT:    s_nop 1
2460; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2461; GFX9-NEXT:    s_nop 1
2462; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2463; GFX9-NEXT:    s_nop 1
2464; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2465; GFX9-NEXT:    s_nop 1
2466; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2467; GFX9-NEXT:    s_nop 1
2468; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2469; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2470; GFX9-NEXT:    s_nop 0
2471; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2472; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2473; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2474; GFX9-NEXT:    ; implicit-def: $vgpr0
2475; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2476; GFX9-NEXT:    s_cbranch_execz BB14_2
2477; GFX9-NEXT:  ; %bb.1:
2478; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2479; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2480; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2482; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2483; GFX9-NEXT:  BB14_2:
2484; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2486; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2487; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2488; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2489; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2490; GFX9-NEXT:    s_mov_b32 s2, -1
2491; GFX9-NEXT:    s_nop 0
2492; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2493; GFX9-NEXT:    s_endpgm
2494;
2495; GFX1064-LABEL: and_i32_varying:
2496; GFX1064:       ; %bb.0: ; %entry
2497; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2498; GFX1064-NEXT:    s_not_b64 exec, exec
2499; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2500; GFX1064-NEXT:    s_not_b64 exec, exec
2501; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2502; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2503; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2504; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2505; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2506; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2507; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2508; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2509; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2510; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2511; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2512; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2513; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2514; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2515; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2516; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2517; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2518; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2519; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2520; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2521; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2522; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2523; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2524; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2525; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2526; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2527; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2528; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2529; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2530; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2531; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2532; GFX1064-NEXT:    s_mov_b32 s2, -1
2533; GFX1064-NEXT:    ; implicit-def: $vgpr0
2534; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2535; GFX1064-NEXT:    s_cbranch_execz BB14_2
2536; GFX1064-NEXT:  ; %bb.1:
2537; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2538; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2539; GFX1064-NEXT:    s_mov_b32 s3, s7
2540; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2541; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2542; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
2543; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2544; GFX1064-NEXT:    buffer_gl0_inv
2545; GFX1064-NEXT:  BB14_2:
2546; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2547; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2548; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2549; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2550; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2551; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2552; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2553; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2554; GFX1064-NEXT:    s_endpgm
2555;
2556; GFX1032-LABEL: and_i32_varying:
2557; GFX1032:       ; %bb.0: ; %entry
2558; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2559; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2560; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2561; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2562; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2563; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2564; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2565; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2566; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2567; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2568; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2569; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2570; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2571; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2572; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2573; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2574; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2575; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2576; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2577; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2578; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2579; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2580; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2581; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2582; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2583; GFX1032-NEXT:    s_mov_b32 s2, -1
2584; GFX1032-NEXT:    ; implicit-def: $vgpr0
2585; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2586; GFX1032-NEXT:    s_cbranch_execz BB14_2
2587; GFX1032-NEXT:  ; %bb.1:
2588; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2589; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2590; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2591; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2592; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
2593; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2594; GFX1032-NEXT:    buffer_gl0_inv
2595; GFX1032-NEXT:  BB14_2:
2596; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2597; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2598; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2599; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2600; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2601; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2602; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2603; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2604; GFX1032-NEXT:    s_endpgm
2605entry:
2606  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2607  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2608  store i32 %old, i32 addrspace(1)* %out
2609  ret void
2610}
2611
2612define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2613;
2614;
2615; GFX7LESS-LABEL: or_i32_varying:
2616; GFX7LESS:       ; %bb.0: ; %entry
2617; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2618; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2619; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2620; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2621; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2622; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2623; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2624; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2625; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2626; GFX7LESS-NEXT:    s_endpgm
2627;
2628; GFX8-LABEL: or_i32_varying:
2629; GFX8:       ; %bb.0: ; %entry
2630; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2631; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2632; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2633; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2634; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2635; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2636; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2637; GFX8-NEXT:    s_not_b64 exec, exec
2638; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2639; GFX8-NEXT:    s_not_b64 exec, exec
2640; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2641; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2642; GFX8-NEXT:    s_nop 1
2643; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2644; GFX8-NEXT:    s_nop 1
2645; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2646; GFX8-NEXT:    s_nop 1
2647; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2648; GFX8-NEXT:    s_nop 1
2649; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2650; GFX8-NEXT:    s_nop 1
2651; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2652; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2653; GFX8-NEXT:    s_nop 0
2654; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2655; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2656; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2657; GFX8-NEXT:    ; implicit-def: $vgpr0
2658; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2659; GFX8-NEXT:    s_cbranch_execz BB15_2
2660; GFX8-NEXT:  ; %bb.1:
2661; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2662; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2663; GFX8-NEXT:    s_mov_b32 m0, -1
2664; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2665; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2666; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2667; GFX8-NEXT:  BB15_2:
2668; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2669; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2670; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2671; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2672; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2673; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2674; GFX8-NEXT:    s_mov_b32 s2, -1
2675; GFX8-NEXT:    s_nop 0
2676; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2677; GFX8-NEXT:    s_endpgm
2678;
2679; GFX9-LABEL: or_i32_varying:
2680; GFX9:       ; %bb.0: ; %entry
2681; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2682; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2683; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2684; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2685; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2686; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2687; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2688; GFX9-NEXT:    s_not_b64 exec, exec
2689; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2690; GFX9-NEXT:    s_not_b64 exec, exec
2691; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2692; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2693; GFX9-NEXT:    s_nop 1
2694; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2695; GFX9-NEXT:    s_nop 1
2696; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2697; GFX9-NEXT:    s_nop 1
2698; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2699; GFX9-NEXT:    s_nop 1
2700; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2701; GFX9-NEXT:    s_nop 1
2702; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2703; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2704; GFX9-NEXT:    s_nop 0
2705; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2706; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2707; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2708; GFX9-NEXT:    ; implicit-def: $vgpr0
2709; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2710; GFX9-NEXT:    s_cbranch_execz BB15_2
2711; GFX9-NEXT:  ; %bb.1:
2712; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2713; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2714; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2715; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2716; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2717; GFX9-NEXT:  BB15_2:
2718; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2719; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2720; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2721; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2722; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2723; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2724; GFX9-NEXT:    s_mov_b32 s2, -1
2725; GFX9-NEXT:    s_nop 0
2726; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2727; GFX9-NEXT:    s_endpgm
2728;
2729; GFX1064-LABEL: or_i32_varying:
2730; GFX1064:       ; %bb.0: ; %entry
2731; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2732; GFX1064-NEXT:    s_not_b64 exec, exec
2733; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2734; GFX1064-NEXT:    s_not_b64 exec, exec
2735; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2736; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2737; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2738; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2739; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2740; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2741; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2742; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2743; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2744; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2745; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2746; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2747; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2748; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2749; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2750; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2751; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2752; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2753; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2754; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2755; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2756; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2757; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2758; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2759; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2760; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2761; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2762; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2763; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2764; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2765; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2766; GFX1064-NEXT:    s_mov_b32 s2, -1
2767; GFX1064-NEXT:    ; implicit-def: $vgpr0
2768; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2769; GFX1064-NEXT:    s_cbranch_execz BB15_2
2770; GFX1064-NEXT:  ; %bb.1:
2771; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2772; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2773; GFX1064-NEXT:    s_mov_b32 s3, s7
2774; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2775; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2776; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
2777; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2778; GFX1064-NEXT:    buffer_gl0_inv
2779; GFX1064-NEXT:  BB15_2:
2780; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2781; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2782; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2783; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2784; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
2785; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2786; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2787; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2788; GFX1064-NEXT:    s_endpgm
2789;
2790; GFX1032-LABEL: or_i32_varying:
2791; GFX1032:       ; %bb.0: ; %entry
2792; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2793; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2794; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2795; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2796; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2797; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2798; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2799; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2800; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2801; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2802; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2803; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2804; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2805; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2806; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2807; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2808; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2809; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2810; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2811; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2812; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2813; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2814; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2815; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2816; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2817; GFX1032-NEXT:    s_mov_b32 s2, -1
2818; GFX1032-NEXT:    ; implicit-def: $vgpr0
2819; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2820; GFX1032-NEXT:    s_cbranch_execz BB15_2
2821; GFX1032-NEXT:  ; %bb.1:
2822; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2823; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2824; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2825; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2826; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
2827; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2828; GFX1032-NEXT:    buffer_gl0_inv
2829; GFX1032-NEXT:  BB15_2:
2830; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2831; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2832; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2833; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2834; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
2835; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2836; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2838; GFX1032-NEXT:    s_endpgm
2839entry:
2840  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2841  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2842  store i32 %old, i32 addrspace(1)* %out
2843  ret void
2844}
2845
2846define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
2847;
2848;
2849; GFX7LESS-LABEL: xor_i32_varying:
2850; GFX7LESS:       ; %bb.0: ; %entry
2851; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2852; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2853; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2854; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2855; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
2856; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2857; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2858; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2859; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2860; GFX7LESS-NEXT:    s_endpgm
2861;
2862; GFX8-LABEL: xor_i32_varying:
2863; GFX8:       ; %bb.0: ; %entry
2864; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2865; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2866; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2867; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2868; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2869; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2870; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2871; GFX8-NEXT:    s_not_b64 exec, exec
2872; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2873; GFX8-NEXT:    s_not_b64 exec, exec
2874; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2875; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2876; GFX8-NEXT:    s_nop 1
2877; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2878; GFX8-NEXT:    s_nop 1
2879; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2880; GFX8-NEXT:    s_nop 1
2881; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2882; GFX8-NEXT:    s_nop 1
2883; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2884; GFX8-NEXT:    s_nop 1
2885; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2886; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2887; GFX8-NEXT:    s_nop 0
2888; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2889; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2890; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2891; GFX8-NEXT:    ; implicit-def: $vgpr0
2892; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2893; GFX8-NEXT:    s_cbranch_execz BB16_2
2894; GFX8-NEXT:  ; %bb.1:
2895; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2896; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2897; GFX8-NEXT:    s_mov_b32 m0, -1
2898; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2899; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2900; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2901; GFX8-NEXT:  BB16_2:
2902; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2903; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2904; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2905; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2906; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
2907; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2908; GFX8-NEXT:    s_mov_b32 s2, -1
2909; GFX8-NEXT:    s_nop 0
2910; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2911; GFX8-NEXT:    s_endpgm
2912;
2913; GFX9-LABEL: xor_i32_varying:
2914; GFX9:       ; %bb.0: ; %entry
2915; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2916; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2917; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2918; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2919; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2920; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2921; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2922; GFX9-NEXT:    s_not_b64 exec, exec
2923; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2924; GFX9-NEXT:    s_not_b64 exec, exec
2925; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2926; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2927; GFX9-NEXT:    s_nop 1
2928; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2929; GFX9-NEXT:    s_nop 1
2930; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2931; GFX9-NEXT:    s_nop 1
2932; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2933; GFX9-NEXT:    s_nop 1
2934; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2935; GFX9-NEXT:    s_nop 1
2936; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2937; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2938; GFX9-NEXT:    s_nop 0
2939; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2940; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2941; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2942; GFX9-NEXT:    ; implicit-def: $vgpr0
2943; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2944; GFX9-NEXT:    s_cbranch_execz BB16_2
2945; GFX9-NEXT:  ; %bb.1:
2946; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2947; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2948; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2949; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2950; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2951; GFX9-NEXT:  BB16_2:
2952; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2953; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2954; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2955; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2956; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2957; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2958; GFX9-NEXT:    s_mov_b32 s2, -1
2959; GFX9-NEXT:    s_nop 0
2960; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2961; GFX9-NEXT:    s_endpgm
2962;
2963; GFX1064-LABEL: xor_i32_varying:
2964; GFX1064:       ; %bb.0: ; %entry
2965; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2966; GFX1064-NEXT:    s_not_b64 exec, exec
2967; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2968; GFX1064-NEXT:    s_not_b64 exec, exec
2969; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2970; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2971; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2972; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2973; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2974; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2975; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2976; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2977; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2978; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2979; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2980; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2981; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2982; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2983; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2984; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2985; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2986; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2987; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2988; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2989; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2990; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2991; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2992; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2993; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2994; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2995; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2996; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2997; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2998; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2999; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3000; GFX1064-NEXT:    s_mov_b32 s2, -1
3001; GFX1064-NEXT:    ; implicit-def: $vgpr0
3002; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3003; GFX1064-NEXT:    s_cbranch_execz BB16_2
3004; GFX1064-NEXT:  ; %bb.1:
3005; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3006; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3007; GFX1064-NEXT:    s_mov_b32 s3, s7
3008; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3009; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3010; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3011; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3012; GFX1064-NEXT:    buffer_gl0_inv
3013; GFX1064-NEXT:  BB16_2:
3014; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3015; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3016; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3017; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3018; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3019; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3020; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3021; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3022; GFX1064-NEXT:    s_endpgm
3023;
3024; GFX1032-LABEL: xor_i32_varying:
3025; GFX1032:       ; %bb.0: ; %entry
3026; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3027; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3028; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3029; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3030; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3031; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3032; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3033; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3034; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3035; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3036; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3037; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3038; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3039; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3040; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3041; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3042; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3043; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3044; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3045; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3046; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3047; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3048; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3049; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3050; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3051; GFX1032-NEXT:    s_mov_b32 s2, -1
3052; GFX1032-NEXT:    ; implicit-def: $vgpr0
3053; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3054; GFX1032-NEXT:    s_cbranch_execz BB16_2
3055; GFX1032-NEXT:  ; %bb.1:
3056; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3057; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3058; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3059; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3060; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3061; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3062; GFX1032-NEXT:    buffer_gl0_inv
3063; GFX1032-NEXT:  BB16_2:
3064; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3065; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3066; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3067; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3068; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3069; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3070; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3071; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3072; GFX1032-NEXT:    s_endpgm
3073entry:
3074  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3075  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3076  store i32 %old, i32 addrspace(1)* %out
3077  ret void
3078}
3079
3080define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3081;
3082;
3083; GFX7LESS-LABEL: max_i32_varying:
3084; GFX7LESS:       ; %bb.0: ; %entry
3085; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3086; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3087; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3088; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3089; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3090; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3091; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3092; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3093; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3094; GFX7LESS-NEXT:    s_endpgm
3095;
3096; GFX8-LABEL: max_i32_varying:
3097; GFX8:       ; %bb.0: ; %entry
3098; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3099; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3100; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3101; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3102; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3103; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3104; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3105; GFX8-NEXT:    s_not_b64 exec, exec
3106; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3107; GFX8-NEXT:    s_not_b64 exec, exec
3108; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3109; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3110; GFX8-NEXT:    s_nop 1
3111; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3112; GFX8-NEXT:    s_nop 1
3113; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3114; GFX8-NEXT:    s_nop 1
3115; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3116; GFX8-NEXT:    s_nop 1
3117; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3118; GFX8-NEXT:    s_nop 1
3119; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3120; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3121; GFX8-NEXT:    s_nop 0
3122; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3123; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3124; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3125; GFX8-NEXT:    ; implicit-def: $vgpr0
3126; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3127; GFX8-NEXT:    s_cbranch_execz BB17_2
3128; GFX8-NEXT:  ; %bb.1:
3129; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3130; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3131; GFX8-NEXT:    s_mov_b32 m0, -1
3132; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3133; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3134; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3135; GFX8-NEXT:  BB17_2:
3136; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3137; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3138; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3139; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3140; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3141; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3142; GFX8-NEXT:    s_mov_b32 s2, -1
3143; GFX8-NEXT:    s_nop 0
3144; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3145; GFX8-NEXT:    s_endpgm
3146;
3147; GFX9-LABEL: max_i32_varying:
3148; GFX9:       ; %bb.0: ; %entry
3149; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3150; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3151; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3152; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3153; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3154; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3155; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3156; GFX9-NEXT:    s_not_b64 exec, exec
3157; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3158; GFX9-NEXT:    s_not_b64 exec, exec
3159; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3160; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3161; GFX9-NEXT:    s_nop 1
3162; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3163; GFX9-NEXT:    s_nop 1
3164; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3165; GFX9-NEXT:    s_nop 1
3166; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3167; GFX9-NEXT:    s_nop 1
3168; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3169; GFX9-NEXT:    s_nop 1
3170; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3171; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3172; GFX9-NEXT:    s_nop 0
3173; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3174; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3175; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3176; GFX9-NEXT:    ; implicit-def: $vgpr0
3177; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3178; GFX9-NEXT:    s_cbranch_execz BB17_2
3179; GFX9-NEXT:  ; %bb.1:
3180; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3181; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3182; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3183; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3184; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3185; GFX9-NEXT:  BB17_2:
3186; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3187; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3188; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3189; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3190; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3191; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3192; GFX9-NEXT:    s_mov_b32 s2, -1
3193; GFX9-NEXT:    s_nop 0
3194; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3195; GFX9-NEXT:    s_endpgm
3196;
3197; GFX1064-LABEL: max_i32_varying:
3198; GFX1064:       ; %bb.0: ; %entry
3199; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3200; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3201; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3202; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3203; GFX1064-NEXT:    s_not_b64 exec, exec
3204; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3205; GFX1064-NEXT:    s_not_b64 exec, exec
3206; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3207; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3208; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3209; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3210; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3211; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3212; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3213; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3214; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3215; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3216; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3217; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3218; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3219; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3220; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3221; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3222; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3223; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3224; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3225; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3226; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3227; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3228; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3229; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3230; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3231; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3232; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3233; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3234; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3235; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3236; GFX1064-NEXT:    s_mov_b32 s2, -1
3237; GFX1064-NEXT:    ; implicit-def: $vgpr0
3238; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3239; GFX1064-NEXT:    s_cbranch_execz BB17_2
3240; GFX1064-NEXT:  ; %bb.1:
3241; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3242; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3243; GFX1064-NEXT:    s_mov_b32 s3, s7
3244; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3245; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3246; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
3247; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3248; GFX1064-NEXT:    buffer_gl0_inv
3249; GFX1064-NEXT:  BB17_2:
3250; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3251; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3252; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3253; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3254; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3255; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3256; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3257; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3258; GFX1064-NEXT:    s_endpgm
3259;
3260; GFX1032-LABEL: max_i32_varying:
3261; GFX1032:       ; %bb.0: ; %entry
3262; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3263; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3264; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3265; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3266; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3267; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3268; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3269; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3270; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3271; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3272; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3273; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3274; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3275; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3276; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3277; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3278; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3279; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3280; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3281; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3282; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3283; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3284; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3285; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3286; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3287; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3288; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3289; GFX1032-NEXT:    s_mov_b32 s2, -1
3290; GFX1032-NEXT:    ; implicit-def: $vgpr0
3291; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3292; GFX1032-NEXT:    s_cbranch_execz BB17_2
3293; GFX1032-NEXT:  ; %bb.1:
3294; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3295; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3296; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3297; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3298; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
3299; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3300; GFX1032-NEXT:    buffer_gl0_inv
3301; GFX1032-NEXT:  BB17_2:
3302; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3303; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3304; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3305; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3306; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3307; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3308; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3309; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3310; GFX1032-NEXT:    s_endpgm
3311entry:
3312  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3313  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3314  store i32 %old, i32 addrspace(1)* %out
3315  ret void
3316}
3317
3318define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3319;
3320;
3321; GFX7LESS-LABEL: max_i64_constant:
3322; GFX7LESS:       ; %bb.0: ; %entry
3323; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3324; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3325; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3326; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3327; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3328; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3329; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3330; GFX7LESS-NEXT:  ; %bb.1:
3331; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3332; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3333; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3334; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3335; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3336; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3337; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3338; GFX7LESS-NEXT:  BB18_2:
3339; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3340; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3341; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3342; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3343; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3344; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3345; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3346; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3347; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3348; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3349; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3350; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3351; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3352; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3353; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3354; GFX7LESS-NEXT:    s_endpgm
3355;
3356; GFX8-LABEL: max_i64_constant:
3357; GFX8:       ; %bb.0: ; %entry
3358; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3359; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3360; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3361; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3362; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3363; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3364; GFX8-NEXT:    s_cbranch_execz BB18_2
3365; GFX8-NEXT:  ; %bb.1:
3366; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3367; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3368; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3369; GFX8-NEXT:    s_mov_b32 m0, -1
3370; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3371; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3372; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3373; GFX8-NEXT:  BB18_2:
3374; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3375; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3376; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3377; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3378; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3379; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3380; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3381; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3382; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3383; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3384; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3385; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3386; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3387; GFX8-NEXT:    s_mov_b32 s2, -1
3388; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3389; GFX8-NEXT:    s_endpgm
3390;
3391; GFX9-LABEL: max_i64_constant:
3392; GFX9:       ; %bb.0: ; %entry
3393; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3394; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3395; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3396; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3397; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3398; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3399; GFX9-NEXT:    s_cbranch_execz BB18_2
3400; GFX9-NEXT:  ; %bb.1:
3401; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3402; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3403; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3404; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3405; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3406; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3407; GFX9-NEXT:  BB18_2:
3408; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3409; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3410; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3411; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3412; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3413; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3414; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3415; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3416; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3417; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3418; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3419; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3420; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3421; GFX9-NEXT:    s_mov_b32 s2, -1
3422; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3423; GFX9-NEXT:    s_endpgm
3424;
3425; GFX1064-LABEL: max_i64_constant:
3426; GFX1064:       ; %bb.0: ; %entry
3427; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3428; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3429; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3430; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3431; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3432; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3433; GFX1064-NEXT:    s_cbranch_execz BB18_2
3434; GFX1064-NEXT:  ; %bb.1:
3435; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3436; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3437; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3438; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3439; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3440; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3441; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3442; GFX1064-NEXT:    buffer_gl0_inv
3443; GFX1064-NEXT:  BB18_2:
3444; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3445; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3446; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3447; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3448; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3449; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3450; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3451; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3452; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3453; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3454; GFX1064-NEXT:    s_mov_b32 s2, -1
3455; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3456; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3457; GFX1064-NEXT:    s_endpgm
3458;
3459; GFX1032-LABEL: max_i64_constant:
3460; GFX1032:       ; %bb.0: ; %entry
3461; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3462; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3463; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3464; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3465; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3466; GFX1032-NEXT:    s_cbranch_execz BB18_2
3467; GFX1032-NEXT:  ; %bb.1:
3468; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3469; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3470; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3471; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3472; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3473; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3474; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3475; GFX1032-NEXT:    buffer_gl0_inv
3476; GFX1032-NEXT:  BB18_2:
3477; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3478; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3479; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3480; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3481; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3482; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3483; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3484; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3485; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3486; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3487; GFX1032-NEXT:    s_mov_b32 s2, -1
3488; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3489; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3490; GFX1032-NEXT:    s_endpgm
3491entry:
3492  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3493  store i64 %old, i64 addrspace(1)* %out
3494  ret void
3495}
3496
3497define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3498;
3499;
3500; GFX7LESS-LABEL: min_i32_varying:
3501; GFX7LESS:       ; %bb.0: ; %entry
3502; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3503; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3504; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3505; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3506; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3507; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3508; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3509; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3510; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3511; GFX7LESS-NEXT:    s_endpgm
3512;
3513; GFX8-LABEL: min_i32_varying:
3514; GFX8:       ; %bb.0: ; %entry
3515; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3516; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3517; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3518; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3519; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3520; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3521; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3522; GFX8-NEXT:    s_not_b64 exec, exec
3523; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3524; GFX8-NEXT:    s_not_b64 exec, exec
3525; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3526; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3527; GFX8-NEXT:    s_nop 1
3528; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3529; GFX8-NEXT:    s_nop 1
3530; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3531; GFX8-NEXT:    s_nop 1
3532; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3533; GFX8-NEXT:    s_nop 1
3534; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3535; GFX8-NEXT:    s_nop 1
3536; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3537; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3538; GFX8-NEXT:    s_nop 0
3539; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3540; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3541; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3542; GFX8-NEXT:    ; implicit-def: $vgpr0
3543; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3544; GFX8-NEXT:    s_cbranch_execz BB19_2
3545; GFX8-NEXT:  ; %bb.1:
3546; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3547; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3548; GFX8-NEXT:    s_mov_b32 m0, -1
3549; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3550; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3551; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3552; GFX8-NEXT:  BB19_2:
3553; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3554; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3555; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3556; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3557; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3558; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3559; GFX8-NEXT:    s_mov_b32 s2, -1
3560; GFX8-NEXT:    s_nop 0
3561; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3562; GFX8-NEXT:    s_endpgm
3563;
3564; GFX9-LABEL: min_i32_varying:
3565; GFX9:       ; %bb.0: ; %entry
3566; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3567; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3568; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3569; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3570; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3571; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3572; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3573; GFX9-NEXT:    s_not_b64 exec, exec
3574; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3575; GFX9-NEXT:    s_not_b64 exec, exec
3576; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3577; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3578; GFX9-NEXT:    s_nop 1
3579; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3580; GFX9-NEXT:    s_nop 1
3581; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3582; GFX9-NEXT:    s_nop 1
3583; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3584; GFX9-NEXT:    s_nop 1
3585; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3586; GFX9-NEXT:    s_nop 1
3587; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3588; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3589; GFX9-NEXT:    s_nop 0
3590; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3591; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3592; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3593; GFX9-NEXT:    ; implicit-def: $vgpr0
3594; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3595; GFX9-NEXT:    s_cbranch_execz BB19_2
3596; GFX9-NEXT:  ; %bb.1:
3597; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3598; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3599; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3600; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3601; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3602; GFX9-NEXT:  BB19_2:
3603; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3604; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3605; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3606; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3607; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3608; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3609; GFX9-NEXT:    s_mov_b32 s2, -1
3610; GFX9-NEXT:    s_nop 0
3611; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3612; GFX9-NEXT:    s_endpgm
3613;
3614; GFX1064-LABEL: min_i32_varying:
3615; GFX1064:       ; %bb.0: ; %entry
3616; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3617; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3618; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3619; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3620; GFX1064-NEXT:    s_not_b64 exec, exec
3621; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3622; GFX1064-NEXT:    s_not_b64 exec, exec
3623; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3624; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3625; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3626; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3627; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3628; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3629; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3630; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3631; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3632; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3633; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3634; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3635; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3636; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3637; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3638; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3639; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3640; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3641; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3642; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3643; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3644; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3645; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3646; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3647; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3648; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3649; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3650; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3651; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3652; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3653; GFX1064-NEXT:    s_mov_b32 s2, -1
3654; GFX1064-NEXT:    ; implicit-def: $vgpr0
3655; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3656; GFX1064-NEXT:    s_cbranch_execz BB19_2
3657; GFX1064-NEXT:  ; %bb.1:
3658; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3659; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3660; GFX1064-NEXT:    s_mov_b32 s3, s7
3661; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3662; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3663; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
3664; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3665; GFX1064-NEXT:    buffer_gl0_inv
3666; GFX1064-NEXT:  BB19_2:
3667; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3668; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3669; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3670; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3671; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3672; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3673; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3674; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3675; GFX1064-NEXT:    s_endpgm
3676;
3677; GFX1032-LABEL: min_i32_varying:
3678; GFX1032:       ; %bb.0: ; %entry
3679; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3680; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3681; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3682; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3683; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3684; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3685; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3686; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3687; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3688; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3689; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3690; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3691; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3692; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3693; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3694; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3695; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3696; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3697; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3698; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3699; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3700; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3701; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3702; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3703; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3704; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3705; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3706; GFX1032-NEXT:    s_mov_b32 s2, -1
3707; GFX1032-NEXT:    ; implicit-def: $vgpr0
3708; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3709; GFX1032-NEXT:    s_cbranch_execz BB19_2
3710; GFX1032-NEXT:  ; %bb.1:
3711; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3712; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3713; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3714; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3715; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
3716; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3717; GFX1032-NEXT:    buffer_gl0_inv
3718; GFX1032-NEXT:  BB19_2:
3719; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3720; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3721; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3722; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3723; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3724; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3725; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3726; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3727; GFX1032-NEXT:    s_endpgm
3728entry:
3729  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3730  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3731  store i32 %old, i32 addrspace(1)* %out
3732  ret void
3733}
3734
3735define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3736;
3737;
3738; GFX7LESS-LABEL: min_i64_constant:
3739; GFX7LESS:       ; %bb.0: ; %entry
3740; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3741; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3742; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3743; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3744; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3745; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3746; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
3747; GFX7LESS-NEXT:  ; %bb.1:
3748; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3749; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3750; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3751; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3752; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3753; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3754; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3755; GFX7LESS-NEXT:  BB20_2:
3756; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3757; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3758; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3759; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3760; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
3761; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3762; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3763; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3764; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3765; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3766; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3767; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3768; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3769; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3770; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3771; GFX7LESS-NEXT:    s_endpgm
3772;
3773; GFX8-LABEL: min_i64_constant:
3774; GFX8:       ; %bb.0: ; %entry
3775; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3776; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3777; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3778; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3779; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3780; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3781; GFX8-NEXT:    s_cbranch_execz BB20_2
3782; GFX8-NEXT:  ; %bb.1:
3783; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3784; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3785; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3786; GFX8-NEXT:    s_mov_b32 m0, -1
3787; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3788; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3789; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3790; GFX8-NEXT:  BB20_2:
3791; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3792; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3793; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
3794; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
3795; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
3796; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3797; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3798; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3799; GFX8-NEXT:    v_mov_b32_e32 v2, s5
3800; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3801; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3802; GFX8-NEXT:    s_mov_b32 s2, -1
3803; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3804; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3805; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3806; GFX8-NEXT:    s_endpgm
3807;
3808; GFX9-LABEL: min_i64_constant:
3809; GFX9:       ; %bb.0: ; %entry
3810; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3811; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3812; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3813; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3814; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3815; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3816; GFX9-NEXT:    s_cbranch_execz BB20_2
3817; GFX9-NEXT:  ; %bb.1:
3818; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3819; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3820; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3821; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3822; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3823; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3824; GFX9-NEXT:  BB20_2:
3825; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3826; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3827; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
3828; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
3829; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
3830; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3831; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3832; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3833; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3834; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3835; GFX9-NEXT:    v_mov_b32_e32 v2, s4
3836; GFX9-NEXT:    s_mov_b32 s2, -1
3837; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3838; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3839; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3840; GFX9-NEXT:    s_endpgm
3841;
3842; GFX1064-LABEL: min_i64_constant:
3843; GFX1064:       ; %bb.0: ; %entry
3844; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3845; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3846; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3847; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3848; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3849; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3850; GFX1064-NEXT:    s_cbranch_execz BB20_2
3851; GFX1064-NEXT:  ; %bb.1:
3852; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3853; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3854; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3855; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3856; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3857; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3858; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3859; GFX1064-NEXT:    buffer_gl0_inv
3860; GFX1064-NEXT:  BB20_2:
3861; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3862; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3863; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3864; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3865; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
3866; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3867; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3868; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3869; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3870; GFX1064-NEXT:    s_mov_b32 s2, -1
3871; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3872; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3873; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3874; GFX1064-NEXT:    s_endpgm
3875;
3876; GFX1032-LABEL: min_i64_constant:
3877; GFX1032:       ; %bb.0: ; %entry
3878; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3879; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3880; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3881; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3882; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3883; GFX1032-NEXT:    s_cbranch_execz BB20_2
3884; GFX1032-NEXT:  ; %bb.1:
3885; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3886; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3887; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3888; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3889; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3890; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3891; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3892; GFX1032-NEXT:    buffer_gl0_inv
3893; GFX1032-NEXT:  BB20_2:
3894; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3895; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3896; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3897; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3898; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
3899; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
3900; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
3901; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3902; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3903; GFX1032-NEXT:    s_mov_b32 s2, -1
3904; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3905; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3906; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3907; GFX1032-NEXT:    s_endpgm
3908entry:
3909  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
3910  store i64 %old, i64 addrspace(1)* %out
3911  ret void
3912}
3913
3914define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
3915;
3916;
3917; GFX7LESS-LABEL: umax_i32_varying:
3918; GFX7LESS:       ; %bb.0: ; %entry
3919; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3920; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3921; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3922; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3923; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
3924; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3925; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3926; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3927; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3928; GFX7LESS-NEXT:    s_endpgm
3929;
3930; GFX8-LABEL: umax_i32_varying:
3931; GFX8:       ; %bb.0: ; %entry
3932; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3933; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3934; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3935; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3936; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3937; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3938; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3939; GFX8-NEXT:    s_not_b64 exec, exec
3940; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3941; GFX8-NEXT:    s_not_b64 exec, exec
3942; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3943; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3944; GFX8-NEXT:    s_nop 1
3945; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3946; GFX8-NEXT:    s_nop 1
3947; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3948; GFX8-NEXT:    s_nop 1
3949; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3950; GFX8-NEXT:    s_nop 1
3951; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3952; GFX8-NEXT:    s_nop 1
3953; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3954; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3955; GFX8-NEXT:    s_nop 0
3956; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3957; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3958; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3959; GFX8-NEXT:    ; implicit-def: $vgpr0
3960; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3961; GFX8-NEXT:    s_cbranch_execz BB21_2
3962; GFX8-NEXT:  ; %bb.1:
3963; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3964; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3965; GFX8-NEXT:    s_mov_b32 m0, -1
3966; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3967; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
3968; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3969; GFX8-NEXT:  BB21_2:
3970; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3971; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3972; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3973; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3974; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
3975; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3976; GFX8-NEXT:    s_mov_b32 s2, -1
3977; GFX8-NEXT:    s_nop 0
3978; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3979; GFX8-NEXT:    s_endpgm
3980;
3981; GFX9-LABEL: umax_i32_varying:
3982; GFX9:       ; %bb.0: ; %entry
3983; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3984; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3985; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3986; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3987; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3988; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3989; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3990; GFX9-NEXT:    s_not_b64 exec, exec
3991; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3992; GFX9-NEXT:    s_not_b64 exec, exec
3993; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3994; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3995; GFX9-NEXT:    s_nop 1
3996; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3997; GFX9-NEXT:    s_nop 1
3998; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3999; GFX9-NEXT:    s_nop 1
4000; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4001; GFX9-NEXT:    s_nop 1
4002; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4003; GFX9-NEXT:    s_nop 1
4004; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4005; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4006; GFX9-NEXT:    s_nop 0
4007; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4008; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4009; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4010; GFX9-NEXT:    ; implicit-def: $vgpr0
4011; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4012; GFX9-NEXT:    s_cbranch_execz BB21_2
4013; GFX9-NEXT:  ; %bb.1:
4014; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4015; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4016; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4017; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4018; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4019; GFX9-NEXT:  BB21_2:
4020; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4021; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4022; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4023; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4024; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4025; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4026; GFX9-NEXT:    s_mov_b32 s2, -1
4027; GFX9-NEXT:    s_nop 0
4028; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4029; GFX9-NEXT:    s_endpgm
4030;
4031; GFX1064-LABEL: umax_i32_varying:
4032; GFX1064:       ; %bb.0: ; %entry
4033; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4034; GFX1064-NEXT:    s_not_b64 exec, exec
4035; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4036; GFX1064-NEXT:    s_not_b64 exec, exec
4037; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4038; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4039; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4040; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4041; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4042; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4043; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4044; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4045; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4046; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4047; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4048; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4049; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4050; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4051; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4052; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4053; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4054; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4055; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4056; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4057; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4058; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4059; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4060; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4061; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4062; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4063; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4064; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4065; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4066; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4067; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4068; GFX1064-NEXT:    s_mov_b32 s2, -1
4069; GFX1064-NEXT:    ; implicit-def: $vgpr0
4070; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4071; GFX1064-NEXT:    s_cbranch_execz BB21_2
4072; GFX1064-NEXT:  ; %bb.1:
4073; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4074; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4075; GFX1064-NEXT:    s_mov_b32 s3, s7
4076; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4077; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4078; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
4079; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4080; GFX1064-NEXT:    buffer_gl0_inv
4081; GFX1064-NEXT:  BB21_2:
4082; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4083; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4084; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4085; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4086; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4087; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4088; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4089; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4090; GFX1064-NEXT:    s_endpgm
4091;
4092; GFX1032-LABEL: umax_i32_varying:
4093; GFX1032:       ; %bb.0: ; %entry
4094; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4095; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4096; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4097; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4098; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4099; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4100; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4101; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4102; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4103; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4104; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4105; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4106; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4107; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4108; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4109; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4110; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4111; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4112; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4113; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4114; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4115; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4116; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4117; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4118; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4119; GFX1032-NEXT:    s_mov_b32 s2, -1
4120; GFX1032-NEXT:    ; implicit-def: $vgpr0
4121; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4122; GFX1032-NEXT:    s_cbranch_execz BB21_2
4123; GFX1032-NEXT:  ; %bb.1:
4124; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4125; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4126; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4127; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4128; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
4129; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4130; GFX1032-NEXT:    buffer_gl0_inv
4131; GFX1032-NEXT:  BB21_2:
4132; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4133; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4134; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4135; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4136; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4137; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4138; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4139; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4140; GFX1032-NEXT:    s_endpgm
4141entry:
4142  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4143  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4144  store i32 %old, i32 addrspace(1)* %out
4145  ret void
4146}
4147
4148define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4149;
4150;
4151; GFX7LESS-LABEL: umax_i64_constant:
4152; GFX7LESS:       ; %bb.0: ; %entry
4153; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4154; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4155; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4156; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4157; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4158; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4159; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4160; GFX7LESS-NEXT:  ; %bb.1:
4161; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4162; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4163; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4164; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4165; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4166; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4167; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4168; GFX7LESS-NEXT:  BB22_2:
4169; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4170; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4171; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4172; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4173; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4174; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4175; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4176; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4177; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4178; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4179; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4180; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4181; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4182; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4183; GFX7LESS-NEXT:    s_endpgm
4184;
4185; GFX8-LABEL: umax_i64_constant:
4186; GFX8:       ; %bb.0: ; %entry
4187; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4188; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4189; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4190; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4191; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4192; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4193; GFX8-NEXT:    s_cbranch_execz BB22_2
4194; GFX8-NEXT:  ; %bb.1:
4195; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4196; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4197; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4198; GFX8-NEXT:    s_mov_b32 m0, -1
4199; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4200; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4201; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4202; GFX8-NEXT:  BB22_2:
4203; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4204; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4205; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4206; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4207; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4208; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4209; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4210; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4211; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4212; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4213; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4214; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4215; GFX8-NEXT:    s_mov_b32 s2, -1
4216; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4217; GFX8-NEXT:    s_endpgm
4218;
4219; GFX9-LABEL: umax_i64_constant:
4220; GFX9:       ; %bb.0: ; %entry
4221; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4222; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4223; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4224; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4225; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4226; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4227; GFX9-NEXT:    s_cbranch_execz BB22_2
4228; GFX9-NEXT:  ; %bb.1:
4229; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4230; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4231; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4232; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4233; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4235; GFX9-NEXT:  BB22_2:
4236; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4237; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4238; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4239; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4240; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4241; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4242; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4243; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4244; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4245; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4246; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4247; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4248; GFX9-NEXT:    s_mov_b32 s2, -1
4249; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4250; GFX9-NEXT:    s_endpgm
4251;
4252; GFX1064-LABEL: umax_i64_constant:
4253; GFX1064:       ; %bb.0: ; %entry
4254; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4255; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4256; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4257; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4258; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4259; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4260; GFX1064-NEXT:    s_cbranch_execz BB22_2
4261; GFX1064-NEXT:  ; %bb.1:
4262; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4263; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4264; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4265; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4266; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4267; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4268; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4269; GFX1064-NEXT:    buffer_gl0_inv
4270; GFX1064-NEXT:  BB22_2:
4271; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4272; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4273; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4274; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4275; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4276; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4277; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4278; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4279; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4280; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4281; GFX1064-NEXT:    s_mov_b32 s2, -1
4282; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4283; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4284; GFX1064-NEXT:    s_endpgm
4285;
4286; GFX1032-LABEL: umax_i64_constant:
4287; GFX1032:       ; %bb.0: ; %entry
4288; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4289; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4290; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4291; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4292; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4293; GFX1032-NEXT:    s_cbranch_execz BB22_2
4294; GFX1032-NEXT:  ; %bb.1:
4295; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4296; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4297; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4298; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4299; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4300; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4301; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4302; GFX1032-NEXT:    buffer_gl0_inv
4303; GFX1032-NEXT:  BB22_2:
4304; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4305; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4306; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4307; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4308; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4309; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4310; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4311; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4312; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4313; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4314; GFX1032-NEXT:    s_mov_b32 s2, -1
4315; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4316; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4317; GFX1032-NEXT:    s_endpgm
4318entry:
4319  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4320  store i64 %old, i64 addrspace(1)* %out
4321  ret void
4322}
4323
4324define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4325;
4326;
4327; GFX7LESS-LABEL: umin_i32_varying:
4328; GFX7LESS:       ; %bb.0: ; %entry
4329; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4330; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4331; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4332; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4334; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4335; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4336; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4337; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4338; GFX7LESS-NEXT:    s_endpgm
4339;
4340; GFX8-LABEL: umin_i32_varying:
4341; GFX8:       ; %bb.0: ; %entry
4342; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4343; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4344; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4345; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4346; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4347; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4348; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4349; GFX8-NEXT:    s_not_b64 exec, exec
4350; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4351; GFX8-NEXT:    s_not_b64 exec, exec
4352; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4353; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4354; GFX8-NEXT:    s_nop 1
4355; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4356; GFX8-NEXT:    s_nop 1
4357; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4358; GFX8-NEXT:    s_nop 1
4359; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4360; GFX8-NEXT:    s_nop 1
4361; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4362; GFX8-NEXT:    s_nop 1
4363; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4364; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4365; GFX8-NEXT:    s_nop 0
4366; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4367; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4368; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4369; GFX8-NEXT:    ; implicit-def: $vgpr0
4370; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4371; GFX8-NEXT:    s_cbranch_execz BB23_2
4372; GFX8-NEXT:  ; %bb.1:
4373; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4374; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4375; GFX8-NEXT:    s_mov_b32 m0, -1
4376; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4377; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4378; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4379; GFX8-NEXT:  BB23_2:
4380; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4381; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4382; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4383; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4384; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4385; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4386; GFX8-NEXT:    s_mov_b32 s2, -1
4387; GFX8-NEXT:    s_nop 0
4388; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4389; GFX8-NEXT:    s_endpgm
4390;
4391; GFX9-LABEL: umin_i32_varying:
4392; GFX9:       ; %bb.0: ; %entry
4393; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4394; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4395; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4396; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4397; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4398; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4399; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4400; GFX9-NEXT:    s_not_b64 exec, exec
4401; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4402; GFX9-NEXT:    s_not_b64 exec, exec
4403; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4404; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4405; GFX9-NEXT:    s_nop 1
4406; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4407; GFX9-NEXT:    s_nop 1
4408; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4409; GFX9-NEXT:    s_nop 1
4410; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4411; GFX9-NEXT:    s_nop 1
4412; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4413; GFX9-NEXT:    s_nop 1
4414; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4415; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4416; GFX9-NEXT:    s_nop 0
4417; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4418; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4419; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4420; GFX9-NEXT:    ; implicit-def: $vgpr0
4421; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4422; GFX9-NEXT:    s_cbranch_execz BB23_2
4423; GFX9-NEXT:  ; %bb.1:
4424; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4425; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4427; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4428; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4429; GFX9-NEXT:  BB23_2:
4430; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4432; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4433; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4434; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4435; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4436; GFX9-NEXT:    s_mov_b32 s2, -1
4437; GFX9-NEXT:    s_nop 0
4438; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4439; GFX9-NEXT:    s_endpgm
4440;
4441; GFX1064-LABEL: umin_i32_varying:
4442; GFX1064:       ; %bb.0: ; %entry
4443; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4444; GFX1064-NEXT:    s_not_b64 exec, exec
4445; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4446; GFX1064-NEXT:    s_not_b64 exec, exec
4447; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4448; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4449; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4450; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4451; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4452; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4453; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4454; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4455; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4456; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4457; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4458; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4459; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4460; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4461; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4462; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4463; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4464; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4465; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4466; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4467; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4468; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4469; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4470; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4471; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4472; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4473; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4474; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4475; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4476; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4477; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4478; GFX1064-NEXT:    s_mov_b32 s2, -1
4479; GFX1064-NEXT:    ; implicit-def: $vgpr0
4480; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4481; GFX1064-NEXT:    s_cbranch_execz BB23_2
4482; GFX1064-NEXT:  ; %bb.1:
4483; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4484; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4485; GFX1064-NEXT:    s_mov_b32 s3, s7
4486; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4487; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4488; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
4489; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4490; GFX1064-NEXT:    buffer_gl0_inv
4491; GFX1064-NEXT:  BB23_2:
4492; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4493; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4494; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4495; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4496; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4497; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4498; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4499; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4500; GFX1064-NEXT:    s_endpgm
4501;
4502; GFX1032-LABEL: umin_i32_varying:
4503; GFX1032:       ; %bb.0: ; %entry
4504; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4505; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4506; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4507; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4508; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4509; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4510; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4511; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4512; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4513; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4514; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4515; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4516; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4517; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4518; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4519; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4520; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4521; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4522; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4523; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4524; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4525; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4526; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4527; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4528; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4529; GFX1032-NEXT:    s_mov_b32 s2, -1
4530; GFX1032-NEXT:    ; implicit-def: $vgpr0
4531; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4532; GFX1032-NEXT:    s_cbranch_execz BB23_2
4533; GFX1032-NEXT:  ; %bb.1:
4534; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4535; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4536; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4537; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4538; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
4539; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4540; GFX1032-NEXT:    buffer_gl0_inv
4541; GFX1032-NEXT:  BB23_2:
4542; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4543; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4544; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4545; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4546; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4547; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4548; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4549; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4550; GFX1032-NEXT:    s_endpgm
4551entry:
4552  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4553  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4554  store i32 %old, i32 addrspace(1)* %out
4555  ret void
4556}
4557
4558define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4559;
4560;
4561; GFX7LESS-LABEL: umin_i64_constant:
4562; GFX7LESS:       ; %bb.0: ; %entry
4563; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4564; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4565; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4566; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4567; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4568; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4569; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4570; GFX7LESS-NEXT:  ; %bb.1:
4571; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4572; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4573; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4574; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4575; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4576; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4577; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4578; GFX7LESS-NEXT:  BB24_2:
4579; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4580; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4581; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4582; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4583; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4584; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4585; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4586; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4587; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4588; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4589; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4590; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4591; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4592; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4593; GFX7LESS-NEXT:    s_endpgm
4594;
4595; GFX8-LABEL: umin_i64_constant:
4596; GFX8:       ; %bb.0: ; %entry
4597; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4598; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4599; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4600; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4601; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4602; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4603; GFX8-NEXT:    s_cbranch_execz BB24_2
4604; GFX8-NEXT:  ; %bb.1:
4605; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4606; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4607; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4608; GFX8-NEXT:    s_mov_b32 m0, -1
4609; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4610; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4611; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4612; GFX8-NEXT:  BB24_2:
4613; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4614; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4615; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4616; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4617; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4618; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4619; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4620; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4621; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4622; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4623; GFX8-NEXT:    s_mov_b32 s2, -1
4624; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4625; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4626; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4627; GFX8-NEXT:    s_endpgm
4628;
4629; GFX9-LABEL: umin_i64_constant:
4630; GFX9:       ; %bb.0: ; %entry
4631; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4632; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4633; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4634; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4635; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4636; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4637; GFX9-NEXT:    s_cbranch_execz BB24_2
4638; GFX9-NEXT:  ; %bb.1:
4639; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4640; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4641; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4642; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4643; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4644; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4645; GFX9-NEXT:  BB24_2:
4646; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4647; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4648; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4649; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4650; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4651; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4652; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4653; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4654; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4655; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4656; GFX9-NEXT:    s_mov_b32 s2, -1
4657; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4658; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4659; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4660; GFX9-NEXT:    s_endpgm
4661;
4662; GFX1064-LABEL: umin_i64_constant:
4663; GFX1064:       ; %bb.0: ; %entry
4664; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4665; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4666; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4667; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4668; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4669; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4670; GFX1064-NEXT:    s_cbranch_execz BB24_2
4671; GFX1064-NEXT:  ; %bb.1:
4672; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4673; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4674; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4675; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4676; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4677; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4678; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4679; GFX1064-NEXT:    buffer_gl0_inv
4680; GFX1064-NEXT:  BB24_2:
4681; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4682; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4683; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4684; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4685; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4686; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4687; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4688; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4689; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4690; GFX1064-NEXT:    s_mov_b32 s2, -1
4691; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4692; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4693; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4694; GFX1064-NEXT:    s_endpgm
4695;
4696; GFX1032-LABEL: umin_i64_constant:
4697; GFX1032:       ; %bb.0: ; %entry
4698; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4699; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4700; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4701; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4702; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4703; GFX1032-NEXT:    s_cbranch_execz BB24_2
4704; GFX1032-NEXT:  ; %bb.1:
4705; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4706; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4707; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4708; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4709; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4710; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4711; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4712; GFX1032-NEXT:    buffer_gl0_inv
4713; GFX1032-NEXT:  BB24_2:
4714; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4715; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4716; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4717; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4718; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4719; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4720; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4721; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4722; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4723; GFX1032-NEXT:    s_mov_b32 s2, -1
4724; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4725; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4726; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4727; GFX1032-NEXT:    s_endpgm
4728entry:
4729  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
4730  store i64 %old, i64 addrspace(1)* %out
4731  ret void
4732}
4733