1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
32; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
33; GFX7LESS-NEXT:    s_mov_b32 m0, -1
34; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:  BB0_2:
38; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
39; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
43; GFX7LESS-NEXT:    s_mov_b32 s2, -1
44; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX7LESS-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
56; GFX8-NEXT:    s_cbranch_execz BB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    s_mul_i32 s2, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
61; GFX8-NEXT:    v_mov_b32_e32 v2, s2
62; GFX8-NEXT:    s_mov_b32 m0, -1
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:  BB0_2:
67; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
70; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
71; GFX8-NEXT:    s_mov_b32 s3, 0xf000
72; GFX8-NEXT:    s_mov_b32 s2, -1
73; GFX8-NEXT:    s_nop 1
74; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
80; GFX9-NEXT:    s_mov_b64 s[2:3], exec
81; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
82; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
83; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
84; GFX9-NEXT:    ; implicit-def: $vgpr1
85; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
86; GFX9-NEXT:    s_cbranch_execz BB0_2
87; GFX9-NEXT:  ; %bb.1:
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    s_mul_i32 s2, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
91; GFX9-NEXT:    v_mov_b32_e32 v2, s2
92; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:  BB0_2:
96; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
99; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
100; GFX9-NEXT:    s_mov_b32 s3, 0xf000
101; GFX9-NEXT:    s_mov_b32 s2, -1
102; GFX9-NEXT:    s_nop 1
103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
104; GFX9-NEXT:    s_endpgm
105;
106; GFX1064-LABEL: add_i32_constant:
107; GFX1064:       ; %bb.0: ; %entry
108; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
110; GFX1064-NEXT:    ; implicit-def: $vgpr1
111; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
112; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
115; GFX1064-NEXT:    s_cbranch_execz BB0_2
116; GFX1064-NEXT:  ; %bb.1:
117; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
118; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
119; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
120; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
121; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
122; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
124; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX1064-NEXT:    buffer_gl0_inv
126; GFX1064-NEXT:  BB0_2:
127; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
135; GFX1064-NEXT:    s_endpgm
136;
137; GFX1032-LABEL: add_i32_constant:
138; GFX1032:       ; %bb.0: ; %entry
139; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
140; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
141; GFX1032-NEXT:    ; implicit-def: $vgpr1
142; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
143; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
144; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
145; GFX1032-NEXT:    s_cbranch_execz BB0_2
146; GFX1032-NEXT:  ; %bb.1:
147; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
148; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
149; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
150; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
151; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
152; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
153; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
154; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX1032-NEXT:    buffer_gl0_inv
156; GFX1032-NEXT:  BB0_2:
157; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
158; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
159; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
160; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
161; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
162; GFX1032-NEXT:    s_mov_b32 s2, -1
163; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
165; GFX1032-NEXT:    s_endpgm
166entry:
167  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
168  store i32 %old, i32 addrspace(1)* %out
169  ret void
170}
171
172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
173;
174;
175; GFX7LESS-LABEL: add_i32_uniform:
176; GFX7LESS:       ; %bb.0: ; %entry
177; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
178; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
179; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
180; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
181; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
182; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
183; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
184; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
185; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
186; GFX7LESS-NEXT:  ; %bb.1:
187; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
188; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
190; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
191; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
192; GFX7LESS-NEXT:    s_mov_b32 m0, -1
193; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
195; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX7LESS-NEXT:  BB1_2:
197; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
198; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
200; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
201; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
202; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
203; GFX7LESS-NEXT:    s_mov_b32 s6, -1
204; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
205; GFX7LESS-NEXT:    s_endpgm
206;
207; GFX8-LABEL: add_i32_uniform:
208; GFX8:       ; %bb.0: ; %entry
209; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
210; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
211; GFX8-NEXT:    s_mov_b64 s[2:3], exec
212; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
213; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
214; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
215; GFX8-NEXT:    ; implicit-def: $vgpr1
216; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
217; GFX8-NEXT:    s_cbranch_execz BB1_2
218; GFX8-NEXT:  ; %bb.1:
219; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
220; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX8-NEXT:    s_mul_i32 s1, s0, s1
222; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
223; GFX8-NEXT:    v_mov_b32_e32 v2, s1
224; GFX8-NEXT:    s_mov_b32 m0, -1
225; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
227; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX8-NEXT:  BB1_2:
229; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
230; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
232; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
233; GFX8-NEXT:    s_mov_b32 s7, 0xf000
234; GFX8-NEXT:    s_mov_b32 s6, -1
235; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
236; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
237; GFX8-NEXT:    s_endpgm
238;
239; GFX9-LABEL: add_i32_uniform:
240; GFX9:       ; %bb.0: ; %entry
241; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
242; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
243; GFX9-NEXT:    s_mov_b64 s[6:7], exec
244; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
245; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
246; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
247; GFX9-NEXT:    ; implicit-def: $vgpr1
248; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
249; GFX9-NEXT:    s_cbranch_execz BB1_2
250; GFX9-NEXT:  ; %bb.1:
251; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    s_mul_i32 s3, s2, s3
254; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
255; GFX9-NEXT:    v_mov_b32_e32 v2, s3
256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:  BB1_2:
260; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
261; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
263; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
264; GFX9-NEXT:    s_mov_b32 s7, 0xf000
265; GFX9-NEXT:    s_mov_b32 s6, -1
266; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
267; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
268; GFX9-NEXT:    s_endpgm
269;
270; GFX1064-LABEL: add_i32_uniform:
271; GFX1064:       ; %bb.0: ; %entry
272; GFX1064-NEXT:    s_clause 0x1
273; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
274; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
275; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
276; GFX1064-NEXT:    ; implicit-def: $vgpr1
277; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
278; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
279; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
280; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
281; GFX1064-NEXT:    s_cbranch_execz BB1_2
282; GFX1064-NEXT:  ; %bb.1:
283; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
284; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
285; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
287; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
288; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
289; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
291; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX1064-NEXT:    buffer_gl0_inv
293; GFX1064-NEXT:  BB1_2:
294; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
295; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
296; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
298; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
299; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
300; GFX1064-NEXT:    s_mov_b32 s6, -1
301; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
302; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
303; GFX1064-NEXT:    s_endpgm
304;
305; GFX1032-LABEL: add_i32_uniform:
306; GFX1032:       ; %bb.0: ; %entry
307; GFX1032-NEXT:    s_clause 0x1
308; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
309; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
310; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
311; GFX1032-NEXT:    ; implicit-def: $vgpr1
312; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
313; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
314; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
315; GFX1032-NEXT:    s_cbranch_execz BB1_2
316; GFX1032-NEXT:  ; %bb.1:
317; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
318; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
319; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
321; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
322; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
323; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
324; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
325; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX1032-NEXT:    buffer_gl0_inv
327; GFX1032-NEXT:  BB1_2:
328; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
329; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
330; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
332; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
333; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
334; GFX1032-NEXT:    s_mov_b32 s6, -1
335; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
336; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
337; GFX1032-NEXT:    s_endpgm
338entry:
339  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
340  store i32 %old, i32 addrspace(1)* %out
341  ret void
342}
343
344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
345;
346;
347; GFX7LESS-LABEL: add_i32_varying:
348; GFX7LESS:       ; %bb.0: ; %entry
349; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
350; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
351; GFX7LESS-NEXT:    s_mov_b32 m0, -1
352; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
354; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
356; GFX7LESS-NEXT:    s_mov_b32 s2, -1
357; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
358; GFX7LESS-NEXT:    s_endpgm
359;
360; GFX8-LABEL: add_i32_varying:
361; GFX8:       ; %bb.0: ; %entry
362; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
363; GFX8-NEXT:    v_mov_b32_e32 v2, v0
364; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
365; GFX8-NEXT:    v_mov_b32_e32 v1, 0
366; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
367; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
368; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
369; GFX8-NEXT:    s_not_b64 exec, exec
370; GFX8-NEXT:    v_mov_b32_e32 v2, 0
371; GFX8-NEXT:    s_not_b64 exec, exec
372; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
373; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
374; GFX8-NEXT:    s_nop 1
375; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
376; GFX8-NEXT:    s_nop 1
377; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
378; GFX8-NEXT:    s_nop 1
379; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
380; GFX8-NEXT:    s_nop 1
381; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
382; GFX8-NEXT:    s_nop 1
383; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
384; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
385; GFX8-NEXT:    s_nop 0
386; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
387; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
388; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
389; GFX8-NEXT:    ; implicit-def: $vgpr0
390; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
391; GFX8-NEXT:    s_cbranch_execz BB2_2
392; GFX8-NEXT:  ; %bb.1:
393; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
394; GFX8-NEXT:    v_mov_b32_e32 v3, s4
395; GFX8-NEXT:    s_mov_b32 m0, -1
396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX8-NEXT:  BB2_2:
400; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
403; GFX8-NEXT:    v_mov_b32_e32 v0, v1
404; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
405; GFX8-NEXT:    s_mov_b32 s3, 0xf000
406; GFX8-NEXT:    s_mov_b32 s2, -1
407; GFX8-NEXT:    s_nop 0
408; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
409; GFX8-NEXT:    s_endpgm
410;
411; GFX9-LABEL: add_i32_varying:
412; GFX9:       ; %bb.0: ; %entry
413; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
414; GFX9-NEXT:    v_mov_b32_e32 v2, v0
415; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
416; GFX9-NEXT:    v_mov_b32_e32 v1, 0
417; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
418; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
419; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
420; GFX9-NEXT:    s_not_b64 exec, exec
421; GFX9-NEXT:    v_mov_b32_e32 v2, 0
422; GFX9-NEXT:    s_not_b64 exec, exec
423; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
424; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
425; GFX9-NEXT:    s_nop 1
426; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
427; GFX9-NEXT:    s_nop 1
428; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
429; GFX9-NEXT:    s_nop 1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
433; GFX9-NEXT:    s_nop 1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
435; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
436; GFX9-NEXT:    s_nop 0
437; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
438; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
439; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
440; GFX9-NEXT:    ; implicit-def: $vgpr0
441; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
442; GFX9-NEXT:    s_cbranch_execz BB2_2
443; GFX9-NEXT:  ; %bb.1:
444; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
445; GFX9-NEXT:    v_mov_b32_e32 v3, s4
446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:  BB2_2:
450; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
453; GFX9-NEXT:    v_mov_b32_e32 v0, v1
454; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
455; GFX9-NEXT:    s_mov_b32 s3, 0xf000
456; GFX9-NEXT:    s_mov_b32 s2, -1
457; GFX9-NEXT:    s_nop 0
458; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
459; GFX9-NEXT:    s_endpgm
460;
461; GFX1064-LABEL: add_i32_varying:
462; GFX1064:       ; %bb.0: ; %entry
463; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
464; GFX1064-NEXT:    s_not_b64 exec, exec
465; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
466; GFX1064-NEXT:    s_not_b64 exec, exec
467; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
468; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
469; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
470; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
471; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
472; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
473; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
474; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
475; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
476; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
477; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
478; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
479; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
480; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
481; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
482; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
483; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
484; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
485; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
486; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
487; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
488; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
489; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
490; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
491; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
492; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
493; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
494; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
495; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
496; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
497; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
498; GFX1064-NEXT:    s_mov_b32 s2, -1
499; GFX1064-NEXT:    ; implicit-def: $vgpr0
500; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
501; GFX1064-NEXT:    s_cbranch_execz BB2_2
502; GFX1064-NEXT:  ; %bb.1:
503; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
504; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
505; GFX1064-NEXT:    s_mov_b32 s3, s7
506; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
507; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
508; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
509; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX1064-NEXT:    buffer_gl0_inv
511; GFX1064-NEXT:  BB2_2:
512; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
513; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
514; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
515; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
516; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
517; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
518; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
519; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
520; GFX1064-NEXT:    s_endpgm
521;
522; GFX1032-LABEL: add_i32_varying:
523; GFX1032:       ; %bb.0: ; %entry
524; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
525; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
526; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
527; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
528; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
529; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
530; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
531; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
532; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
533; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
534; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
535; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
536; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
537; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
538; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
539; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
540; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
541; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
542; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
543; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
544; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
545; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
546; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
547; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
548; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
549; GFX1032-NEXT:    s_mov_b32 s2, -1
550; GFX1032-NEXT:    ; implicit-def: $vgpr0
551; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
552; GFX1032-NEXT:    s_cbranch_execz BB2_2
553; GFX1032-NEXT:  ; %bb.1:
554; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
555; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
556; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
557; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
558; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
559; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX1032-NEXT:    buffer_gl0_inv
561; GFX1032-NEXT:  BB2_2:
562; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
563; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
564; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
565; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
566; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
567; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
568; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
570; GFX1032-NEXT:    s_endpgm
571entry:
572  %lane = call i32 @llvm.amdgcn.workitem.id.x()
573  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
574  store i32 %old, i32 addrspace(1)* %out
575  ret void
576}
577
578define amdgpu_kernel void @add_i32_varying_nouse() {
579; GFX7LESS-LABEL: add_i32_varying_nouse:
580; GFX7LESS:       ; %bb.0: ; %entry
581; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
582; GFX7LESS-NEXT:    s_mov_b32 m0, -1
583; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX7LESS-NEXT:    ds_add_u32 v1, v0
585; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX7LESS-NEXT:    s_endpgm
587;
588; GFX8-LABEL: add_i32_varying_nouse:
589; GFX8:       ; %bb.0: ; %entry
590; GFX8-NEXT:    v_mov_b32_e32 v1, v0
591; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
592; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
593; GFX8-NEXT:    s_not_b64 exec, exec
594; GFX8-NEXT:    v_mov_b32_e32 v1, 0
595; GFX8-NEXT:    s_not_b64 exec, exec
596; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
597; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
598; GFX8-NEXT:    s_nop 1
599; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
600; GFX8-NEXT:    s_nop 1
601; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
602; GFX8-NEXT:    s_nop 1
603; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
604; GFX8-NEXT:    s_nop 1
605; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
606; GFX8-NEXT:    s_nop 1
607; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
608; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
609; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
610; GFX8-NEXT:    s_mov_b32 s0, s2
611; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
612; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
613; GFX8-NEXT:    s_cbranch_execz BB3_2
614; GFX8-NEXT:  ; %bb.1:
615; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
616; GFX8-NEXT:    v_mov_b32_e32 v2, s0
617; GFX8-NEXT:    s_mov_b32 m0, -1
618; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX8-NEXT:    ds_add_u32 v0, v2
620; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX8-NEXT:  BB3_2:
622; GFX8-NEXT:    s_endpgm
623;
624; GFX9-LABEL: add_i32_varying_nouse:
625; GFX9:       ; %bb.0: ; %entry
626; GFX9-NEXT:    v_mov_b32_e32 v1, v0
627; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
628; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
629; GFX9-NEXT:    s_not_b64 exec, exec
630; GFX9-NEXT:    v_mov_b32_e32 v1, 0
631; GFX9-NEXT:    s_not_b64 exec, exec
632; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
633; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
634; GFX9-NEXT:    s_nop 1
635; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
636; GFX9-NEXT:    s_nop 1
637; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
638; GFX9-NEXT:    s_nop 1
639; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
640; GFX9-NEXT:    s_nop 1
641; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
642; GFX9-NEXT:    s_nop 1
643; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
644; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
645; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
646; GFX9-NEXT:    s_mov_b32 s0, s2
647; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
648; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
649; GFX9-NEXT:    s_cbranch_execz BB3_2
650; GFX9-NEXT:  ; %bb.1:
651; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
652; GFX9-NEXT:    v_mov_b32_e32 v2, s0
653; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX9-NEXT:    ds_add_u32 v0, v2
655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX9-NEXT:  BB3_2:
657; GFX9-NEXT:    s_endpgm
658;
659; GFX1064-LABEL: add_i32_varying_nouse:
660; GFX1064:       ; %bb.0: ; %entry
661; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
662; GFX1064-NEXT:    s_not_b64 exec, exec
663; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
664; GFX1064-NEXT:    s_not_b64 exec, exec
665; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
666; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
670; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
671; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
672; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
673; GFX1064-NEXT:    v_readlane_b32 s2, v1, 31
674; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
675; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
676; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
677; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
678; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
679; GFX1064-NEXT:    v_readlane_b32 s2, v1, 63
680; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
681; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
682; GFX1064-NEXT:    s_mov_b32 s0, s2
683; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
684; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
685; GFX1064-NEXT:    s_cbranch_execz BB3_2
686; GFX1064-NEXT:  ; %bb.1:
687; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
688; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
689; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
690; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
691; GFX1064-NEXT:    ds_add_u32 v0, v3
692; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX1064-NEXT:    buffer_gl0_inv
694; GFX1064-NEXT:  BB3_2:
695; GFX1064-NEXT:    s_endpgm
696;
697; GFX1032-LABEL: add_i32_varying_nouse:
698; GFX1032:       ; %bb.0: ; %entry
699; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
700; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
701; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
702; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
703; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
704; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
705; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
706; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
707; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
708; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
709; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
710; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
711; GFX1032-NEXT:    v_readlane_b32 s1, v1, 31
712; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
713; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
714; GFX1032-NEXT:    s_mov_b32 s0, s1
715; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
716; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
717; GFX1032-NEXT:    s_cbranch_execz BB3_2
718; GFX1032-NEXT:  ; %bb.1:
719; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
720; GFX1032-NEXT:    v_mov_b32_e32 v3, s0
721; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
722; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
723; GFX1032-NEXT:    ds_add_u32 v0, v3
724; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
725; GFX1032-NEXT:    buffer_gl0_inv
726; GFX1032-NEXT:  BB3_2:
727; GFX1032-NEXT:    s_endpgm
728entry:
729  %lane = call i32 @llvm.amdgcn.workitem.id.x()
730  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
731  ret void
732}
733
734define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
735;
736;
737; GFX7LESS-LABEL: add_i64_constant:
738; GFX7LESS:       ; %bb.0: ; %entry
739; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
740; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
741; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
742; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
743; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
744; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
745; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
746; GFX7LESS-NEXT:    s_cbranch_execz BB4_2
747; GFX7LESS-NEXT:  ; %bb.1:
748; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
749; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
750; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
751; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
752; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
753; GFX7LESS-NEXT:    s_mov_b32 m0, -1
754; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
756; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX7LESS-NEXT:  BB4_2:
758; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
759; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
761; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
762; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
763; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
764; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
765; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
766; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
767; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
768; GFX7LESS-NEXT:    s_mov_b32 s2, -1
769; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
770; GFX7LESS-NEXT:    s_endpgm
771;
772; GFX8-LABEL: add_i64_constant:
773; GFX8:       ; %bb.0: ; %entry
774; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
775; GFX8-NEXT:    s_mov_b64 s[4:5], exec
776; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
777; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
778; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
779; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
780; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
781; GFX8-NEXT:    s_cbranch_execz BB4_2
782; GFX8-NEXT:  ; %bb.1:
783; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
784; GFX8-NEXT:    s_mul_i32 s4, s4, 5
785; GFX8-NEXT:    v_mov_b32_e32 v1, s4
786; GFX8-NEXT:    v_mov_b32_e32 v2, 0
787; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
788; GFX8-NEXT:    s_mov_b32 m0, -1
789; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
791; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX8-NEXT:  BB4_2:
793; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
794; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
795; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
796; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
797; GFX8-NEXT:    v_mov_b32_e32 v1, s2
798; GFX8-NEXT:    v_mov_b32_e32 v2, s3
799; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
800; GFX8-NEXT:    s_mov_b32 s3, 0xf000
801; GFX8-NEXT:    s_mov_b32 s2, -1
802; GFX8-NEXT:    s_nop 2
803; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
804; GFX8-NEXT:    s_endpgm
805;
806; GFX9-LABEL: add_i64_constant:
807; GFX9:       ; %bb.0: ; %entry
808; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
809; GFX9-NEXT:    s_mov_b64 s[4:5], exec
810; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
811; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
812; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
813; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
814; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
815; GFX9-NEXT:    s_cbranch_execz BB4_2
816; GFX9-NEXT:  ; %bb.1:
817; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
818; GFX9-NEXT:    s_mul_i32 s4, s4, 5
819; GFX9-NEXT:    v_mov_b32_e32 v1, s4
820; GFX9-NEXT:    v_mov_b32_e32 v2, 0
821; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
822; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
824; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
825; GFX9-NEXT:  BB4_2:
826; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
827; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
828; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
829; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
830; GFX9-NEXT:    v_mov_b32_e32 v1, s2
831; GFX9-NEXT:    v_mov_b32_e32 v2, s3
832; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
833; GFX9-NEXT:    s_mov_b32 s3, 0xf000
834; GFX9-NEXT:    s_mov_b32 s2, -1
835; GFX9-NEXT:    s_nop 2
836; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
837; GFX9-NEXT:    s_endpgm
838;
839; GFX1064-LABEL: add_i64_constant:
840; GFX1064:       ; %bb.0: ; %entry
841; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
842; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
843; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
844; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
845; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
846; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
847; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
848; GFX1064-NEXT:    s_cbranch_execz BB4_2
849; GFX1064-NEXT:  ; %bb.1:
850; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
851; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
852; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
853; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
854; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
855; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
856; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
857; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
858; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX1064-NEXT:    buffer_gl0_inv
860; GFX1064-NEXT:  BB4_2:
861; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
862; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
863; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
864; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
865; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
866; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
867; GFX1064-NEXT:    s_mov_b32 s2, -1
868; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
869; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
870; GFX1064-NEXT:    s_endpgm
871;
872; GFX1032-LABEL: add_i64_constant:
873; GFX1032:       ; %bb.0: ; %entry
874; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
875; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
876; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
877; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
878; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
879; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
880; GFX1032-NEXT:    s_cbranch_execz BB4_2
881; GFX1032-NEXT:  ; %bb.1:
882; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
883; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
884; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
885; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
886; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
887; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
888; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
889; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
890; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
891; GFX1032-NEXT:    buffer_gl0_inv
892; GFX1032-NEXT:  BB4_2:
893; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
894; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
895; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
896; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
897; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
898; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
899; GFX1032-NEXT:    s_mov_b32 s2, -1
900; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
901; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
902; GFX1032-NEXT:    s_endpgm
903entry:
904  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
905  store i64 %old, i64 addrspace(1)* %out
906  ret void
907}
908
909define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
910;
911;
912; GFX7LESS-LABEL: add_i64_uniform:
913; GFX7LESS:       ; %bb.0: ; %entry
914; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
915; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
916; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
917; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
918; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
919; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
920; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
921; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
922; GFX7LESS-NEXT:  ; %bb.1:
923; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
924; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
925; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
926; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
927; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
928; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
929; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
930; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
931; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
932; GFX7LESS-NEXT:    s_mov_b32 m0, -1
933; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
935; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
936; GFX7LESS-NEXT:  BB5_2:
937; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
938; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
939; GFX7LESS-NEXT:    s_mov_b32 s6, -1
940; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX7LESS-NEXT:    s_mov_b32 s4, s0
942; GFX7LESS-NEXT:    s_mov_b32 s5, s1
943; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
944; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
945; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
946; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
947; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
948; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
949; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
950; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
951; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
952; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
953; GFX7LESS-NEXT:    s_endpgm
954;
955; GFX8-LABEL: add_i64_uniform:
956; GFX8:       ; %bb.0: ; %entry
957; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
958; GFX8-NEXT:    s_mov_b64 s[6:7], exec
959; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
960; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
961; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
962; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
963; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
964; GFX8-NEXT:    s_cbranch_execz BB5_2
965; GFX8-NEXT:  ; %bb.1:
966; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
967; GFX8-NEXT:    v_mov_b32_e32 v1, s6
968; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
969; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
970; GFX8-NEXT:    s_mul_i32 s7, s3, s6
971; GFX8-NEXT:    s_mul_i32 s6, s2, s6
972; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
973; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
974; GFX8-NEXT:    v_mov_b32_e32 v1, s6
975; GFX8-NEXT:    s_mov_b32 m0, -1
976; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
978; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX8-NEXT:  BB5_2:
980; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
981; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX8-NEXT:    s_mov_b32 s4, s0
983; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
984; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
985; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
986; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
987; GFX8-NEXT:    s_mov_b32 s5, s1
988; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
989; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
990; GFX8-NEXT:    v_mov_b32_e32 v2, s1
991; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
992; GFX8-NEXT:    s_mov_b32 s7, 0xf000
993; GFX8-NEXT:    s_mov_b32 s6, -1
994; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
995; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
996; GFX8-NEXT:    s_endpgm
997;
998; GFX9-LABEL: add_i64_uniform:
999; GFX9:       ; %bb.0: ; %entry
1000; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1001; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1002; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1003; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1004; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1005; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1006; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1007; GFX9-NEXT:    s_cbranch_execz BB5_2
1008; GFX9-NEXT:  ; %bb.1:
1009; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1010; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1012; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1013; GFX9-NEXT:    s_add_i32 s8, s8, s7
1014; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1015; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1016; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1017; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1018; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1020; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1021; GFX9-NEXT:  BB5_2:
1022; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1023; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1025; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1026; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1027; GFX9-NEXT:    s_mov_b32 s4, s0
1028; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1029; GFX9-NEXT:    s_mov_b32 s5, s1
1030; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1031; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1032; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1033; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1034; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1035; GFX9-NEXT:    s_mov_b32 s6, -1
1036; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1037; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1038; GFX9-NEXT:    s_endpgm
1039;
1040; GFX1064-LABEL: add_i64_uniform:
1041; GFX1064:       ; %bb.0: ; %entry
1042; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1043; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1044; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1045; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1046; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1047; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1048; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1049; GFX1064-NEXT:    s_cbranch_execz BB5_2
1050; GFX1064-NEXT:  ; %bb.1:
1051; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1052; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1053; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1054; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1055; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1056; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1057; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1058; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1059; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1060; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1061; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1062; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1063; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX1064-NEXT:    buffer_gl0_inv
1065; GFX1064-NEXT:  BB5_2:
1066; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1067; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1068; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1069; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1070; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1071; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1072; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1073; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1074; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1075; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1076; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
1077; GFX1064-NEXT:    s_mov_b32 s2, -1
1078; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1079; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1080; GFX1064-NEXT:    s_endpgm
1081;
1082; GFX1032-LABEL: add_i64_uniform:
1083; GFX1032:       ; %bb.0: ; %entry
1084; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1085; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1086; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1087; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1088; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1089; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1090; GFX1032-NEXT:    s_cbranch_execz BB5_2
1091; GFX1032-NEXT:  ; %bb.1:
1092; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1093; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1094; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1095; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1096; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1097; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1098; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1099; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1100; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1101; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1102; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1103; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1104; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1105; GFX1032-NEXT:    buffer_gl0_inv
1106; GFX1032-NEXT:  BB5_2:
1107; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1108; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1109; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1111; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1112; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1113; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1114; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1115; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1116; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1117; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s2, v0
1118; GFX1032-NEXT:    s_mov_b32 s2, -1
1119; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1120; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1121; GFX1032-NEXT:    s_endpgm
1122entry:
1123  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1124  store i64 %old, i64 addrspace(1)* %out
1125  ret void
1126}
1127
1128define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1129;
1130;
1131; GFX7LESS-LABEL: add_i64_varying:
1132; GFX7LESS:       ; %bb.0: ; %entry
1133; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1134; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1135; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1136; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1137; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1139; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1141; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1142; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1143; GFX7LESS-NEXT:    s_endpgm
1144;
1145; GFX8-LABEL: add_i64_varying:
1146; GFX8:       ; %bb.0: ; %entry
1147; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1148; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1149; GFX8-NEXT:    s_mov_b32 m0, -1
1150; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1151; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1152; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1153; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1155; GFX8-NEXT:    s_mov_b32 s2, -1
1156; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1157; GFX8-NEXT:    s_endpgm
1158;
1159; GFX9-LABEL: add_i64_varying:
1160; GFX9:       ; %bb.0: ; %entry
1161; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1162; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1163; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1166; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1168; GFX9-NEXT:    s_mov_b32 s2, -1
1169; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1170; GFX9-NEXT:    s_endpgm
1171;
1172; GFX1064-LABEL: add_i64_varying:
1173; GFX1064:       ; %bb.0: ; %entry
1174; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1175; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1176; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1177; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1178; GFX1064-NEXT:    s_mov_b32 s2, -1
1179; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1180; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1181; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1182; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1183; GFX1064-NEXT:    buffer_gl0_inv
1184; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1185; GFX1064-NEXT:    s_endpgm
1186;
1187; GFX1032-LABEL: add_i64_varying:
1188; GFX1032:       ; %bb.0: ; %entry
1189; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1190; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1191; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1192; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1193; GFX1032-NEXT:    s_mov_b32 s2, -1
1194; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1195; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1196; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1197; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1198; GFX1032-NEXT:    buffer_gl0_inv
1199; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1200; GFX1032-NEXT:    s_endpgm
1201entry:
1202  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1203  %zext = zext i32 %lane to i64
1204  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1205  store i64 %old, i64 addrspace(1)* %out
1206  ret void
1207}
1208
1209define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1210;
1211;
1212; GFX7LESS-LABEL: sub_i32_constant:
1213; GFX7LESS:       ; %bb.0: ; %entry
1214; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1215; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1216; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1217; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1218; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1219; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1220; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1221; GFX7LESS-NEXT:    s_cbranch_execz BB7_2
1222; GFX7LESS-NEXT:  ; %bb.1:
1223; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1224; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1225; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1226; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1227; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1228; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1230; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX7LESS-NEXT:  BB7_2:
1232; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1233; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1234; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1235; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1236; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1237; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1238; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1239; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1240; GFX7LESS-NEXT:    s_endpgm
1241;
1242; GFX8-LABEL: sub_i32_constant:
1243; GFX8:       ; %bb.0: ; %entry
1244; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1245; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1246; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1247; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1248; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1249; GFX8-NEXT:    ; implicit-def: $vgpr1
1250; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1251; GFX8-NEXT:    s_cbranch_execz BB7_2
1252; GFX8-NEXT:  ; %bb.1:
1253; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1254; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1255; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1256; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1257; GFX8-NEXT:    s_mov_b32 m0, -1
1258; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1260; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX8-NEXT:  BB7_2:
1262; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1263; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1264; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1265; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1266; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1267; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1268; GFX8-NEXT:    s_mov_b32 s2, -1
1269; GFX8-NEXT:    s_nop 0
1270; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1271; GFX8-NEXT:    s_endpgm
1272;
1273; GFX9-LABEL: sub_i32_constant:
1274; GFX9:       ; %bb.0: ; %entry
1275; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1276; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1277; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1278; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1279; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1280; GFX9-NEXT:    ; implicit-def: $vgpr1
1281; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1282; GFX9-NEXT:    s_cbranch_execz BB7_2
1283; GFX9-NEXT:  ; %bb.1:
1284; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1285; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1286; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1287; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1288; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1290; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX9-NEXT:  BB7_2:
1292; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1293; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1295; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1296; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1297; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1298; GFX9-NEXT:    s_mov_b32 s2, -1
1299; GFX9-NEXT:    s_nop 0
1300; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1301; GFX9-NEXT:    s_endpgm
1302;
1303; GFX1064-LABEL: sub_i32_constant:
1304; GFX1064:       ; %bb.0: ; %entry
1305; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1306; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1307; GFX1064-NEXT:    ; implicit-def: $vgpr1
1308; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1309; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1310; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1311; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1312; GFX1064-NEXT:    s_cbranch_execz BB7_2
1313; GFX1064-NEXT:  ; %bb.1:
1314; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1315; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1316; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1317; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1318; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1319; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1320; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1321; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX1064-NEXT:    buffer_gl0_inv
1323; GFX1064-NEXT:  BB7_2:
1324; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1325; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1326; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1327; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1328; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1329; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1330; GFX1064-NEXT:    s_mov_b32 s2, -1
1331; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1332; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1333; GFX1064-NEXT:    s_endpgm
1334;
1335; GFX1032-LABEL: sub_i32_constant:
1336; GFX1032:       ; %bb.0: ; %entry
1337; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1338; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1339; GFX1032-NEXT:    ; implicit-def: $vgpr1
1340; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1341; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1342; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1343; GFX1032-NEXT:    s_cbranch_execz BB7_2
1344; GFX1032-NEXT:  ; %bb.1:
1345; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1346; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1347; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1348; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1349; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1350; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1351; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1352; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX1032-NEXT:    buffer_gl0_inv
1354; GFX1032-NEXT:  BB7_2:
1355; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1356; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1357; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1358; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1359; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1360; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1361; GFX1032-NEXT:    s_mov_b32 s2, -1
1362; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1363; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1364; GFX1032-NEXT:    s_endpgm
1365entry:
1366  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1367  store i32 %old, i32 addrspace(1)* %out
1368  ret void
1369}
1370
1371define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1372;
1373;
1374; GFX7LESS-LABEL: sub_i32_uniform:
1375; GFX7LESS:       ; %bb.0: ; %entry
1376; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1377; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1378; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
1379; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1380; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1381; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1382; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1383; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1384; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1385; GFX7LESS-NEXT:  ; %bb.1:
1386; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1387; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1388; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
1389; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1390; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1391; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1392; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1394; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1395; GFX7LESS-NEXT:  BB8_2:
1396; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
1397; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1398; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1399; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1400; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1401; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
1402; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1403; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1404; GFX7LESS-NEXT:    s_endpgm
1405;
1406; GFX8-LABEL: sub_i32_uniform:
1407; GFX8:       ; %bb.0: ; %entry
1408; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1409; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1410; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1411; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1412; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1413; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1414; GFX8-NEXT:    ; implicit-def: $vgpr1
1415; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1416; GFX8-NEXT:    s_cbranch_execz BB8_2
1417; GFX8-NEXT:  ; %bb.1:
1418; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1419; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1420; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1421; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1422; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1423; GFX8-NEXT:    s_mov_b32 m0, -1
1424; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1425; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1426; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX8-NEXT:  BB8_2:
1428; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1429; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1430; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1431; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1432; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1433; GFX8-NEXT:    s_mov_b32 s6, -1
1434; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1435; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1436; GFX8-NEXT:    s_endpgm
1437;
1438; GFX9-LABEL: sub_i32_uniform:
1439; GFX9:       ; %bb.0: ; %entry
1440; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1441; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1442; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1443; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1444; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1445; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1446; GFX9-NEXT:    ; implicit-def: $vgpr1
1447; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1448; GFX9-NEXT:    s_cbranch_execz BB8_2
1449; GFX9-NEXT:  ; %bb.1:
1450; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1452; GFX9-NEXT:    s_mul_i32 s3, s2, s3
1453; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1454; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1455; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1456; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1457; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX9-NEXT:  BB8_2:
1459; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1460; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1461; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1462; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1463; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1464; GFX9-NEXT:    s_mov_b32 s6, -1
1465; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1466; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1467; GFX9-NEXT:    s_endpgm
1468;
1469; GFX1064-LABEL: sub_i32_uniform:
1470; GFX1064:       ; %bb.0: ; %entry
1471; GFX1064-NEXT:    s_clause 0x1
1472; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1473; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
1474; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1475; GFX1064-NEXT:    ; implicit-def: $vgpr1
1476; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1477; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1478; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1479; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1480; GFX1064-NEXT:    s_cbranch_execz BB8_2
1481; GFX1064-NEXT:  ; %bb.1:
1482; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1483; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1484; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1485; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
1486; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
1487; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1488; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1489; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1490; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX1064-NEXT:    buffer_gl0_inv
1492; GFX1064-NEXT:  BB8_2:
1493; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1494; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1495; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1496; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1497; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1498; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1499; GFX1064-NEXT:    s_mov_b32 s6, -1
1500; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1501; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1502; GFX1064-NEXT:    s_endpgm
1503;
1504; GFX1032-LABEL: sub_i32_uniform:
1505; GFX1032:       ; %bb.0: ; %entry
1506; GFX1032-NEXT:    s_clause 0x1
1507; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1508; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1509; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1510; GFX1032-NEXT:    ; implicit-def: $vgpr1
1511; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1512; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1513; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1514; GFX1032-NEXT:    s_cbranch_execz BB8_2
1515; GFX1032-NEXT:  ; %bb.1:
1516; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1517; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1518; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1519; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1520; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1521; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1522; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1523; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1524; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX1032-NEXT:    buffer_gl0_inv
1526; GFX1032-NEXT:  BB8_2:
1527; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1528; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1529; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1530; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1531; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1532; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1533; GFX1032-NEXT:    s_mov_b32 s6, -1
1534; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1535; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1536; GFX1032-NEXT:    s_endpgm
1537entry:
1538  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1539  store i32 %old, i32 addrspace(1)* %out
1540  ret void
1541}
1542
1543define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1544;
1545;
1546; GFX7LESS-LABEL: sub_i32_varying:
1547; GFX7LESS:       ; %bb.0: ; %entry
1548; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1549; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1550; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1551; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1552; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1553; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1555; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1556; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1557; GFX7LESS-NEXT:    s_endpgm
1558;
1559; GFX8-LABEL: sub_i32_varying:
1560; GFX8:       ; %bb.0: ; %entry
1561; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1562; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1563; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1564; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1565; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1566; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1567; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1568; GFX8-NEXT:    s_not_b64 exec, exec
1569; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1570; GFX8-NEXT:    s_not_b64 exec, exec
1571; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1572; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1573; GFX8-NEXT:    s_nop 1
1574; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1575; GFX8-NEXT:    s_nop 1
1576; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1577; GFX8-NEXT:    s_nop 1
1578; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1579; GFX8-NEXT:    s_nop 1
1580; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1581; GFX8-NEXT:    s_nop 1
1582; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1583; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1584; GFX8-NEXT:    s_nop 0
1585; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1586; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1587; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1588; GFX8-NEXT:    ; implicit-def: $vgpr0
1589; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1590; GFX8-NEXT:    s_cbranch_execz BB9_2
1591; GFX8-NEXT:  ; %bb.1:
1592; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1593; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1594; GFX8-NEXT:    s_mov_b32 m0, -1
1595; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1596; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1597; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX8-NEXT:  BB9_2:
1599; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1602; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1603; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1604; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1605; GFX8-NEXT:    s_mov_b32 s2, -1
1606; GFX8-NEXT:    s_nop 0
1607; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1608; GFX8-NEXT:    s_endpgm
1609;
1610; GFX9-LABEL: sub_i32_varying:
1611; GFX9:       ; %bb.0: ; %entry
1612; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1613; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1614; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1615; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1616; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1617; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1618; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1619; GFX9-NEXT:    s_not_b64 exec, exec
1620; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1621; GFX9-NEXT:    s_not_b64 exec, exec
1622; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1623; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1624; GFX9-NEXT:    s_nop 1
1625; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1626; GFX9-NEXT:    s_nop 1
1627; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1628; GFX9-NEXT:    s_nop 1
1629; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1630; GFX9-NEXT:    s_nop 1
1631; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1632; GFX9-NEXT:    s_nop 1
1633; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1634; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1635; GFX9-NEXT:    s_nop 0
1636; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1637; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1638; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1639; GFX9-NEXT:    ; implicit-def: $vgpr0
1640; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1641; GFX9-NEXT:    s_cbranch_execz BB9_2
1642; GFX9-NEXT:  ; %bb.1:
1643; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1644; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1645; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1646; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1647; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1648; GFX9-NEXT:  BB9_2:
1649; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1650; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1651; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1652; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1653; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1654; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1655; GFX9-NEXT:    s_mov_b32 s2, -1
1656; GFX9-NEXT:    s_nop 0
1657; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1658; GFX9-NEXT:    s_endpgm
1659;
1660; GFX1064-LABEL: sub_i32_varying:
1661; GFX1064:       ; %bb.0: ; %entry
1662; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1663; GFX1064-NEXT:    s_not_b64 exec, exec
1664; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1665; GFX1064-NEXT:    s_not_b64 exec, exec
1666; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1667; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1668; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1669; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1670; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1671; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1672; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1673; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1674; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1675; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
1676; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
1677; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1678; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
1679; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1680; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1681; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1682; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1683; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
1684; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
1685; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1686; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1687; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1688; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
1689; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
1690; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
1691; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1692; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
1693; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1694; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
1695; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1696; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1697; GFX1064-NEXT:    s_mov_b32 s2, -1
1698; GFX1064-NEXT:    ; implicit-def: $vgpr0
1699; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1700; GFX1064-NEXT:    s_cbranch_execz BB9_2
1701; GFX1064-NEXT:  ; %bb.1:
1702; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1703; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
1704; GFX1064-NEXT:    s_mov_b32 s3, s7
1705; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1706; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1707; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
1708; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1709; GFX1064-NEXT:    buffer_gl0_inv
1710; GFX1064-NEXT:  BB9_2:
1711; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1712; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1713; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
1714; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
1715; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1716; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1717; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1718; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1719; GFX1064-NEXT:    s_endpgm
1720;
1721; GFX1032-LABEL: sub_i32_varying:
1722; GFX1032:       ; %bb.0: ; %entry
1723; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1724; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1725; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1726; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1727; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1728; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1729; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1730; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1731; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1732; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1733; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1734; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1735; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1736; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1737; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1738; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1739; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1740; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1741; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1742; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1743; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1744; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1745; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1746; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1747; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1748; GFX1032-NEXT:    s_mov_b32 s2, -1
1749; GFX1032-NEXT:    ; implicit-def: $vgpr0
1750; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1751; GFX1032-NEXT:    s_cbranch_execz BB9_2
1752; GFX1032-NEXT:  ; %bb.1:
1753; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1754; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1755; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1756; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1757; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
1758; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1759; GFX1032-NEXT:    buffer_gl0_inv
1760; GFX1032-NEXT:  BB9_2:
1761; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1762; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1763; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1764; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1765; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
1766; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1767; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1769; GFX1032-NEXT:    s_endpgm
1770entry:
1771  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1772  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1773  store i32 %old, i32 addrspace(1)* %out
1774  ret void
1775}
1776
1777define amdgpu_kernel void @sub_i32_varying_nouse() {
1778; GFX7LESS-LABEL: sub_i32_varying_nouse:
1779; GFX7LESS:       ; %bb.0: ; %entry
1780; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1781; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1782; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1783; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
1784; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1785; GFX7LESS-NEXT:    s_endpgm
1786;
1787; GFX8-LABEL: sub_i32_varying_nouse:
1788; GFX8:       ; %bb.0: ; %entry
1789; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1790; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1791; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1792; GFX8-NEXT:    s_not_b64 exec, exec
1793; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1794; GFX8-NEXT:    s_not_b64 exec, exec
1795; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
1796; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1797; GFX8-NEXT:    s_nop 1
1798; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1799; GFX8-NEXT:    s_nop 1
1800; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1801; GFX8-NEXT:    s_nop 1
1802; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1803; GFX8-NEXT:    s_nop 1
1804; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1805; GFX8-NEXT:    s_nop 1
1806; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1807; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
1808; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
1809; GFX8-NEXT:    s_mov_b32 s0, s2
1810; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1811; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1812; GFX8-NEXT:    s_cbranch_execz BB10_2
1813; GFX8-NEXT:  ; %bb.1:
1814; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1815; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1816; GFX8-NEXT:    s_mov_b32 m0, -1
1817; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1818; GFX8-NEXT:    ds_sub_u32 v0, v2
1819; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX8-NEXT:  BB10_2:
1821; GFX8-NEXT:    s_endpgm
1822;
1823; GFX9-LABEL: sub_i32_varying_nouse:
1824; GFX9:       ; %bb.0: ; %entry
1825; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1826; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1827; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1828; GFX9-NEXT:    s_not_b64 exec, exec
1829; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1830; GFX9-NEXT:    s_not_b64 exec, exec
1831; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
1832; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1833; GFX9-NEXT:    s_nop 1
1834; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1835; GFX9-NEXT:    s_nop 1
1836; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1837; GFX9-NEXT:    s_nop 1
1838; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1839; GFX9-NEXT:    s_nop 1
1840; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1841; GFX9-NEXT:    s_nop 1
1842; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1843; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
1844; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
1845; GFX9-NEXT:    s_mov_b32 s0, s2
1846; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1847; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1848; GFX9-NEXT:    s_cbranch_execz BB10_2
1849; GFX9-NEXT:  ; %bb.1:
1850; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1851; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1852; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1853; GFX9-NEXT:    ds_sub_u32 v0, v2
1854; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX9-NEXT:  BB10_2:
1856; GFX9-NEXT:    s_endpgm
1857;
1858; GFX1064-LABEL: sub_i32_varying_nouse:
1859; GFX1064:       ; %bb.0: ; %entry
1860; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1861; GFX1064-NEXT:    s_not_b64 exec, exec
1862; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1863; GFX1064-NEXT:    s_not_b64 exec, exec
1864; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1865; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1866; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1867; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1868; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1869; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1870; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1871; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1872; GFX1064-NEXT:    v_readlane_b32 s2, v1, 31
1873; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1874; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1875; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1876; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1877; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1878; GFX1064-NEXT:    v_readlane_b32 s2, v1, 63
1879; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1880; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
1881; GFX1064-NEXT:    s_mov_b32 s0, s2
1882; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1883; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1884; GFX1064-NEXT:    s_cbranch_execz BB10_2
1885; GFX1064-NEXT:  ; %bb.1:
1886; GFX1064-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1887; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
1888; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1889; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1890; GFX1064-NEXT:    ds_sub_u32 v0, v3
1891; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1892; GFX1064-NEXT:    buffer_gl0_inv
1893; GFX1064-NEXT:  BB10_2:
1894; GFX1064-NEXT:    s_endpgm
1895;
1896; GFX1032-LABEL: sub_i32_varying_nouse:
1897; GFX1032:       ; %bb.0: ; %entry
1898; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1899; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1900; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1901; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1902; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
1903; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1904; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1905; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1906; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1907; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1908; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1909; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1910; GFX1032-NEXT:    v_readlane_b32 s1, v1, 31
1911; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
1912; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1913; GFX1032-NEXT:    s_mov_b32 s0, s1
1914; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1915; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1916; GFX1032-NEXT:    s_cbranch_execz BB10_2
1917; GFX1032-NEXT:  ; %bb.1:
1918; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1919; GFX1032-NEXT:    v_mov_b32_e32 v3, s0
1920; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1921; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1922; GFX1032-NEXT:    ds_sub_u32 v0, v3
1923; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1924; GFX1032-NEXT:    buffer_gl0_inv
1925; GFX1032-NEXT:  BB10_2:
1926; GFX1032-NEXT:    s_endpgm
1927entry:
1928  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1929  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1930  ret void
1931}
1932
1933define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
1934;
1935;
1936; GFX7LESS-LABEL: sub_i64_constant:
1937; GFX7LESS:       ; %bb.0: ; %entry
1938; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1939; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1940; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1941; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1942; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1943; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1944; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1945; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
1946; GFX7LESS-NEXT:  ; %bb.1:
1947; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1948; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
1949; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1950; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1951; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
1952; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1953; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1954; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
1955; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1956; GFX7LESS-NEXT:  BB11_2:
1957; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1958; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1959; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1960; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1961; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1962; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1963; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1964; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1965; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1966; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1967; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1968; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1969; GFX7LESS-NEXT:    s_endpgm
1970;
1971; GFX8-LABEL: sub_i64_constant:
1972; GFX8:       ; %bb.0: ; %entry
1973; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1974; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1975; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1976; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1977; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1978; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1979; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1980; GFX8-NEXT:    s_cbranch_execz BB11_2
1981; GFX8-NEXT:  ; %bb.1:
1982; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1983; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1984; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1985; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1986; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1987; GFX8-NEXT:    s_mov_b32 m0, -1
1988; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1989; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
1990; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX8-NEXT:  BB11_2:
1992; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1993; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1994; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1995; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1996; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1997; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1998; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1999; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2000; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2001; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2002; GFX8-NEXT:    s_mov_b32 s2, -1
2003; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2004; GFX8-NEXT:    s_endpgm
2005;
2006; GFX9-LABEL: sub_i64_constant:
2007; GFX9:       ; %bb.0: ; %entry
2008; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2009; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2010; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2011; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2012; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2013; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2014; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2015; GFX9-NEXT:    s_cbranch_execz BB11_2
2016; GFX9-NEXT:  ; %bb.1:
2017; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2018; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2019; GFX9-NEXT:    v_mov_b32_e32 v1, s4
2020; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2021; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2022; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2023; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2024; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2025; GFX9-NEXT:  BB11_2:
2026; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2027; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2029; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2030; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2031; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2032; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2033; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2034; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2035; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2036; GFX9-NEXT:    s_mov_b32 s2, -1
2037; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2038; GFX9-NEXT:    s_endpgm
2039;
2040; GFX1064-LABEL: sub_i64_constant:
2041; GFX1064:       ; %bb.0: ; %entry
2042; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2043; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2044; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2045; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2046; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2047; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2048; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2049; GFX1064-NEXT:    s_cbranch_execz BB11_2
2050; GFX1064-NEXT:  ; %bb.1:
2051; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2052; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2053; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2054; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2055; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
2056; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2057; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2058; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2059; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2060; GFX1064-NEXT:    buffer_gl0_inv
2061; GFX1064-NEXT:  BB11_2:
2062; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2063; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2064; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2065; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2066; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2067; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2068; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2069; GFX1064-NEXT:    s_mov_b32 s2, -1
2070; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2071; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2072; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2074; GFX1064-NEXT:    s_endpgm
2075;
2076; GFX1032-LABEL: sub_i64_constant:
2077; GFX1032:       ; %bb.0: ; %entry
2078; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2079; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2080; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2081; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2082; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2083; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2084; GFX1032-NEXT:    s_cbranch_execz BB11_2
2085; GFX1032-NEXT:  ; %bb.1:
2086; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2087; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2088; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2089; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2090; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
2091; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2092; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2093; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2094; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2095; GFX1032-NEXT:    buffer_gl0_inv
2096; GFX1032-NEXT:  BB11_2:
2097; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2098; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2099; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2100; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2101; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2102; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2103; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2104; GFX1032-NEXT:    s_mov_b32 s2, -1
2105; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2106; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2107; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2108; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2109; GFX1032-NEXT:    s_endpgm
2110entry:
2111  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2112  store i64 %old, i64 addrspace(1)* %out
2113  ret void
2114}
2115
2116define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2117;
2118;
2119; GFX7LESS-LABEL: sub_i64_uniform:
2120; GFX7LESS:       ; %bb.0: ; %entry
2121; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2122; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2123; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2124; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2125; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2126; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2127; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2128; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2129; GFX7LESS-NEXT:  ; %bb.1:
2130; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2131; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2132; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2133; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2134; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2135; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2136; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2137; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2138; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2139; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2140; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2141; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2142; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2143; GFX7LESS-NEXT:  BB12_2:
2144; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2145; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2146; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2147; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2149; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2150; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2151; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2152; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2153; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2154; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2155; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2156; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2157; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2158; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2159; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2160; GFX7LESS-NEXT:    s_endpgm
2161;
2162; GFX8-LABEL: sub_i64_uniform:
2163; GFX8:       ; %bb.0: ; %entry
2164; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2165; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2166; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2167; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2168; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2169; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2170; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2171; GFX8-NEXT:    s_cbranch_execz BB12_2
2172; GFX8-NEXT:  ; %bb.1:
2173; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2174; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2175; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2176; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2177; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2178; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2179; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2180; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2181; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2182; GFX8-NEXT:    s_mov_b32 m0, -1
2183; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2184; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2185; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2186; GFX8-NEXT:  BB12_2:
2187; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2188; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2189; GFX8-NEXT:    s_mov_b32 s4, s0
2190; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2191; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2192; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2193; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2194; GFX8-NEXT:    s_mov_b32 s5, s1
2195; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2196; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2197; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2198; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2199; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2200; GFX8-NEXT:    s_mov_b32 s6, -1
2201; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2202; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2203; GFX8-NEXT:    s_endpgm
2204;
2205; GFX9-LABEL: sub_i64_uniform:
2206; GFX9:       ; %bb.0: ; %entry
2207; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2208; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2209; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2210; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2211; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2212; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2213; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2214; GFX9-NEXT:    s_cbranch_execz BB12_2
2215; GFX9-NEXT:  ; %bb.1:
2216; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2217; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2218; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2219; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2220; GFX9-NEXT:    s_add_i32 s8, s8, s7
2221; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2222; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2223; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2224; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2225; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2226; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2227; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2228; GFX9-NEXT:  BB12_2:
2229; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2230; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2231; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2232; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2233; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2234; GFX9-NEXT:    s_mov_b32 s4, s0
2235; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2236; GFX9-NEXT:    s_mov_b32 s5, s1
2237; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2238; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2239; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2240; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2241; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2242; GFX9-NEXT:    s_mov_b32 s6, -1
2243; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2244; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2245; GFX9-NEXT:    s_endpgm
2246;
2247; GFX1064-LABEL: sub_i64_uniform:
2248; GFX1064:       ; %bb.0: ; %entry
2249; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2250; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2251; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2252; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2253; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2254; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2255; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2256; GFX1064-NEXT:    s_cbranch_execz BB12_2
2257; GFX1064-NEXT:  ; %bb.1:
2258; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2259; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2260; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2262; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2263; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2264; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2265; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2266; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2267; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2268; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2269; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2270; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2271; GFX1064-NEXT:    buffer_gl0_inv
2272; GFX1064-NEXT:  BB12_2:
2273; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2274; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2275; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2276; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2277; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2278; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2279; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2280; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2281; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2282; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2283; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v0
2284; GFX1064-NEXT:    s_mov_b32 s2, -1
2285; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2286; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2287; GFX1064-NEXT:    s_endpgm
2288;
2289; GFX1032-LABEL: sub_i64_uniform:
2290; GFX1032:       ; %bb.0: ; %entry
2291; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2292; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2293; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2294; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2295; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2296; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2297; GFX1032-NEXT:    s_cbranch_execz BB12_2
2298; GFX1032-NEXT:  ; %bb.1:
2299; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2300; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2301; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2302; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2303; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2304; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2305; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2306; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2307; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2308; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2309; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2310; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2311; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX1032-NEXT:    buffer_gl0_inv
2313; GFX1032-NEXT:  BB12_2:
2314; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2315; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2316; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2318; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2319; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2320; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2321; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2322; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2323; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2324; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v0
2325; GFX1032-NEXT:    s_mov_b32 s2, -1
2326; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2327; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2328; GFX1032-NEXT:    s_endpgm
2329entry:
2330  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2331  store i64 %old, i64 addrspace(1)* %out
2332  ret void
2333}
2334
2335define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2336;
2337;
2338; GFX7LESS-LABEL: sub_i64_varying:
2339; GFX7LESS:       ; %bb.0: ; %entry
2340; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2341; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2342; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2343; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2344; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2345; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2346; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2347; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2348; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2349; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2350; GFX7LESS-NEXT:    s_endpgm
2351;
2352; GFX8-LABEL: sub_i64_varying:
2353; GFX8:       ; %bb.0: ; %entry
2354; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2355; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2356; GFX8-NEXT:    s_mov_b32 m0, -1
2357; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2358; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2359; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2360; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2362; GFX8-NEXT:    s_mov_b32 s2, -1
2363; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2364; GFX8-NEXT:    s_endpgm
2365;
2366; GFX9-LABEL: sub_i64_varying:
2367; GFX9:       ; %bb.0: ; %entry
2368; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2369; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2370; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2371; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2372; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2374; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2375; GFX9-NEXT:    s_mov_b32 s2, -1
2376; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2377; GFX9-NEXT:    s_endpgm
2378;
2379; GFX1064-LABEL: sub_i64_varying:
2380; GFX1064:       ; %bb.0: ; %entry
2381; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2382; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2383; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2384; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2385; GFX1064-NEXT:    s_mov_b32 s2, -1
2386; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2387; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2388; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2389; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX1064-NEXT:    buffer_gl0_inv
2391; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2392; GFX1064-NEXT:    s_endpgm
2393;
2394; GFX1032-LABEL: sub_i64_varying:
2395; GFX1032:       ; %bb.0: ; %entry
2396; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2397; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2398; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2399; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2400; GFX1032-NEXT:    s_mov_b32 s2, -1
2401; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2402; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2403; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2404; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2405; GFX1032-NEXT:    buffer_gl0_inv
2406; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2407; GFX1032-NEXT:    s_endpgm
2408entry:
2409  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2410  %zext = zext i32 %lane to i64
2411  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2412  store i64 %old, i64 addrspace(1)* %out
2413  ret void
2414}
2415
2416define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2417;
2418;
2419; GFX7LESS-LABEL: and_i32_varying:
2420; GFX7LESS:       ; %bb.0: ; %entry
2421; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2422; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2423; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2424; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2425; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2426; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2427; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2428; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2429; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2430; GFX7LESS-NEXT:    s_endpgm
2431;
2432; GFX8-LABEL: and_i32_varying:
2433; GFX8:       ; %bb.0: ; %entry
2434; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2435; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2436; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2437; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2438; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2439; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2440; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2441; GFX8-NEXT:    s_not_b64 exec, exec
2442; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2443; GFX8-NEXT:    s_not_b64 exec, exec
2444; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2445; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2446; GFX8-NEXT:    s_nop 1
2447; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2448; GFX8-NEXT:    s_nop 1
2449; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2450; GFX8-NEXT:    s_nop 1
2451; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2452; GFX8-NEXT:    s_nop 1
2453; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2454; GFX8-NEXT:    s_nop 1
2455; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2456; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2457; GFX8-NEXT:    s_nop 0
2458; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2459; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2460; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2461; GFX8-NEXT:    ; implicit-def: $vgpr0
2462; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2463; GFX8-NEXT:    s_cbranch_execz BB14_2
2464; GFX8-NEXT:  ; %bb.1:
2465; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2466; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2467; GFX8-NEXT:    s_mov_b32 m0, -1
2468; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2469; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2470; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2471; GFX8-NEXT:  BB14_2:
2472; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2473; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2474; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2475; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2476; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2477; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2478; GFX8-NEXT:    s_mov_b32 s2, -1
2479; GFX8-NEXT:    s_nop 0
2480; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2481; GFX8-NEXT:    s_endpgm
2482;
2483; GFX9-LABEL: and_i32_varying:
2484; GFX9:       ; %bb.0: ; %entry
2485; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2486; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2487; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2488; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2489; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2490; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2491; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2492; GFX9-NEXT:    s_not_b64 exec, exec
2493; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2494; GFX9-NEXT:    s_not_b64 exec, exec
2495; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2496; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2497; GFX9-NEXT:    s_nop 1
2498; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2499; GFX9-NEXT:    s_nop 1
2500; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2501; GFX9-NEXT:    s_nop 1
2502; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2503; GFX9-NEXT:    s_nop 1
2504; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2505; GFX9-NEXT:    s_nop 1
2506; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2507; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2508; GFX9-NEXT:    s_nop 0
2509; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2510; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2511; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2512; GFX9-NEXT:    ; implicit-def: $vgpr0
2513; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2514; GFX9-NEXT:    s_cbranch_execz BB14_2
2515; GFX9-NEXT:  ; %bb.1:
2516; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2517; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2518; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2519; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2520; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2521; GFX9-NEXT:  BB14_2:
2522; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2523; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2524; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2525; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2526; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2527; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2528; GFX9-NEXT:    s_mov_b32 s2, -1
2529; GFX9-NEXT:    s_nop 0
2530; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2531; GFX9-NEXT:    s_endpgm
2532;
2533; GFX1064-LABEL: and_i32_varying:
2534; GFX1064:       ; %bb.0: ; %entry
2535; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2536; GFX1064-NEXT:    s_not_b64 exec, exec
2537; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2538; GFX1064-NEXT:    s_not_b64 exec, exec
2539; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2540; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2541; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2542; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2543; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2544; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2545; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2546; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2547; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2548; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2549; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2550; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2551; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2552; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2553; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2554; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2555; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2556; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2557; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2558; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2559; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2560; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2561; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2562; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2563; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2564; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2565; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2566; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2567; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2568; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2569; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2570; GFX1064-NEXT:    s_mov_b32 s2, -1
2571; GFX1064-NEXT:    ; implicit-def: $vgpr0
2572; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2573; GFX1064-NEXT:    s_cbranch_execz BB14_2
2574; GFX1064-NEXT:  ; %bb.1:
2575; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2576; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2577; GFX1064-NEXT:    s_mov_b32 s3, s7
2578; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2579; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2580; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
2581; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2582; GFX1064-NEXT:    buffer_gl0_inv
2583; GFX1064-NEXT:  BB14_2:
2584; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2585; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2586; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2587; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2588; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2589; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2590; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2591; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2592; GFX1064-NEXT:    s_endpgm
2593;
2594; GFX1032-LABEL: and_i32_varying:
2595; GFX1032:       ; %bb.0: ; %entry
2596; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2597; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2598; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2599; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2600; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2601; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2602; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2603; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2604; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2605; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2606; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2607; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2608; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2609; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2610; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2611; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2612; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2613; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2614; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2615; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2616; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2617; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2618; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2619; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2620; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2621; GFX1032-NEXT:    s_mov_b32 s2, -1
2622; GFX1032-NEXT:    ; implicit-def: $vgpr0
2623; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2624; GFX1032-NEXT:    s_cbranch_execz BB14_2
2625; GFX1032-NEXT:  ; %bb.1:
2626; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2627; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2628; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2629; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2630; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
2631; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2632; GFX1032-NEXT:    buffer_gl0_inv
2633; GFX1032-NEXT:  BB14_2:
2634; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2635; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2636; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2637; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2638; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2639; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2640; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2641; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2642; GFX1032-NEXT:    s_endpgm
2643entry:
2644  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2645  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2646  store i32 %old, i32 addrspace(1)* %out
2647  ret void
2648}
2649
2650define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2651;
2652;
2653; GFX7LESS-LABEL: or_i32_varying:
2654; GFX7LESS:       ; %bb.0: ; %entry
2655; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2656; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2657; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2658; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2659; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2660; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2661; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2662; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2663; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2664; GFX7LESS-NEXT:    s_endpgm
2665;
2666; GFX8-LABEL: or_i32_varying:
2667; GFX8:       ; %bb.0: ; %entry
2668; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2669; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2670; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2671; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2672; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2673; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2674; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2675; GFX8-NEXT:    s_not_b64 exec, exec
2676; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2677; GFX8-NEXT:    s_not_b64 exec, exec
2678; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2679; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2680; GFX8-NEXT:    s_nop 1
2681; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2682; GFX8-NEXT:    s_nop 1
2683; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2684; GFX8-NEXT:    s_nop 1
2685; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2686; GFX8-NEXT:    s_nop 1
2687; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2688; GFX8-NEXT:    s_nop 1
2689; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2690; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2691; GFX8-NEXT:    s_nop 0
2692; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2693; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2694; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2695; GFX8-NEXT:    ; implicit-def: $vgpr0
2696; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2697; GFX8-NEXT:    s_cbranch_execz BB15_2
2698; GFX8-NEXT:  ; %bb.1:
2699; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2700; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2701; GFX8-NEXT:    s_mov_b32 m0, -1
2702; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2703; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2704; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2705; GFX8-NEXT:  BB15_2:
2706; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2707; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2708; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2709; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2710; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2711; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2712; GFX8-NEXT:    s_mov_b32 s2, -1
2713; GFX8-NEXT:    s_nop 0
2714; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2715; GFX8-NEXT:    s_endpgm
2716;
2717; GFX9-LABEL: or_i32_varying:
2718; GFX9:       ; %bb.0: ; %entry
2719; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2720; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2721; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2722; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2723; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2724; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2725; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2726; GFX9-NEXT:    s_not_b64 exec, exec
2727; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2728; GFX9-NEXT:    s_not_b64 exec, exec
2729; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2730; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2731; GFX9-NEXT:    s_nop 1
2732; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2733; GFX9-NEXT:    s_nop 1
2734; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2735; GFX9-NEXT:    s_nop 1
2736; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2737; GFX9-NEXT:    s_nop 1
2738; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2739; GFX9-NEXT:    s_nop 1
2740; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2741; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2742; GFX9-NEXT:    s_nop 0
2743; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2744; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2745; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2746; GFX9-NEXT:    ; implicit-def: $vgpr0
2747; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2748; GFX9-NEXT:    s_cbranch_execz BB15_2
2749; GFX9-NEXT:  ; %bb.1:
2750; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2751; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2752; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2753; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2755; GFX9-NEXT:  BB15_2:
2756; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2757; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2758; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2759; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2760; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2761; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2762; GFX9-NEXT:    s_mov_b32 s2, -1
2763; GFX9-NEXT:    s_nop 0
2764; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2765; GFX9-NEXT:    s_endpgm
2766;
2767; GFX1064-LABEL: or_i32_varying:
2768; GFX1064:       ; %bb.0: ; %entry
2769; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2770; GFX1064-NEXT:    s_not_b64 exec, exec
2771; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2772; GFX1064-NEXT:    s_not_b64 exec, exec
2773; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2774; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2775; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2776; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2777; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2778; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2779; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2780; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2781; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2782; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2783; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2784; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2785; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2786; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2787; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2788; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2789; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2790; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2791; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2792; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2793; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2794; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2795; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2796; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2797; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2798; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2799; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2800; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2801; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2802; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2803; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2804; GFX1064-NEXT:    s_mov_b32 s2, -1
2805; GFX1064-NEXT:    ; implicit-def: $vgpr0
2806; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2807; GFX1064-NEXT:    s_cbranch_execz BB15_2
2808; GFX1064-NEXT:  ; %bb.1:
2809; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2810; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2811; GFX1064-NEXT:    s_mov_b32 s3, s7
2812; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2813; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2814; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
2815; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2816; GFX1064-NEXT:    buffer_gl0_inv
2817; GFX1064-NEXT:  BB15_2:
2818; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2819; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2820; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2821; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2822; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
2823; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2824; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2825; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2826; GFX1064-NEXT:    s_endpgm
2827;
2828; GFX1032-LABEL: or_i32_varying:
2829; GFX1032:       ; %bb.0: ; %entry
2830; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2831; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2832; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2833; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2834; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2835; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2836; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2837; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2838; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2839; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2840; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2841; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2842; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2843; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2844; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2845; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2846; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2847; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2848; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2849; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2850; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2851; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2852; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2853; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2854; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2855; GFX1032-NEXT:    s_mov_b32 s2, -1
2856; GFX1032-NEXT:    ; implicit-def: $vgpr0
2857; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2858; GFX1032-NEXT:    s_cbranch_execz BB15_2
2859; GFX1032-NEXT:  ; %bb.1:
2860; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2861; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2862; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2863; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2864; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
2865; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2866; GFX1032-NEXT:    buffer_gl0_inv
2867; GFX1032-NEXT:  BB15_2:
2868; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2869; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2870; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2871; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2872; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
2873; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2874; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2875; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2876; GFX1032-NEXT:    s_endpgm
2877entry:
2878  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2879  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2880  store i32 %old, i32 addrspace(1)* %out
2881  ret void
2882}
2883
2884define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
2885;
2886;
2887; GFX7LESS-LABEL: xor_i32_varying:
2888; GFX7LESS:       ; %bb.0: ; %entry
2889; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2890; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2891; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2892; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2893; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
2894; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2896; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2897; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2898; GFX7LESS-NEXT:    s_endpgm
2899;
2900; GFX8-LABEL: xor_i32_varying:
2901; GFX8:       ; %bb.0: ; %entry
2902; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2903; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2904; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2905; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2906; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2907; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2908; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2909; GFX8-NEXT:    s_not_b64 exec, exec
2910; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2911; GFX8-NEXT:    s_not_b64 exec, exec
2912; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2913; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2914; GFX8-NEXT:    s_nop 1
2915; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2916; GFX8-NEXT:    s_nop 1
2917; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2918; GFX8-NEXT:    s_nop 1
2919; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2920; GFX8-NEXT:    s_nop 1
2921; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2922; GFX8-NEXT:    s_nop 1
2923; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2924; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2925; GFX8-NEXT:    s_nop 0
2926; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2927; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2928; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2929; GFX8-NEXT:    ; implicit-def: $vgpr0
2930; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2931; GFX8-NEXT:    s_cbranch_execz BB16_2
2932; GFX8-NEXT:  ; %bb.1:
2933; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2934; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2935; GFX8-NEXT:    s_mov_b32 m0, -1
2936; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2937; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2938; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2939; GFX8-NEXT:  BB16_2:
2940; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2941; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2943; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2944; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
2945; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2946; GFX8-NEXT:    s_mov_b32 s2, -1
2947; GFX8-NEXT:    s_nop 0
2948; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2949; GFX8-NEXT:    s_endpgm
2950;
2951; GFX9-LABEL: xor_i32_varying:
2952; GFX9:       ; %bb.0: ; %entry
2953; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2954; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2955; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2956; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2957; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2958; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2959; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2960; GFX9-NEXT:    s_not_b64 exec, exec
2961; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2962; GFX9-NEXT:    s_not_b64 exec, exec
2963; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2964; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2965; GFX9-NEXT:    s_nop 1
2966; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2967; GFX9-NEXT:    s_nop 1
2968; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2969; GFX9-NEXT:    s_nop 1
2970; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2971; GFX9-NEXT:    s_nop 1
2972; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2973; GFX9-NEXT:    s_nop 1
2974; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2975; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2976; GFX9-NEXT:    s_nop 0
2977; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2978; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2979; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2980; GFX9-NEXT:    ; implicit-def: $vgpr0
2981; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2982; GFX9-NEXT:    s_cbranch_execz BB16_2
2983; GFX9-NEXT:  ; %bb.1:
2984; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2985; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2986; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
2988; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2989; GFX9-NEXT:  BB16_2:
2990; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2991; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2992; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2993; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2994; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2995; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2996; GFX9-NEXT:    s_mov_b32 s2, -1
2997; GFX9-NEXT:    s_nop 0
2998; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2999; GFX9-NEXT:    s_endpgm
3000;
3001; GFX1064-LABEL: xor_i32_varying:
3002; GFX1064:       ; %bb.0: ; %entry
3003; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3004; GFX1064-NEXT:    s_not_b64 exec, exec
3005; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3006; GFX1064-NEXT:    s_not_b64 exec, exec
3007; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3008; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3009; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3010; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3011; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3012; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3013; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3014; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3015; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3016; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3017; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3018; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3019; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3020; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3021; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3022; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3023; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3024; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3025; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3026; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3027; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3028; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3029; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3030; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3031; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3032; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3033; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3034; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3035; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3036; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3037; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3038; GFX1064-NEXT:    s_mov_b32 s2, -1
3039; GFX1064-NEXT:    ; implicit-def: $vgpr0
3040; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3041; GFX1064-NEXT:    s_cbranch_execz BB16_2
3042; GFX1064-NEXT:  ; %bb.1:
3043; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3044; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3045; GFX1064-NEXT:    s_mov_b32 s3, s7
3046; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3047; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3048; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3049; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3050; GFX1064-NEXT:    buffer_gl0_inv
3051; GFX1064-NEXT:  BB16_2:
3052; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3053; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3054; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3055; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3056; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3057; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3058; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3059; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3060; GFX1064-NEXT:    s_endpgm
3061;
3062; GFX1032-LABEL: xor_i32_varying:
3063; GFX1032:       ; %bb.0: ; %entry
3064; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3065; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3066; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3067; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3068; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3069; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3070; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3071; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3072; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3073; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3074; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3075; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3076; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3077; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3078; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3079; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3080; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3081; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3082; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3083; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3084; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3085; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3086; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3087; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3088; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3089; GFX1032-NEXT:    s_mov_b32 s2, -1
3090; GFX1032-NEXT:    ; implicit-def: $vgpr0
3091; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3092; GFX1032-NEXT:    s_cbranch_execz BB16_2
3093; GFX1032-NEXT:  ; %bb.1:
3094; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3095; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3096; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3097; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3098; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3099; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3100; GFX1032-NEXT:    buffer_gl0_inv
3101; GFX1032-NEXT:  BB16_2:
3102; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3103; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3104; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3105; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3106; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3107; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3108; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3109; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3110; GFX1032-NEXT:    s_endpgm
3111entry:
3112  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3113  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3114  store i32 %old, i32 addrspace(1)* %out
3115  ret void
3116}
3117
3118define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3119;
3120;
3121; GFX7LESS-LABEL: max_i32_varying:
3122; GFX7LESS:       ; %bb.0: ; %entry
3123; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3124; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3125; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3126; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3127; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3128; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3129; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3130; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3131; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3132; GFX7LESS-NEXT:    s_endpgm
3133;
3134; GFX8-LABEL: max_i32_varying:
3135; GFX8:       ; %bb.0: ; %entry
3136; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3137; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3138; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3139; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3140; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3141; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3142; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3143; GFX8-NEXT:    s_not_b64 exec, exec
3144; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3145; GFX8-NEXT:    s_not_b64 exec, exec
3146; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3147; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3148; GFX8-NEXT:    s_nop 1
3149; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3150; GFX8-NEXT:    s_nop 1
3151; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3152; GFX8-NEXT:    s_nop 1
3153; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3154; GFX8-NEXT:    s_nop 1
3155; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3156; GFX8-NEXT:    s_nop 1
3157; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3158; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3159; GFX8-NEXT:    s_nop 0
3160; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3161; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3162; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3163; GFX8-NEXT:    ; implicit-def: $vgpr0
3164; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3165; GFX8-NEXT:    s_cbranch_execz BB17_2
3166; GFX8-NEXT:  ; %bb.1:
3167; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3168; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3169; GFX8-NEXT:    s_mov_b32 m0, -1
3170; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3171; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3172; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3173; GFX8-NEXT:  BB17_2:
3174; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3175; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3176; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3177; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3178; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3179; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3180; GFX8-NEXT:    s_mov_b32 s2, -1
3181; GFX8-NEXT:    s_nop 0
3182; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3183; GFX8-NEXT:    s_endpgm
3184;
3185; GFX9-LABEL: max_i32_varying:
3186; GFX9:       ; %bb.0: ; %entry
3187; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3188; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3189; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3190; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3191; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3192; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3193; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3194; GFX9-NEXT:    s_not_b64 exec, exec
3195; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3196; GFX9-NEXT:    s_not_b64 exec, exec
3197; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3198; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3199; GFX9-NEXT:    s_nop 1
3200; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3201; GFX9-NEXT:    s_nop 1
3202; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3203; GFX9-NEXT:    s_nop 1
3204; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3205; GFX9-NEXT:    s_nop 1
3206; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3207; GFX9-NEXT:    s_nop 1
3208; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3209; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3210; GFX9-NEXT:    s_nop 0
3211; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3212; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3213; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3214; GFX9-NEXT:    ; implicit-def: $vgpr0
3215; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3216; GFX9-NEXT:    s_cbranch_execz BB17_2
3217; GFX9-NEXT:  ; %bb.1:
3218; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3219; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3220; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3221; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3222; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3223; GFX9-NEXT:  BB17_2:
3224; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3225; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3226; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3227; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3228; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3229; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3230; GFX9-NEXT:    s_mov_b32 s2, -1
3231; GFX9-NEXT:    s_nop 0
3232; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3233; GFX9-NEXT:    s_endpgm
3234;
3235; GFX1064-LABEL: max_i32_varying:
3236; GFX1064:       ; %bb.0: ; %entry
3237; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3238; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3239; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3240; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3241; GFX1064-NEXT:    s_not_b64 exec, exec
3242; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3243; GFX1064-NEXT:    s_not_b64 exec, exec
3244; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3245; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3246; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3247; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3248; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3249; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3250; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3251; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3252; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3253; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3254; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3255; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3256; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3257; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3258; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3259; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3260; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3261; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3262; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3263; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3264; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3265; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3266; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3267; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3268; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3269; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3270; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3271; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3272; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3273; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3274; GFX1064-NEXT:    s_mov_b32 s2, -1
3275; GFX1064-NEXT:    ; implicit-def: $vgpr0
3276; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3277; GFX1064-NEXT:    s_cbranch_execz BB17_2
3278; GFX1064-NEXT:  ; %bb.1:
3279; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3280; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3281; GFX1064-NEXT:    s_mov_b32 s3, s7
3282; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3283; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3284; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
3285; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3286; GFX1064-NEXT:    buffer_gl0_inv
3287; GFX1064-NEXT:  BB17_2:
3288; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3289; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3290; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3291; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3292; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3293; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3294; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3295; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3296; GFX1064-NEXT:    s_endpgm
3297;
3298; GFX1032-LABEL: max_i32_varying:
3299; GFX1032:       ; %bb.0: ; %entry
3300; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3301; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3302; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3303; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3304; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3305; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3306; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3307; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3308; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3309; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3310; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3311; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3312; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3313; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3314; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3315; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3316; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3317; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3318; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3319; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3320; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3321; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3322; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3323; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3324; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3325; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3326; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3327; GFX1032-NEXT:    s_mov_b32 s2, -1
3328; GFX1032-NEXT:    ; implicit-def: $vgpr0
3329; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3330; GFX1032-NEXT:    s_cbranch_execz BB17_2
3331; GFX1032-NEXT:  ; %bb.1:
3332; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3333; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3334; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3335; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3336; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
3337; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3338; GFX1032-NEXT:    buffer_gl0_inv
3339; GFX1032-NEXT:  BB17_2:
3340; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3341; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3342; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3343; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3344; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3345; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3346; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3347; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3348; GFX1032-NEXT:    s_endpgm
3349entry:
3350  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3351  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3352  store i32 %old, i32 addrspace(1)* %out
3353  ret void
3354}
3355
3356define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3357;
3358;
3359; GFX7LESS-LABEL: max_i64_constant:
3360; GFX7LESS:       ; %bb.0: ; %entry
3361; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3362; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3363; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3364; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3365; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3366; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3367; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3368; GFX7LESS-NEXT:  ; %bb.1:
3369; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3370; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3371; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3372; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3373; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3374; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3375; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3376; GFX7LESS-NEXT:  BB18_2:
3377; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3378; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3379; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3380; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3381; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3382; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3383; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3384; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3385; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3386; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3387; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3388; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3389; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3390; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3391; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3392; GFX7LESS-NEXT:    s_endpgm
3393;
3394; GFX8-LABEL: max_i64_constant:
3395; GFX8:       ; %bb.0: ; %entry
3396; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3397; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3398; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3399; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3400; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3401; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3402; GFX8-NEXT:    s_cbranch_execz BB18_2
3403; GFX8-NEXT:  ; %bb.1:
3404; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3405; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3406; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3407; GFX8-NEXT:    s_mov_b32 m0, -1
3408; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3409; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3410; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3411; GFX8-NEXT:  BB18_2:
3412; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3413; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3414; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3415; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3416; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3417; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3418; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3419; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3420; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3421; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3422; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3423; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3424; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3425; GFX8-NEXT:    s_mov_b32 s2, -1
3426; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3427; GFX8-NEXT:    s_endpgm
3428;
3429; GFX9-LABEL: max_i64_constant:
3430; GFX9:       ; %bb.0: ; %entry
3431; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3432; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3433; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3434; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3435; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3436; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3437; GFX9-NEXT:    s_cbranch_execz BB18_2
3438; GFX9-NEXT:  ; %bb.1:
3439; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3440; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3441; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3442; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3443; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3445; GFX9-NEXT:  BB18_2:
3446; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3447; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3448; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3449; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3450; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3451; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3452; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3453; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3454; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3455; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3456; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3457; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3458; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3459; GFX9-NEXT:    s_mov_b32 s2, -1
3460; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3461; GFX9-NEXT:    s_endpgm
3462;
3463; GFX1064-LABEL: max_i64_constant:
3464; GFX1064:       ; %bb.0: ; %entry
3465; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3466; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3467; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3468; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3469; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3470; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3471; GFX1064-NEXT:    s_cbranch_execz BB18_2
3472; GFX1064-NEXT:  ; %bb.1:
3473; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3474; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3475; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3476; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3477; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3478; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3479; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3480; GFX1064-NEXT:    buffer_gl0_inv
3481; GFX1064-NEXT:  BB18_2:
3482; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3483; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3484; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3485; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3486; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3487; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3488; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3489; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3490; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3491; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3492; GFX1064-NEXT:    s_mov_b32 s2, -1
3493; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3494; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3495; GFX1064-NEXT:    s_endpgm
3496;
3497; GFX1032-LABEL: max_i64_constant:
3498; GFX1032:       ; %bb.0: ; %entry
3499; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3500; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3501; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3502; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3503; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3504; GFX1032-NEXT:    s_cbranch_execz BB18_2
3505; GFX1032-NEXT:  ; %bb.1:
3506; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3507; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3508; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3509; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3510; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3511; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3512; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3513; GFX1032-NEXT:    buffer_gl0_inv
3514; GFX1032-NEXT:  BB18_2:
3515; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3516; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3517; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3518; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3519; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3520; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3521; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3522; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3523; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3524; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3525; GFX1032-NEXT:    s_mov_b32 s2, -1
3526; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3527; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3528; GFX1032-NEXT:    s_endpgm
3529entry:
3530  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3531  store i64 %old, i64 addrspace(1)* %out
3532  ret void
3533}
3534
3535define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3536;
3537;
3538; GFX7LESS-LABEL: min_i32_varying:
3539; GFX7LESS:       ; %bb.0: ; %entry
3540; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3541; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3542; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3543; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3544; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3545; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3546; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3547; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3548; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3549; GFX7LESS-NEXT:    s_endpgm
3550;
3551; GFX8-LABEL: min_i32_varying:
3552; GFX8:       ; %bb.0: ; %entry
3553; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3554; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3555; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3556; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3557; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3558; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3559; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3560; GFX8-NEXT:    s_not_b64 exec, exec
3561; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3562; GFX8-NEXT:    s_not_b64 exec, exec
3563; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3564; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3565; GFX8-NEXT:    s_nop 1
3566; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3567; GFX8-NEXT:    s_nop 1
3568; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3569; GFX8-NEXT:    s_nop 1
3570; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3571; GFX8-NEXT:    s_nop 1
3572; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3573; GFX8-NEXT:    s_nop 1
3574; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3575; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3576; GFX8-NEXT:    s_nop 0
3577; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3578; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3579; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3580; GFX8-NEXT:    ; implicit-def: $vgpr0
3581; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3582; GFX8-NEXT:    s_cbranch_execz BB19_2
3583; GFX8-NEXT:  ; %bb.1:
3584; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3585; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3586; GFX8-NEXT:    s_mov_b32 m0, -1
3587; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3588; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3589; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3590; GFX8-NEXT:  BB19_2:
3591; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3592; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3593; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3594; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3595; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3596; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3597; GFX8-NEXT:    s_mov_b32 s2, -1
3598; GFX8-NEXT:    s_nop 0
3599; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3600; GFX8-NEXT:    s_endpgm
3601;
3602; GFX9-LABEL: min_i32_varying:
3603; GFX9:       ; %bb.0: ; %entry
3604; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3605; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3606; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3607; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3608; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3609; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3610; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3611; GFX9-NEXT:    s_not_b64 exec, exec
3612; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3613; GFX9-NEXT:    s_not_b64 exec, exec
3614; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3615; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3616; GFX9-NEXT:    s_nop 1
3617; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3618; GFX9-NEXT:    s_nop 1
3619; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3620; GFX9-NEXT:    s_nop 1
3621; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3622; GFX9-NEXT:    s_nop 1
3623; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3624; GFX9-NEXT:    s_nop 1
3625; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3626; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3627; GFX9-NEXT:    s_nop 0
3628; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3629; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3630; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3631; GFX9-NEXT:    ; implicit-def: $vgpr0
3632; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3633; GFX9-NEXT:    s_cbranch_execz BB19_2
3634; GFX9-NEXT:  ; %bb.1:
3635; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3636; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3637; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3638; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3639; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3640; GFX9-NEXT:  BB19_2:
3641; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3642; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3643; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3644; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3645; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3646; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3647; GFX9-NEXT:    s_mov_b32 s2, -1
3648; GFX9-NEXT:    s_nop 0
3649; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3650; GFX9-NEXT:    s_endpgm
3651;
3652; GFX1064-LABEL: min_i32_varying:
3653; GFX1064:       ; %bb.0: ; %entry
3654; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3655; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3656; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3657; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3658; GFX1064-NEXT:    s_not_b64 exec, exec
3659; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3660; GFX1064-NEXT:    s_not_b64 exec, exec
3661; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3662; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3663; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3664; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3665; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3666; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3667; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3668; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3669; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3670; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3671; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3672; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3673; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3674; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3675; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3676; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3677; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3678; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3679; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3680; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3681; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3682; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3683; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3684; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3685; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3686; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3687; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3688; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3689; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3690; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3691; GFX1064-NEXT:    s_mov_b32 s2, -1
3692; GFX1064-NEXT:    ; implicit-def: $vgpr0
3693; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3694; GFX1064-NEXT:    s_cbranch_execz BB19_2
3695; GFX1064-NEXT:  ; %bb.1:
3696; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3697; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3698; GFX1064-NEXT:    s_mov_b32 s3, s7
3699; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3700; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3701; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
3702; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3703; GFX1064-NEXT:    buffer_gl0_inv
3704; GFX1064-NEXT:  BB19_2:
3705; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3706; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3707; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3708; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3709; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3710; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3711; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3712; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3713; GFX1064-NEXT:    s_endpgm
3714;
3715; GFX1032-LABEL: min_i32_varying:
3716; GFX1032:       ; %bb.0: ; %entry
3717; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3718; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3719; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3720; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3721; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3722; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3723; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3724; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3725; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3726; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3727; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3728; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3729; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3730; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3731; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3732; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3733; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3734; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3735; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3736; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3737; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3738; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3739; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3740; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3741; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3742; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3743; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3744; GFX1032-NEXT:    s_mov_b32 s2, -1
3745; GFX1032-NEXT:    ; implicit-def: $vgpr0
3746; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3747; GFX1032-NEXT:    s_cbranch_execz BB19_2
3748; GFX1032-NEXT:  ; %bb.1:
3749; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3750; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3751; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3752; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3753; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
3754; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3755; GFX1032-NEXT:    buffer_gl0_inv
3756; GFX1032-NEXT:  BB19_2:
3757; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3758; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3759; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3760; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3761; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3762; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3763; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3764; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3765; GFX1032-NEXT:    s_endpgm
3766entry:
3767  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3768  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3769  store i32 %old, i32 addrspace(1)* %out
3770  ret void
3771}
3772
3773define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3774;
3775;
3776; GFX7LESS-LABEL: min_i64_constant:
3777; GFX7LESS:       ; %bb.0: ; %entry
3778; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3779; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3780; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3781; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3782; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3783; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3784; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
3785; GFX7LESS-NEXT:  ; %bb.1:
3786; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3787; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3788; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3789; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3790; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3791; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3792; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3793; GFX7LESS-NEXT:  BB20_2:
3794; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3795; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3796; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3797; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3798; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
3799; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3800; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3801; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3802; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3803; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3804; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3805; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3806; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3807; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3808; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3809; GFX7LESS-NEXT:    s_endpgm
3810;
3811; GFX8-LABEL: min_i64_constant:
3812; GFX8:       ; %bb.0: ; %entry
3813; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3814; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3815; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3816; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3817; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3818; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3819; GFX8-NEXT:    s_cbranch_execz BB20_2
3820; GFX8-NEXT:  ; %bb.1:
3821; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3822; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3823; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3824; GFX8-NEXT:    s_mov_b32 m0, -1
3825; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3826; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3827; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3828; GFX8-NEXT:  BB20_2:
3829; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3830; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3831; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
3832; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
3833; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
3834; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3835; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3836; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3837; GFX8-NEXT:    v_mov_b32_e32 v2, s5
3838; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3839; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3840; GFX8-NEXT:    s_mov_b32 s2, -1
3841; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3842; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3843; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3844; GFX8-NEXT:    s_endpgm
3845;
3846; GFX9-LABEL: min_i64_constant:
3847; GFX9:       ; %bb.0: ; %entry
3848; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3849; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3850; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3851; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3852; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3853; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3854; GFX9-NEXT:    s_cbranch_execz BB20_2
3855; GFX9-NEXT:  ; %bb.1:
3856; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3857; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3858; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3859; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3860; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3861; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3862; GFX9-NEXT:  BB20_2:
3863; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3864; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3865; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
3866; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
3867; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
3868; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3869; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3870; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3871; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3872; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3873; GFX9-NEXT:    v_mov_b32_e32 v2, s4
3874; GFX9-NEXT:    s_mov_b32 s2, -1
3875; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3876; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3877; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3878; GFX9-NEXT:    s_endpgm
3879;
3880; GFX1064-LABEL: min_i64_constant:
3881; GFX1064:       ; %bb.0: ; %entry
3882; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3883; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3884; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3885; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3886; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3887; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3888; GFX1064-NEXT:    s_cbranch_execz BB20_2
3889; GFX1064-NEXT:  ; %bb.1:
3890; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3891; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3892; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3893; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3894; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3895; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3896; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3897; GFX1064-NEXT:    buffer_gl0_inv
3898; GFX1064-NEXT:  BB20_2:
3899; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3900; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3901; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3902; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3903; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
3904; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3905; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3906; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3907; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3908; GFX1064-NEXT:    s_mov_b32 s2, -1
3909; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3910; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3911; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3912; GFX1064-NEXT:    s_endpgm
3913;
3914; GFX1032-LABEL: min_i64_constant:
3915; GFX1032:       ; %bb.0: ; %entry
3916; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3917; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3918; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3919; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3920; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3921; GFX1032-NEXT:    s_cbranch_execz BB20_2
3922; GFX1032-NEXT:  ; %bb.1:
3923; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3924; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3925; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3926; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3927; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3928; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3929; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3930; GFX1032-NEXT:    buffer_gl0_inv
3931; GFX1032-NEXT:  BB20_2:
3932; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3933; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3934; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3935; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3936; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
3937; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
3938; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
3939; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3940; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3941; GFX1032-NEXT:    s_mov_b32 s2, -1
3942; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3943; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3944; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3945; GFX1032-NEXT:    s_endpgm
3946entry:
3947  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
3948  store i64 %old, i64 addrspace(1)* %out
3949  ret void
3950}
3951
3952define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
3953;
3954;
3955; GFX7LESS-LABEL: umax_i32_varying:
3956; GFX7LESS:       ; %bb.0: ; %entry
3957; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3958; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3959; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3960; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3961; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
3962; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3963; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3964; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3965; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3966; GFX7LESS-NEXT:    s_endpgm
3967;
3968; GFX8-LABEL: umax_i32_varying:
3969; GFX8:       ; %bb.0: ; %entry
3970; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3971; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3972; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3973; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3974; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3975; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3976; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3977; GFX8-NEXT:    s_not_b64 exec, exec
3978; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3979; GFX8-NEXT:    s_not_b64 exec, exec
3980; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3981; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3982; GFX8-NEXT:    s_nop 1
3983; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3984; GFX8-NEXT:    s_nop 1
3985; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3986; GFX8-NEXT:    s_nop 1
3987; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3988; GFX8-NEXT:    s_nop 1
3989; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3990; GFX8-NEXT:    s_nop 1
3991; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3992; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3993; GFX8-NEXT:    s_nop 0
3994; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3995; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3996; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3997; GFX8-NEXT:    ; implicit-def: $vgpr0
3998; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3999; GFX8-NEXT:    s_cbranch_execz BB21_2
4000; GFX8-NEXT:  ; %bb.1:
4001; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4002; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4003; GFX8-NEXT:    s_mov_b32 m0, -1
4004; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4005; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4006; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4007; GFX8-NEXT:  BB21_2:
4008; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4009; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4010; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4011; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4012; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4013; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4014; GFX8-NEXT:    s_mov_b32 s2, -1
4015; GFX8-NEXT:    s_nop 0
4016; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4017; GFX8-NEXT:    s_endpgm
4018;
4019; GFX9-LABEL: umax_i32_varying:
4020; GFX9:       ; %bb.0: ; %entry
4021; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4022; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4023; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4024; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4025; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4026; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4027; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4028; GFX9-NEXT:    s_not_b64 exec, exec
4029; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4030; GFX9-NEXT:    s_not_b64 exec, exec
4031; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4032; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4033; GFX9-NEXT:    s_nop 1
4034; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4035; GFX9-NEXT:    s_nop 1
4036; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4037; GFX9-NEXT:    s_nop 1
4038; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4039; GFX9-NEXT:    s_nop 1
4040; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4041; GFX9-NEXT:    s_nop 1
4042; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4043; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4044; GFX9-NEXT:    s_nop 0
4045; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4046; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4047; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4048; GFX9-NEXT:    ; implicit-def: $vgpr0
4049; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4050; GFX9-NEXT:    s_cbranch_execz BB21_2
4051; GFX9-NEXT:  ; %bb.1:
4052; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4053; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4054; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4055; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4056; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4057; GFX9-NEXT:  BB21_2:
4058; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4059; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4060; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4061; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4062; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4063; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4064; GFX9-NEXT:    s_mov_b32 s2, -1
4065; GFX9-NEXT:    s_nop 0
4066; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4067; GFX9-NEXT:    s_endpgm
4068;
4069; GFX1064-LABEL: umax_i32_varying:
4070; GFX1064:       ; %bb.0: ; %entry
4071; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4072; GFX1064-NEXT:    s_not_b64 exec, exec
4073; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4074; GFX1064-NEXT:    s_not_b64 exec, exec
4075; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4076; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4077; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4078; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4079; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4080; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4081; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4082; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4083; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4084; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4085; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4086; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4087; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4088; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4089; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4090; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4091; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4092; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4093; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4094; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4095; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4096; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4097; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4098; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4099; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4100; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4101; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4102; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4103; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4104; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4105; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4106; GFX1064-NEXT:    s_mov_b32 s2, -1
4107; GFX1064-NEXT:    ; implicit-def: $vgpr0
4108; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4109; GFX1064-NEXT:    s_cbranch_execz BB21_2
4110; GFX1064-NEXT:  ; %bb.1:
4111; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4112; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4113; GFX1064-NEXT:    s_mov_b32 s3, s7
4114; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4115; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4116; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
4117; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4118; GFX1064-NEXT:    buffer_gl0_inv
4119; GFX1064-NEXT:  BB21_2:
4120; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4121; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4122; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4123; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4124; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4125; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4127; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4128; GFX1064-NEXT:    s_endpgm
4129;
4130; GFX1032-LABEL: umax_i32_varying:
4131; GFX1032:       ; %bb.0: ; %entry
4132; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4133; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4134; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4135; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4136; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4137; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4138; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4139; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4140; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4141; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4142; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4143; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4144; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4145; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4146; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4147; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4148; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4149; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4150; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4151; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4152; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4153; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4154; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4155; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4156; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4157; GFX1032-NEXT:    s_mov_b32 s2, -1
4158; GFX1032-NEXT:    ; implicit-def: $vgpr0
4159; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4160; GFX1032-NEXT:    s_cbranch_execz BB21_2
4161; GFX1032-NEXT:  ; %bb.1:
4162; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4163; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4164; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4165; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4166; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
4167; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4168; GFX1032-NEXT:    buffer_gl0_inv
4169; GFX1032-NEXT:  BB21_2:
4170; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4171; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4172; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4173; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4174; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4175; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4176; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4177; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4178; GFX1032-NEXT:    s_endpgm
4179entry:
4180  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4181  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4182  store i32 %old, i32 addrspace(1)* %out
4183  ret void
4184}
4185
4186define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4187;
4188;
4189; GFX7LESS-LABEL: umax_i64_constant:
4190; GFX7LESS:       ; %bb.0: ; %entry
4191; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4192; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4193; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4194; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4195; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4196; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4197; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4198; GFX7LESS-NEXT:  ; %bb.1:
4199; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4200; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4201; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4202; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4203; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4204; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4205; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4206; GFX7LESS-NEXT:  BB22_2:
4207; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4208; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4209; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4210; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4211; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4212; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4213; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4214; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4215; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4216; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4217; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4218; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4219; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4220; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4221; GFX7LESS-NEXT:    s_endpgm
4222;
4223; GFX8-LABEL: umax_i64_constant:
4224; GFX8:       ; %bb.0: ; %entry
4225; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4226; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4227; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4228; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4229; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4230; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4231; GFX8-NEXT:    s_cbranch_execz BB22_2
4232; GFX8-NEXT:  ; %bb.1:
4233; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4234; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4235; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4236; GFX8-NEXT:    s_mov_b32 m0, -1
4237; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4238; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4239; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4240; GFX8-NEXT:  BB22_2:
4241; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4242; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4243; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4244; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4245; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4246; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4247; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4248; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4249; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4250; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4251; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4252; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4253; GFX8-NEXT:    s_mov_b32 s2, -1
4254; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4255; GFX8-NEXT:    s_endpgm
4256;
4257; GFX9-LABEL: umax_i64_constant:
4258; GFX9:       ; %bb.0: ; %entry
4259; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4260; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4261; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4262; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4263; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4264; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4265; GFX9-NEXT:    s_cbranch_execz BB22_2
4266; GFX9-NEXT:  ; %bb.1:
4267; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4268; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4269; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4271; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4273; GFX9-NEXT:  BB22_2:
4274; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4276; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4277; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4278; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4279; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4280; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4281; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4282; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4283; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4284; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4285; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4286; GFX9-NEXT:    s_mov_b32 s2, -1
4287; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4288; GFX9-NEXT:    s_endpgm
4289;
4290; GFX1064-LABEL: umax_i64_constant:
4291; GFX1064:       ; %bb.0: ; %entry
4292; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4293; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4294; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4295; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4296; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4297; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4298; GFX1064-NEXT:    s_cbranch_execz BB22_2
4299; GFX1064-NEXT:  ; %bb.1:
4300; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4301; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4302; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4303; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4304; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4305; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4306; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4307; GFX1064-NEXT:    buffer_gl0_inv
4308; GFX1064-NEXT:  BB22_2:
4309; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4310; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4311; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4312; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4313; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4314; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4315; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4316; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4317; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4318; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4319; GFX1064-NEXT:    s_mov_b32 s2, -1
4320; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4321; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4322; GFX1064-NEXT:    s_endpgm
4323;
4324; GFX1032-LABEL: umax_i64_constant:
4325; GFX1032:       ; %bb.0: ; %entry
4326; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4327; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4328; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4329; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4330; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4331; GFX1032-NEXT:    s_cbranch_execz BB22_2
4332; GFX1032-NEXT:  ; %bb.1:
4333; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4334; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4335; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4336; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4337; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4338; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4339; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4340; GFX1032-NEXT:    buffer_gl0_inv
4341; GFX1032-NEXT:  BB22_2:
4342; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4343; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4344; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4345; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4346; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4347; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4348; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4349; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4350; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4351; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4352; GFX1032-NEXT:    s_mov_b32 s2, -1
4353; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4354; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4355; GFX1032-NEXT:    s_endpgm
4356entry:
4357  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4358  store i64 %old, i64 addrspace(1)* %out
4359  ret void
4360}
4361
4362define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4363;
4364;
4365; GFX7LESS-LABEL: umin_i32_varying:
4366; GFX7LESS:       ; %bb.0: ; %entry
4367; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4368; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4369; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4370; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4371; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4372; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4373; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4374; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4375; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4376; GFX7LESS-NEXT:    s_endpgm
4377;
4378; GFX8-LABEL: umin_i32_varying:
4379; GFX8:       ; %bb.0: ; %entry
4380; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4381; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4382; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4383; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4384; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4385; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4386; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4387; GFX8-NEXT:    s_not_b64 exec, exec
4388; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4389; GFX8-NEXT:    s_not_b64 exec, exec
4390; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4391; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4392; GFX8-NEXT:    s_nop 1
4393; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4394; GFX8-NEXT:    s_nop 1
4395; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4396; GFX8-NEXT:    s_nop 1
4397; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4398; GFX8-NEXT:    s_nop 1
4399; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4400; GFX8-NEXT:    s_nop 1
4401; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4402; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4403; GFX8-NEXT:    s_nop 0
4404; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4405; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4406; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4407; GFX8-NEXT:    ; implicit-def: $vgpr0
4408; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4409; GFX8-NEXT:    s_cbranch_execz BB23_2
4410; GFX8-NEXT:  ; %bb.1:
4411; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4412; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4413; GFX8-NEXT:    s_mov_b32 m0, -1
4414; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4415; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4416; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4417; GFX8-NEXT:  BB23_2:
4418; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4419; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4420; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4421; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4422; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4423; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4424; GFX8-NEXT:    s_mov_b32 s2, -1
4425; GFX8-NEXT:    s_nop 0
4426; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4427; GFX8-NEXT:    s_endpgm
4428;
4429; GFX9-LABEL: umin_i32_varying:
4430; GFX9:       ; %bb.0: ; %entry
4431; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4432; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4433; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4434; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4435; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4436; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4437; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4438; GFX9-NEXT:    s_not_b64 exec, exec
4439; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4440; GFX9-NEXT:    s_not_b64 exec, exec
4441; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4442; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4443; GFX9-NEXT:    s_nop 1
4444; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4445; GFX9-NEXT:    s_nop 1
4446; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4447; GFX9-NEXT:    s_nop 1
4448; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4449; GFX9-NEXT:    s_nop 1
4450; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4451; GFX9-NEXT:    s_nop 1
4452; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4453; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4454; GFX9-NEXT:    s_nop 0
4455; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4456; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4457; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4458; GFX9-NEXT:    ; implicit-def: $vgpr0
4459; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4460; GFX9-NEXT:    s_cbranch_execz BB23_2
4461; GFX9-NEXT:  ; %bb.1:
4462; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4463; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4464; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4465; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4466; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4467; GFX9-NEXT:  BB23_2:
4468; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4469; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4470; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4471; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4472; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4473; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4474; GFX9-NEXT:    s_mov_b32 s2, -1
4475; GFX9-NEXT:    s_nop 0
4476; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4477; GFX9-NEXT:    s_endpgm
4478;
4479; GFX1064-LABEL: umin_i32_varying:
4480; GFX1064:       ; %bb.0: ; %entry
4481; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4482; GFX1064-NEXT:    s_not_b64 exec, exec
4483; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4484; GFX1064-NEXT:    s_not_b64 exec, exec
4485; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4486; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4487; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4488; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4489; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4490; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4491; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4492; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4493; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4494; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4495; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4496; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4497; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4498; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4499; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4500; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4501; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4502; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4503; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4504; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4505; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4506; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4507; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4508; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4509; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4510; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4511; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4512; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4513; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4514; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4515; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4516; GFX1064-NEXT:    s_mov_b32 s2, -1
4517; GFX1064-NEXT:    ; implicit-def: $vgpr0
4518; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4519; GFX1064-NEXT:    s_cbranch_execz BB23_2
4520; GFX1064-NEXT:  ; %bb.1:
4521; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4522; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4523; GFX1064-NEXT:    s_mov_b32 s3, s7
4524; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4525; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4526; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
4527; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4528; GFX1064-NEXT:    buffer_gl0_inv
4529; GFX1064-NEXT:  BB23_2:
4530; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4531; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4532; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4533; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4534; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4535; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4536; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4537; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4538; GFX1064-NEXT:    s_endpgm
4539;
4540; GFX1032-LABEL: umin_i32_varying:
4541; GFX1032:       ; %bb.0: ; %entry
4542; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4543; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4544; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4545; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4546; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4547; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4548; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4549; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4550; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4551; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4552; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4553; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4554; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4555; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4556; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4557; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4558; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4559; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4560; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4561; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4562; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4563; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4564; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4565; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4566; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4567; GFX1032-NEXT:    s_mov_b32 s2, -1
4568; GFX1032-NEXT:    ; implicit-def: $vgpr0
4569; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4570; GFX1032-NEXT:    s_cbranch_execz BB23_2
4571; GFX1032-NEXT:  ; %bb.1:
4572; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4573; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4574; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4575; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4576; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
4577; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4578; GFX1032-NEXT:    buffer_gl0_inv
4579; GFX1032-NEXT:  BB23_2:
4580; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4581; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4582; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4583; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4584; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4585; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4586; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4587; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4588; GFX1032-NEXT:    s_endpgm
4589entry:
4590  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4591  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4592  store i32 %old, i32 addrspace(1)* %out
4593  ret void
4594}
4595
4596define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4597;
4598;
4599; GFX7LESS-LABEL: umin_i64_constant:
4600; GFX7LESS:       ; %bb.0: ; %entry
4601; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4602; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4603; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4604; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4605; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4606; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4607; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4608; GFX7LESS-NEXT:  ; %bb.1:
4609; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4610; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4611; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4612; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4613; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4614; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4615; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4616; GFX7LESS-NEXT:  BB24_2:
4617; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4618; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4619; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4620; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4621; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4622; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4623; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4624; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4625; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4626; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4627; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4628; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4629; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4630; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4631; GFX7LESS-NEXT:    s_endpgm
4632;
4633; GFX8-LABEL: umin_i64_constant:
4634; GFX8:       ; %bb.0: ; %entry
4635; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4636; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4637; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4638; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4639; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4640; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4641; GFX8-NEXT:    s_cbranch_execz BB24_2
4642; GFX8-NEXT:  ; %bb.1:
4643; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4644; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4645; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4646; GFX8-NEXT:    s_mov_b32 m0, -1
4647; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4648; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4649; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4650; GFX8-NEXT:  BB24_2:
4651; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4652; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4653; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4654; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4655; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4656; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4657; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4658; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4659; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4660; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4661; GFX8-NEXT:    s_mov_b32 s2, -1
4662; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4663; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4664; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4665; GFX8-NEXT:    s_endpgm
4666;
4667; GFX9-LABEL: umin_i64_constant:
4668; GFX9:       ; %bb.0: ; %entry
4669; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4670; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4671; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4672; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4673; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4674; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4675; GFX9-NEXT:    s_cbranch_execz BB24_2
4676; GFX9-NEXT:  ; %bb.1:
4677; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4678; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4679; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4680; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4681; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4682; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4683; GFX9-NEXT:  BB24_2:
4684; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4685; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4686; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4687; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4688; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4689; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4690; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4691; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4692; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4693; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4694; GFX9-NEXT:    s_mov_b32 s2, -1
4695; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4696; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4697; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4698; GFX9-NEXT:    s_endpgm
4699;
4700; GFX1064-LABEL: umin_i64_constant:
4701; GFX1064:       ; %bb.0: ; %entry
4702; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4703; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4704; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4705; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4706; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4707; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4708; GFX1064-NEXT:    s_cbranch_execz BB24_2
4709; GFX1064-NEXT:  ; %bb.1:
4710; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4711; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4712; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4713; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4714; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4715; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4716; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4717; GFX1064-NEXT:    buffer_gl0_inv
4718; GFX1064-NEXT:  BB24_2:
4719; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4720; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4721; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4722; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4723; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4724; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4725; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4726; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4727; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4728; GFX1064-NEXT:    s_mov_b32 s2, -1
4729; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4730; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4731; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4732; GFX1064-NEXT:    s_endpgm
4733;
4734; GFX1032-LABEL: umin_i64_constant:
4735; GFX1032:       ; %bb.0: ; %entry
4736; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4737; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4738; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4739; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4740; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4741; GFX1032-NEXT:    s_cbranch_execz BB24_2
4742; GFX1032-NEXT:  ; %bb.1:
4743; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4744; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4745; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4746; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4747; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4748; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4749; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4750; GFX1032-NEXT:    buffer_gl0_inv
4751; GFX1032-NEXT:  BB24_2:
4752; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4753; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4754; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4755; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4756; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4757; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4758; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4759; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4760; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4761; GFX1032-NEXT:    s_mov_b32 s2, -1
4762; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4763; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4764; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4765; GFX1032-NEXT:    s_endpgm
4766entry:
4767  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
4768  store i64 %old, i64 addrspace(1)* %out
4769  ret void
4770}
4771