1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12@local_var32 = addrspace(3) global i32 undef, align 4
13@local_var64 = addrspace(3) global i64 undef, align 8
14
15; Show what the atomic optimization pass will do for local pointers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
18;
19;
20; GFX7LESS-LABEL: add_i32_constant:
21; GFX7LESS:       ; %bb.0: ; %entry
22; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
23; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
25; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
26; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
27; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
28; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
29; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
30; GFX7LESS-NEXT:  ; %bb.1:
31; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
34; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7LESS-NEXT:    s_mov_b32 m0, -1
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:  .LBB0_2:
40; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
43; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
45; GFX7LESS-NEXT:    s_mov_b32 s2, -1
46; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; GFX7LESS-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
52; GFX8-NEXT:    s_mov_b64 s[2:3], exec
53; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
54; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
55; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
56; GFX8-NEXT:    ; implicit-def: $vgpr1
57; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
58; GFX8-NEXT:    s_cbranch_execz .LBB0_2
59; GFX8-NEXT:  ; %bb.1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
61; GFX8-NEXT:    s_mul_i32 s2, s2, 5
62; GFX8-NEXT:    v_mov_b32_e32 v1, 0
63; GFX8-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-NEXT:    s_mov_b32 m0, -1
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:  .LBB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
73; GFX8-NEXT:    s_mov_b32 s3, 0xf000
74; GFX8-NEXT:    s_mov_b32 s2, -1
75; GFX8-NEXT:    s_nop 1
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    s_mov_b64 s[2:3], exec
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX9-NEXT:    s_cbranch_execz .LBB0_2
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
91; GFX9-NEXT:    s_mul_i32 s2, s2, 5
92; GFX9-NEXT:    v_mov_b32_e32 v1, 0
93; GFX9-NEXT:    v_mov_b32_e32 v2, s2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
111; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
112; GFX1064-NEXT:    ; implicit-def: $vgpr1
113; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
114; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
116; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
117; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
118; GFX1064-NEXT:  ; %bb.1:
119; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
120; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
121; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
122; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1064-NEXT:    buffer_gl0_inv
128; GFX1064-NEXT:  .LBB0_2:
129; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
130; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
131; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
132; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
133; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
134; GFX1064-NEXT:    s_mov_b32 s2, -1
135; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
137; GFX1064-NEXT:    s_endpgm
138;
139; GFX1032-LABEL: add_i32_constant:
140; GFX1032:       ; %bb.0: ; %entry
141; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
151; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
152; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
155; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1032-NEXT:    buffer_gl0_inv
158; GFX1032-NEXT:  .LBB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168;
169; GFX1164-LABEL: add_i32_constant:
170; GFX1164:       ; %bb.0: ; %entry
171; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
172; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
173; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
174; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
175; GFX1164-NEXT:    ; implicit-def: $vgpr1
176; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
177; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
178; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
179; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
180; GFX1164-NEXT:  ; %bb.1:
181; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
182; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
183; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
184; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
185; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
186; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
187; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
188; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
189; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX1164-NEXT:    buffer_gl0_inv
191; GFX1164-NEXT:  .LBB0_2:
192; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
193; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
194; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
195; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
196; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
197; GFX1164-NEXT:    s_mov_b32 s2, -1
198; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
200; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
201; GFX1164-NEXT:    s_endpgm
202;
203; GFX1132-LABEL: add_i32_constant:
204; GFX1132:       ; %bb.0: ; %entry
205; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
206; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
207; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
208; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
209; GFX1132-NEXT:    ; implicit-def: $vgpr1
210; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
211; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
212; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
213; GFX1132-NEXT:  ; %bb.1:
214; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
215; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
216; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
217; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3
218; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
219; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
220; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
221; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX1132-NEXT:    buffer_gl0_inv
223; GFX1132-NEXT:  .LBB0_2:
224; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
225; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
226; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
227; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
228; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
229; GFX1132-NEXT:    s_mov_b32 s2, -1
230; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
232; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
233; GFX1132-NEXT:    s_endpgm
234entry:
235  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
236  store i32 %old, i32 addrspace(1)* %out
237  ret void
238}
239
240define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
241;
242;
243; GFX7LESS-LABEL: add_i32_uniform:
244; GFX7LESS:       ; %bb.0: ; %entry
245; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
246; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
247; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
248; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
249; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
250; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
251; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
252; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
253; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
254; GFX7LESS-NEXT:  ; %bb.1:
255; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
258; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
259; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
260; GFX7LESS-NEXT:    s_mov_b32 m0, -1
261; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
263; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX7LESS-NEXT:  .LBB1_2:
265; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
266; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
268; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
269; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
270; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
271; GFX7LESS-NEXT:    s_mov_b32 s6, -1
272; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
273; GFX7LESS-NEXT:    s_endpgm
274;
275; GFX8-LABEL: add_i32_uniform:
276; GFX8:       ; %bb.0: ; %entry
277; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
278; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
279; GFX8-NEXT:    s_mov_b64 s[2:3], exec
280; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
281; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
282; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
283; GFX8-NEXT:    ; implicit-def: $vgpr1
284; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
285; GFX8-NEXT:    s_cbranch_execz .LBB1_2
286; GFX8-NEXT:  ; %bb.1:
287; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
288; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX8-NEXT:    s_mul_i32 s2, s6, s2
290; GFX8-NEXT:    v_mov_b32_e32 v1, 0
291; GFX8-NEXT:    v_mov_b32_e32 v2, s2
292; GFX8-NEXT:    s_mov_b32 m0, -1
293; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
295; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
296; GFX8-NEXT:  .LBB1_2:
297; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
298; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
300; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
301; GFX8-NEXT:    s_mov_b32 s7, 0xf000
302; GFX8-NEXT:    s_mov_b32 s6, -1
303; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
304; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
305; GFX8-NEXT:    s_endpgm
306;
307; GFX9-LABEL: add_i32_uniform:
308; GFX9:       ; %bb.0: ; %entry
309; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
310; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
311; GFX9-NEXT:    s_mov_b64 s[2:3], exec
312; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
313; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
314; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
315; GFX9-NEXT:    ; implicit-def: $vgpr1
316; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
317; GFX9-NEXT:    s_cbranch_execz .LBB1_2
318; GFX9-NEXT:  ; %bb.1:
319; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
320; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX9-NEXT:    s_mul_i32 s2, s6, s2
322; GFX9-NEXT:    v_mov_b32_e32 v1, 0
323; GFX9-NEXT:    v_mov_b32_e32 v2, s2
324; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX9-NEXT:  .LBB1_2:
328; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
329; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
331; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
332; GFX9-NEXT:    s_mov_b32 s7, 0xf000
333; GFX9-NEXT:    s_mov_b32 s6, -1
334; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
335; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
336; GFX9-NEXT:    s_endpgm
337;
338; GFX1064-LABEL: add_i32_uniform:
339; GFX1064:       ; %bb.0: ; %entry
340; GFX1064-NEXT:    s_clause 0x1
341; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
342; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
343; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
344; GFX1064-NEXT:    ; implicit-def: $vgpr1
345; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
346; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
347; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
348; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
349; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
350; GFX1064-NEXT:  ; %bb.1:
351; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
352; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
353; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
355; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
356; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
357; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
358; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
359; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX1064-NEXT:    buffer_gl0_inv
361; GFX1064-NEXT:  .LBB1_2:
362; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
363; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
364; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
365; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
366; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
368; GFX1064-NEXT:    s_mov_b32 s6, -1
369; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
370; GFX1064-NEXT:    s_endpgm
371;
372; GFX1032-LABEL: add_i32_uniform:
373; GFX1032:       ; %bb.0: ; %entry
374; GFX1032-NEXT:    s_clause 0x1
375; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
376; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
377; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
378; GFX1032-NEXT:    ; implicit-def: $vgpr1
379; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
380; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
381; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
382; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
383; GFX1032-NEXT:  ; %bb.1:
384; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
385; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
386; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
388; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
389; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
390; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
391; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
392; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX1032-NEXT:    buffer_gl0_inv
394; GFX1032-NEXT:  .LBB1_2:
395; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
396; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
397; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
398; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
399; GFX1032-NEXT:    s_mov_b32 s6, -1
400; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
402; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
403; GFX1032-NEXT:    s_endpgm
404;
405; GFX1164-LABEL: add_i32_uniform:
406; GFX1164:       ; %bb.0: ; %entry
407; GFX1164-NEXT:    s_clause 0x1
408; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
409; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
410; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
411; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
412; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
413; GFX1164-NEXT:    ; implicit-def: $vgpr1
414; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
415; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
416; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
417; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
418; GFX1164-NEXT:  ; %bb.1:
419; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
420; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
421; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
423; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
424; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
425; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
426; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
427; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
428; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX1164-NEXT:    buffer_gl0_inv
430; GFX1164-NEXT:  .LBB1_2:
431; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
432; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
433; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
434; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
436; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
437; GFX1164-NEXT:    s_mov_b32 s6, -1
438; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
439; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
440; GFX1164-NEXT:    s_endpgm
441;
442; GFX1132-LABEL: add_i32_uniform:
443; GFX1132:       ; %bb.0: ; %entry
444; GFX1132-NEXT:    s_clause 0x1
445; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
446; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
447; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
448; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
449; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
450; GFX1132-NEXT:    ; implicit-def: $vgpr1
451; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
452; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
453; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
454; GFX1132-NEXT:  ; %bb.1:
455; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
456; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
458; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
459; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
460; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
461; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
462; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
463; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX1132-NEXT:    buffer_gl0_inv
465; GFX1132-NEXT:  .LBB1_2:
466; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
467; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
468; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
469; GFX1132-NEXT:    s_mov_b32 s6, -1
470; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
472; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
473; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
474; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
475; GFX1132-NEXT:    s_endpgm
476entry:
477  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
478  store i32 %old, i32 addrspace(1)* %out
479  ret void
480}
481
482define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
483;
484;
485; GFX7LESS-LABEL: add_i32_varying:
486; GFX7LESS:       ; %bb.0: ; %entry
487; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
488; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
489; GFX7LESS-NEXT:    s_mov_b32 m0, -1
490; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
492; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
494; GFX7LESS-NEXT:    s_mov_b32 s2, -1
495; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
496; GFX7LESS-NEXT:    s_endpgm
497;
498; GFX8-LABEL: add_i32_varying:
499; GFX8:       ; %bb.0: ; %entry
500; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
501; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
502; GFX8-NEXT:    v_mov_b32_e32 v1, 0
503; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
504; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
505; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
506; GFX8-NEXT:    v_mov_b32_e32 v2, v0
507; GFX8-NEXT:    s_not_b64 exec, exec
508; GFX8-NEXT:    v_mov_b32_e32 v2, 0
509; GFX8-NEXT:    s_not_b64 exec, exec
510; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
511; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
512; GFX8-NEXT:    s_nop 1
513; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
514; GFX8-NEXT:    s_nop 1
515; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
516; GFX8-NEXT:    s_nop 1
517; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
518; GFX8-NEXT:    s_nop 1
519; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
520; GFX8-NEXT:    s_nop 1
521; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
522; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
523; GFX8-NEXT:    s_nop 0
524; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
525; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
526; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
527; GFX8-NEXT:    ; implicit-def: $vgpr0
528; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
529; GFX8-NEXT:    s_cbranch_execz .LBB2_2
530; GFX8-NEXT:  ; %bb.1:
531; GFX8-NEXT:    v_mov_b32_e32 v0, 0
532; GFX8-NEXT:    v_mov_b32_e32 v3, s4
533; GFX8-NEXT:    s_mov_b32 m0, -1
534; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
536; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX8-NEXT:  .LBB2_2:
538; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
539; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
541; GFX8-NEXT:    v_mov_b32_e32 v0, v1
542; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
543; GFX8-NEXT:    s_mov_b32 s3, 0xf000
544; GFX8-NEXT:    s_mov_b32 s2, -1
545; GFX8-NEXT:    s_nop 0
546; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
547; GFX8-NEXT:    s_endpgm
548;
549; GFX9-LABEL: add_i32_varying:
550; GFX9:       ; %bb.0: ; %entry
551; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
552; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
553; GFX9-NEXT:    v_mov_b32_e32 v1, 0
554; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
555; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
556; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
557; GFX9-NEXT:    v_mov_b32_e32 v2, v0
558; GFX9-NEXT:    s_not_b64 exec, exec
559; GFX9-NEXT:    v_mov_b32_e32 v2, 0
560; GFX9-NEXT:    s_not_b64 exec, exec
561; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
562; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
563; GFX9-NEXT:    s_nop 1
564; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
565; GFX9-NEXT:    s_nop 1
566; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
567; GFX9-NEXT:    s_nop 1
568; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
569; GFX9-NEXT:    s_nop 1
570; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
571; GFX9-NEXT:    s_nop 1
572; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
573; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
574; GFX9-NEXT:    s_nop 0
575; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
576; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
577; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
578; GFX9-NEXT:    ; implicit-def: $vgpr0
579; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
580; GFX9-NEXT:    s_cbranch_execz .LBB2_2
581; GFX9-NEXT:  ; %bb.1:
582; GFX9-NEXT:    v_mov_b32_e32 v0, 0
583; GFX9-NEXT:    v_mov_b32_e32 v3, s4
584; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
585; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
586; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX9-NEXT:  .LBB2_2:
588; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
589; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
591; GFX9-NEXT:    v_mov_b32_e32 v0, v1
592; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
593; GFX9-NEXT:    s_mov_b32 s3, 0xf000
594; GFX9-NEXT:    s_mov_b32 s2, -1
595; GFX9-NEXT:    s_nop 0
596; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
597; GFX9-NEXT:    s_endpgm
598;
599; GFX1064-LABEL: add_i32_varying:
600; GFX1064:       ; %bb.0: ; %entry
601; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
602; GFX1064-NEXT:    s_not_b64 exec, exec
603; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
604; GFX1064-NEXT:    s_not_b64 exec, exec
605; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
606; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
607; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
608; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
609; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
610; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
611; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
612; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
613; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
614; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
615; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
616; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
617; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
618; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
619; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
620; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
621; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
622; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
623; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
624; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
625; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
626; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
627; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
628; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
629; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
630; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
631; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
632; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
633; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
634; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
635; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
636; GFX1064-NEXT:    s_mov_b32 s2, -1
637; GFX1064-NEXT:    ; implicit-def: $vgpr0
638; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
639; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
640; GFX1064-NEXT:  ; %bb.1:
641; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
642; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
643; GFX1064-NEXT:    s_mov_b32 s3, s7
644; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
645; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
646; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
647; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX1064-NEXT:    buffer_gl0_inv
649; GFX1064-NEXT:  .LBB2_2:
650; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
651; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
652; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
653; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
654; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
655; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
656; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
658; GFX1064-NEXT:    s_endpgm
659;
660; GFX1032-LABEL: add_i32_varying:
661; GFX1032:       ; %bb.0: ; %entry
662; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
663; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
664; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
665; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
666; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
667; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
670; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
671; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
672; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
673; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
674; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
675; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
676; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
677; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
678; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
679; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
680; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
681; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
682; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
683; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
684; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
685; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
686; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
687; GFX1032-NEXT:    s_mov_b32 s2, -1
688; GFX1032-NEXT:    ; implicit-def: $vgpr0
689; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
690; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
691; GFX1032-NEXT:  ; %bb.1:
692; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
693; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
694; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
695; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
696; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
697; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX1032-NEXT:    buffer_gl0_inv
699; GFX1032-NEXT:  .LBB2_2:
700; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
701; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
702; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
703; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
704; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
705; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
706; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
708; GFX1032-NEXT:    s_endpgm
709;
710; GFX1164-LABEL: add_i32_varying:
711; GFX1164:       ; %bb.0: ; %entry
712; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
713; GFX1164-NEXT:    s_not_b64 exec, exec
714; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
715; GFX1164-NEXT:    s_not_b64 exec, exec
716; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
717; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
718; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
719; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
720; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
721; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
722; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
723; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
724; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
725; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
726; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
727; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
728; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
729; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
730; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
731; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
732; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
733; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
734; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
735; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
736; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
737; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
738; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
739; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
740; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
741; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
742; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
743; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
744; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
745; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
746; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
747; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
748; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
749; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
750; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
751; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
752; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
753; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
754; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
755; GFX1164-NEXT:    s_mov_b32 s2, -1
756; GFX1164-NEXT:    ; implicit-def: $vgpr0
757; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
758; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
759; GFX1164-NEXT:  ; %bb.1:
760; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
761; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
762; GFX1164-NEXT:    s_mov_b32 s3, s7
763; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
764; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
765; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v4
766; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
767; GFX1164-NEXT:    buffer_gl0_inv
768; GFX1164-NEXT:  .LBB2_2:
769; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
770; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
771; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
772; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
773; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
774; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
775; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
777; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
778; GFX1164-NEXT:    s_endpgm
779;
780; GFX1132-LABEL: add_i32_varying:
781; GFX1132:       ; %bb.0: ; %entry
782; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
783; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
784; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
785; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
786; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
787; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
788; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
789; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
790; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
791; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
792; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
793; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
794; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
795; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
796; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
797; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
798; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
799; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
800; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
801; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
802; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
803; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
804; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
805; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
806; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
807; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
808; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
809; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
810; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
811; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
812; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
813; GFX1132-NEXT:    s_mov_b32 s2, -1
814; GFX1132-NEXT:    ; implicit-def: $vgpr0
815; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
816; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
817; GFX1132-NEXT:  ; %bb.1:
818; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
819; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
820; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
821; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
822; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v4
823; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
824; GFX1132-NEXT:    buffer_gl0_inv
825; GFX1132-NEXT:  .LBB2_2:
826; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
827; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
828; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
829; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
830; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
831; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
832; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
834; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
835; GFX1132-NEXT:    s_endpgm
836entry:
837  %lane = call i32 @llvm.amdgcn.workitem.id.x()
838  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
839  store i32 %old, i32 addrspace(1)* %out
840  ret void
841}
842
843define amdgpu_kernel void @add_i32_varying_nouse() {
844; GFX7LESS-LABEL: add_i32_varying_nouse:
845; GFX7LESS:       ; %bb.0: ; %entry
846; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
847; GFX7LESS-NEXT:    s_mov_b32 m0, -1
848; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX7LESS-NEXT:    ds_add_u32 v1, v0
850; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX7LESS-NEXT:    s_endpgm
852;
853; GFX8-LABEL: add_i32_varying_nouse:
854; GFX8:       ; %bb.0: ; %entry
855; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
856; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
857; GFX8-NEXT:    v_mov_b32_e32 v1, v0
858; GFX8-NEXT:    s_not_b64 exec, exec
859; GFX8-NEXT:    v_mov_b32_e32 v1, 0
860; GFX8-NEXT:    s_not_b64 exec, exec
861; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
862; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
863; GFX8-NEXT:    s_nop 1
864; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
865; GFX8-NEXT:    s_nop 1
866; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
867; GFX8-NEXT:    s_nop 1
868; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
869; GFX8-NEXT:    s_nop 1
870; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
871; GFX8-NEXT:    s_nop 1
872; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
873; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
874; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
875; GFX8-NEXT:    s_mov_b32 s0, s2
876; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
877; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
878; GFX8-NEXT:    s_cbranch_execz .LBB3_2
879; GFX8-NEXT:  ; %bb.1:
880; GFX8-NEXT:    v_mov_b32_e32 v0, 0
881; GFX8-NEXT:    v_mov_b32_e32 v2, s0
882; GFX8-NEXT:    s_mov_b32 m0, -1
883; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
884; GFX8-NEXT:    ds_add_u32 v0, v2
885; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX8-NEXT:  .LBB3_2:
887; GFX8-NEXT:    s_endpgm
888;
889; GFX9-LABEL: add_i32_varying_nouse:
890; GFX9:       ; %bb.0: ; %entry
891; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
892; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
893; GFX9-NEXT:    v_mov_b32_e32 v1, v0
894; GFX9-NEXT:    s_not_b64 exec, exec
895; GFX9-NEXT:    v_mov_b32_e32 v1, 0
896; GFX9-NEXT:    s_not_b64 exec, exec
897; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
898; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
899; GFX9-NEXT:    s_nop 1
900; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX9-NEXT:    s_nop 1
902; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
903; GFX9-NEXT:    s_nop 1
904; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
905; GFX9-NEXT:    s_nop 1
906; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
907; GFX9-NEXT:    s_nop 1
908; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
909; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
910; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
911; GFX9-NEXT:    s_mov_b32 s0, s2
912; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
913; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
914; GFX9-NEXT:    s_cbranch_execz .LBB3_2
915; GFX9-NEXT:  ; %bb.1:
916; GFX9-NEXT:    v_mov_b32_e32 v0, 0
917; GFX9-NEXT:    v_mov_b32_e32 v2, s0
918; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
919; GFX9-NEXT:    ds_add_u32 v0, v2
920; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX9-NEXT:  .LBB3_2:
922; GFX9-NEXT:    s_endpgm
923;
924; GFX1064-LABEL: add_i32_varying_nouse:
925; GFX1064:       ; %bb.0: ; %entry
926; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
927; GFX1064-NEXT:    s_not_b64 exec, exec
928; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
929; GFX1064-NEXT:    s_not_b64 exec, exec
930; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
931; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
932; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
933; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
934; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
935; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
936; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
937; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
938; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
939; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
940; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
941; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
942; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
943; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
944; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
945; GFX1064-NEXT:    s_add_i32 s0, s2, s3
946; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
947; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
948; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
949; GFX1064-NEXT:  ; %bb.1:
950; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
951; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
952; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
953; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
954; GFX1064-NEXT:    ds_add_u32 v0, v3
955; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
956; GFX1064-NEXT:    buffer_gl0_inv
957; GFX1064-NEXT:  .LBB3_2:
958; GFX1064-NEXT:    s_endpgm
959;
960; GFX1032-LABEL: add_i32_varying_nouse:
961; GFX1032:       ; %bb.0: ; %entry
962; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
963; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
964; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
965; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
966; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
967; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
968; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
969; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
970; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
971; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
972; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
973; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
974; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
975; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
976; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
977; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
978; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
979; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
980; GFX1032-NEXT:  ; %bb.1:
981; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
982; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
983; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
984; GFX1032-NEXT:    ds_add_u32 v3, v0
985; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
986; GFX1032-NEXT:    buffer_gl0_inv
987; GFX1032-NEXT:  .LBB3_2:
988; GFX1032-NEXT:    s_endpgm
989;
990; GFX1164-LABEL: add_i32_varying_nouse:
991; GFX1164:       ; %bb.0: ; %entry
992; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
993; GFX1164-NEXT:    s_not_b64 exec, exec
994; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
995; GFX1164-NEXT:    s_not_b64 exec, exec
996; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
997; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
998; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
999; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1000; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1001; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1002; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1003; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1004; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
1005; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1006; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1007; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1008; GFX1164-NEXT:    v_permlane64_b32 v2, v1
1009; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
1010; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1011; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1012; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
1013; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1014; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
1015; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1016; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
1017; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1018; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
1019; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
1020; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
1021; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
1022; GFX1164-NEXT:  ; %bb.1:
1023; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1024; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1025; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1026; GFX1164-NEXT:    ds_add_u32 v3, v0
1027; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX1164-NEXT:    buffer_gl0_inv
1029; GFX1164-NEXT:  .LBB3_2:
1030; GFX1164-NEXT:    s_endpgm
1031;
1032; GFX1132-LABEL: add_i32_varying_nouse:
1033; GFX1132:       ; %bb.0: ; %entry
1034; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
1035; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1036; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1037; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1038; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
1039; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1040; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1041; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1042; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1043; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1044; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1045; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1046; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
1047; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1048; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1049; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1050; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
1051; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1052; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1053; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
1054; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1055; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
1056; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1057; GFX1132-NEXT:  ; %bb.1:
1058; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1059; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1060; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1061; GFX1132-NEXT:    ds_add_u32 v3, v0
1062; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1063; GFX1132-NEXT:    buffer_gl0_inv
1064; GFX1132-NEXT:  .LBB3_2:
1065; GFX1132-NEXT:    s_endpgm
1066entry:
1067  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1068  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1069  ret void
1070}
1071
1072define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1073;
1074;
1075; GFX7LESS-LABEL: add_i64_constant:
1076; GFX7LESS:       ; %bb.0: ; %entry
1077; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1078; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1079; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1080; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1081; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1082; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1083; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1084; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1085; GFX7LESS-NEXT:  ; %bb.1:
1086; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1087; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1088; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1089; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1090; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1091; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1092; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1093; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX7LESS-NEXT:  .LBB4_2:
1095; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1096; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1098; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1099; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1100; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1101; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1102; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1103; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1104; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1105; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1106; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1107; GFX7LESS-NEXT:    s_endpgm
1108;
1109; GFX8-LABEL: add_i64_constant:
1110; GFX8:       ; %bb.0: ; %entry
1111; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1112; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1113; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1114; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1115; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1116; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1117; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1118; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1119; GFX8-NEXT:  ; %bb.1:
1120; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1121; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1122; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1123; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1124; GFX8-NEXT:    s_mov_b32 m0, -1
1125; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1126; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1127; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX8-NEXT:  .LBB4_2:
1129; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1130; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1131; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1132; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1133; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1134; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1135; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1136; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1137; GFX8-NEXT:    s_mov_b32 s2, -1
1138; GFX8-NEXT:    s_nop 2
1139; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1140; GFX8-NEXT:    s_endpgm
1141;
1142; GFX9-LABEL: add_i64_constant:
1143; GFX9:       ; %bb.0: ; %entry
1144; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1145; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1146; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1147; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1148; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1149; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1150; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1151; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1152; GFX9-NEXT:  ; %bb.1:
1153; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1154; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1155; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1156; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1157; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1159; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX9-NEXT:  .LBB4_2:
1161; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1164; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1165; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1166; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1167; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1168; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1169; GFX9-NEXT:    s_mov_b32 s2, -1
1170; GFX9-NEXT:    s_nop 2
1171; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1172; GFX9-NEXT:    s_endpgm
1173;
1174; GFX1064-LABEL: add_i64_constant:
1175; GFX1064:       ; %bb.0: ; %entry
1176; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1177; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1178; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1179; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1180; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1181; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1182; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1183; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1184; GFX1064-NEXT:  ; %bb.1:
1185; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1186; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1187; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
1188; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1189; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1190; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1191; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1192; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1193; GFX1064-NEXT:    buffer_gl0_inv
1194; GFX1064-NEXT:  .LBB4_2:
1195; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1196; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1197; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1198; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1199; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1200; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1201; GFX1064-NEXT:    s_mov_b32 s2, -1
1202; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1204; GFX1064-NEXT:    s_endpgm
1205;
1206; GFX1032-LABEL: add_i64_constant:
1207; GFX1032:       ; %bb.0: ; %entry
1208; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1209; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1210; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1211; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1212; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1213; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1214; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1215; GFX1032-NEXT:  ; %bb.1:
1216; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1217; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1218; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1219; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
1220; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1221; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1222; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1223; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1224; GFX1032-NEXT:    buffer_gl0_inv
1225; GFX1032-NEXT:  .LBB4_2:
1226; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1227; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1228; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1229; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1230; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1231; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1232; GFX1032-NEXT:    s_mov_b32 s2, -1
1233; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1234; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1235; GFX1032-NEXT:    s_endpgm
1236;
1237; GFX1164-LABEL: add_i64_constant:
1238; GFX1164:       ; %bb.0: ; %entry
1239; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1240; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1241; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1242; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1243; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1244; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1245; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1246; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1247; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1248; GFX1164-NEXT:  ; %bb.1:
1249; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1250; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1251; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
1252; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1253; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
1254; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1255; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1256; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1257; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX1164-NEXT:    buffer_gl0_inv
1259; GFX1164-NEXT:  .LBB4_2:
1260; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1261; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1262; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1263; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1264; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1265; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1266; GFX1164-NEXT:    s_mov_b32 s2, -1
1267; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1268; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1269; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1270; GFX1164-NEXT:    s_endpgm
1271;
1272; GFX1132-LABEL: add_i64_constant:
1273; GFX1132:       ; %bb.0: ; %entry
1274; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1275; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1276; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1277; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1278; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1279; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1280; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1281; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1282; GFX1132-NEXT:  ; %bb.1:
1283; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1284; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1285; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1286; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1287; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
1288; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1289; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1290; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1291; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1292; GFX1132-NEXT:    buffer_gl0_inv
1293; GFX1132-NEXT:  .LBB4_2:
1294; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1295; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1296; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1297; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1298; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1299; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1300; GFX1132-NEXT:    s_mov_b32 s2, -1
1301; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1302; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1303; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1304; GFX1132-NEXT:    s_endpgm
1305entry:
1306  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1307  store i64 %old, i64 addrspace(1)* %out
1308  ret void
1309}
1310
1311define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1312;
1313;
1314; GFX7LESS-LABEL: add_i64_uniform:
1315; GFX7LESS:       ; %bb.0: ; %entry
1316; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1317; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1318; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1319; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1320; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1321; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1322; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1323; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1324; GFX7LESS-NEXT:  ; %bb.1:
1325; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1326; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1327; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1328; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1329; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1330; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1331; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1332; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1333; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1334; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1335; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1337; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX7LESS-NEXT:  .LBB5_2:
1339; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1340; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1341; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1342; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1344; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1345; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
1346; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1347; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1348; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1349; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1350; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1351; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
1352; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1353; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1354; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1355; GFX7LESS-NEXT:    s_endpgm
1356;
1357; GFX8-LABEL: add_i64_uniform:
1358; GFX8:       ; %bb.0: ; %entry
1359; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1360; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1361; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1362; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1363; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1364; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1365; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1366; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1367; GFX8-NEXT:  ; %bb.1:
1368; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1369; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1370; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1371; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1372; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1373; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1374; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1375; GFX8-NEXT:    s_mov_b32 m0, -1
1376; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1377; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1378; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX8-NEXT:  .LBB5_2:
1380; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1381; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1383; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1384; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1385; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1386; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1387; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1388; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1389; GFX8-NEXT:    s_mov_b32 s6, -1
1390; GFX8-NEXT:    s_mov_b32 s4, s0
1391; GFX8-NEXT:    s_mov_b32 s5, s1
1392; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1393; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1394; GFX8-NEXT:    s_endpgm
1395;
1396; GFX9-LABEL: add_i64_uniform:
1397; GFX9:       ; %bb.0: ; %entry
1398; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1399; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1400; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1401; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1402; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1403; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1404; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1405; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1406; GFX9-NEXT:  ; %bb.1:
1407; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1408; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1410; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1411; GFX9-NEXT:    s_add_i32 s8, s8, s7
1412; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1413; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1414; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1415; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1416; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1419; GFX9-NEXT:  .LBB5_2:
1420; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1421; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1423; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1424; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1425; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1426; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1427; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1428; GFX9-NEXT:    s_mov_b32 s6, -1
1429; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1430; GFX9-NEXT:    s_mov_b32 s4, s0
1431; GFX9-NEXT:    s_mov_b32 s5, s1
1432; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1433; GFX9-NEXT:    s_endpgm
1434;
1435; GFX1064-LABEL: add_i64_uniform:
1436; GFX1064:       ; %bb.0: ; %entry
1437; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1438; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1439; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1440; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1441; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1442; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1443; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1444; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1445; GFX1064-NEXT:  ; %bb.1:
1446; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1447; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1448; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1449; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1450; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1451; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1452; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1453; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1454; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1455; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1456; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1457; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1458; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1459; GFX1064-NEXT:    buffer_gl0_inv
1460; GFX1064-NEXT:  .LBB5_2:
1461; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1462; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1463; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1464; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1465; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1466; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1467; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1468; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1469; GFX1064-NEXT:    s_mov_b32 s2, -1
1470; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1471; GFX1064-NEXT:    s_endpgm
1472;
1473; GFX1032-LABEL: add_i64_uniform:
1474; GFX1032:       ; %bb.0: ; %entry
1475; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1476; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1477; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1478; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1479; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1480; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1481; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1482; GFX1032-NEXT:  ; %bb.1:
1483; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1484; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1485; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1487; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1488; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1489; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1490; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1491; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1492; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1493; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1494; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1495; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1496; GFX1032-NEXT:    buffer_gl0_inv
1497; GFX1032-NEXT:  .LBB5_2:
1498; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1499; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1500; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1501; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1502; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1504; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1505; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1506; GFX1032-NEXT:    s_mov_b32 s2, -1
1507; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1508; GFX1032-NEXT:    s_endpgm
1509;
1510; GFX1164-LABEL: add_i64_uniform:
1511; GFX1164:       ; %bb.0: ; %entry
1512; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1513; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1514; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1515; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1516; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1517; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1518; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1519; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1520; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1521; GFX1164-NEXT:  ; %bb.1:
1522; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1523; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1524; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1526; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1527; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1528; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1529; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1530; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1531; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1532; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1533; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1534; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1535; GFX1164-NEXT:    buffer_gl0_inv
1536; GFX1164-NEXT:  .LBB5_2:
1537; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1538; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1539; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1540; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1541; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1542; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1543; GFX1164-NEXT:    s_mov_b32 s2, -1
1544; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1545; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1546; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1547; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1548; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1549; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1550; GFX1164-NEXT:    s_endpgm
1551;
1552; GFX1132-LABEL: add_i64_uniform:
1553; GFX1132:       ; %bb.0: ; %entry
1554; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1555; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1556; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1557; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1558; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1559; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1560; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1561; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1562; GFX1132-NEXT:  ; %bb.1:
1563; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1564; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1565; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1566; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1567; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1568; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1569; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1570; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1571; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
1572; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1573; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1574; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1575; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX1132-NEXT:    buffer_gl0_inv
1577; GFX1132-NEXT:  .LBB5_2:
1578; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1579; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1580; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1581; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1582; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1583; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1584; GFX1132-NEXT:    s_mov_b32 s2, -1
1585; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1586; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1587; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1588; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1589; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1590; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1591; GFX1132-NEXT:    s_endpgm
1592entry:
1593  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1594  store i64 %old, i64 addrspace(1)* %out
1595  ret void
1596}
1597
1598define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1599;
1600;
1601; GFX7LESS-LABEL: add_i64_varying:
1602; GFX7LESS:       ; %bb.0: ; %entry
1603; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1604; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1605; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1606; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1607; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1608; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1609; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1610; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1611; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1612; GFX7LESS-NEXT:    s_endpgm
1613;
1614; GFX8-LABEL: add_i64_varying:
1615; GFX8:       ; %bb.0: ; %entry
1616; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1617; GFX8-NEXT:    s_mov_b32 m0, -1
1618; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1619; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1620; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1621; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1622; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1623; GFX8-NEXT:    s_mov_b32 s2, -1
1624; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1625; GFX8-NEXT:    s_endpgm
1626;
1627; GFX9-LABEL: add_i64_varying:
1628; GFX9:       ; %bb.0: ; %entry
1629; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1630; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1631; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1632; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1633; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1635; GFX9-NEXT:    s_mov_b32 s2, -1
1636; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1637; GFX9-NEXT:    s_endpgm
1638;
1639; GFX10-LABEL: add_i64_varying:
1640; GFX10:       ; %bb.0: ; %entry
1641; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1642; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1643; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1644; GFX10-NEXT:    s_mov_b32 s2, -1
1645; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1646; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1647; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1648; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX10-NEXT:    buffer_gl0_inv
1650; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1651; GFX10-NEXT:    s_endpgm
1652;
1653; GFX11-LABEL: add_i64_varying:
1654; GFX11:       ; %bb.0: ; %entry
1655; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1656; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1657; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1658; GFX11-NEXT:    s_mov_b32 s2, -1
1659; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1660; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1661; GFX11-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1662; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1663; GFX11-NEXT:    buffer_gl0_inv
1664; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1665; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1666; GFX11-NEXT:    s_endpgm
1667entry:
1668  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1669  %zext = zext i32 %lane to i64
1670  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1671  store i64 %old, i64 addrspace(1)* %out
1672  ret void
1673}
1674
1675define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1676;
1677;
1678; GFX7LESS-LABEL: sub_i32_constant:
1679; GFX7LESS:       ; %bb.0: ; %entry
1680; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1681; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1682; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1683; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1684; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1685; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1686; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1687; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1688; GFX7LESS-NEXT:  ; %bb.1:
1689; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1690; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1691; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1692; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1693; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1694; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1696; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX7LESS-NEXT:  .LBB7_2:
1698; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1699; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1700; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1701; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1702; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1703; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1704; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1705; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1706; GFX7LESS-NEXT:    s_endpgm
1707;
1708; GFX8-LABEL: sub_i32_constant:
1709; GFX8:       ; %bb.0: ; %entry
1710; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1711; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1712; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1713; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1714; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1715; GFX8-NEXT:    ; implicit-def: $vgpr1
1716; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1717; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1718; GFX8-NEXT:  ; %bb.1:
1719; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1720; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1721; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1722; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1723; GFX8-NEXT:    s_mov_b32 m0, -1
1724; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1726; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX8-NEXT:  .LBB7_2:
1728; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1729; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1730; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1731; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1732; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1733; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1734; GFX8-NEXT:    s_mov_b32 s2, -1
1735; GFX8-NEXT:    s_nop 0
1736; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1737; GFX8-NEXT:    s_endpgm
1738;
1739; GFX9-LABEL: sub_i32_constant:
1740; GFX9:       ; %bb.0: ; %entry
1741; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1742; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1743; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1744; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1745; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1746; GFX9-NEXT:    ; implicit-def: $vgpr1
1747; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1748; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1749; GFX9-NEXT:  ; %bb.1:
1750; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1751; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1752; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1753; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1756; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1757; GFX9-NEXT:  .LBB7_2:
1758; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1759; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1760; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1761; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1762; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1763; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1764; GFX9-NEXT:    s_mov_b32 s2, -1
1765; GFX9-NEXT:    s_nop 0
1766; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1767; GFX9-NEXT:    s_endpgm
1768;
1769; GFX1064-LABEL: sub_i32_constant:
1770; GFX1064:       ; %bb.0: ; %entry
1771; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1772; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1773; GFX1064-NEXT:    ; implicit-def: $vgpr1
1774; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1775; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1776; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1777; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1778; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1779; GFX1064-NEXT:  ; %bb.1:
1780; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1781; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1782; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1783; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1784; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1785; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1786; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1787; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1788; GFX1064-NEXT:    buffer_gl0_inv
1789; GFX1064-NEXT:  .LBB7_2:
1790; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1791; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1792; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1793; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1794; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1795; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1796; GFX1064-NEXT:    s_mov_b32 s2, -1
1797; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1799; GFX1064-NEXT:    s_endpgm
1800;
1801; GFX1032-LABEL: sub_i32_constant:
1802; GFX1032:       ; %bb.0: ; %entry
1803; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1804; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1805; GFX1032-NEXT:    ; implicit-def: $vgpr1
1806; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1807; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1808; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1809; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1810; GFX1032-NEXT:  ; %bb.1:
1811; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1812; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1813; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1814; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1815; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1816; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1817; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1818; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1819; GFX1032-NEXT:    buffer_gl0_inv
1820; GFX1032-NEXT:  .LBB7_2:
1821; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1822; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1823; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1824; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1825; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1826; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1827; GFX1032-NEXT:    s_mov_b32 s2, -1
1828; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1830; GFX1032-NEXT:    s_endpgm
1831;
1832; GFX1164-LABEL: sub_i32_constant:
1833; GFX1164:       ; %bb.0: ; %entry
1834; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1835; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1836; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1837; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1838; GFX1164-NEXT:    ; implicit-def: $vgpr1
1839; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1840; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1841; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1842; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1843; GFX1164-NEXT:  ; %bb.1:
1844; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1845; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1846; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1847; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1848; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
1849; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1850; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1851; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1852; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1853; GFX1164-NEXT:    buffer_gl0_inv
1854; GFX1164-NEXT:  .LBB7_2:
1855; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1856; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1857; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1858; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1859; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1860; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1861; GFX1164-NEXT:    s_mov_b32 s2, -1
1862; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1863; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1864; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1865; GFX1164-NEXT:    s_endpgm
1866;
1867; GFX1132-LABEL: sub_i32_constant:
1868; GFX1132:       ; %bb.0: ; %entry
1869; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1870; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1871; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1872; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1873; GFX1132-NEXT:    ; implicit-def: $vgpr1
1874; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1875; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1876; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
1877; GFX1132-NEXT:  ; %bb.1:
1878; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1879; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1880; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1881; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3
1882; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1883; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1884; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1885; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1886; GFX1132-NEXT:    buffer_gl0_inv
1887; GFX1132-NEXT:  .LBB7_2:
1888; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1889; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1890; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1891; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1892; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1893; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1894; GFX1132-NEXT:    s_mov_b32 s2, -1
1895; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1896; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1897; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1898; GFX1132-NEXT:    s_endpgm
1899entry:
1900  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1901  store i32 %old, i32 addrspace(1)* %out
1902  ret void
1903}
1904
1905define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1906;
1907;
1908; GFX7LESS-LABEL: sub_i32_uniform:
1909; GFX7LESS:       ; %bb.0: ; %entry
1910; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1911; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1912; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1913; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1914; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1915; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1916; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1917; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1918; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1919; GFX7LESS-NEXT:  ; %bb.1:
1920; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1921; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1922; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1923; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1924; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1925; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1926; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1927; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1928; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1929; GFX7LESS-NEXT:  .LBB8_2:
1930; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1931; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1932; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1933; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1934; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1935; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1936; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1937; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1938; GFX7LESS-NEXT:    s_endpgm
1939;
1940; GFX8-LABEL: sub_i32_uniform:
1941; GFX8:       ; %bb.0: ; %entry
1942; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1943; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1944; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1945; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1946; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1947; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1948; GFX8-NEXT:    ; implicit-def: $vgpr1
1949; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1950; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1951; GFX8-NEXT:  ; %bb.1:
1952; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1954; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1955; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1956; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1957; GFX8-NEXT:    s_mov_b32 m0, -1
1958; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1959; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1960; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1961; GFX8-NEXT:  .LBB8_2:
1962; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1963; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1964; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1965; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1966; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1967; GFX8-NEXT:    s_mov_b32 s6, -1
1968; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1969; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1970; GFX8-NEXT:    s_endpgm
1971;
1972; GFX9-LABEL: sub_i32_uniform:
1973; GFX9:       ; %bb.0: ; %entry
1974; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1975; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1976; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1977; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1978; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1979; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1980; GFX9-NEXT:    ; implicit-def: $vgpr1
1981; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1982; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1983; GFX9-NEXT:  ; %bb.1:
1984; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1985; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1986; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1987; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1988; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1990; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1991; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1992; GFX9-NEXT:  .LBB8_2:
1993; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1994; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1995; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1996; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1997; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1998; GFX9-NEXT:    s_mov_b32 s6, -1
1999; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
2000; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2001; GFX9-NEXT:    s_endpgm
2002;
2003; GFX1064-LABEL: sub_i32_uniform:
2004; GFX1064:       ; %bb.0: ; %entry
2005; GFX1064-NEXT:    s_clause 0x1
2006; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2007; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
2008; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
2009; GFX1064-NEXT:    ; implicit-def: $vgpr1
2010; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2011; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2012; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2013; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2014; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
2015; GFX1064-NEXT:  ; %bb.1:
2016; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2017; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2018; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2019; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
2020; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
2021; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2022; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2023; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2024; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2025; GFX1064-NEXT:    buffer_gl0_inv
2026; GFX1064-NEXT:  .LBB8_2:
2027; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2028; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
2029; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2030; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
2031; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
2032; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
2033; GFX1064-NEXT:    s_mov_b32 s6, -1
2034; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2035; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2036; GFX1064-NEXT:    s_endpgm
2037;
2038; GFX1032-LABEL: sub_i32_uniform:
2039; GFX1032:       ; %bb.0: ; %entry
2040; GFX1032-NEXT:    s_clause 0x1
2041; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2042; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
2043; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2044; GFX1032-NEXT:    ; implicit-def: $vgpr1
2045; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
2046; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2047; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2048; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
2049; GFX1032-NEXT:  ; %bb.1:
2050; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
2051; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2052; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
2054; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
2055; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2056; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2057; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2058; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2059; GFX1032-NEXT:    buffer_gl0_inv
2060; GFX1032-NEXT:  .LBB8_2:
2061; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2062; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2063; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2065; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
2066; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2067; GFX1032-NEXT:    s_mov_b32 s6, -1
2068; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2069; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2070; GFX1032-NEXT:    s_endpgm
2071;
2072; GFX1164-LABEL: sub_i32_uniform:
2073; GFX1164:       ; %bb.0: ; %entry
2074; GFX1164-NEXT:    s_clause 0x1
2075; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2076; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
2077; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2078; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2079; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2080; GFX1164-NEXT:    ; implicit-def: $vgpr1
2081; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2082; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2083; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2084; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2085; GFX1164-NEXT:  ; %bb.1:
2086; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2087; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2088; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2089; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
2090; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2091; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
2092; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2093; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2094; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2095; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2096; GFX1164-NEXT:    buffer_gl0_inv
2097; GFX1164-NEXT:  .LBB8_2:
2098; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2099; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2100; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
2101; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2102; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2103; GFX1164-NEXT:    s_mov_b32 s6, -1
2104; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2105; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2106; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2107; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2108; GFX1164-NEXT:    s_endpgm
2109;
2110; GFX1132-LABEL: sub_i32_uniform:
2111; GFX1132:       ; %bb.0: ; %entry
2112; GFX1132-NEXT:    s_clause 0x1
2113; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2114; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
2115; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2116; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2117; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2118; GFX1132-NEXT:    ; implicit-def: $vgpr1
2119; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2120; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2121; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2122; GFX1132-NEXT:  ; %bb.1:
2123; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2124; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2125; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2126; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2127; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
2128; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2129; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2130; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2131; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX1132-NEXT:    buffer_gl0_inv
2133; GFX1132-NEXT:  .LBB8_2:
2134; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2135; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2136; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2137; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2138; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2139; GFX1132-NEXT:    s_mov_b32 s6, -1
2140; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2141; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2142; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2143; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2144; GFX1132-NEXT:    s_endpgm
2145entry:
2146  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
2147  store i32 %old, i32 addrspace(1)* %out
2148  ret void
2149}
2150
2151define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
2152;
2153;
2154; GFX7LESS-LABEL: sub_i32_varying:
2155; GFX7LESS:       ; %bb.0: ; %entry
2156; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2157; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2158; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2159; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
2161; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2162; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2163; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2164; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2165; GFX7LESS-NEXT:    s_endpgm
2166;
2167; GFX8-LABEL: sub_i32_varying:
2168; GFX8:       ; %bb.0: ; %entry
2169; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2170; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2171; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2172; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2173; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2174; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2175; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2176; GFX8-NEXT:    s_not_b64 exec, exec
2177; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2178; GFX8-NEXT:    s_not_b64 exec, exec
2179; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2180; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2181; GFX8-NEXT:    s_nop 1
2182; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2183; GFX8-NEXT:    s_nop 1
2184; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2185; GFX8-NEXT:    s_nop 1
2186; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2187; GFX8-NEXT:    s_nop 1
2188; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2189; GFX8-NEXT:    s_nop 1
2190; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2191; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2192; GFX8-NEXT:    s_nop 0
2193; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2194; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2195; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2196; GFX8-NEXT:    ; implicit-def: $vgpr0
2197; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2198; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2199; GFX8-NEXT:  ; %bb.1:
2200; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2201; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2202; GFX8-NEXT:    s_mov_b32 m0, -1
2203; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2204; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2205; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2206; GFX8-NEXT:  .LBB9_2:
2207; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2208; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2209; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2210; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2211; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2212; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2213; GFX8-NEXT:    s_mov_b32 s2, -1
2214; GFX8-NEXT:    s_nop 0
2215; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2216; GFX8-NEXT:    s_endpgm
2217;
2218; GFX9-LABEL: sub_i32_varying:
2219; GFX9:       ; %bb.0: ; %entry
2220; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2221; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2222; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2223; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2224; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2225; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2226; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2227; GFX9-NEXT:    s_not_b64 exec, exec
2228; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2229; GFX9-NEXT:    s_not_b64 exec, exec
2230; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2231; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2232; GFX9-NEXT:    s_nop 1
2233; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2234; GFX9-NEXT:    s_nop 1
2235; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2236; GFX9-NEXT:    s_nop 1
2237; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2238; GFX9-NEXT:    s_nop 1
2239; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2240; GFX9-NEXT:    s_nop 1
2241; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2242; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2243; GFX9-NEXT:    s_nop 0
2244; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2245; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2246; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2247; GFX9-NEXT:    ; implicit-def: $vgpr0
2248; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2249; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2250; GFX9-NEXT:  ; %bb.1:
2251; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2252; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2253; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2254; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2256; GFX9-NEXT:  .LBB9_2:
2257; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2259; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2260; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2261; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2262; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2263; GFX9-NEXT:    s_mov_b32 s2, -1
2264; GFX9-NEXT:    s_nop 0
2265; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2266; GFX9-NEXT:    s_endpgm
2267;
2268; GFX1064-LABEL: sub_i32_varying:
2269; GFX1064:       ; %bb.0: ; %entry
2270; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2271; GFX1064-NEXT:    s_not_b64 exec, exec
2272; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2273; GFX1064-NEXT:    s_not_b64 exec, exec
2274; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2275; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2276; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2277; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2278; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2279; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2280; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2281; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2282; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2283; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2284; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2285; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2286; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2287; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2288; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2289; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2290; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2291; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2292; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2293; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2294; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2295; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2296; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2297; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2298; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2299; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2300; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2301; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2302; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2303; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2304; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2305; GFX1064-NEXT:    s_mov_b32 s2, -1
2306; GFX1064-NEXT:    ; implicit-def: $vgpr0
2307; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2308; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2309; GFX1064-NEXT:  ; %bb.1:
2310; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2311; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2312; GFX1064-NEXT:    s_mov_b32 s3, s7
2313; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2314; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2315; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2316; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX1064-NEXT:    buffer_gl0_inv
2318; GFX1064-NEXT:  .LBB9_2:
2319; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2320; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2321; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2322; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2323; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2324; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2325; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2326; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2327; GFX1064-NEXT:    s_endpgm
2328;
2329; GFX1032-LABEL: sub_i32_varying:
2330; GFX1032:       ; %bb.0: ; %entry
2331; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2332; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2333; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2334; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2335; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2336; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2337; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2338; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2339; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2340; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2341; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2342; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2343; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2344; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2345; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2346; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2347; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2348; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2349; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2350; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2351; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2352; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2353; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2354; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2355; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2356; GFX1032-NEXT:    s_mov_b32 s2, -1
2357; GFX1032-NEXT:    ; implicit-def: $vgpr0
2358; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2359; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2360; GFX1032-NEXT:  ; %bb.1:
2361; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2362; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2363; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2364; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2365; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2366; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2367; GFX1032-NEXT:    buffer_gl0_inv
2368; GFX1032-NEXT:  .LBB9_2:
2369; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2370; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2371; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2372; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2373; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2374; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2375; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2376; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2377; GFX1032-NEXT:    s_endpgm
2378;
2379; GFX1164-LABEL: sub_i32_varying:
2380; GFX1164:       ; %bb.0: ; %entry
2381; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2382; GFX1164-NEXT:    s_not_b64 exec, exec
2383; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2384; GFX1164-NEXT:    s_not_b64 exec, exec
2385; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2386; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2387; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2388; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2389; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2390; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2391; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2392; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2393; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2394; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2395; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2396; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2397; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2398; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2399; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2400; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2401; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2402; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2403; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
2404; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2405; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2406; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2407; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2408; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
2409; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
2410; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2411; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2412; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2413; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2414; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
2415; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
2416; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
2417; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2418; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2419; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2420; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2421; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
2422; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2423; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2424; GFX1164-NEXT:    s_mov_b32 s2, -1
2425; GFX1164-NEXT:    ; implicit-def: $vgpr0
2426; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2427; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2428; GFX1164-NEXT:  ; %bb.1:
2429; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2430; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
2431; GFX1164-NEXT:    s_mov_b32 s3, s7
2432; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2433; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2434; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2435; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2436; GFX1164-NEXT:    buffer_gl0_inv
2437; GFX1164-NEXT:  .LBB9_2:
2438; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2439; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
2440; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2441; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2442; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2443; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2444; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2445; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2446; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2447; GFX1164-NEXT:    s_endpgm
2448;
2449; GFX1132-LABEL: sub_i32_varying:
2450; GFX1132:       ; %bb.0: ; %entry
2451; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2452; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2453; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2454; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2455; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2456; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2457; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2458; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2459; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2460; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2461; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2462; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2463; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2464; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2465; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2466; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2467; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2468; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2469; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2470; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2471; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
2472; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
2473; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2474; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2475; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2476; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2477; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2478; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
2479; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2480; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2481; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2482; GFX1132-NEXT:    s_mov_b32 s2, -1
2483; GFX1132-NEXT:    ; implicit-def: $vgpr0
2484; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2485; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2486; GFX1132-NEXT:  ; %bb.1:
2487; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
2488; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
2489; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2490; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2491; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2492; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2493; GFX1132-NEXT:    buffer_gl0_inv
2494; GFX1132-NEXT:  .LBB9_2:
2495; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2496; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
2497; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2498; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2499; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2500; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2501; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2502; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2503; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2504; GFX1132-NEXT:    s_endpgm
2505entry:
2506  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2507  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2508  store i32 %old, i32 addrspace(1)* %out
2509  ret void
2510}
2511
2512define amdgpu_kernel void @sub_i32_varying_nouse() {
2513; GFX7LESS-LABEL: sub_i32_varying_nouse:
2514; GFX7LESS:       ; %bb.0: ; %entry
2515; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2516; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2517; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2518; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
2519; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2520; GFX7LESS-NEXT:    s_endpgm
2521;
2522; GFX8-LABEL: sub_i32_varying_nouse:
2523; GFX8:       ; %bb.0: ; %entry
2524; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2525; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2526; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2527; GFX8-NEXT:    s_not_b64 exec, exec
2528; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2529; GFX8-NEXT:    s_not_b64 exec, exec
2530; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
2531; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2532; GFX8-NEXT:    s_nop 1
2533; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2534; GFX8-NEXT:    s_nop 1
2535; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2536; GFX8-NEXT:    s_nop 1
2537; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2538; GFX8-NEXT:    s_nop 1
2539; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2540; GFX8-NEXT:    s_nop 1
2541; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2542; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
2543; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
2544; GFX8-NEXT:    s_mov_b32 s0, s2
2545; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2546; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2547; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2548; GFX8-NEXT:  ; %bb.1:
2549; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2550; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2551; GFX8-NEXT:    s_mov_b32 m0, -1
2552; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2553; GFX8-NEXT:    ds_sub_u32 v0, v2
2554; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2555; GFX8-NEXT:  .LBB10_2:
2556; GFX8-NEXT:    s_endpgm
2557;
2558; GFX9-LABEL: sub_i32_varying_nouse:
2559; GFX9:       ; %bb.0: ; %entry
2560; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2561; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2562; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2563; GFX9-NEXT:    s_not_b64 exec, exec
2564; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2565; GFX9-NEXT:    s_not_b64 exec, exec
2566; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
2567; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2568; GFX9-NEXT:    s_nop 1
2569; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2570; GFX9-NEXT:    s_nop 1
2571; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2572; GFX9-NEXT:    s_nop 1
2573; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2574; GFX9-NEXT:    s_nop 1
2575; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2576; GFX9-NEXT:    s_nop 1
2577; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2578; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
2579; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
2580; GFX9-NEXT:    s_mov_b32 s0, s2
2581; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2582; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2583; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2584; GFX9-NEXT:  ; %bb.1:
2585; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2586; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2587; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2588; GFX9-NEXT:    ds_sub_u32 v0, v2
2589; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2590; GFX9-NEXT:  .LBB10_2:
2591; GFX9-NEXT:    s_endpgm
2592;
2593; GFX1064-LABEL: sub_i32_varying_nouse:
2594; GFX1064:       ; %bb.0: ; %entry
2595; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2596; GFX1064-NEXT:    s_not_b64 exec, exec
2597; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2598; GFX1064-NEXT:    s_not_b64 exec, exec
2599; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2600; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2601; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2602; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2603; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2604; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2605; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2606; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2607; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2608; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2609; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2610; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
2611; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
2612; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2613; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2614; GFX1064-NEXT:    s_add_i32 s0, s2, s3
2615; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2616; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2617; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2618; GFX1064-NEXT:  ; %bb.1:
2619; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2620; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
2621; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2622; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2623; GFX1064-NEXT:    ds_sub_u32 v0, v3
2624; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2625; GFX1064-NEXT:    buffer_gl0_inv
2626; GFX1064-NEXT:  .LBB10_2:
2627; GFX1064-NEXT:    s_endpgm
2628;
2629; GFX1032-LABEL: sub_i32_varying_nouse:
2630; GFX1032:       ; %bb.0: ; %entry
2631; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2632; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2633; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2634; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2635; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
2636; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2637; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2638; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2639; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2640; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2641; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2642; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2643; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
2644; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2645; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2646; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2647; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2648; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2649; GFX1032-NEXT:  ; %bb.1:
2650; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2651; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2652; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2653; GFX1032-NEXT:    ds_sub_u32 v3, v0
2654; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2655; GFX1032-NEXT:    buffer_gl0_inv
2656; GFX1032-NEXT:  .LBB10_2:
2657; GFX1032-NEXT:    s_endpgm
2658;
2659; GFX1164-LABEL: sub_i32_varying_nouse:
2660; GFX1164:       ; %bb.0: ; %entry
2661; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2662; GFX1164-NEXT:    s_not_b64 exec, exec
2663; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2664; GFX1164-NEXT:    s_not_b64 exec, exec
2665; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2666; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2667; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2668; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2669; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2670; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2671; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2672; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2673; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2674; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2675; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2676; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2677; GFX1164-NEXT:    v_permlane64_b32 v2, v1
2678; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2679; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2680; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2681; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2682; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2683; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2684; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
2685; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
2686; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2687; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
2688; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2689; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
2690; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2691; GFX1164-NEXT:  ; %bb.1:
2692; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2693; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2694; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2695; GFX1164-NEXT:    ds_sub_u32 v3, v0
2696; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2697; GFX1164-NEXT:    buffer_gl0_inv
2698; GFX1164-NEXT:  .LBB10_2:
2699; GFX1164-NEXT:    s_endpgm
2700;
2701; GFX1132-LABEL: sub_i32_varying_nouse:
2702; GFX1132:       ; %bb.0: ; %entry
2703; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2704; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2705; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2706; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2707; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
2708; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2709; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2710; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2711; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2712; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2713; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2714; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2715; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2716; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2717; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2718; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2719; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
2720; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2721; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2722; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
2723; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
2724; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
2725; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
2726; GFX1132-NEXT:  ; %bb.1:
2727; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2728; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2729; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2730; GFX1132-NEXT:    ds_sub_u32 v3, v0
2731; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2732; GFX1132-NEXT:    buffer_gl0_inv
2733; GFX1132-NEXT:  .LBB10_2:
2734; GFX1132-NEXT:    s_endpgm
2735entry:
2736  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2737  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2738  ret void
2739}
2740
2741define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2742;
2743;
2744; GFX7LESS-LABEL: sub_i64_constant:
2745; GFX7LESS:       ; %bb.0: ; %entry
2746; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2747; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2748; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2749; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
2750; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2751; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2752; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2753; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
2754; GFX7LESS-NEXT:  ; %bb.1:
2755; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2756; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
2757; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2758; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
2759; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2760; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2761; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2762; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2763; GFX7LESS-NEXT:  .LBB11_2:
2764; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2765; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2767; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
2768; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2769; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2770; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2771; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2772; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2773; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2774; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2775; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2776; GFX7LESS-NEXT:    s_endpgm
2777;
2778; GFX8-LABEL: sub_i64_constant:
2779; GFX8:       ; %bb.0: ; %entry
2780; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2781; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2782; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2783; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2784; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2785; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2786; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2787; GFX8-NEXT:    s_cbranch_execz .LBB11_2
2788; GFX8-NEXT:  ; %bb.1:
2789; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2790; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2791; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2792; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2793; GFX8-NEXT:    s_mov_b32 m0, -1
2794; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2795; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2796; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2797; GFX8-NEXT:  .LBB11_2:
2798; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2799; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2800; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2801; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2802; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2803; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2804; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2805; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2806; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2807; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2808; GFX8-NEXT:    s_mov_b32 s2, -1
2809; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2810; GFX8-NEXT:    s_endpgm
2811;
2812; GFX9-LABEL: sub_i64_constant:
2813; GFX9:       ; %bb.0: ; %entry
2814; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2815; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2816; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2817; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2818; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2819; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2820; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2821; GFX9-NEXT:    s_cbranch_execz .LBB11_2
2822; GFX9-NEXT:  ; %bb.1:
2823; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2824; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2825; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2826; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2827; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2828; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2829; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2830; GFX9-NEXT:  .LBB11_2:
2831; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2832; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2833; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2834; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2835; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2836; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2837; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2838; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2839; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2840; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2841; GFX9-NEXT:    s_mov_b32 s2, -1
2842; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2843; GFX9-NEXT:    s_endpgm
2844;
2845; GFX1064-LABEL: sub_i64_constant:
2846; GFX1064:       ; %bb.0: ; %entry
2847; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2848; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2849; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2850; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2851; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2852; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2853; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2854; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
2855; GFX1064-NEXT:  ; %bb.1:
2856; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2857; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2858; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2859; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2860; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2861; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2862; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2863; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2864; GFX1064-NEXT:    buffer_gl0_inv
2865; GFX1064-NEXT:  .LBB11_2:
2866; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2867; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2868; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2869; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2870; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2871; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2872; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2873; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2874; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2875; GFX1064-NEXT:    s_mov_b32 s2, -1
2876; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2877; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2878; GFX1064-NEXT:    s_endpgm
2879;
2880; GFX1032-LABEL: sub_i64_constant:
2881; GFX1032:       ; %bb.0: ; %entry
2882; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2883; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2884; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2885; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2886; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2887; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2888; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2889; GFX1032-NEXT:  ; %bb.1:
2890; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2891; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2892; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2893; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2894; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2895; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2896; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2897; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2898; GFX1032-NEXT:    buffer_gl0_inv
2899; GFX1032-NEXT:  .LBB11_2:
2900; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2901; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2902; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2903; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2904; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2905; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2906; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2907; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2908; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2909; GFX1032-NEXT:    s_mov_b32 s2, -1
2910; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2911; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2912; GFX1032-NEXT:    s_endpgm
2913;
2914; GFX1164-LABEL: sub_i64_constant:
2915; GFX1164:       ; %bb.0: ; %entry
2916; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2917; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2918; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2919; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2920; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2921; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2922; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2923; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2924; GFX1164-NEXT:    s_cbranch_execz .LBB11_2
2925; GFX1164-NEXT:  ; %bb.1:
2926; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2927; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2928; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
2929; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2930; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2931; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2932; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2933; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2934; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2935; GFX1164-NEXT:    buffer_gl0_inv
2936; GFX1164-NEXT:  .LBB11_2:
2937; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
2938; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2939; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2940; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2941; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2942; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2943; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2944; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2945; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2946; GFX1164-NEXT:    s_mov_b32 s2, -1
2947; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2948; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2949; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2950; GFX1164-NEXT:    s_endpgm
2951;
2952; GFX1132-LABEL: sub_i64_constant:
2953; GFX1132:       ; %bb.0: ; %entry
2954; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2955; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
2956; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2957; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2958; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2959; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2960; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2961; GFX1132-NEXT:    s_cbranch_execz .LBB11_2
2962; GFX1132-NEXT:  ; %bb.1:
2963; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
2964; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2965; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
2966; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2967; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
2968; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2969; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2970; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2971; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2972; GFX1132-NEXT:    buffer_gl0_inv
2973; GFX1132-NEXT:  .LBB11_2:
2974; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2975; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2976; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2977; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2978; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2979; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2980; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2981; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2982; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2983; GFX1132-NEXT:    s_mov_b32 s2, -1
2984; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2985; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2986; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2987; GFX1132-NEXT:    s_endpgm
2988entry:
2989  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2990  store i64 %old, i64 addrspace(1)* %out
2991  ret void
2992}
2993
2994define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2995;
2996;
2997; GFX7LESS-LABEL: sub_i64_uniform:
2998; GFX7LESS:       ; %bb.0: ; %entry
2999; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
3000; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3001; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
3002; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
3003; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3004; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3005; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3006; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
3007; GFX7LESS-NEXT:  ; %bb.1:
3008; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3009; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
3010; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3011; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
3012; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
3013; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
3014; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
3015; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
3016; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
3017; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3018; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3019; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3020; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3021; GFX7LESS-NEXT:  .LBB12_2:
3022; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
3023; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
3024; GFX7LESS-NEXT:    s_mov_b32 s6, -1
3025; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3026; GFX7LESS-NEXT:    s_mov_b32 s4, s0
3027; GFX7LESS-NEXT:    s_mov_b32 s5, s1
3028; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
3029; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
3030; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
3031; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
3032; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
3033; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
3034; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
3035; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
3036; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
3037; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3038; GFX7LESS-NEXT:    s_endpgm
3039;
3040; GFX8-LABEL: sub_i64_uniform:
3041; GFX8:       ; %bb.0: ; %entry
3042; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3043; GFX8-NEXT:    s_mov_b64 s[6:7], exec
3044; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3045; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3046; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3047; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3048; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3049; GFX8-NEXT:    s_cbranch_execz .LBB12_2
3050; GFX8-NEXT:  ; %bb.1:
3051; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
3052; GFX8-NEXT:    v_mov_b32_e32 v0, s8
3053; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3054; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
3055; GFX8-NEXT:    s_mul_i32 s6, s3, s8
3056; GFX8-NEXT:    v_mov_b32_e32 v3, 0
3057; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
3058; GFX8-NEXT:    s_mov_b32 m0, -1
3059; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3060; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3061; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3062; GFX8-NEXT:  .LBB12_2:
3063; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3064; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3065; GFX8-NEXT:    s_mov_b32 s4, s0
3066; GFX8-NEXT:    s_mov_b32 s5, s1
3067; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
3068; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
3069; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
3070; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
3071; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
3072; GFX8-NEXT:    v_mov_b32_e32 v3, s1
3073; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
3074; GFX8-NEXT:    s_mov_b32 s7, 0xf000
3075; GFX8-NEXT:    s_mov_b32 s6, -1
3076; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
3077; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3078; GFX8-NEXT:    s_endpgm
3079;
3080; GFX9-LABEL: sub_i64_uniform:
3081; GFX9:       ; %bb.0: ; %entry
3082; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3083; GFX9-NEXT:    s_mov_b64 s[6:7], exec
3084; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3085; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3086; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3087; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3088; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3089; GFX9-NEXT:    s_cbranch_execz .LBB12_2
3090; GFX9-NEXT:  ; %bb.1:
3091; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3092; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3093; GFX9-NEXT:    s_mul_i32 s7, s3, s6
3094; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
3095; GFX9-NEXT:    s_add_i32 s8, s8, s7
3096; GFX9-NEXT:    s_mul_i32 s6, s2, s6
3097; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3098; GFX9-NEXT:    v_mov_b32_e32 v1, s8
3099; GFX9-NEXT:    v_mov_b32_e32 v3, 0
3100; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3101; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3103; GFX9-NEXT:  .LBB12_2:
3104; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3106; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3107; GFX9-NEXT:    s_mov_b32 s4, s0
3108; GFX9-NEXT:    s_mov_b32 s5, s1
3109; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3110; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3111; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3112; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3113; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3114; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
3115; GFX9-NEXT:    s_mov_b32 s7, 0xf000
3116; GFX9-NEXT:    s_mov_b32 s6, -1
3117; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
3118; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3119; GFX9-NEXT:    s_endpgm
3120;
3121; GFX1064-LABEL: sub_i64_uniform:
3122; GFX1064:       ; %bb.0: ; %entry
3123; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3124; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3125; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3126; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3127; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3128; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3129; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3130; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
3131; GFX1064-NEXT:  ; %bb.1:
3132; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3133; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3134; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3135; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
3136; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
3137; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
3138; GFX1064-NEXT:    s_add_i32 s8, s8, s7
3139; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
3140; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
3141; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3142; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3143; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3144; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3145; GFX1064-NEXT:    buffer_gl0_inv
3146; GFX1064-NEXT:  .LBB12_2:
3147; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3148; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3149; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3150; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3151; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
3152; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3153; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3154; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3155; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3156; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
3157; GFX1064-NEXT:    s_mov_b32 s2, -1
3158; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3159; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3160; GFX1064-NEXT:    s_endpgm
3161;
3162; GFX1032-LABEL: sub_i64_uniform:
3163; GFX1032:       ; %bb.0: ; %entry
3164; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3165; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
3166; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3167; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3168; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
3169; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3170; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
3171; GFX1032-NEXT:  ; %bb.1:
3172; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
3173; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3174; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3175; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
3176; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
3177; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
3178; GFX1032-NEXT:    s_add_i32 s7, s7, s6
3179; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
3180; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
3181; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3182; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3183; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3184; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3185; GFX1032-NEXT:    buffer_gl0_inv
3186; GFX1032-NEXT:  .LBB12_2:
3187; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3188; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3189; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3190; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
3191; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
3192; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
3193; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3194; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3195; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3196; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
3197; GFX1032-NEXT:    s_mov_b32 s2, -1
3198; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3199; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3200; GFX1032-NEXT:    s_endpgm
3201;
3202; GFX1164-LABEL: sub_i64_uniform:
3203; GFX1164:       ; %bb.0: ; %entry
3204; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3205; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3206; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3207; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3208; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3209; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3210; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
3211; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
3212; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
3213; GFX1164-NEXT:  ; %bb.1:
3214; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3215; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3216; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3217; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
3218; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
3219; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
3220; GFX1164-NEXT:    s_add_i32 s8, s8, s7
3221; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
3222; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
3223; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3224; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3225; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3226; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3227; GFX1164-NEXT:    buffer_gl0_inv
3228; GFX1164-NEXT:  .LBB12_2:
3229; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3230; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3231; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3232; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
3233; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
3234; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
3235; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3236; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3237; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3238; GFX1164-NEXT:    s_mov_b32 s2, -1
3239; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3240; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3241; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3242; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3243; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3244; GFX1164-NEXT:    s_endpgm
3245;
3246; GFX1132-LABEL: sub_i64_uniform:
3247; GFX1132:       ; %bb.0: ; %entry
3248; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3249; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
3250; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3251; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3252; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3253; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3254; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3255; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
3256; GFX1132-NEXT:  ; %bb.1:
3257; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
3258; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3259; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3260; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
3261; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
3262; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
3263; GFX1132-NEXT:    s_add_i32 s7, s7, s6
3264; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3265; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
3266; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3267; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3268; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3269; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3270; GFX1132-NEXT:    buffer_gl0_inv
3271; GFX1132-NEXT:  .LBB12_2:
3272; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3273; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3274; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3275; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
3276; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
3277; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3278; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3279; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3280; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3281; GFX1132-NEXT:    s_mov_b32 s2, -1
3282; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3283; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3284; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3285; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3286; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3287; GFX1132-NEXT:    s_endpgm
3288entry:
3289  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
3290  store i64 %old, i64 addrspace(1)* %out
3291  ret void
3292}
3293
3294define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
3295;
3296;
3297; GFX7LESS-LABEL: sub_i64_varying:
3298; GFX7LESS:       ; %bb.0: ; %entry
3299; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3300; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3301; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3302; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3303; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3304; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3305; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3306; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3307; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3308; GFX7LESS-NEXT:    s_endpgm
3309;
3310; GFX8-LABEL: sub_i64_varying:
3311; GFX8:       ; %bb.0: ; %entry
3312; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3313; GFX8-NEXT:    s_mov_b32 m0, -1
3314; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3315; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3316; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3317; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3318; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3319; GFX8-NEXT:    s_mov_b32 s2, -1
3320; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3321; GFX8-NEXT:    s_endpgm
3322;
3323; GFX9-LABEL: sub_i64_varying:
3324; GFX9:       ; %bb.0: ; %entry
3325; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3326; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3327; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3328; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3329; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3330; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3331; GFX9-NEXT:    s_mov_b32 s2, -1
3332; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3333; GFX9-NEXT:    s_endpgm
3334;
3335; GFX10-LABEL: sub_i64_varying:
3336; GFX10:       ; %bb.0: ; %entry
3337; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3338; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3339; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
3340; GFX10-NEXT:    s_mov_b32 s2, -1
3341; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3342; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3343; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3344; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3345; GFX10-NEXT:    buffer_gl0_inv
3346; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3347; GFX10-NEXT:    s_endpgm
3348;
3349; GFX11-LABEL: sub_i64_varying:
3350; GFX11:       ; %bb.0: ; %entry
3351; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3352; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3353; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
3354; GFX11-NEXT:    s_mov_b32 s2, -1
3355; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3356; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3357; GFX11-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3358; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3359; GFX11-NEXT:    buffer_gl0_inv
3360; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3361; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3362; GFX11-NEXT:    s_endpgm
3363entry:
3364  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3365  %zext = zext i32 %lane to i64
3366  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
3367  store i64 %old, i64 addrspace(1)* %out
3368  ret void
3369}
3370
3371define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
3372;
3373;
3374; GFX7LESS-LABEL: and_i32_varying:
3375; GFX7LESS:       ; %bb.0: ; %entry
3376; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3377; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3378; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3379; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3380; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
3381; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3382; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3383; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3384; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3385; GFX7LESS-NEXT:    s_endpgm
3386;
3387; GFX8-LABEL: and_i32_varying:
3388; GFX8:       ; %bb.0: ; %entry
3389; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3390; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3391; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3392; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3393; GFX8-NEXT:    v_mov_b32_e32 v1, -1
3394; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3395; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3396; GFX8-NEXT:    s_not_b64 exec, exec
3397; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3398; GFX8-NEXT:    s_not_b64 exec, exec
3399; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3400; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3401; GFX8-NEXT:    s_nop 1
3402; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3403; GFX8-NEXT:    s_nop 1
3404; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3405; GFX8-NEXT:    s_nop 1
3406; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3407; GFX8-NEXT:    s_nop 1
3408; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3409; GFX8-NEXT:    s_nop 1
3410; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3411; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3412; GFX8-NEXT:    s_nop 0
3413; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3414; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3415; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3416; GFX8-NEXT:    ; implicit-def: $vgpr0
3417; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3418; GFX8-NEXT:    s_cbranch_execz .LBB14_2
3419; GFX8-NEXT:  ; %bb.1:
3420; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3421; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3422; GFX8-NEXT:    s_mov_b32 m0, -1
3423; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3424; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
3425; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3426; GFX8-NEXT:  .LBB14_2:
3427; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3428; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3429; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3430; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3431; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
3432; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3433; GFX8-NEXT:    s_mov_b32 s2, -1
3434; GFX8-NEXT:    s_nop 0
3435; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3436; GFX8-NEXT:    s_endpgm
3437;
3438; GFX9-LABEL: and_i32_varying:
3439; GFX9:       ; %bb.0: ; %entry
3440; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3441; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3442; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3443; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3444; GFX9-NEXT:    v_mov_b32_e32 v1, -1
3445; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3446; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3447; GFX9-NEXT:    s_not_b64 exec, exec
3448; GFX9-NEXT:    v_mov_b32_e32 v2, -1
3449; GFX9-NEXT:    s_not_b64 exec, exec
3450; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3451; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3452; GFX9-NEXT:    s_nop 1
3453; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3454; GFX9-NEXT:    s_nop 1
3455; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3456; GFX9-NEXT:    s_nop 1
3457; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3458; GFX9-NEXT:    s_nop 1
3459; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3460; GFX9-NEXT:    s_nop 1
3461; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3462; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3463; GFX9-NEXT:    s_nop 0
3464; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3465; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3466; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3467; GFX9-NEXT:    ; implicit-def: $vgpr0
3468; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3469; GFX9-NEXT:    s_cbranch_execz .LBB14_2
3470; GFX9-NEXT:  ; %bb.1:
3471; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3472; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3473; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3474; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
3475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3476; GFX9-NEXT:  .LBB14_2:
3477; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3478; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3479; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3480; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3481; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
3482; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3483; GFX9-NEXT:    s_mov_b32 s2, -1
3484; GFX9-NEXT:    s_nop 0
3485; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3486; GFX9-NEXT:    s_endpgm
3487;
3488; GFX1064-LABEL: and_i32_varying:
3489; GFX1064:       ; %bb.0: ; %entry
3490; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3491; GFX1064-NEXT:    s_not_b64 exec, exec
3492; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
3493; GFX1064-NEXT:    s_not_b64 exec, exec
3494; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3495; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3496; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
3497; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3498; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3499; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3500; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3501; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3502; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3503; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3504; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3505; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3506; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3507; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3508; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3509; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3510; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3511; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3512; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3513; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3514; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3515; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3516; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3517; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3518; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3519; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3520; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3521; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3522; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3523; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3524; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3525; GFX1064-NEXT:    s_mov_b32 s2, -1
3526; GFX1064-NEXT:    ; implicit-def: $vgpr0
3527; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3528; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
3529; GFX1064-NEXT:  ; %bb.1:
3530; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3531; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3532; GFX1064-NEXT:    s_mov_b32 s3, s7
3533; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3534; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3535; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
3536; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3537; GFX1064-NEXT:    buffer_gl0_inv
3538; GFX1064-NEXT:  .LBB14_2:
3539; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3540; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3541; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3542; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3543; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
3544; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3545; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3546; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3547; GFX1064-NEXT:    s_endpgm
3548;
3549; GFX1032-LABEL: and_i32_varying:
3550; GFX1032:       ; %bb.0: ; %entry
3551; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3552; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3553; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
3554; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3555; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3556; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3557; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3558; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3559; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3560; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3561; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3562; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3563; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3564; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3565; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3566; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
3567; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3568; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3569; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3570; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3571; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3572; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3573; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3574; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3575; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3576; GFX1032-NEXT:    s_mov_b32 s2, -1
3577; GFX1032-NEXT:    ; implicit-def: $vgpr0
3578; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3579; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
3580; GFX1032-NEXT:  ; %bb.1:
3581; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3582; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3583; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3584; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3585; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
3586; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3587; GFX1032-NEXT:    buffer_gl0_inv
3588; GFX1032-NEXT:  .LBB14_2:
3589; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3590; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3591; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3592; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3593; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
3594; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3595; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3596; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3597; GFX1032-NEXT:    s_endpgm
3598;
3599; GFX1164-LABEL: and_i32_varying:
3600; GFX1164:       ; %bb.0: ; %entry
3601; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3602; GFX1164-NEXT:    s_not_b64 exec, exec
3603; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
3604; GFX1164-NEXT:    s_not_b64 exec, exec
3605; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3606; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3607; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3608; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
3609; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3610; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3611; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3612; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3613; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3614; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3615; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3616; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3617; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3618; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3619; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3620; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3621; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3622; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3623; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3624; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3625; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3626; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3627; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3628; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3629; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3630; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3631; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3632; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3633; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3634; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3635; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3636; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3637; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3638; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
3639; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3640; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3641; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3642; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3643; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3644; GFX1164-NEXT:    s_mov_b32 s2, -1
3645; GFX1164-NEXT:    ; implicit-def: $vgpr0
3646; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3647; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
3648; GFX1164-NEXT:  ; %bb.1:
3649; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3650; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3651; GFX1164-NEXT:    s_mov_b32 s3, s7
3652; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3653; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3654; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
3655; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3656; GFX1164-NEXT:    buffer_gl0_inv
3657; GFX1164-NEXT:  .LBB14_2:
3658; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3659; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3660; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3661; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3662; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
3663; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3664; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3665; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3666; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3667; GFX1164-NEXT:    s_endpgm
3668;
3669; GFX1132-LABEL: and_i32_varying:
3670; GFX1132:       ; %bb.0: ; %entry
3671; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3672; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3673; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
3674; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3675; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3676; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3677; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3678; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3679; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3680; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3681; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3682; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3683; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3684; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3685; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3686; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3687; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3688; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3689; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3690; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
3691; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3692; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3693; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3694; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3695; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3696; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3697; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3698; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3699; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3700; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3701; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3702; GFX1132-NEXT:    s_mov_b32 s2, -1
3703; GFX1132-NEXT:    ; implicit-def: $vgpr0
3704; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3705; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
3706; GFX1132-NEXT:  ; %bb.1:
3707; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3708; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3709; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3710; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3711; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
3712; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3713; GFX1132-NEXT:    buffer_gl0_inv
3714; GFX1132-NEXT:  .LBB14_2:
3715; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3716; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3717; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3718; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3719; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
3720; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3721; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3722; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3723; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3724; GFX1132-NEXT:    s_endpgm
3725entry:
3726  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3727  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3728  store i32 %old, i32 addrspace(1)* %out
3729  ret void
3730}
3731
3732define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
3733;
3734;
3735; GFX7LESS-LABEL: or_i32_varying:
3736; GFX7LESS:       ; %bb.0: ; %entry
3737; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3738; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3739; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3740; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3741; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
3742; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3743; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3744; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3745; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3746; GFX7LESS-NEXT:    s_endpgm
3747;
3748; GFX8-LABEL: or_i32_varying:
3749; GFX8:       ; %bb.0: ; %entry
3750; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3751; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3752; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3753; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3754; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3755; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3756; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3757; GFX8-NEXT:    s_not_b64 exec, exec
3758; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3759; GFX8-NEXT:    s_not_b64 exec, exec
3760; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3761; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3762; GFX8-NEXT:    s_nop 1
3763; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3764; GFX8-NEXT:    s_nop 1
3765; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3766; GFX8-NEXT:    s_nop 1
3767; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3768; GFX8-NEXT:    s_nop 1
3769; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3770; GFX8-NEXT:    s_nop 1
3771; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3772; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3773; GFX8-NEXT:    s_nop 0
3774; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3775; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3776; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3777; GFX8-NEXT:    ; implicit-def: $vgpr0
3778; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3779; GFX8-NEXT:    s_cbranch_execz .LBB15_2
3780; GFX8-NEXT:  ; %bb.1:
3781; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3782; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3783; GFX8-NEXT:    s_mov_b32 m0, -1
3784; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3785; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3786; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3787; GFX8-NEXT:  .LBB15_2:
3788; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3789; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3790; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3791; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3792; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3793; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3794; GFX8-NEXT:    s_mov_b32 s2, -1
3795; GFX8-NEXT:    s_nop 0
3796; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3797; GFX8-NEXT:    s_endpgm
3798;
3799; GFX9-LABEL: or_i32_varying:
3800; GFX9:       ; %bb.0: ; %entry
3801; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3802; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3803; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3804; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3805; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3806; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3807; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3808; GFX9-NEXT:    s_not_b64 exec, exec
3809; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3810; GFX9-NEXT:    s_not_b64 exec, exec
3811; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3812; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3813; GFX9-NEXT:    s_nop 1
3814; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3815; GFX9-NEXT:    s_nop 1
3816; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3817; GFX9-NEXT:    s_nop 1
3818; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3819; GFX9-NEXT:    s_nop 1
3820; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3821; GFX9-NEXT:    s_nop 1
3822; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3823; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3824; GFX9-NEXT:    s_nop 0
3825; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3826; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3827; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3828; GFX9-NEXT:    ; implicit-def: $vgpr0
3829; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3830; GFX9-NEXT:    s_cbranch_execz .LBB15_2
3831; GFX9-NEXT:  ; %bb.1:
3832; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3833; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3834; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3835; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3836; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3837; GFX9-NEXT:  .LBB15_2:
3838; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3839; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3840; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3841; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3842; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3843; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3844; GFX9-NEXT:    s_mov_b32 s2, -1
3845; GFX9-NEXT:    s_nop 0
3846; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3847; GFX9-NEXT:    s_endpgm
3848;
3849; GFX1064-LABEL: or_i32_varying:
3850; GFX1064:       ; %bb.0: ; %entry
3851; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3852; GFX1064-NEXT:    s_not_b64 exec, exec
3853; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3854; GFX1064-NEXT:    s_not_b64 exec, exec
3855; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3856; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3857; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3858; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3859; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3860; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3861; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3862; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3863; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3864; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3865; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3866; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3867; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3868; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3869; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3870; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3871; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3872; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3873; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3874; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3875; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3876; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3877; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3878; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3879; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3880; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3881; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3882; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3883; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3884; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3885; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3886; GFX1064-NEXT:    s_mov_b32 s2, -1
3887; GFX1064-NEXT:    ; implicit-def: $vgpr0
3888; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3889; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
3890; GFX1064-NEXT:  ; %bb.1:
3891; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3892; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3893; GFX1064-NEXT:    s_mov_b32 s3, s7
3894; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3895; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3896; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
3897; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3898; GFX1064-NEXT:    buffer_gl0_inv
3899; GFX1064-NEXT:  .LBB15_2:
3900; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3901; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3902; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3903; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3904; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3905; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3906; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3907; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3908; GFX1064-NEXT:    s_endpgm
3909;
3910; GFX1032-LABEL: or_i32_varying:
3911; GFX1032:       ; %bb.0: ; %entry
3912; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3913; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3914; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3915; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3916; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3917; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3918; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3919; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3920; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3921; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3922; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3923; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3924; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3925; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3926; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3927; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3928; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3929; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3930; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3931; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3932; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3933; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3934; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3935; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3936; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3937; GFX1032-NEXT:    s_mov_b32 s2, -1
3938; GFX1032-NEXT:    ; implicit-def: $vgpr0
3939; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3940; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
3941; GFX1032-NEXT:  ; %bb.1:
3942; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3943; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3944; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3945; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3946; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
3947; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3948; GFX1032-NEXT:    buffer_gl0_inv
3949; GFX1032-NEXT:  .LBB15_2:
3950; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3951; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3952; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3953; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3954; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3955; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3956; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3957; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3958; GFX1032-NEXT:    s_endpgm
3959;
3960; GFX1164-LABEL: or_i32_varying:
3961; GFX1164:       ; %bb.0: ; %entry
3962; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3963; GFX1164-NEXT:    s_not_b64 exec, exec
3964; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3965; GFX1164-NEXT:    s_not_b64 exec, exec
3966; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3967; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3968; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3969; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3970; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3971; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3972; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3973; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3974; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3975; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3976; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3977; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3978; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3979; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3980; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3981; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3982; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3983; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3984; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3985; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3986; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3987; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3988; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3989; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3990; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3991; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3992; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3993; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3994; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3995; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3996; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3997; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3998; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3999; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4000; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4001; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4002; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4003; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4004; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4005; GFX1164-NEXT:    s_mov_b32 s2, -1
4006; GFX1164-NEXT:    ; implicit-def: $vgpr0
4007; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4008; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
4009; GFX1164-NEXT:  ; %bb.1:
4010; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4011; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4012; GFX1164-NEXT:    s_mov_b32 s3, s7
4013; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4014; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4015; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v4
4016; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4017; GFX1164-NEXT:    buffer_gl0_inv
4018; GFX1164-NEXT:  .LBB15_2:
4019; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4020; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4021; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4022; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4023; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
4024; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4025; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4026; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4027; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4028; GFX1164-NEXT:    s_endpgm
4029;
4030; GFX1132-LABEL: or_i32_varying:
4031; GFX1132:       ; %bb.0: ; %entry
4032; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4033; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4034; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4035; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4036; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4037; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4038; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4039; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4040; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4041; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4042; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4043; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4044; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4045; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4046; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4047; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4048; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4049; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4050; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4051; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4052; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4053; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4054; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4055; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4056; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4057; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4058; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4059; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4060; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4061; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4062; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4063; GFX1132-NEXT:    s_mov_b32 s2, -1
4064; GFX1132-NEXT:    ; implicit-def: $vgpr0
4065; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4066; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
4067; GFX1132-NEXT:  ; %bb.1:
4068; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4069; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4070; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4071; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4072; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v4
4073; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4074; GFX1132-NEXT:    buffer_gl0_inv
4075; GFX1132-NEXT:  .LBB15_2:
4076; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4077; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4078; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4079; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4080; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
4081; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4082; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4083; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4084; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4085; GFX1132-NEXT:    s_endpgm
4086entry:
4087  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4088  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4089  store i32 %old, i32 addrspace(1)* %out
4090  ret void
4091}
4092
4093define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
4094;
4095;
4096; GFX7LESS-LABEL: xor_i32_varying:
4097; GFX7LESS:       ; %bb.0: ; %entry
4098; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4099; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4100; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4101; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4102; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
4103; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4104; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4105; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4106; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4107; GFX7LESS-NEXT:    s_endpgm
4108;
4109; GFX8-LABEL: xor_i32_varying:
4110; GFX8:       ; %bb.0: ; %entry
4111; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4112; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4113; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4114; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4115; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4116; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4117; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4118; GFX8-NEXT:    s_not_b64 exec, exec
4119; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4120; GFX8-NEXT:    s_not_b64 exec, exec
4121; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4122; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4123; GFX8-NEXT:    s_nop 1
4124; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4125; GFX8-NEXT:    s_nop 1
4126; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4127; GFX8-NEXT:    s_nop 1
4128; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4129; GFX8-NEXT:    s_nop 1
4130; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4131; GFX8-NEXT:    s_nop 1
4132; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4133; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4134; GFX8-NEXT:    s_nop 0
4135; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4136; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4137; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4138; GFX8-NEXT:    ; implicit-def: $vgpr0
4139; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4140; GFX8-NEXT:    s_cbranch_execz .LBB16_2
4141; GFX8-NEXT:  ; %bb.1:
4142; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4143; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4144; GFX8-NEXT:    s_mov_b32 m0, -1
4145; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4146; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4147; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4148; GFX8-NEXT:  .LBB16_2:
4149; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4150; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4151; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4152; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4153; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
4154; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4155; GFX8-NEXT:    s_mov_b32 s2, -1
4156; GFX8-NEXT:    s_nop 0
4157; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4158; GFX8-NEXT:    s_endpgm
4159;
4160; GFX9-LABEL: xor_i32_varying:
4161; GFX9:       ; %bb.0: ; %entry
4162; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4163; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4164; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4165; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4166; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4167; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4168; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4169; GFX9-NEXT:    s_not_b64 exec, exec
4170; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4171; GFX9-NEXT:    s_not_b64 exec, exec
4172; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4173; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4174; GFX9-NEXT:    s_nop 1
4175; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4176; GFX9-NEXT:    s_nop 1
4177; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4178; GFX9-NEXT:    s_nop 1
4179; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4180; GFX9-NEXT:    s_nop 1
4181; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4182; GFX9-NEXT:    s_nop 1
4183; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4184; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4185; GFX9-NEXT:    s_nop 0
4186; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4187; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4188; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4189; GFX9-NEXT:    ; implicit-def: $vgpr0
4190; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4191; GFX9-NEXT:    s_cbranch_execz .LBB16_2
4192; GFX9-NEXT:  ; %bb.1:
4193; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4194; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4195; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4196; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4197; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4198; GFX9-NEXT:  .LBB16_2:
4199; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4201; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4202; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4203; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
4204; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4205; GFX9-NEXT:    s_mov_b32 s2, -1
4206; GFX9-NEXT:    s_nop 0
4207; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4208; GFX9-NEXT:    s_endpgm
4209;
4210; GFX1064-LABEL: xor_i32_varying:
4211; GFX1064:       ; %bb.0: ; %entry
4212; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4213; GFX1064-NEXT:    s_not_b64 exec, exec
4214; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4215; GFX1064-NEXT:    s_not_b64 exec, exec
4216; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4217; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4218; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4219; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4220; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4221; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4222; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4223; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4224; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4225; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4226; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4227; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4228; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4229; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4230; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4231; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4232; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4233; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4234; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4235; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4236; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4237; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4238; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4239; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4240; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4241; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4242; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4243; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4244; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4245; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4246; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4247; GFX1064-NEXT:    s_mov_b32 s2, -1
4248; GFX1064-NEXT:    ; implicit-def: $vgpr0
4249; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4250; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
4251; GFX1064-NEXT:  ; %bb.1:
4252; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4253; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4254; GFX1064-NEXT:    s_mov_b32 s3, s7
4255; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4256; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4257; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4258; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4259; GFX1064-NEXT:    buffer_gl0_inv
4260; GFX1064-NEXT:  .LBB16_2:
4261; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4262; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4263; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4264; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4265; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
4266; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4267; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4268; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4269; GFX1064-NEXT:    s_endpgm
4270;
4271; GFX1032-LABEL: xor_i32_varying:
4272; GFX1032:       ; %bb.0: ; %entry
4273; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4274; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4275; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4276; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4277; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4278; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4279; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4280; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4281; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4282; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4283; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4284; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4285; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4286; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4287; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4288; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4289; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4290; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4291; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4292; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4293; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4294; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4295; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4296; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4297; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4298; GFX1032-NEXT:    s_mov_b32 s2, -1
4299; GFX1032-NEXT:    ; implicit-def: $vgpr0
4300; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4301; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
4302; GFX1032-NEXT:  ; %bb.1:
4303; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4304; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4305; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4306; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4307; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4308; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4309; GFX1032-NEXT:    buffer_gl0_inv
4310; GFX1032-NEXT:  .LBB16_2:
4311; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4312; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4313; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4314; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4315; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
4316; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4317; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4318; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4319; GFX1032-NEXT:    s_endpgm
4320;
4321; GFX1164-LABEL: xor_i32_varying:
4322; GFX1164:       ; %bb.0: ; %entry
4323; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4324; GFX1164-NEXT:    s_not_b64 exec, exec
4325; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4326; GFX1164-NEXT:    s_not_b64 exec, exec
4327; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4328; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4329; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4330; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
4331; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4332; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4333; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4334; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4335; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4336; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4337; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4338; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4339; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4340; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4341; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4342; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4343; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4344; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4345; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4346; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4347; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4348; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4349; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4350; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4351; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4352; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4353; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4354; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4355; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4356; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4357; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4358; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4359; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4360; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4361; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4362; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4363; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4364; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4365; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4366; GFX1164-NEXT:    s_mov_b32 s2, -1
4367; GFX1164-NEXT:    ; implicit-def: $vgpr0
4368; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4369; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
4370; GFX1164-NEXT:  ; %bb.1:
4371; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4372; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4373; GFX1164-NEXT:    s_mov_b32 s3, s7
4374; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4375; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4376; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4377; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4378; GFX1164-NEXT:    buffer_gl0_inv
4379; GFX1164-NEXT:  .LBB16_2:
4380; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4381; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4382; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4383; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4384; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
4385; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4386; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4387; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4388; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4389; GFX1164-NEXT:    s_endpgm
4390;
4391; GFX1132-LABEL: xor_i32_varying:
4392; GFX1132:       ; %bb.0: ; %entry
4393; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4394; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4395; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4396; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4397; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4398; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4399; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4400; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4401; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4402; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4403; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4404; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4405; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4406; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4407; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4408; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4409; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4410; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4411; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4412; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4413; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4414; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4415; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4416; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4417; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4418; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4419; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4420; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4421; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4422; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4423; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4424; GFX1132-NEXT:    s_mov_b32 s2, -1
4425; GFX1132-NEXT:    ; implicit-def: $vgpr0
4426; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4427; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
4428; GFX1132-NEXT:  ; %bb.1:
4429; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4430; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4431; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4432; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4433; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4434; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4435; GFX1132-NEXT:    buffer_gl0_inv
4436; GFX1132-NEXT:  .LBB16_2:
4437; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4438; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4439; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4440; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4441; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
4442; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4443; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4444; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4445; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4446; GFX1132-NEXT:    s_endpgm
4447entry:
4448  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4449  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4450  store i32 %old, i32 addrspace(1)* %out
4451  ret void
4452}
4453
4454define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
4455;
4456;
4457; GFX7LESS-LABEL: max_i32_varying:
4458; GFX7LESS:       ; %bb.0: ; %entry
4459; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4460; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4461; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4462; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4463; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
4464; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4465; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4466; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4467; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4468; GFX7LESS-NEXT:    s_endpgm
4469;
4470; GFX8-LABEL: max_i32_varying:
4471; GFX8:       ; %bb.0: ; %entry
4472; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4473; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4474; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4475; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4476; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4477; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4478; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4479; GFX8-NEXT:    s_not_b64 exec, exec
4480; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
4481; GFX8-NEXT:    s_not_b64 exec, exec
4482; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4483; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4484; GFX8-NEXT:    s_nop 1
4485; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4486; GFX8-NEXT:    s_nop 1
4487; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4488; GFX8-NEXT:    s_nop 1
4489; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4490; GFX8-NEXT:    s_nop 1
4491; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4492; GFX8-NEXT:    s_nop 1
4493; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4494; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4495; GFX8-NEXT:    s_nop 0
4496; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4497; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4498; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4499; GFX8-NEXT:    ; implicit-def: $vgpr0
4500; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4501; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4502; GFX8-NEXT:  ; %bb.1:
4503; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4504; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4505; GFX8-NEXT:    s_mov_b32 m0, -1
4506; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4507; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
4508; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4509; GFX8-NEXT:  .LBB17_2:
4510; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4511; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4513; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4514; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
4515; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4516; GFX8-NEXT:    s_mov_b32 s2, -1
4517; GFX8-NEXT:    s_nop 0
4518; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4519; GFX8-NEXT:    s_endpgm
4520;
4521; GFX9-LABEL: max_i32_varying:
4522; GFX9:       ; %bb.0: ; %entry
4523; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4524; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4525; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4526; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4527; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4528; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4529; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4530; GFX9-NEXT:    s_not_b64 exec, exec
4531; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
4532; GFX9-NEXT:    s_not_b64 exec, exec
4533; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4534; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4535; GFX9-NEXT:    s_nop 1
4536; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4537; GFX9-NEXT:    s_nop 1
4538; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4539; GFX9-NEXT:    s_nop 1
4540; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4541; GFX9-NEXT:    s_nop 1
4542; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4543; GFX9-NEXT:    s_nop 1
4544; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4545; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4546; GFX9-NEXT:    s_nop 0
4547; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4548; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4549; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4550; GFX9-NEXT:    ; implicit-def: $vgpr0
4551; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4552; GFX9-NEXT:    s_cbranch_execz .LBB17_2
4553; GFX9-NEXT:  ; %bb.1:
4554; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4555; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4557; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
4558; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4559; GFX9-NEXT:  .LBB17_2:
4560; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4561; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4562; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4563; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4564; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
4565; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4566; GFX9-NEXT:    s_mov_b32 s2, -1
4567; GFX9-NEXT:    s_nop 0
4568; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4569; GFX9-NEXT:    s_endpgm
4570;
4571; GFX1064-LABEL: max_i32_varying:
4572; GFX1064:       ; %bb.0: ; %entry
4573; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4574; GFX1064-NEXT:    s_not_b64 exec, exec
4575; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
4576; GFX1064-NEXT:    s_not_b64 exec, exec
4577; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4578; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4579; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
4580; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4581; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4582; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4583; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4584; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4585; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4586; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4587; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4588; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4589; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4590; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4591; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4592; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4593; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4594; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4595; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4596; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4597; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4598; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4599; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4600; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4601; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4602; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4603; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4604; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4605; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4606; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4607; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4608; GFX1064-NEXT:    s_mov_b32 s2, -1
4609; GFX1064-NEXT:    ; implicit-def: $vgpr0
4610; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4611; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
4612; GFX1064-NEXT:  ; %bb.1:
4613; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4614; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4615; GFX1064-NEXT:    s_mov_b32 s3, s7
4616; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4617; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4618; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
4619; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4620; GFX1064-NEXT:    buffer_gl0_inv
4621; GFX1064-NEXT:  .LBB17_2:
4622; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4623; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4624; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4625; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4626; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
4627; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4628; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4629; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4630; GFX1064-NEXT:    s_endpgm
4631;
4632; GFX1032-LABEL: max_i32_varying:
4633; GFX1032:       ; %bb.0: ; %entry
4634; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4635; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4636; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
4637; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4638; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4639; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4640; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4641; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4642; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4643; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4644; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4645; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4646; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4647; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4648; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4649; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
4650; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4651; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4652; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4653; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4654; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4655; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4656; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4657; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4658; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4659; GFX1032-NEXT:    s_mov_b32 s2, -1
4660; GFX1032-NEXT:    ; implicit-def: $vgpr0
4661; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4662; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
4663; GFX1032-NEXT:  ; %bb.1:
4664; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4665; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4666; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4667; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4668; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
4669; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4670; GFX1032-NEXT:    buffer_gl0_inv
4671; GFX1032-NEXT:  .LBB17_2:
4672; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4673; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4674; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4675; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4676; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
4677; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4678; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4679; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4680; GFX1032-NEXT:    s_endpgm
4681;
4682; GFX1164-LABEL: max_i32_varying:
4683; GFX1164:       ; %bb.0: ; %entry
4684; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4685; GFX1164-NEXT:    s_not_b64 exec, exec
4686; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
4687; GFX1164-NEXT:    s_not_b64 exec, exec
4688; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4689; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4690; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4691; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
4692; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4693; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4694; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4695; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4696; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4697; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4698; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4699; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4700; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4701; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4702; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4703; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4704; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4705; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4706; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4707; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4708; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4709; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4710; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4711; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4712; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4713; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4714; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4715; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4716; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4717; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4718; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4719; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4720; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4721; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4722; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4723; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4724; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4725; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4726; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4727; GFX1164-NEXT:    s_mov_b32 s2, -1
4728; GFX1164-NEXT:    ; implicit-def: $vgpr0
4729; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4730; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
4731; GFX1164-NEXT:  ; %bb.1:
4732; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4733; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4734; GFX1164-NEXT:    s_mov_b32 s3, s7
4735; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4736; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4737; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
4738; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4739; GFX1164-NEXT:    buffer_gl0_inv
4740; GFX1164-NEXT:  .LBB17_2:
4741; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4742; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4743; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4744; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4745; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
4746; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4747; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4748; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4749; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4750; GFX1164-NEXT:    s_endpgm
4751;
4752; GFX1132-LABEL: max_i32_varying:
4753; GFX1132:       ; %bb.0: ; %entry
4754; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4755; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4756; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
4757; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4758; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4759; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4760; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4761; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4762; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4763; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4764; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4765; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4766; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4767; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4768; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4769; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4770; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4771; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4772; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4773; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
4774; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4775; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4776; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4777; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4778; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4779; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4780; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4781; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4782; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4783; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4784; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4785; GFX1132-NEXT:    s_mov_b32 s2, -1
4786; GFX1132-NEXT:    ; implicit-def: $vgpr0
4787; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4788; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
4789; GFX1132-NEXT:  ; %bb.1:
4790; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4791; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4792; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4793; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4794; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
4795; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4796; GFX1132-NEXT:    buffer_gl0_inv
4797; GFX1132-NEXT:  .LBB17_2:
4798; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4799; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4800; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4801; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4802; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
4803; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4804; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4805; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4806; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4807; GFX1132-NEXT:    s_endpgm
4808entry:
4809  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4810  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4811  store i32 %old, i32 addrspace(1)* %out
4812  ret void
4813}
4814
4815define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
4816;
4817;
4818; GFX7LESS-LABEL: max_i64_constant:
4819; GFX7LESS:       ; %bb.0: ; %entry
4820; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4821; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4822; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4823; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4824; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4825; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4826; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
4827; GFX7LESS-NEXT:  ; %bb.1:
4828; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4829; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4830; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4831; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4832; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4833; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4834; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4835; GFX7LESS-NEXT:  .LBB18_2:
4836; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4837; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4838; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4839; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4840; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
4841; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4842; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4843; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4844; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4845; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4846; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
4847; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4848; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4849; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4850; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4851; GFX7LESS-NEXT:    s_endpgm
4852;
4853; GFX8-LABEL: max_i64_constant:
4854; GFX8:       ; %bb.0: ; %entry
4855; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4856; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4857; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4858; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4859; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4860; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4861; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4862; GFX8-NEXT:  ; %bb.1:
4863; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4864; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4865; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4866; GFX8-NEXT:    s_mov_b32 m0, -1
4867; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4868; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4869; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4870; GFX8-NEXT:  .LBB18_2:
4871; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4872; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4873; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4874; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
4875; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4876; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4877; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4878; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4879; GFX8-NEXT:    v_mov_b32_e32 v2, s3
4880; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4881; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4882; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4883; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4884; GFX8-NEXT:    s_mov_b32 s2, -1
4885; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4886; GFX8-NEXT:    s_endpgm
4887;
4888; GFX9-LABEL: max_i64_constant:
4889; GFX9:       ; %bb.0: ; %entry
4890; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4891; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4892; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4893; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4894; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4895; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4896; GFX9-NEXT:    s_cbranch_execz .LBB18_2
4897; GFX9-NEXT:  ; %bb.1:
4898; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4899; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4900; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4901; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4902; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4903; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4904; GFX9-NEXT:  .LBB18_2:
4905; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4906; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4907; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4908; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
4909; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4910; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4911; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4912; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4913; GFX9-NEXT:    v_mov_b32_e32 v2, s3
4914; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4915; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4916; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4917; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4918; GFX9-NEXT:    s_mov_b32 s2, -1
4919; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4920; GFX9-NEXT:    s_endpgm
4921;
4922; GFX1064-LABEL: max_i64_constant:
4923; GFX1064:       ; %bb.0: ; %entry
4924; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4925; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4926; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4927; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4928; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4929; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4930; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
4931; GFX1064-NEXT:  ; %bb.1:
4932; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4933; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4934; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4935; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4936; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4937; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4938; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4939; GFX1064-NEXT:    buffer_gl0_inv
4940; GFX1064-NEXT:  .LBB18_2:
4941; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4942; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4943; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4944; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4945; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4946; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4947; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4948; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4949; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4950; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4951; GFX1064-NEXT:    s_mov_b32 s2, -1
4952; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4953; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4954; GFX1064-NEXT:    s_endpgm
4955;
4956; GFX1032-LABEL: max_i64_constant:
4957; GFX1032:       ; %bb.0: ; %entry
4958; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4959; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4960; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4961; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4962; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4963; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
4964; GFX1032-NEXT:  ; %bb.1:
4965; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4966; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4967; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4968; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4969; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4970; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4971; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4972; GFX1032-NEXT:    buffer_gl0_inv
4973; GFX1032-NEXT:  .LBB18_2:
4974; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4975; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4976; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4977; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4978; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4979; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4980; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4981; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4982; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4983; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4984; GFX1032-NEXT:    s_mov_b32 s2, -1
4985; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4986; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4987; GFX1032-NEXT:    s_endpgm
4988;
4989; GFX1164-LABEL: max_i64_constant:
4990; GFX1164:       ; %bb.0: ; %entry
4991; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4992; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4993; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4994; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4995; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4996; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
4997; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4998; GFX1164-NEXT:    s_cbranch_execz .LBB18_2
4999; GFX1164-NEXT:  ; %bb.1:
5000; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5001; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5002; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5003; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5004; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5005; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
5006; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5007; GFX1164-NEXT:    buffer_gl0_inv
5008; GFX1164-NEXT:  .LBB18_2:
5009; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5010; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5011; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5012; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
5013; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5014; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5015; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
5016; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5017; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5018; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5019; GFX1164-NEXT:    s_mov_b32 s2, -1
5020; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5021; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5022; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5023; GFX1164-NEXT:    s_endpgm
5024;
5025; GFX1132-LABEL: max_i64_constant:
5026; GFX1132:       ; %bb.0: ; %entry
5027; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5028; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5029; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5030; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5031; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5032; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5033; GFX1132-NEXT:    s_cbranch_execz .LBB18_2
5034; GFX1132-NEXT:  ; %bb.1:
5035; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5036; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
5037; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5038; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5039; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
5040; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5041; GFX1132-NEXT:    buffer_gl0_inv
5042; GFX1132-NEXT:  .LBB18_2:
5043; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5044; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5045; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5046; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
5047; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
5048; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5049; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
5050; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5051; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5052; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5053; GFX1132-NEXT:    s_mov_b32 s2, -1
5054; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5055; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5056; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5057; GFX1132-NEXT:    s_endpgm
5058entry:
5059  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
5060  store i64 %old, i64 addrspace(1)* %out
5061  ret void
5062}
5063
5064define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
5065;
5066;
5067; GFX7LESS-LABEL: min_i32_varying:
5068; GFX7LESS:       ; %bb.0: ; %entry
5069; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5070; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5071; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5072; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5073; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
5074; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5075; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5076; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5077; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5078; GFX7LESS-NEXT:    s_endpgm
5079;
5080; GFX8-LABEL: min_i32_varying:
5081; GFX8:       ; %bb.0: ; %entry
5082; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5083; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5084; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5085; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5086; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
5087; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5088; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5089; GFX8-NEXT:    s_not_b64 exec, exec
5090; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
5091; GFX8-NEXT:    s_not_b64 exec, exec
5092; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5093; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5094; GFX8-NEXT:    s_nop 1
5095; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5096; GFX8-NEXT:    s_nop 1
5097; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5098; GFX8-NEXT:    s_nop 1
5099; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5100; GFX8-NEXT:    s_nop 1
5101; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5102; GFX8-NEXT:    s_nop 1
5103; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5104; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5105; GFX8-NEXT:    s_nop 0
5106; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5107; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5108; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5109; GFX8-NEXT:    ; implicit-def: $vgpr0
5110; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5111; GFX8-NEXT:    s_cbranch_execz .LBB19_2
5112; GFX8-NEXT:  ; %bb.1:
5113; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5114; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5115; GFX8-NEXT:    s_mov_b32 m0, -1
5116; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5117; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
5118; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5119; GFX8-NEXT:  .LBB19_2:
5120; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5121; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5122; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5123; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5124; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
5125; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5126; GFX8-NEXT:    s_mov_b32 s2, -1
5127; GFX8-NEXT:    s_nop 0
5128; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5129; GFX8-NEXT:    s_endpgm
5130;
5131; GFX9-LABEL: min_i32_varying:
5132; GFX9:       ; %bb.0: ; %entry
5133; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5134; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5135; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5136; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5137; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
5138; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5139; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5140; GFX9-NEXT:    s_not_b64 exec, exec
5141; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
5142; GFX9-NEXT:    s_not_b64 exec, exec
5143; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5144; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5145; GFX9-NEXT:    s_nop 1
5146; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5147; GFX9-NEXT:    s_nop 1
5148; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5149; GFX9-NEXT:    s_nop 1
5150; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5151; GFX9-NEXT:    s_nop 1
5152; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5153; GFX9-NEXT:    s_nop 1
5154; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5155; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5156; GFX9-NEXT:    s_nop 0
5157; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5158; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5159; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5160; GFX9-NEXT:    ; implicit-def: $vgpr0
5161; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5162; GFX9-NEXT:    s_cbranch_execz .LBB19_2
5163; GFX9-NEXT:  ; %bb.1:
5164; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5165; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5166; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5167; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
5168; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5169; GFX9-NEXT:  .LBB19_2:
5170; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5171; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5172; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5173; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5174; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
5175; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5176; GFX9-NEXT:    s_mov_b32 s2, -1
5177; GFX9-NEXT:    s_nop 0
5178; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5179; GFX9-NEXT:    s_endpgm
5180;
5181; GFX1064-LABEL: min_i32_varying:
5182; GFX1064:       ; %bb.0: ; %entry
5183; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5184; GFX1064-NEXT:    s_not_b64 exec, exec
5185; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
5186; GFX1064-NEXT:    s_not_b64 exec, exec
5187; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5188; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5189; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
5190; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5191; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5192; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5193; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5194; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5195; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5196; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5197; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5198; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5199; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5200; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5201; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5202; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5203; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5204; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5205; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5206; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5207; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5208; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5209; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5210; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5211; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5212; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5213; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5214; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5215; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5216; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5217; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5218; GFX1064-NEXT:    s_mov_b32 s2, -1
5219; GFX1064-NEXT:    ; implicit-def: $vgpr0
5220; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5221; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
5222; GFX1064-NEXT:  ; %bb.1:
5223; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5224; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5225; GFX1064-NEXT:    s_mov_b32 s3, s7
5226; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5227; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5228; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
5229; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5230; GFX1064-NEXT:    buffer_gl0_inv
5231; GFX1064-NEXT:  .LBB19_2:
5232; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5233; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5234; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5235; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5236; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
5237; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5238; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5239; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5240; GFX1064-NEXT:    s_endpgm
5241;
5242; GFX1032-LABEL: min_i32_varying:
5243; GFX1032:       ; %bb.0: ; %entry
5244; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5245; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5246; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
5247; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5248; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5249; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5250; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5251; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5252; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5253; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5254; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5255; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5256; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5257; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5258; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5259; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
5260; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5261; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5262; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5263; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5264; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5265; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5266; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5267; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5268; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5269; GFX1032-NEXT:    s_mov_b32 s2, -1
5270; GFX1032-NEXT:    ; implicit-def: $vgpr0
5271; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5272; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
5273; GFX1032-NEXT:  ; %bb.1:
5274; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5275; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5276; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5277; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5278; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
5279; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5280; GFX1032-NEXT:    buffer_gl0_inv
5281; GFX1032-NEXT:  .LBB19_2:
5282; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5283; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5284; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5285; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5286; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
5287; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5288; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5289; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5290; GFX1032-NEXT:    s_endpgm
5291;
5292; GFX1164-LABEL: min_i32_varying:
5293; GFX1164:       ; %bb.0: ; %entry
5294; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5295; GFX1164-NEXT:    s_not_b64 exec, exec
5296; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
5297; GFX1164-NEXT:    s_not_b64 exec, exec
5298; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5299; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5300; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5301; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
5302; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5303; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5304; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5305; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5306; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5307; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5308; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5309; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5310; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5311; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5312; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5313; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5314; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5315; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5316; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5317; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5318; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5319; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5320; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5321; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5322; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5323; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5324; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5325; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5326; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5327; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5328; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5329; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5330; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5331; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5332; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5333; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5334; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5335; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5336; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5337; GFX1164-NEXT:    s_mov_b32 s2, -1
5338; GFX1164-NEXT:    ; implicit-def: $vgpr0
5339; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5340; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
5341; GFX1164-NEXT:  ; %bb.1:
5342; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5343; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5344; GFX1164-NEXT:    s_mov_b32 s3, s7
5345; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5346; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5347; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
5348; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5349; GFX1164-NEXT:    buffer_gl0_inv
5350; GFX1164-NEXT:  .LBB19_2:
5351; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5352; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5353; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5354; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5355; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
5356; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5357; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5358; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5359; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5360; GFX1164-NEXT:    s_endpgm
5361;
5362; GFX1132-LABEL: min_i32_varying:
5363; GFX1132:       ; %bb.0: ; %entry
5364; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5365; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5366; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
5367; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5368; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5369; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5370; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5371; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5372; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5373; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5374; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5375; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5376; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5377; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5378; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5379; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5380; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5381; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5382; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5383; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
5384; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5385; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5386; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5387; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5388; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5389; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5390; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5391; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5392; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5393; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5394; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5395; GFX1132-NEXT:    s_mov_b32 s2, -1
5396; GFX1132-NEXT:    ; implicit-def: $vgpr0
5397; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5398; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
5399; GFX1132-NEXT:  ; %bb.1:
5400; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5401; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5402; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5403; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5404; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
5405; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5406; GFX1132-NEXT:    buffer_gl0_inv
5407; GFX1132-NEXT:  .LBB19_2:
5408; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5409; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5410; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5411; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5412; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
5413; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5414; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5415; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5416; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5417; GFX1132-NEXT:    s_endpgm
5418entry:
5419  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5420  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5421  store i32 %old, i32 addrspace(1)* %out
5422  ret void
5423}
5424
5425define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
5426;
5427;
5428; GFX7LESS-LABEL: min_i64_constant:
5429; GFX7LESS:       ; %bb.0: ; %entry
5430; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5431; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5432; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5433; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5434; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5435; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5436; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
5437; GFX7LESS-NEXT:  ; %bb.1:
5438; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5439; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5440; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5441; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5442; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5443; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5444; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5445; GFX7LESS-NEXT:  .LBB20_2:
5446; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5447; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5448; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5449; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5450; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
5451; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5452; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5453; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5454; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5455; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
5456; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5457; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5458; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5459; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5460; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5461; GFX7LESS-NEXT:    s_endpgm
5462;
5463; GFX8-LABEL: min_i64_constant:
5464; GFX8:       ; %bb.0: ; %entry
5465; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5466; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5467; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5468; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5469; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5470; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5471; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5472; GFX8-NEXT:  ; %bb.1:
5473; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5474; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5475; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5476; GFX8-NEXT:    s_mov_b32 m0, -1
5477; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5478; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5479; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5480; GFX8-NEXT:  .LBB20_2:
5481; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5482; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5483; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5484; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
5485; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5486; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5487; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5488; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5489; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5490; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5491; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5492; GFX8-NEXT:    s_mov_b32 s2, -1
5493; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5494; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5495; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5496; GFX8-NEXT:    s_endpgm
5497;
5498; GFX9-LABEL: min_i64_constant:
5499; GFX9:       ; %bb.0: ; %entry
5500; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5501; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5502; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5503; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5504; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5505; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5506; GFX9-NEXT:    s_cbranch_execz .LBB20_2
5507; GFX9-NEXT:  ; %bb.1:
5508; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5509; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5510; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5511; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5512; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5513; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5514; GFX9-NEXT:  .LBB20_2:
5515; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5517; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5518; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
5519; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5520; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5521; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5522; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5523; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5524; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5525; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5526; GFX9-NEXT:    s_mov_b32 s2, -1
5527; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5528; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5529; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5530; GFX9-NEXT:    s_endpgm
5531;
5532; GFX1064-LABEL: min_i64_constant:
5533; GFX1064:       ; %bb.0: ; %entry
5534; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5535; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5536; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5537; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5538; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5539; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5540; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
5541; GFX1064-NEXT:  ; %bb.1:
5542; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5543; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5544; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5545; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5546; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5547; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5548; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5549; GFX1064-NEXT:    buffer_gl0_inv
5550; GFX1064-NEXT:  .LBB20_2:
5551; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5552; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5553; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5554; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5555; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5556; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5557; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5558; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5559; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5560; GFX1064-NEXT:    s_mov_b32 s2, -1
5561; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5562; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5563; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5564; GFX1064-NEXT:    s_endpgm
5565;
5566; GFX1032-LABEL: min_i64_constant:
5567; GFX1032:       ; %bb.0: ; %entry
5568; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5569; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5570; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5571; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5572; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5573; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
5574; GFX1032-NEXT:  ; %bb.1:
5575; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5576; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5577; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5578; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5579; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5580; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5581; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5582; GFX1032-NEXT:    buffer_gl0_inv
5583; GFX1032-NEXT:  .LBB20_2:
5584; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5585; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5586; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5587; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5588; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5589; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5590; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5591; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5592; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5593; GFX1032-NEXT:    s_mov_b32 s2, -1
5594; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5595; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5596; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5597; GFX1032-NEXT:    s_endpgm
5598;
5599; GFX1164-LABEL: min_i64_constant:
5600; GFX1164:       ; %bb.0: ; %entry
5601; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5602; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5603; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5604; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5605; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5606; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5607; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5608; GFX1164-NEXT:    s_cbranch_execz .LBB20_2
5609; GFX1164-NEXT:  ; %bb.1:
5610; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5611; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5612; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5613; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5614; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5615; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5616; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5617; GFX1164-NEXT:    buffer_gl0_inv
5618; GFX1164-NEXT:  .LBB20_2:
5619; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5620; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5621; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5622; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5623; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5624; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5625; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5626; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5627; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5628; GFX1164-NEXT:    s_mov_b32 s2, -1
5629; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5630; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5631; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5632; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5633; GFX1164-NEXT:    s_endpgm
5634;
5635; GFX1132-LABEL: min_i64_constant:
5636; GFX1132:       ; %bb.0: ; %entry
5637; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5638; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5639; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5640; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5641; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5642; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5643; GFX1132-NEXT:    s_cbranch_execz .LBB20_2
5644; GFX1132-NEXT:  ; %bb.1:
5645; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5646; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
5647; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5648; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5649; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5650; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5651; GFX1132-NEXT:    buffer_gl0_inv
5652; GFX1132-NEXT:  .LBB20_2:
5653; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5654; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5655; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5656; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5657; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5658; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5659; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5660; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5661; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5662; GFX1132-NEXT:    s_mov_b32 s2, -1
5663; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5664; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5665; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5666; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5667; GFX1132-NEXT:    s_endpgm
5668entry:
5669  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
5670  store i64 %old, i64 addrspace(1)* %out
5671  ret void
5672}
5673
5674define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
5675;
5676;
5677; GFX7LESS-LABEL: umax_i32_varying:
5678; GFX7LESS:       ; %bb.0: ; %entry
5679; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5680; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5681; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5682; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5683; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
5684; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5685; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5686; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5687; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5688; GFX7LESS-NEXT:    s_endpgm
5689;
5690; GFX8-LABEL: umax_i32_varying:
5691; GFX8:       ; %bb.0: ; %entry
5692; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5693; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5694; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5695; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5696; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5697; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5698; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5699; GFX8-NEXT:    s_not_b64 exec, exec
5700; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5701; GFX8-NEXT:    s_not_b64 exec, exec
5702; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5703; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5704; GFX8-NEXT:    s_nop 1
5705; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5706; GFX8-NEXT:    s_nop 1
5707; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5708; GFX8-NEXT:    s_nop 1
5709; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5710; GFX8-NEXT:    s_nop 1
5711; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5712; GFX8-NEXT:    s_nop 1
5713; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5714; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5715; GFX8-NEXT:    s_nop 0
5716; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5717; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5718; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5719; GFX8-NEXT:    ; implicit-def: $vgpr0
5720; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5721; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5722; GFX8-NEXT:  ; %bb.1:
5723; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5724; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5725; GFX8-NEXT:    s_mov_b32 m0, -1
5726; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5727; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
5728; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5729; GFX8-NEXT:  .LBB21_2:
5730; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5731; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5732; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5733; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5734; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
5735; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5736; GFX8-NEXT:    s_mov_b32 s2, -1
5737; GFX8-NEXT:    s_nop 0
5738; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5739; GFX8-NEXT:    s_endpgm
5740;
5741; GFX9-LABEL: umax_i32_varying:
5742; GFX9:       ; %bb.0: ; %entry
5743; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5744; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5745; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5746; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5747; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5748; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5749; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5750; GFX9-NEXT:    s_not_b64 exec, exec
5751; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5752; GFX9-NEXT:    s_not_b64 exec, exec
5753; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5754; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5755; GFX9-NEXT:    s_nop 1
5756; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5757; GFX9-NEXT:    s_nop 1
5758; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5759; GFX9-NEXT:    s_nop 1
5760; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5761; GFX9-NEXT:    s_nop 1
5762; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5763; GFX9-NEXT:    s_nop 1
5764; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5765; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5766; GFX9-NEXT:    s_nop 0
5767; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5768; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5769; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5770; GFX9-NEXT:    ; implicit-def: $vgpr0
5771; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5772; GFX9-NEXT:    s_cbranch_execz .LBB21_2
5773; GFX9-NEXT:  ; %bb.1:
5774; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5775; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5776; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5777; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
5778; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5779; GFX9-NEXT:  .LBB21_2:
5780; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5781; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5782; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5783; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5784; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
5785; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5786; GFX9-NEXT:    s_mov_b32 s2, -1
5787; GFX9-NEXT:    s_nop 0
5788; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5789; GFX9-NEXT:    s_endpgm
5790;
5791; GFX1064-LABEL: umax_i32_varying:
5792; GFX1064:       ; %bb.0: ; %entry
5793; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5794; GFX1064-NEXT:    s_not_b64 exec, exec
5795; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5796; GFX1064-NEXT:    s_not_b64 exec, exec
5797; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5798; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5799; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5800; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5801; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5802; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5803; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5804; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5805; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5806; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5807; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5808; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5809; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5810; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5811; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5812; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5813; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5814; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5815; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5816; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5817; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5818; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5819; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5820; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5821; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5822; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5823; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5824; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5825; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5826; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5827; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5828; GFX1064-NEXT:    s_mov_b32 s2, -1
5829; GFX1064-NEXT:    ; implicit-def: $vgpr0
5830; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5831; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
5832; GFX1064-NEXT:  ; %bb.1:
5833; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5834; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5835; GFX1064-NEXT:    s_mov_b32 s3, s7
5836; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5837; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5838; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
5839; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5840; GFX1064-NEXT:    buffer_gl0_inv
5841; GFX1064-NEXT:  .LBB21_2:
5842; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5843; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5844; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5845; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5846; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
5847; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5848; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5849; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5850; GFX1064-NEXT:    s_endpgm
5851;
5852; GFX1032-LABEL: umax_i32_varying:
5853; GFX1032:       ; %bb.0: ; %entry
5854; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5855; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5856; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5857; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5858; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5859; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5860; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5861; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5862; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5863; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5864; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5865; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5866; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5867; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5868; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5869; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5870; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5871; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5872; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5873; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5874; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5875; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5876; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5877; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5878; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5879; GFX1032-NEXT:    s_mov_b32 s2, -1
5880; GFX1032-NEXT:    ; implicit-def: $vgpr0
5881; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5882; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
5883; GFX1032-NEXT:  ; %bb.1:
5884; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5885; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5886; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5887; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5888; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
5889; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5890; GFX1032-NEXT:    buffer_gl0_inv
5891; GFX1032-NEXT:  .LBB21_2:
5892; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5893; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5894; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5895; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5896; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
5897; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5898; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5899; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5900; GFX1032-NEXT:    s_endpgm
5901;
5902; GFX1164-LABEL: umax_i32_varying:
5903; GFX1164:       ; %bb.0: ; %entry
5904; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5905; GFX1164-NEXT:    s_not_b64 exec, exec
5906; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5907; GFX1164-NEXT:    s_not_b64 exec, exec
5908; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5909; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5910; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5911; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5912; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5913; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5914; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5915; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5916; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5917; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5918; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5919; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5920; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5921; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5922; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5923; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5924; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5925; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5926; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5927; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5928; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5929; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5930; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5931; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5932; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5933; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5934; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5935; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5936; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5937; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5938; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5939; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5940; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5941; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5942; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5943; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5944; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5945; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5946; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5947; GFX1164-NEXT:    s_mov_b32 s2, -1
5948; GFX1164-NEXT:    ; implicit-def: $vgpr0
5949; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5950; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
5951; GFX1164-NEXT:  ; %bb.1:
5952; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5953; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5954; GFX1164-NEXT:    s_mov_b32 s3, s7
5955; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5956; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5957; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v4
5958; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5959; GFX1164-NEXT:    buffer_gl0_inv
5960; GFX1164-NEXT:  .LBB21_2:
5961; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5962; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5963; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5964; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5965; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
5966; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5967; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5968; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5969; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5970; GFX1164-NEXT:    s_endpgm
5971;
5972; GFX1132-LABEL: umax_i32_varying:
5973; GFX1132:       ; %bb.0: ; %entry
5974; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5975; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5976; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5977; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5978; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5979; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5980; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5981; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5982; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5983; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5984; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5985; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5986; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5987; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5988; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5989; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5990; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5991; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5992; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5993; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
5994; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5995; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5996; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5997; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5998; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5999; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6000; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6001; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6002; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6003; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6004; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6005; GFX1132-NEXT:    s_mov_b32 s2, -1
6006; GFX1132-NEXT:    ; implicit-def: $vgpr0
6007; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6008; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
6009; GFX1132-NEXT:  ; %bb.1:
6010; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6011; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6012; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6013; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6014; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v4
6015; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6016; GFX1132-NEXT:    buffer_gl0_inv
6017; GFX1132-NEXT:  .LBB21_2:
6018; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6019; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6020; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6021; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6022; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
6023; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6024; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6025; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6026; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6027; GFX1132-NEXT:    s_endpgm
6028entry:
6029  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6030  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6031  store i32 %old, i32 addrspace(1)* %out
6032  ret void
6033}
6034
6035define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
6036;
6037;
6038; GFX7LESS-LABEL: umax_i64_constant:
6039; GFX7LESS:       ; %bb.0: ; %entry
6040; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6041; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6042; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6043; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6044; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6045; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6046; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
6047; GFX7LESS-NEXT:  ; %bb.1:
6048; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6049; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6050; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6051; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6052; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6053; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6054; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6055; GFX7LESS-NEXT:  .LBB22_2:
6056; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6057; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6058; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6059; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6060; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6061; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6062; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6063; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6064; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
6065; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6066; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
6067; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6068; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6069; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6070; GFX7LESS-NEXT:    s_endpgm
6071;
6072; GFX8-LABEL: umax_i64_constant:
6073; GFX8:       ; %bb.0: ; %entry
6074; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6075; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6076; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6077; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6078; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6079; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6080; GFX8-NEXT:    s_cbranch_execz .LBB22_2
6081; GFX8-NEXT:  ; %bb.1:
6082; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6083; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6084; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6085; GFX8-NEXT:    s_mov_b32 m0, -1
6086; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6087; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6088; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6089; GFX8-NEXT:  .LBB22_2:
6090; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6091; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6092; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6093; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
6094; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6095; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6096; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6097; GFX8-NEXT:    v_mov_b32_e32 v2, s2
6098; GFX8-NEXT:    v_mov_b32_e32 v1, s3
6099; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6100; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6101; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6102; GFX8-NEXT:    s_mov_b32 s2, -1
6103; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6104; GFX8-NEXT:    s_endpgm
6105;
6106; GFX9-LABEL: umax_i64_constant:
6107; GFX9:       ; %bb.0: ; %entry
6108; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6109; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6110; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6111; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6112; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6113; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6114; GFX9-NEXT:    s_cbranch_execz .LBB22_2
6115; GFX9-NEXT:  ; %bb.1:
6116; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6117; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6118; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6119; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6120; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6121; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6122; GFX9-NEXT:  .LBB22_2:
6123; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6124; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6125; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6126; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
6127; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6128; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6129; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6130; GFX9-NEXT:    v_mov_b32_e32 v2, s2
6131; GFX9-NEXT:    v_mov_b32_e32 v1, s3
6132; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6133; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6134; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6135; GFX9-NEXT:    s_mov_b32 s2, -1
6136; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6137; GFX9-NEXT:    s_endpgm
6138;
6139; GFX1064-LABEL: umax_i64_constant:
6140; GFX1064:       ; %bb.0: ; %entry
6141; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6142; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6143; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6144; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6145; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6146; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6147; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
6148; GFX1064-NEXT:  ; %bb.1:
6149; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6150; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6151; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6152; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6153; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6154; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6155; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6156; GFX1064-NEXT:    buffer_gl0_inv
6157; GFX1064-NEXT:  .LBB22_2:
6158; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6159; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6160; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6161; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6162; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6163; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6164; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6165; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6166; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
6167; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6168; GFX1064-NEXT:    s_mov_b32 s2, -1
6169; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6170; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6171; GFX1064-NEXT:    s_endpgm
6172;
6173; GFX1032-LABEL: umax_i64_constant:
6174; GFX1032:       ; %bb.0: ; %entry
6175; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6176; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6177; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6178; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6179; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6180; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
6181; GFX1032-NEXT:  ; %bb.1:
6182; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6183; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6184; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6185; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6186; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6187; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6188; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6189; GFX1032-NEXT:    buffer_gl0_inv
6190; GFX1032-NEXT:  .LBB22_2:
6191; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6192; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6193; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6194; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6195; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6196; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6197; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6198; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6199; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6200; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6201; GFX1032-NEXT:    s_mov_b32 s2, -1
6202; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6203; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6204; GFX1032-NEXT:    s_endpgm
6205;
6206; GFX1164-LABEL: umax_i64_constant:
6207; GFX1164:       ; %bb.0: ; %entry
6208; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6209; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6210; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6211; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6212; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6213; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6214; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6215; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
6216; GFX1164-NEXT:  ; %bb.1:
6217; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6218; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6219; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6220; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6221; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6222; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6223; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6224; GFX1164-NEXT:    buffer_gl0_inv
6225; GFX1164-NEXT:  .LBB22_2:
6226; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6227; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6228; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6229; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6230; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6231; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6232; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6233; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6234; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
6235; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6236; GFX1164-NEXT:    s_mov_b32 s2, -1
6237; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6238; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6239; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6240; GFX1164-NEXT:    s_endpgm
6241;
6242; GFX1132-LABEL: umax_i64_constant:
6243; GFX1132:       ; %bb.0: ; %entry
6244; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6245; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6246; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6247; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6248; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6249; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6250; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
6251; GFX1132-NEXT:  ; %bb.1:
6252; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6253; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
6254; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6255; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6256; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6257; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6258; GFX1132-NEXT:    buffer_gl0_inv
6259; GFX1132-NEXT:  .LBB22_2:
6260; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6261; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6262; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6263; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6264; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6265; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6266; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6267; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6268; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6269; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6270; GFX1132-NEXT:    s_mov_b32 s2, -1
6271; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6272; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6273; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6274; GFX1132-NEXT:    s_endpgm
6275entry:
6276  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
6277  store i64 %old, i64 addrspace(1)* %out
6278  ret void
6279}
6280
6281define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
6282;
6283;
6284; GFX7LESS-LABEL: umin_i32_varying:
6285; GFX7LESS:       ; %bb.0: ; %entry
6286; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6287; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6288; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6289; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6290; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
6291; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6292; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6293; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6294; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6295; GFX7LESS-NEXT:    s_endpgm
6296;
6297; GFX8-LABEL: umin_i32_varying:
6298; GFX8:       ; %bb.0: ; %entry
6299; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6300; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6301; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6302; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6303; GFX8-NEXT:    v_mov_b32_e32 v1, -1
6304; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6305; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6306; GFX8-NEXT:    s_not_b64 exec, exec
6307; GFX8-NEXT:    v_mov_b32_e32 v2, -1
6308; GFX8-NEXT:    s_not_b64 exec, exec
6309; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6310; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6311; GFX8-NEXT:    s_nop 1
6312; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6313; GFX8-NEXT:    s_nop 1
6314; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6315; GFX8-NEXT:    s_nop 1
6316; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6317; GFX8-NEXT:    s_nop 1
6318; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6319; GFX8-NEXT:    s_nop 1
6320; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6321; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
6322; GFX8-NEXT:    s_nop 0
6323; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6324; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6325; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6326; GFX8-NEXT:    ; implicit-def: $vgpr0
6327; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6328; GFX8-NEXT:    s_cbranch_execz .LBB23_2
6329; GFX8-NEXT:  ; %bb.1:
6330; GFX8-NEXT:    v_mov_b32_e32 v0, 0
6331; GFX8-NEXT:    v_mov_b32_e32 v3, s4
6332; GFX8-NEXT:    s_mov_b32 m0, -1
6333; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6334; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
6335; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6336; GFX8-NEXT:  .LBB23_2:
6337; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6338; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6339; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6340; GFX8-NEXT:    v_mov_b32_e32 v0, v1
6341; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
6342; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6343; GFX8-NEXT:    s_mov_b32 s2, -1
6344; GFX8-NEXT:    s_nop 0
6345; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6346; GFX8-NEXT:    s_endpgm
6347;
6348; GFX9-LABEL: umin_i32_varying:
6349; GFX9:       ; %bb.0: ; %entry
6350; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6351; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6352; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6353; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6354; GFX9-NEXT:    v_mov_b32_e32 v1, -1
6355; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6356; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6357; GFX9-NEXT:    s_not_b64 exec, exec
6358; GFX9-NEXT:    v_mov_b32_e32 v2, -1
6359; GFX9-NEXT:    s_not_b64 exec, exec
6360; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6361; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6362; GFX9-NEXT:    s_nop 1
6363; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6364; GFX9-NEXT:    s_nop 1
6365; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6366; GFX9-NEXT:    s_nop 1
6367; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6368; GFX9-NEXT:    s_nop 1
6369; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6370; GFX9-NEXT:    s_nop 1
6371; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6372; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
6373; GFX9-NEXT:    s_nop 0
6374; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6375; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6376; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6377; GFX9-NEXT:    ; implicit-def: $vgpr0
6378; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6379; GFX9-NEXT:    s_cbranch_execz .LBB23_2
6380; GFX9-NEXT:  ; %bb.1:
6381; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6382; GFX9-NEXT:    v_mov_b32_e32 v3, s4
6383; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6384; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
6385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6386; GFX9-NEXT:  .LBB23_2:
6387; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6389; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6390; GFX9-NEXT:    v_mov_b32_e32 v0, v1
6391; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
6392; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6393; GFX9-NEXT:    s_mov_b32 s2, -1
6394; GFX9-NEXT:    s_nop 0
6395; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6396; GFX9-NEXT:    s_endpgm
6397;
6398; GFX1064-LABEL: umin_i32_varying:
6399; GFX1064:       ; %bb.0: ; %entry
6400; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
6401; GFX1064-NEXT:    s_not_b64 exec, exec
6402; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
6403; GFX1064-NEXT:    s_not_b64 exec, exec
6404; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6405; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6406; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
6407; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6408; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6409; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6410; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
6411; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6412; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6413; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
6414; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
6415; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6416; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
6417; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6418; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6419; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6420; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6421; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
6422; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
6423; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6424; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6425; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6426; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
6427; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
6428; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
6429; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6430; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6431; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
6432; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
6433; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
6434; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6435; GFX1064-NEXT:    s_mov_b32 s2, -1
6436; GFX1064-NEXT:    ; implicit-def: $vgpr0
6437; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6438; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
6439; GFX1064-NEXT:  ; %bb.1:
6440; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
6441; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
6442; GFX1064-NEXT:    s_mov_b32 s3, s7
6443; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6444; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6445; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
6446; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6447; GFX1064-NEXT:    buffer_gl0_inv
6448; GFX1064-NEXT:  .LBB23_2:
6449; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6450; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
6451; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
6452; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
6453; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
6454; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6455; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6456; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6457; GFX1064-NEXT:    s_endpgm
6458;
6459; GFX1032-LABEL: umin_i32_varying:
6460; GFX1032:       ; %bb.0: ; %entry
6461; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
6462; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6463; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
6464; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6465; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6466; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6467; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6468; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6469; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6470; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
6471; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6472; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6473; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6474; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6475; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6476; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
6477; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
6478; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
6479; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6480; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6481; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6482; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6483; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
6484; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6485; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6486; GFX1032-NEXT:    s_mov_b32 s2, -1
6487; GFX1032-NEXT:    ; implicit-def: $vgpr0
6488; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6489; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
6490; GFX1032-NEXT:  ; %bb.1:
6491; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
6492; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
6493; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6494; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6495; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
6496; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6497; GFX1032-NEXT:    buffer_gl0_inv
6498; GFX1032-NEXT:  .LBB23_2:
6499; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6500; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6501; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
6502; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
6503; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
6504; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6505; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6506; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6507; GFX1032-NEXT:    s_endpgm
6508;
6509; GFX1164-LABEL: umin_i32_varying:
6510; GFX1164:       ; %bb.0: ; %entry
6511; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
6512; GFX1164-NEXT:    s_not_b64 exec, exec
6513; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
6514; GFX1164-NEXT:    s_not_b64 exec, exec
6515; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6516; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6517; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6518; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
6519; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6520; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6521; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6522; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6523; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6524; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
6525; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6526; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6527; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6528; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
6529; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6530; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
6531; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6532; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6533; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
6534; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6535; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6536; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6537; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6538; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
6539; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
6540; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6541; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6542; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6543; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6544; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
6545; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
6546; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
6547; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6548; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
6549; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6550; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
6551; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
6552; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
6553; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6554; GFX1164-NEXT:    s_mov_b32 s2, -1
6555; GFX1164-NEXT:    ; implicit-def: $vgpr0
6556; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6557; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
6558; GFX1164-NEXT:  ; %bb.1:
6559; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
6560; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
6561; GFX1164-NEXT:    s_mov_b32 s3, s7
6562; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6563; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6564; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
6565; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6566; GFX1164-NEXT:    buffer_gl0_inv
6567; GFX1164-NEXT:  .LBB23_2:
6568; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
6569; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
6570; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
6571; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6572; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
6573; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6574; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6575; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6576; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6577; GFX1164-NEXT:    s_endpgm
6578;
6579; GFX1132-LABEL: umin_i32_varying:
6580; GFX1132:       ; %bb.0: ; %entry
6581; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
6582; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6583; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
6584; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6585; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6586; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6587; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6588; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6589; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6590; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6591; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6592; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6593; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
6594; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6595; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6596; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6597; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6598; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6599; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6600; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
6601; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6602; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6603; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6604; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6605; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6606; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6607; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6608; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6609; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6610; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6611; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6612; GFX1132-NEXT:    s_mov_b32 s2, -1
6613; GFX1132-NEXT:    ; implicit-def: $vgpr0
6614; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6615; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
6616; GFX1132-NEXT:  ; %bb.1:
6617; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6618; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6619; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6620; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6621; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
6622; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6623; GFX1132-NEXT:    buffer_gl0_inv
6624; GFX1132-NEXT:  .LBB23_2:
6625; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6626; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6627; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6628; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6629; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
6630; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6631; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6632; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6633; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6634; GFX1132-NEXT:    s_endpgm
6635entry:
6636  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6637  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6638  store i32 %old, i32 addrspace(1)* %out
6639  ret void
6640}
6641
6642define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
6643;
6644;
6645; GFX7LESS-LABEL: umin_i64_constant:
6646; GFX7LESS:       ; %bb.0: ; %entry
6647; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6648; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6649; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6650; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6651; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6652; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6653; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
6654; GFX7LESS-NEXT:  ; %bb.1:
6655; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6656; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6657; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6658; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6659; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6660; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6661; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6662; GFX7LESS-NEXT:  .LBB24_2:
6663; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6664; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6665; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6666; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6667; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6668; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6669; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6670; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
6671; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6672; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6673; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6674; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6675; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6676; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6677; GFX7LESS-NEXT:    s_endpgm
6678;
6679; GFX8-LABEL: umin_i64_constant:
6680; GFX8:       ; %bb.0: ; %entry
6681; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6682; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6683; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6684; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6685; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6686; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6687; GFX8-NEXT:    s_cbranch_execz .LBB24_2
6688; GFX8-NEXT:  ; %bb.1:
6689; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6690; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6691; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6692; GFX8-NEXT:    s_mov_b32 m0, -1
6693; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6694; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6695; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6696; GFX8-NEXT:  .LBB24_2:
6697; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6698; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6699; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6700; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
6701; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6702; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6703; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6704; GFX8-NEXT:    v_mov_b32_e32 v2, s5
6705; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6706; GFX8-NEXT:    v_mov_b32_e32 v2, s4
6707; GFX8-NEXT:    s_mov_b32 s2, -1
6708; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6709; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6710; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6711; GFX8-NEXT:    s_endpgm
6712;
6713; GFX9-LABEL: umin_i64_constant:
6714; GFX9:       ; %bb.0: ; %entry
6715; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6716; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6717; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6718; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6719; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6720; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6721; GFX9-NEXT:    s_cbranch_execz .LBB24_2
6722; GFX9-NEXT:  ; %bb.1:
6723; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6724; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6725; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6726; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6727; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6728; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6729; GFX9-NEXT:  .LBB24_2:
6730; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6731; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6732; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6733; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
6734; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6735; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6736; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6737; GFX9-NEXT:    v_mov_b32_e32 v2, s5
6738; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6739; GFX9-NEXT:    v_mov_b32_e32 v2, s4
6740; GFX9-NEXT:    s_mov_b32 s2, -1
6741; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6742; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6743; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6744; GFX9-NEXT:    s_endpgm
6745;
6746; GFX1064-LABEL: umin_i64_constant:
6747; GFX1064:       ; %bb.0: ; %entry
6748; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6749; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6750; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6751; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6752; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6753; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6754; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
6755; GFX1064-NEXT:  ; %bb.1:
6756; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6757; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6758; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6759; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6760; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6761; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6762; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6763; GFX1064-NEXT:    buffer_gl0_inv
6764; GFX1064-NEXT:  .LBB24_2:
6765; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6766; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6767; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6768; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6769; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6770; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6771; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6772; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6773; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6774; GFX1064-NEXT:    s_mov_b32 s2, -1
6775; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6776; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6777; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6778; GFX1064-NEXT:    s_endpgm
6779;
6780; GFX1032-LABEL: umin_i64_constant:
6781; GFX1032:       ; %bb.0: ; %entry
6782; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6783; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6784; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6785; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6786; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6787; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
6788; GFX1032-NEXT:  ; %bb.1:
6789; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6790; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6791; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6792; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6793; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6794; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6795; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6796; GFX1032-NEXT:    buffer_gl0_inv
6797; GFX1032-NEXT:  .LBB24_2:
6798; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6799; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6800; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6801; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6802; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6803; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6804; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6805; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6806; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6807; GFX1032-NEXT:    s_mov_b32 s2, -1
6808; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6809; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6810; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6811; GFX1032-NEXT:    s_endpgm
6812;
6813; GFX1164-LABEL: umin_i64_constant:
6814; GFX1164:       ; %bb.0: ; %entry
6815; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6816; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6817; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6818; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6819; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6820; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6821; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6822; GFX1164-NEXT:    s_cbranch_execz .LBB24_2
6823; GFX1164-NEXT:  ; %bb.1:
6824; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6825; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6826; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6827; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6828; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6829; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6830; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6831; GFX1164-NEXT:    buffer_gl0_inv
6832; GFX1164-NEXT:  .LBB24_2:
6833; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6834; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6835; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6836; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6837; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6838; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6839; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6840; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6841; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6842; GFX1164-NEXT:    s_mov_b32 s2, -1
6843; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6844; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6845; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6846; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6847; GFX1164-NEXT:    s_endpgm
6848;
6849; GFX1132-LABEL: umin_i64_constant:
6850; GFX1132:       ; %bb.0: ; %entry
6851; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6852; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6853; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6854; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6855; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6856; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6857; GFX1132-NEXT:    s_cbranch_execz .LBB24_2
6858; GFX1132-NEXT:  ; %bb.1:
6859; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6860; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
6861; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6862; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6863; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6864; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6865; GFX1132-NEXT:    buffer_gl0_inv
6866; GFX1132-NEXT:  .LBB24_2:
6867; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6868; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6869; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6870; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6871; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6872; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6873; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6874; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6875; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6876; GFX1132-NEXT:    s_mov_b32 s2, -1
6877; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6878; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6879; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6880; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6881; GFX1132-NEXT:    s_endpgm
6882entry:
6883  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
6884  store i64 %old, i64 addrspace(1)* %out
6885  ret void
6886}
6887