1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12@local_var32 = addrspace(3) global i32 undef, align 4
13@local_var64 = addrspace(3) global i64 undef, align 8
14
15; Show what the atomic optimization pass will do for local pointers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
18;
19;
20; GFX7LESS-LABEL: add_i32_constant:
21; GFX7LESS:       ; %bb.0: ; %entry
22; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
23; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
25; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
26; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
27; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
28; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
29; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
30; GFX7LESS-NEXT:  ; %bb.1:
31; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
34; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7LESS-NEXT:    s_mov_b32 m0, -1
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:  .LBB0_2:
40; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
43; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
45; GFX7LESS-NEXT:    s_mov_b32 s2, -1
46; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; GFX7LESS-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
52; GFX8-NEXT:    s_mov_b64 s[2:3], exec
53; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
54; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
55; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
56; GFX8-NEXT:    ; implicit-def: $vgpr1
57; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
58; GFX8-NEXT:    s_cbranch_execz .LBB0_2
59; GFX8-NEXT:  ; %bb.1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
61; GFX8-NEXT:    s_mul_i32 s2, s2, 5
62; GFX8-NEXT:    v_mov_b32_e32 v1, 0
63; GFX8-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-NEXT:    s_mov_b32 m0, -1
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:  .LBB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
73; GFX8-NEXT:    s_mov_b32 s3, 0xf000
74; GFX8-NEXT:    s_mov_b32 s2, -1
75; GFX8-NEXT:    s_nop 1
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    s_mov_b64 s[2:3], exec
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX9-NEXT:    s_cbranch_execz .LBB0_2
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
91; GFX9-NEXT:    s_mul_i32 s2, s2, 5
92; GFX9-NEXT:    v_mov_b32_e32 v1, 0
93; GFX9-NEXT:    v_mov_b32_e32 v2, s2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
111; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
112; GFX1064-NEXT:    ; implicit-def: $vgpr1
113; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
114; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
116; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
117; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
118; GFX1064-NEXT:  ; %bb.1:
119; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
120; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
121; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
122; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1064-NEXT:    buffer_gl0_inv
128; GFX1064-NEXT:  .LBB0_2:
129; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
130; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
131; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
132; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
133; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
134; GFX1064-NEXT:    s_mov_b32 s2, -1
135; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
137; GFX1064-NEXT:    s_endpgm
138;
139; GFX1032-LABEL: add_i32_constant:
140; GFX1032:       ; %bb.0: ; %entry
141; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
151; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
152; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
155; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1032-NEXT:    buffer_gl0_inv
158; GFX1032-NEXT:  .LBB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168;
169; GFX1164-LABEL: add_i32_constant:
170; GFX1164:       ; %bb.0: ; %entry
171; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
172; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
173; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
174; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
175; GFX1164-NEXT:    ; implicit-def: $vgpr1
176; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
177; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
178; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
179; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
180; GFX1164-NEXT:  ; %bb.1:
181; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
182; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
183; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
184; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
185; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
186; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
187; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
188; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
189; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX1164-NEXT:    buffer_gl0_inv
191; GFX1164-NEXT:  .LBB0_2:
192; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
193; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
194; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
195; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
196; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
197; GFX1164-NEXT:    s_mov_b32 s2, -1
198; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
200; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
201; GFX1164-NEXT:    s_endpgm
202;
203; GFX1132-LABEL: add_i32_constant:
204; GFX1132:       ; %bb.0: ; %entry
205; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
206; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
207; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
208; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
209; GFX1132-NEXT:    ; implicit-def: $vgpr1
210; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
211; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
212; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
213; GFX1132-NEXT:  ; %bb.1:
214; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
215; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
216; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
217; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
218; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
219; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
220; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
221; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
222; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX1132-NEXT:    buffer_gl0_inv
224; GFX1132-NEXT:  .LBB0_2:
225; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
226; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
227; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
228; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
229; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
230; GFX1132-NEXT:    s_mov_b32 s2, -1
231; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
233; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
234; GFX1132-NEXT:    s_endpgm
235entry:
236  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
237  store i32 %old, i32 addrspace(1)* %out
238  ret void
239}
240
241define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
242;
243;
244; GFX7LESS-LABEL: add_i32_uniform:
245; GFX7LESS:       ; %bb.0: ; %entry
246; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
247; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
248; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
249; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
250; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
251; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
252; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
253; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
254; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
255; GFX7LESS-NEXT:  ; %bb.1:
256; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
257; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
259; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
260; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
261; GFX7LESS-NEXT:    s_mov_b32 m0, -1
262; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
264; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX7LESS-NEXT:  .LBB1_2:
266; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
267; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
269; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
270; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
271; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
272; GFX7LESS-NEXT:    s_mov_b32 s6, -1
273; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
274; GFX7LESS-NEXT:    s_endpgm
275;
276; GFX8-LABEL: add_i32_uniform:
277; GFX8:       ; %bb.0: ; %entry
278; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
279; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
280; GFX8-NEXT:    s_mov_b64 s[2:3], exec
281; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
282; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
283; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
284; GFX8-NEXT:    ; implicit-def: $vgpr1
285; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
286; GFX8-NEXT:    s_cbranch_execz .LBB1_2
287; GFX8-NEXT:  ; %bb.1:
288; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
289; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX8-NEXT:    s_mul_i32 s2, s6, s2
291; GFX8-NEXT:    v_mov_b32_e32 v1, 0
292; GFX8-NEXT:    v_mov_b32_e32 v2, s2
293; GFX8-NEXT:    s_mov_b32 m0, -1
294; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
296; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX8-NEXT:  .LBB1_2:
298; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
299; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
301; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
302; GFX8-NEXT:    s_mov_b32 s7, 0xf000
303; GFX8-NEXT:    s_mov_b32 s6, -1
304; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
305; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
306; GFX8-NEXT:    s_endpgm
307;
308; GFX9-LABEL: add_i32_uniform:
309; GFX9:       ; %bb.0: ; %entry
310; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
311; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
312; GFX9-NEXT:    s_mov_b64 s[2:3], exec
313; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
314; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
315; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
316; GFX9-NEXT:    ; implicit-def: $vgpr1
317; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
318; GFX9-NEXT:    s_cbranch_execz .LBB1_2
319; GFX9-NEXT:  ; %bb.1:
320; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
321; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX9-NEXT:    s_mul_i32 s2, s6, s2
323; GFX9-NEXT:    v_mov_b32_e32 v1, 0
324; GFX9-NEXT:    v_mov_b32_e32 v2, s2
325; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
327; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX9-NEXT:  .LBB1_2:
329; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
330; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
332; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
333; GFX9-NEXT:    s_mov_b32 s7, 0xf000
334; GFX9-NEXT:    s_mov_b32 s6, -1
335; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
336; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
337; GFX9-NEXT:    s_endpgm
338;
339; GFX1064-LABEL: add_i32_uniform:
340; GFX1064:       ; %bb.0: ; %entry
341; GFX1064-NEXT:    s_clause 0x1
342; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
343; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
344; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
345; GFX1064-NEXT:    ; implicit-def: $vgpr1
346; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
347; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
348; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
349; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
350; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
351; GFX1064-NEXT:  ; %bb.1:
352; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
353; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
354; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
356; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
357; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
358; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
359; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
360; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX1064-NEXT:    buffer_gl0_inv
362; GFX1064-NEXT:  .LBB1_2:
363; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
364; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
365; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
366; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
367; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
369; GFX1064-NEXT:    s_mov_b32 s6, -1
370; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
371; GFX1064-NEXT:    s_endpgm
372;
373; GFX1032-LABEL: add_i32_uniform:
374; GFX1032:       ; %bb.0: ; %entry
375; GFX1032-NEXT:    s_clause 0x1
376; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
377; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
378; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
379; GFX1032-NEXT:    ; implicit-def: $vgpr1
380; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
381; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
382; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
383; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
384; GFX1032-NEXT:  ; %bb.1:
385; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
386; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
387; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
389; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
390; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
391; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
392; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
393; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX1032-NEXT:    buffer_gl0_inv
395; GFX1032-NEXT:  .LBB1_2:
396; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
397; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
398; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
399; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
400; GFX1032-NEXT:    s_mov_b32 s6, -1
401; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
403; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
404; GFX1032-NEXT:    s_endpgm
405;
406; GFX1164-LABEL: add_i32_uniform:
407; GFX1164:       ; %bb.0: ; %entry
408; GFX1164-NEXT:    s_clause 0x1
409; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
410; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
411; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
412; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
413; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
414; GFX1164-NEXT:    ; implicit-def: $vgpr1
415; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
416; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
417; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
418; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
419; GFX1164-NEXT:  ; %bb.1:
420; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
421; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
422; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
424; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
425; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
426; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
427; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
428; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
429; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX1164-NEXT:    buffer_gl0_inv
431; GFX1164-NEXT:  .LBB1_2:
432; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
433; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
434; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
435; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
436; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
437; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
438; GFX1164-NEXT:    s_mov_b32 s6, -1
439; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
440; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
441; GFX1164-NEXT:    s_endpgm
442;
443; GFX1132-LABEL: add_i32_uniform:
444; GFX1132:       ; %bb.0: ; %entry
445; GFX1132-NEXT:    s_clause 0x1
446; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
447; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
448; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
449; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
450; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
451; GFX1132-NEXT:    ; implicit-def: $vgpr1
452; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
453; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
454; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
455; GFX1132-NEXT:  ; %bb.1:
456; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
457; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
458; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
460; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
461; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
462; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
463; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
464; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
465; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
466; GFX1132-NEXT:    buffer_gl0_inv
467; GFX1132-NEXT:  .LBB1_2:
468; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
469; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
470; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
471; GFX1132-NEXT:    s_mov_b32 s6, -1
472; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
474; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
475; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
476; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
477; GFX1132-NEXT:    s_endpgm
478entry:
479  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
480  store i32 %old, i32 addrspace(1)* %out
481  ret void
482}
483
484define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
485;
486;
487; GFX7LESS-LABEL: add_i32_varying:
488; GFX7LESS:       ; %bb.0: ; %entry
489; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
490; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
491; GFX7LESS-NEXT:    s_mov_b32 m0, -1
492; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
494; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
495; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
496; GFX7LESS-NEXT:    s_mov_b32 s2, -1
497; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
498; GFX7LESS-NEXT:    s_endpgm
499;
500; GFX8-LABEL: add_i32_varying:
501; GFX8:       ; %bb.0: ; %entry
502; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
503; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
504; GFX8-NEXT:    v_mov_b32_e32 v1, 0
505; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
506; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
507; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
508; GFX8-NEXT:    v_mov_b32_e32 v2, v0
509; GFX8-NEXT:    s_not_b64 exec, exec
510; GFX8-NEXT:    v_mov_b32_e32 v2, 0
511; GFX8-NEXT:    s_not_b64 exec, exec
512; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
513; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
514; GFX8-NEXT:    s_nop 1
515; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
516; GFX8-NEXT:    s_nop 1
517; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
518; GFX8-NEXT:    s_nop 1
519; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
520; GFX8-NEXT:    s_nop 1
521; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
522; GFX8-NEXT:    s_nop 1
523; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
524; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
525; GFX8-NEXT:    s_nop 0
526; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
527; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
528; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
529; GFX8-NEXT:    ; implicit-def: $vgpr0
530; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
531; GFX8-NEXT:    s_cbranch_execz .LBB2_2
532; GFX8-NEXT:  ; %bb.1:
533; GFX8-NEXT:    v_mov_b32_e32 v0, 0
534; GFX8-NEXT:    v_mov_b32_e32 v3, s4
535; GFX8-NEXT:    s_mov_b32 m0, -1
536; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
538; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX8-NEXT:  .LBB2_2:
540; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
541; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
543; GFX8-NEXT:    v_mov_b32_e32 v0, v1
544; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
545; GFX8-NEXT:    s_mov_b32 s3, 0xf000
546; GFX8-NEXT:    s_mov_b32 s2, -1
547; GFX8-NEXT:    s_nop 0
548; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
549; GFX8-NEXT:    s_endpgm
550;
551; GFX9-LABEL: add_i32_varying:
552; GFX9:       ; %bb.0: ; %entry
553; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
554; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
555; GFX9-NEXT:    v_mov_b32_e32 v1, 0
556; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
557; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
558; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
559; GFX9-NEXT:    v_mov_b32_e32 v2, v0
560; GFX9-NEXT:    s_not_b64 exec, exec
561; GFX9-NEXT:    v_mov_b32_e32 v2, 0
562; GFX9-NEXT:    s_not_b64 exec, exec
563; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
564; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
565; GFX9-NEXT:    s_nop 1
566; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
567; GFX9-NEXT:    s_nop 1
568; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
569; GFX9-NEXT:    s_nop 1
570; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
571; GFX9-NEXT:    s_nop 1
572; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
573; GFX9-NEXT:    s_nop 1
574; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
575; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
576; GFX9-NEXT:    s_nop 0
577; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
578; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
579; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
580; GFX9-NEXT:    ; implicit-def: $vgpr0
581; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
582; GFX9-NEXT:    s_cbranch_execz .LBB2_2
583; GFX9-NEXT:  ; %bb.1:
584; GFX9-NEXT:    v_mov_b32_e32 v0, 0
585; GFX9-NEXT:    v_mov_b32_e32 v3, s4
586; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
588; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
589; GFX9-NEXT:  .LBB2_2:
590; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
591; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
593; GFX9-NEXT:    v_mov_b32_e32 v0, v1
594; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
595; GFX9-NEXT:    s_mov_b32 s3, 0xf000
596; GFX9-NEXT:    s_mov_b32 s2, -1
597; GFX9-NEXT:    s_nop 0
598; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
599; GFX9-NEXT:    s_endpgm
600;
601; GFX1064-LABEL: add_i32_varying:
602; GFX1064:       ; %bb.0: ; %entry
603; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
604; GFX1064-NEXT:    s_not_b64 exec, exec
605; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
606; GFX1064-NEXT:    s_not_b64 exec, exec
607; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
608; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
609; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
610; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
611; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
612; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
613; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
614; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
615; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
616; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
617; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
618; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
619; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
620; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
621; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
622; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
623; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
624; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
625; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
626; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
627; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
628; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
629; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
630; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
631; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
632; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
633; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
634; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
635; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
636; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
637; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
638; GFX1064-NEXT:    s_mov_b32 s2, -1
639; GFX1064-NEXT:    ; implicit-def: $vgpr0
640; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
641; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
642; GFX1064-NEXT:  ; %bb.1:
643; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
644; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
645; GFX1064-NEXT:    s_mov_b32 s3, s7
646; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
647; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
648; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
649; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX1064-NEXT:    buffer_gl0_inv
651; GFX1064-NEXT:  .LBB2_2:
652; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
653; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
654; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
655; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
656; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
657; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
658; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
660; GFX1064-NEXT:    s_endpgm
661;
662; GFX1032-LABEL: add_i32_varying:
663; GFX1032:       ; %bb.0: ; %entry
664; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
665; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
666; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
667; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
668; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
669; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
670; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
671; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
672; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
673; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
674; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
675; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
676; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
677; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
678; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
679; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
680; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
681; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
682; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
683; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
684; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
685; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
686; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
687; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
688; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
689; GFX1032-NEXT:    s_mov_b32 s2, -1
690; GFX1032-NEXT:    ; implicit-def: $vgpr0
691; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
692; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
693; GFX1032-NEXT:  ; %bb.1:
694; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
695; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
696; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
697; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
698; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
699; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX1032-NEXT:    buffer_gl0_inv
701; GFX1032-NEXT:  .LBB2_2:
702; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
703; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
704; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
705; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
706; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
707; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
708; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
710; GFX1032-NEXT:    s_endpgm
711;
712; GFX1164-LABEL: add_i32_varying:
713; GFX1164:       ; %bb.0: ; %entry
714; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
715; GFX1164-NEXT:    s_not_b64 exec, exec
716; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
717; GFX1164-NEXT:    s_not_b64 exec, exec
718; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
719; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
720; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
721; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
722; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
723; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
724; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
725; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
726; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
727; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
728; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
729; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
730; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
731; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
732; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
733; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
734; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
735; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
736; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
737; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
738; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
739; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
740; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
741; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
742; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
743; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
744; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
745; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
746; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
747; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
748; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
749; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
750; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
751; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
752; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
753; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
754; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
755; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
756; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
757; GFX1164-NEXT:    s_mov_b32 s2, -1
758; GFX1164-NEXT:    ; implicit-def: $vgpr0
759; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
760; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
761; GFX1164-NEXT:  ; %bb.1:
762; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
763; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
764; GFX1164-NEXT:    s_mov_b32 s3, s7
765; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
766; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
767; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v4
768; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
769; GFX1164-NEXT:    buffer_gl0_inv
770; GFX1164-NEXT:  .LBB2_2:
771; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
772; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
773; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
774; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
775; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
776; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
777; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
778; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
779; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
780; GFX1164-NEXT:    s_endpgm
781;
782; GFX1132-LABEL: add_i32_varying:
783; GFX1132:       ; %bb.0: ; %entry
784; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
785; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
786; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
787; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
788; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
789; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
790; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
791; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
792; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
793; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
794; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
795; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
796; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
797; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
798; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
799; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
800; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
801; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
802; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
803; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
804; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
805; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
806; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
807; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
808; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
809; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
810; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
811; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
812; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
813; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
814; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
815; GFX1132-NEXT:    s_mov_b32 s2, -1
816; GFX1132-NEXT:    ; implicit-def: $vgpr0
817; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
818; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
819; GFX1132-NEXT:  ; %bb.1:
820; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
821; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
822; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
823; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
824; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v4
825; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX1132-NEXT:    buffer_gl0_inv
827; GFX1132-NEXT:  .LBB2_2:
828; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
829; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
830; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
831; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
832; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
833; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
834; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
836; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
837; GFX1132-NEXT:    s_endpgm
838entry:
839  %lane = call i32 @llvm.amdgcn.workitem.id.x()
840  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
841  store i32 %old, i32 addrspace(1)* %out
842  ret void
843}
844
845define amdgpu_kernel void @add_i32_varying_nouse() {
846; GFX7LESS-LABEL: add_i32_varying_nouse:
847; GFX7LESS:       ; %bb.0: ; %entry
848; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
849; GFX7LESS-NEXT:    s_mov_b32 m0, -1
850; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX7LESS-NEXT:    ds_add_u32 v1, v0
852; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
853; GFX7LESS-NEXT:    s_endpgm
854;
855; GFX8-LABEL: add_i32_varying_nouse:
856; GFX8:       ; %bb.0: ; %entry
857; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
858; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
859; GFX8-NEXT:    v_mov_b32_e32 v1, v0
860; GFX8-NEXT:    s_not_b64 exec, exec
861; GFX8-NEXT:    v_mov_b32_e32 v1, 0
862; GFX8-NEXT:    s_not_b64 exec, exec
863; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
864; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
865; GFX8-NEXT:    s_nop 1
866; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
867; GFX8-NEXT:    s_nop 1
868; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
869; GFX8-NEXT:    s_nop 1
870; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
871; GFX8-NEXT:    s_nop 1
872; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
873; GFX8-NEXT:    s_nop 1
874; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
875; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
876; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
877; GFX8-NEXT:    s_mov_b32 s0, s2
878; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
879; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
880; GFX8-NEXT:    s_cbranch_execz .LBB3_2
881; GFX8-NEXT:  ; %bb.1:
882; GFX8-NEXT:    v_mov_b32_e32 v0, 0
883; GFX8-NEXT:    v_mov_b32_e32 v2, s0
884; GFX8-NEXT:    s_mov_b32 m0, -1
885; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX8-NEXT:    ds_add_u32 v0, v2
887; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
888; GFX8-NEXT:  .LBB3_2:
889; GFX8-NEXT:    s_endpgm
890;
891; GFX9-LABEL: add_i32_varying_nouse:
892; GFX9:       ; %bb.0: ; %entry
893; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
894; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
895; GFX9-NEXT:    v_mov_b32_e32 v1, v0
896; GFX9-NEXT:    s_not_b64 exec, exec
897; GFX9-NEXT:    v_mov_b32_e32 v1, 0
898; GFX9-NEXT:    s_not_b64 exec, exec
899; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
900; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX9-NEXT:    s_nop 1
902; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
903; GFX9-NEXT:    s_nop 1
904; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
905; GFX9-NEXT:    s_nop 1
906; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
907; GFX9-NEXT:    s_nop 1
908; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
909; GFX9-NEXT:    s_nop 1
910; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
911; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
912; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
913; GFX9-NEXT:    s_mov_b32 s0, s2
914; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
915; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
916; GFX9-NEXT:    s_cbranch_execz .LBB3_2
917; GFX9-NEXT:  ; %bb.1:
918; GFX9-NEXT:    v_mov_b32_e32 v0, 0
919; GFX9-NEXT:    v_mov_b32_e32 v2, s0
920; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX9-NEXT:    ds_add_u32 v0, v2
922; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX9-NEXT:  .LBB3_2:
924; GFX9-NEXT:    s_endpgm
925;
926; GFX1064-LABEL: add_i32_varying_nouse:
927; GFX1064:       ; %bb.0: ; %entry
928; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
929; GFX1064-NEXT:    s_not_b64 exec, exec
930; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
931; GFX1064-NEXT:    s_not_b64 exec, exec
932; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
933; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
934; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
935; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
936; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
937; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
938; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
939; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
940; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
941; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
942; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
943; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
944; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
945; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
946; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
947; GFX1064-NEXT:    s_add_i32 s0, s2, s3
948; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
949; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
950; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
951; GFX1064-NEXT:  ; %bb.1:
952; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
953; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
954; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
955; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
956; GFX1064-NEXT:    ds_add_u32 v0, v3
957; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX1064-NEXT:    buffer_gl0_inv
959; GFX1064-NEXT:  .LBB3_2:
960; GFX1064-NEXT:    s_endpgm
961;
962; GFX1032-LABEL: add_i32_varying_nouse:
963; GFX1032:       ; %bb.0: ; %entry
964; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
965; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
966; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
967; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
968; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
969; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
970; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
971; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
972; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
973; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
974; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
975; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
976; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
977; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
978; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
979; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
980; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
981; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
982; GFX1032-NEXT:  ; %bb.1:
983; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
984; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
985; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
986; GFX1032-NEXT:    ds_add_u32 v3, v0
987; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX1032-NEXT:    buffer_gl0_inv
989; GFX1032-NEXT:  .LBB3_2:
990; GFX1032-NEXT:    s_endpgm
991;
992; GFX1164-LABEL: add_i32_varying_nouse:
993; GFX1164:       ; %bb.0: ; %entry
994; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
995; GFX1164-NEXT:    s_not_b64 exec, exec
996; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
997; GFX1164-NEXT:    s_not_b64 exec, exec
998; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
999; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1000; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1001; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1002; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1003; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1004; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1005; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1006; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
1007; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1008; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1009; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1010; GFX1164-NEXT:    v_permlane64_b32 v2, v1
1011; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
1012; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1013; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1014; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
1015; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1016; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
1017; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1018; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
1019; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1020; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
1021; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
1022; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
1023; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
1024; GFX1164-NEXT:  ; %bb.1:
1025; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1026; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1027; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1028; GFX1164-NEXT:    ds_add_u32 v3, v0
1029; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX1164-NEXT:    buffer_gl0_inv
1031; GFX1164-NEXT:  .LBB3_2:
1032; GFX1164-NEXT:    s_endpgm
1033;
1034; GFX1132-LABEL: add_i32_varying_nouse:
1035; GFX1132:       ; %bb.0: ; %entry
1036; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
1037; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1038; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1039; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1040; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
1041; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1042; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1043; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1044; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1045; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1046; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1047; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1048; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
1049; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1050; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1051; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1052; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
1053; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1054; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1055; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
1056; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1057; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
1058; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1059; GFX1132-NEXT:  ; %bb.1:
1060; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1061; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1062; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1063; GFX1132-NEXT:    ds_add_u32 v3, v0
1064; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX1132-NEXT:    buffer_gl0_inv
1066; GFX1132-NEXT:  .LBB3_2:
1067; GFX1132-NEXT:    s_endpgm
1068entry:
1069  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1070  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1071  ret void
1072}
1073
1074define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1075;
1076;
1077; GFX7LESS-LABEL: add_i64_constant:
1078; GFX7LESS:       ; %bb.0: ; %entry
1079; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1080; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1081; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1082; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1083; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1084; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1085; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1086; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1087; GFX7LESS-NEXT:  ; %bb.1:
1088; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1089; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1090; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1091; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1092; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1093; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1095; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1096; GFX7LESS-NEXT:  .LBB4_2:
1097; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1098; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1100; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1101; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1102; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1103; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1104; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1105; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1106; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1107; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1108; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1109; GFX7LESS-NEXT:    s_endpgm
1110;
1111; GFX8-LABEL: add_i64_constant:
1112; GFX8:       ; %bb.0: ; %entry
1113; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1114; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1115; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1116; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1117; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1118; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1119; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1120; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1121; GFX8-NEXT:  ; %bb.1:
1122; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1123; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1124; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1125; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1126; GFX8-NEXT:    s_mov_b32 m0, -1
1127; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1129; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX8-NEXT:  .LBB4_2:
1131; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1132; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1134; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1135; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1136; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1137; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1138; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1139; GFX8-NEXT:    s_mov_b32 s2, -1
1140; GFX8-NEXT:    s_nop 2
1141; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1142; GFX8-NEXT:    s_endpgm
1143;
1144; GFX9-LABEL: add_i64_constant:
1145; GFX9:       ; %bb.0: ; %entry
1146; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1147; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1148; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1149; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1150; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1151; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1152; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1153; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1154; GFX9-NEXT:  ; %bb.1:
1155; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1156; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1157; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1158; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1159; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1161; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX9-NEXT:  .LBB4_2:
1163; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1166; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1167; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1168; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1169; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1170; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1171; GFX9-NEXT:    s_mov_b32 s2, -1
1172; GFX9-NEXT:    s_nop 2
1173; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1174; GFX9-NEXT:    s_endpgm
1175;
1176; GFX1064-LABEL: add_i64_constant:
1177; GFX1064:       ; %bb.0: ; %entry
1178; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1179; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1180; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1181; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1182; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1183; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1184; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1185; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1186; GFX1064-NEXT:  ; %bb.1:
1187; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1188; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1189; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
1190; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1191; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1192; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1193; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1194; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX1064-NEXT:    buffer_gl0_inv
1196; GFX1064-NEXT:  .LBB4_2:
1197; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1198; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1199; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1200; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1201; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1202; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1203; GFX1064-NEXT:    s_mov_b32 s2, -1
1204; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1205; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1206; GFX1064-NEXT:    s_endpgm
1207;
1208; GFX1032-LABEL: add_i64_constant:
1209; GFX1032:       ; %bb.0: ; %entry
1210; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1211; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1212; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1213; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1214; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1215; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1216; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1217; GFX1032-NEXT:  ; %bb.1:
1218; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1219; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1220; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1221; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
1222; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1223; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1224; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1225; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1226; GFX1032-NEXT:    buffer_gl0_inv
1227; GFX1032-NEXT:  .LBB4_2:
1228; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1229; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1230; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1231; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1232; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1233; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1234; GFX1032-NEXT:    s_mov_b32 s2, -1
1235; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1237; GFX1032-NEXT:    s_endpgm
1238;
1239; GFX1164-LABEL: add_i64_constant:
1240; GFX1164:       ; %bb.0: ; %entry
1241; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1242; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1243; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1244; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1245; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1246; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1247; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1248; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1249; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1250; GFX1164-NEXT:  ; %bb.1:
1251; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1252; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1253; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
1254; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1255; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
1256; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1257; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1258; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1259; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1260; GFX1164-NEXT:    buffer_gl0_inv
1261; GFX1164-NEXT:  .LBB4_2:
1262; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1263; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1264; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1265; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1266; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1267; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1268; GFX1164-NEXT:    s_mov_b32 s2, -1
1269; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1271; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1272; GFX1164-NEXT:    s_endpgm
1273;
1274; GFX1132-LABEL: add_i64_constant:
1275; GFX1132:       ; %bb.0: ; %entry
1276; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1277; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1278; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1279; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1280; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1281; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1282; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1283; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1284; GFX1132-NEXT:  ; %bb.1:
1285; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1286; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1287; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1288; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1289; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
1290; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1291; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1292; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1293; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX1132-NEXT:    buffer_gl0_inv
1295; GFX1132-NEXT:  .LBB4_2:
1296; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1297; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1298; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1299; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1300; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1301; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1302; GFX1132-NEXT:    s_mov_b32 s2, -1
1303; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1305; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1306; GFX1132-NEXT:    s_endpgm
1307entry:
1308  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1309  store i64 %old, i64 addrspace(1)* %out
1310  ret void
1311}
1312
1313define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1314;
1315;
1316; GFX7LESS-LABEL: add_i64_uniform:
1317; GFX7LESS:       ; %bb.0: ; %entry
1318; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1319; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1320; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1321; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1322; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1323; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1324; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1325; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1326; GFX7LESS-NEXT:  ; %bb.1:
1327; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1328; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1329; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1330; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1331; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1332; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1333; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1334; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1335; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1336; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1337; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1339; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1340; GFX7LESS-NEXT:  .LBB5_2:
1341; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1342; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1343; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1344; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1345; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1346; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1347; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
1348; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1349; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1350; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1351; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1352; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1353; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
1354; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1355; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1356; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1357; GFX7LESS-NEXT:    s_endpgm
1358;
1359; GFX8-LABEL: add_i64_uniform:
1360; GFX8:       ; %bb.0: ; %entry
1361; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1362; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1363; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1364; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1365; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1366; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1367; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1368; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1369; GFX8-NEXT:  ; %bb.1:
1370; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1371; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1372; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1373; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1374; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1375; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1376; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1377; GFX8-NEXT:    s_mov_b32 m0, -1
1378; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1380; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1381; GFX8-NEXT:  .LBB5_2:
1382; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1383; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1384; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1385; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1386; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1387; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1388; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1389; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1390; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1391; GFX8-NEXT:    s_mov_b32 s6, -1
1392; GFX8-NEXT:    s_mov_b32 s4, s0
1393; GFX8-NEXT:    s_mov_b32 s5, s1
1394; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1395; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1396; GFX8-NEXT:    s_endpgm
1397;
1398; GFX9-LABEL: add_i64_uniform:
1399; GFX9:       ; %bb.0: ; %entry
1400; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1401; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1402; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1403; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1404; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1405; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1406; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1407; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1408; GFX9-NEXT:  ; %bb.1:
1409; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1412; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1413; GFX9-NEXT:    s_add_i32 s8, s8, s7
1414; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1415; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1416; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1417; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1419; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1421; GFX9-NEXT:  .LBB5_2:
1422; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1424; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1425; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1426; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1427; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1428; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1429; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1430; GFX9-NEXT:    s_mov_b32 s6, -1
1431; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1432; GFX9-NEXT:    s_mov_b32 s4, s0
1433; GFX9-NEXT:    s_mov_b32 s5, s1
1434; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1435; GFX9-NEXT:    s_endpgm
1436;
1437; GFX1064-LABEL: add_i64_uniform:
1438; GFX1064:       ; %bb.0: ; %entry
1439; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1440; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1441; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1442; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1443; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1444; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1445; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1446; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1447; GFX1064-NEXT:  ; %bb.1:
1448; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1449; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1450; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1452; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1453; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1454; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1455; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1456; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1457; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1458; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1459; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1460; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1461; GFX1064-NEXT:    buffer_gl0_inv
1462; GFX1064-NEXT:  .LBB5_2:
1463; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1464; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1465; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1466; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1467; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1468; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1469; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1470; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1471; GFX1064-NEXT:    s_mov_b32 s2, -1
1472; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1473; GFX1064-NEXT:    s_endpgm
1474;
1475; GFX1032-LABEL: add_i64_uniform:
1476; GFX1032:       ; %bb.0: ; %entry
1477; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1478; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1479; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1480; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1481; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1482; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1483; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1484; GFX1032-NEXT:  ; %bb.1:
1485; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1486; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1487; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1488; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1489; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1490; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1491; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1492; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1493; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1494; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1495; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1496; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1497; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX1032-NEXT:    buffer_gl0_inv
1499; GFX1032-NEXT:  .LBB5_2:
1500; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1501; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1502; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1503; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1504; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1506; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1507; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1508; GFX1032-NEXT:    s_mov_b32 s2, -1
1509; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1510; GFX1032-NEXT:    s_endpgm
1511;
1512; GFX1164-LABEL: add_i64_uniform:
1513; GFX1164:       ; %bb.0: ; %entry
1514; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1515; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1516; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1517; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1518; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1519; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1520; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1521; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1522; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1523; GFX1164-NEXT:  ; %bb.1:
1524; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1525; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1526; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1527; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1528; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1529; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1530; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1531; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1532; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1533; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1534; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1535; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1536; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1537; GFX1164-NEXT:    buffer_gl0_inv
1538; GFX1164-NEXT:  .LBB5_2:
1539; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1540; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1541; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1542; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1544; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1545; GFX1164-NEXT:    s_mov_b32 s2, -1
1546; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1547; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1548; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1549; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1550; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1551; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1552; GFX1164-NEXT:    s_endpgm
1553;
1554; GFX1132-LABEL: add_i64_uniform:
1555; GFX1132:       ; %bb.0: ; %entry
1556; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1557; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1558; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1559; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1560; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1561; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1562; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1563; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1564; GFX1132-NEXT:  ; %bb.1:
1565; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1566; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1567; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1569; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1570; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1571; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1572; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
1573; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
1574; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1575; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1576; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1577; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1578; GFX1132-NEXT:    buffer_gl0_inv
1579; GFX1132-NEXT:  .LBB5_2:
1580; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1581; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1582; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1583; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1585; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1586; GFX1132-NEXT:    s_mov_b32 s2, -1
1587; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1588; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1589; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1590; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1591; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1592; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1593; GFX1132-NEXT:    s_endpgm
1594entry:
1595  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1596  store i64 %old, i64 addrspace(1)* %out
1597  ret void
1598}
1599
1600define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1601;
1602;
1603; GFX7LESS-LABEL: add_i64_varying:
1604; GFX7LESS:       ; %bb.0: ; %entry
1605; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1606; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1607; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1608; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1609; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1610; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1611; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1612; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1613; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1614; GFX7LESS-NEXT:    s_endpgm
1615;
1616; GFX8-LABEL: add_i64_varying:
1617; GFX8:       ; %bb.0: ; %entry
1618; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1619; GFX8-NEXT:    s_mov_b32 m0, -1
1620; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1621; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1622; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1623; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1625; GFX8-NEXT:    s_mov_b32 s2, -1
1626; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1627; GFX8-NEXT:    s_endpgm
1628;
1629; GFX9-LABEL: add_i64_varying:
1630; GFX9:       ; %bb.0: ; %entry
1631; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1632; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1633; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1637; GFX9-NEXT:    s_mov_b32 s2, -1
1638; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1639; GFX9-NEXT:    s_endpgm
1640;
1641; GFX10-LABEL: add_i64_varying:
1642; GFX10:       ; %bb.0: ; %entry
1643; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1644; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1645; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1646; GFX10-NEXT:    s_mov_b32 s2, -1
1647; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1648; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1649; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1650; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1651; GFX10-NEXT:    buffer_gl0_inv
1652; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1653; GFX10-NEXT:    s_endpgm
1654;
1655; GFX11-LABEL: add_i64_varying:
1656; GFX11:       ; %bb.0: ; %entry
1657; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1658; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1659; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1660; GFX11-NEXT:    s_mov_b32 s2, -1
1661; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1662; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1663; GFX11-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1664; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1665; GFX11-NEXT:    buffer_gl0_inv
1666; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1667; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1668; GFX11-NEXT:    s_endpgm
1669entry:
1670  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1671  %zext = zext i32 %lane to i64
1672  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1673  store i64 %old, i64 addrspace(1)* %out
1674  ret void
1675}
1676
1677define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1678;
1679;
1680; GFX7LESS-LABEL: sub_i32_constant:
1681; GFX7LESS:       ; %bb.0: ; %entry
1682; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1683; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1684; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1685; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1686; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1687; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1688; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1689; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1690; GFX7LESS-NEXT:  ; %bb.1:
1691; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1692; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1693; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1694; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1695; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1696; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1698; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX7LESS-NEXT:  .LBB7_2:
1700; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1701; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1702; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1703; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1704; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1705; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1706; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1707; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1708; GFX7LESS-NEXT:    s_endpgm
1709;
1710; GFX8-LABEL: sub_i32_constant:
1711; GFX8:       ; %bb.0: ; %entry
1712; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1713; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1714; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1715; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1716; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1717; GFX8-NEXT:    ; implicit-def: $vgpr1
1718; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1719; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1720; GFX8-NEXT:  ; %bb.1:
1721; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1722; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1723; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1724; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1725; GFX8-NEXT:    s_mov_b32 m0, -1
1726; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1728; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1729; GFX8-NEXT:  .LBB7_2:
1730; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1731; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1732; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1733; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1734; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1735; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1736; GFX8-NEXT:    s_mov_b32 s2, -1
1737; GFX8-NEXT:    s_nop 0
1738; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1739; GFX8-NEXT:    s_endpgm
1740;
1741; GFX9-LABEL: sub_i32_constant:
1742; GFX9:       ; %bb.0: ; %entry
1743; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1744; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1745; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1746; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1747; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1748; GFX9-NEXT:    ; implicit-def: $vgpr1
1749; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1750; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1751; GFX9-NEXT:  ; %bb.1:
1752; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1753; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1754; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1755; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1756; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1757; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1758; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1759; GFX9-NEXT:  .LBB7_2:
1760; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1762; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1763; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1764; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1765; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1766; GFX9-NEXT:    s_mov_b32 s2, -1
1767; GFX9-NEXT:    s_nop 0
1768; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1769; GFX9-NEXT:    s_endpgm
1770;
1771; GFX1064-LABEL: sub_i32_constant:
1772; GFX1064:       ; %bb.0: ; %entry
1773; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1774; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1775; GFX1064-NEXT:    ; implicit-def: $vgpr1
1776; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1777; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1778; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1779; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1780; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1781; GFX1064-NEXT:  ; %bb.1:
1782; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1783; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1784; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1785; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1786; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1787; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1788; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1789; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1790; GFX1064-NEXT:    buffer_gl0_inv
1791; GFX1064-NEXT:  .LBB7_2:
1792; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1793; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1794; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1795; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1796; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1797; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1798; GFX1064-NEXT:    s_mov_b32 s2, -1
1799; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1800; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1801; GFX1064-NEXT:    s_endpgm
1802;
1803; GFX1032-LABEL: sub_i32_constant:
1804; GFX1032:       ; %bb.0: ; %entry
1805; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1806; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1807; GFX1032-NEXT:    ; implicit-def: $vgpr1
1808; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1809; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1810; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1811; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1812; GFX1032-NEXT:  ; %bb.1:
1813; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1814; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1815; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1816; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1817; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1818; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1819; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1820; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX1032-NEXT:    buffer_gl0_inv
1822; GFX1032-NEXT:  .LBB7_2:
1823; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1824; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1825; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1826; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1827; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1828; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1829; GFX1032-NEXT:    s_mov_b32 s2, -1
1830; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1831; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1832; GFX1032-NEXT:    s_endpgm
1833;
1834; GFX1164-LABEL: sub_i32_constant:
1835; GFX1164:       ; %bb.0: ; %entry
1836; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1837; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1838; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1839; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1840; GFX1164-NEXT:    ; implicit-def: $vgpr1
1841; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1842; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1843; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1844; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1845; GFX1164-NEXT:  ; %bb.1:
1846; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1847; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1848; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1849; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1850; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
1851; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1852; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1853; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1854; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX1164-NEXT:    buffer_gl0_inv
1856; GFX1164-NEXT:  .LBB7_2:
1857; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1858; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1859; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1860; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1861; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1862; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1863; GFX1164-NEXT:    s_mov_b32 s2, -1
1864; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1866; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1867; GFX1164-NEXT:    s_endpgm
1868;
1869; GFX1132-LABEL: sub_i32_constant:
1870; GFX1132:       ; %bb.0: ; %entry
1871; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1872; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1873; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1874; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1875; GFX1132-NEXT:    ; implicit-def: $vgpr1
1876; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1877; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1878; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
1879; GFX1132-NEXT:  ; %bb.1:
1880; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1881; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1882; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1883; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1884; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
1885; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1886; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1887; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1888; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1889; GFX1132-NEXT:    buffer_gl0_inv
1890; GFX1132-NEXT:  .LBB7_2:
1891; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1892; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1893; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1894; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1895; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1896; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1897; GFX1132-NEXT:    s_mov_b32 s2, -1
1898; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1899; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1900; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1901; GFX1132-NEXT:    s_endpgm
1902entry:
1903  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1904  store i32 %old, i32 addrspace(1)* %out
1905  ret void
1906}
1907
1908define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1909;
1910;
1911; GFX7LESS-LABEL: sub_i32_uniform:
1912; GFX7LESS:       ; %bb.0: ; %entry
1913; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1914; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1915; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1916; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1917; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1918; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1919; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1920; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1921; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1922; GFX7LESS-NEXT:  ; %bb.1:
1923; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1924; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1926; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1927; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1928; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1929; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1930; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1931; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1932; GFX7LESS-NEXT:  .LBB8_2:
1933; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1934; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1935; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1936; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1937; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1938; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1939; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1940; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1941; GFX7LESS-NEXT:    s_endpgm
1942;
1943; GFX8-LABEL: sub_i32_uniform:
1944; GFX8:       ; %bb.0: ; %entry
1945; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1946; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1947; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1948; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1949; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1950; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1951; GFX8-NEXT:    ; implicit-def: $vgpr1
1952; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1953; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1954; GFX8-NEXT:  ; %bb.1:
1955; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1956; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1958; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1959; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1960; GFX8-NEXT:    s_mov_b32 m0, -1
1961; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1962; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1963; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1964; GFX8-NEXT:  .LBB8_2:
1965; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1966; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1967; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1968; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1969; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1970; GFX8-NEXT:    s_mov_b32 s6, -1
1971; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1972; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1973; GFX8-NEXT:    s_endpgm
1974;
1975; GFX9-LABEL: sub_i32_uniform:
1976; GFX9:       ; %bb.0: ; %entry
1977; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1978; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1979; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1980; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1981; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1982; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1983; GFX9-NEXT:    ; implicit-def: $vgpr1
1984; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1985; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1986; GFX9-NEXT:  ; %bb.1:
1987; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1988; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1989; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1990; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1991; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1993; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1994; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1995; GFX9-NEXT:  .LBB8_2:
1996; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1998; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1999; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2000; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2001; GFX9-NEXT:    s_mov_b32 s6, -1
2002; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
2003; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2004; GFX9-NEXT:    s_endpgm
2005;
2006; GFX1064-LABEL: sub_i32_uniform:
2007; GFX1064:       ; %bb.0: ; %entry
2008; GFX1064-NEXT:    s_clause 0x1
2009; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2010; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
2011; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
2012; GFX1064-NEXT:    ; implicit-def: $vgpr1
2013; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2014; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2015; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2016; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2017; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
2018; GFX1064-NEXT:  ; %bb.1:
2019; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2020; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2021; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2022; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
2023; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
2024; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2025; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2026; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2027; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX1064-NEXT:    buffer_gl0_inv
2029; GFX1064-NEXT:  .LBB8_2:
2030; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2031; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
2032; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2033; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
2034; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
2035; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
2036; GFX1064-NEXT:    s_mov_b32 s6, -1
2037; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2038; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2039; GFX1064-NEXT:    s_endpgm
2040;
2041; GFX1032-LABEL: sub_i32_uniform:
2042; GFX1032:       ; %bb.0: ; %entry
2043; GFX1032-NEXT:    s_clause 0x1
2044; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2045; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
2046; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2047; GFX1032-NEXT:    ; implicit-def: $vgpr1
2048; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
2049; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2050; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2051; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
2052; GFX1032-NEXT:  ; %bb.1:
2053; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
2054; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2055; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2056; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
2057; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
2058; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2059; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2060; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2061; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2062; GFX1032-NEXT:    buffer_gl0_inv
2063; GFX1032-NEXT:  .LBB8_2:
2064; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2065; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2066; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2067; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2068; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
2069; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2070; GFX1032-NEXT:    s_mov_b32 s6, -1
2071; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2072; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2073; GFX1032-NEXT:    s_endpgm
2074;
2075; GFX1164-LABEL: sub_i32_uniform:
2076; GFX1164:       ; %bb.0: ; %entry
2077; GFX1164-NEXT:    s_clause 0x1
2078; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2079; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
2080; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2081; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2082; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2083; GFX1164-NEXT:    ; implicit-def: $vgpr1
2084; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2085; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2086; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2087; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2088; GFX1164-NEXT:  ; %bb.1:
2089; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2090; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2091; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2092; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
2093; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2094; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
2095; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2096; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2097; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2098; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2099; GFX1164-NEXT:    buffer_gl0_inv
2100; GFX1164-NEXT:  .LBB8_2:
2101; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2102; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
2104; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2105; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2106; GFX1164-NEXT:    s_mov_b32 s6, -1
2107; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2108; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2109; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2110; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2111; GFX1164-NEXT:    s_endpgm
2112;
2113; GFX1132-LABEL: sub_i32_uniform:
2114; GFX1132:       ; %bb.0: ; %entry
2115; GFX1132-NEXT:    s_clause 0x1
2116; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2117; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
2118; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2119; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2120; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2121; GFX1132-NEXT:    ; implicit-def: $vgpr1
2122; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2123; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2124; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2125; GFX1132-NEXT:  ; %bb.1:
2126; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2127; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2128; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2129; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2130; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2131; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
2132; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2133; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2134; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2135; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2136; GFX1132-NEXT:    buffer_gl0_inv
2137; GFX1132-NEXT:  .LBB8_2:
2138; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2139; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2140; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2141; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2142; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2143; GFX1132-NEXT:    s_mov_b32 s6, -1
2144; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2145; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2146; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2147; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2148; GFX1132-NEXT:    s_endpgm
2149entry:
2150  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
2151  store i32 %old, i32 addrspace(1)* %out
2152  ret void
2153}
2154
2155define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
2156;
2157;
2158; GFX7LESS-LABEL: sub_i32_varying:
2159; GFX7LESS:       ; %bb.0: ; %entry
2160; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2161; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2162; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2163; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2164; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
2165; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2166; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2167; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2168; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2169; GFX7LESS-NEXT:    s_endpgm
2170;
2171; GFX8-LABEL: sub_i32_varying:
2172; GFX8:       ; %bb.0: ; %entry
2173; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2174; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2175; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2176; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2177; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2178; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2179; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2180; GFX8-NEXT:    s_not_b64 exec, exec
2181; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2182; GFX8-NEXT:    s_not_b64 exec, exec
2183; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2184; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2185; GFX8-NEXT:    s_nop 1
2186; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2187; GFX8-NEXT:    s_nop 1
2188; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2189; GFX8-NEXT:    s_nop 1
2190; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2191; GFX8-NEXT:    s_nop 1
2192; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2193; GFX8-NEXT:    s_nop 1
2194; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2195; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2196; GFX8-NEXT:    s_nop 0
2197; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2198; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2199; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2200; GFX8-NEXT:    ; implicit-def: $vgpr0
2201; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2202; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2203; GFX8-NEXT:  ; %bb.1:
2204; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2205; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2206; GFX8-NEXT:    s_mov_b32 m0, -1
2207; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX8-NEXT:  .LBB9_2:
2211; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2212; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2214; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2215; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2216; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2217; GFX8-NEXT:    s_mov_b32 s2, -1
2218; GFX8-NEXT:    s_nop 0
2219; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2220; GFX8-NEXT:    s_endpgm
2221;
2222; GFX9-LABEL: sub_i32_varying:
2223; GFX9:       ; %bb.0: ; %entry
2224; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2225; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2226; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2227; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2228; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2229; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2230; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2231; GFX9-NEXT:    s_not_b64 exec, exec
2232; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2233; GFX9-NEXT:    s_not_b64 exec, exec
2234; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2235; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2236; GFX9-NEXT:    s_nop 1
2237; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2238; GFX9-NEXT:    s_nop 1
2239; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2240; GFX9-NEXT:    s_nop 1
2241; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2242; GFX9-NEXT:    s_nop 1
2243; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2244; GFX9-NEXT:    s_nop 1
2245; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2246; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2247; GFX9-NEXT:    s_nop 0
2248; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2249; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2250; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2251; GFX9-NEXT:    ; implicit-def: $vgpr0
2252; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2253; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2254; GFX9-NEXT:  ; %bb.1:
2255; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2256; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2257; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2258; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2259; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2260; GFX9-NEXT:  .LBB9_2:
2261; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2262; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2263; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2264; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2265; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2266; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2267; GFX9-NEXT:    s_mov_b32 s2, -1
2268; GFX9-NEXT:    s_nop 0
2269; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2270; GFX9-NEXT:    s_endpgm
2271;
2272; GFX1064-LABEL: sub_i32_varying:
2273; GFX1064:       ; %bb.0: ; %entry
2274; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2275; GFX1064-NEXT:    s_not_b64 exec, exec
2276; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2277; GFX1064-NEXT:    s_not_b64 exec, exec
2278; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2279; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2280; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2281; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2282; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2283; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2284; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2285; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2286; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2287; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2288; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2289; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2290; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2291; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2292; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2293; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2294; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2295; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2296; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2297; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2298; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2299; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2300; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2301; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2302; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2303; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2304; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2305; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2306; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2307; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2308; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2309; GFX1064-NEXT:    s_mov_b32 s2, -1
2310; GFX1064-NEXT:    ; implicit-def: $vgpr0
2311; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2312; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2313; GFX1064-NEXT:  ; %bb.1:
2314; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2315; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2316; GFX1064-NEXT:    s_mov_b32 s3, s7
2317; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2318; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2319; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2320; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2321; GFX1064-NEXT:    buffer_gl0_inv
2322; GFX1064-NEXT:  .LBB9_2:
2323; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2324; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2325; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2326; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2327; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2328; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2329; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2331; GFX1064-NEXT:    s_endpgm
2332;
2333; GFX1032-LABEL: sub_i32_varying:
2334; GFX1032:       ; %bb.0: ; %entry
2335; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2336; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2337; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2338; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2339; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2340; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2341; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2342; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2343; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2344; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2345; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2346; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2347; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2348; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2349; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2350; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2351; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2352; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2353; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2354; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2355; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2356; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2357; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2358; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2359; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2360; GFX1032-NEXT:    s_mov_b32 s2, -1
2361; GFX1032-NEXT:    ; implicit-def: $vgpr0
2362; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2363; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2364; GFX1032-NEXT:  ; %bb.1:
2365; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2366; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2367; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2368; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2369; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2370; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX1032-NEXT:    buffer_gl0_inv
2372; GFX1032-NEXT:  .LBB9_2:
2373; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2374; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2375; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2376; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2377; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2378; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2379; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2380; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2381; GFX1032-NEXT:    s_endpgm
2382;
2383; GFX1164-LABEL: sub_i32_varying:
2384; GFX1164:       ; %bb.0: ; %entry
2385; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2386; GFX1164-NEXT:    s_not_b64 exec, exec
2387; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2388; GFX1164-NEXT:    s_not_b64 exec, exec
2389; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2390; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2391; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2392; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2393; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2394; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2395; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2396; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2397; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2398; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2399; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2400; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2401; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2402; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2403; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2404; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2405; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2406; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2407; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
2408; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2409; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2410; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2411; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2412; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
2413; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
2414; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2415; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2416; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2417; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2418; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
2419; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
2420; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
2421; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2422; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2423; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2424; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2425; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
2426; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2427; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2428; GFX1164-NEXT:    s_mov_b32 s2, -1
2429; GFX1164-NEXT:    ; implicit-def: $vgpr0
2430; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2431; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2432; GFX1164-NEXT:  ; %bb.1:
2433; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2434; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
2435; GFX1164-NEXT:    s_mov_b32 s3, s7
2436; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2437; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2438; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2439; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2440; GFX1164-NEXT:    buffer_gl0_inv
2441; GFX1164-NEXT:  .LBB9_2:
2442; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2443; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
2444; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2445; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2446; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2447; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2448; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2449; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2450; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2451; GFX1164-NEXT:    s_endpgm
2452;
2453; GFX1132-LABEL: sub_i32_varying:
2454; GFX1132:       ; %bb.0: ; %entry
2455; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2456; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2457; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2458; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2459; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2460; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2461; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2462; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2463; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2464; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2465; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2466; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2467; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2468; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2469; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2470; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2471; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2472; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2473; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2474; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2475; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
2476; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
2477; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2478; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2479; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2480; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2481; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2482; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
2483; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2484; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2485; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2486; GFX1132-NEXT:    s_mov_b32 s2, -1
2487; GFX1132-NEXT:    ; implicit-def: $vgpr0
2488; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2489; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2490; GFX1132-NEXT:  ; %bb.1:
2491; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
2492; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
2493; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2494; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2495; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2496; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX1132-NEXT:    buffer_gl0_inv
2498; GFX1132-NEXT:  .LBB9_2:
2499; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2500; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
2501; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2502; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2503; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2504; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2505; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2507; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2508; GFX1132-NEXT:    s_endpgm
2509entry:
2510  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2511  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2512  store i32 %old, i32 addrspace(1)* %out
2513  ret void
2514}
2515
2516define amdgpu_kernel void @sub_i32_varying_nouse() {
2517; GFX7LESS-LABEL: sub_i32_varying_nouse:
2518; GFX7LESS:       ; %bb.0: ; %entry
2519; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2520; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2521; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2522; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
2523; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2524; GFX7LESS-NEXT:    s_endpgm
2525;
2526; GFX8-LABEL: sub_i32_varying_nouse:
2527; GFX8:       ; %bb.0: ; %entry
2528; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2529; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2530; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2531; GFX8-NEXT:    s_not_b64 exec, exec
2532; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2533; GFX8-NEXT:    s_not_b64 exec, exec
2534; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
2535; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2536; GFX8-NEXT:    s_nop 1
2537; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2538; GFX8-NEXT:    s_nop 1
2539; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2540; GFX8-NEXT:    s_nop 1
2541; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2542; GFX8-NEXT:    s_nop 1
2543; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2544; GFX8-NEXT:    s_nop 1
2545; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2546; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
2547; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
2548; GFX8-NEXT:    s_mov_b32 s0, s2
2549; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2550; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2551; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2552; GFX8-NEXT:  ; %bb.1:
2553; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2554; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2555; GFX8-NEXT:    s_mov_b32 m0, -1
2556; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2557; GFX8-NEXT:    ds_sub_u32 v0, v2
2558; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2559; GFX8-NEXT:  .LBB10_2:
2560; GFX8-NEXT:    s_endpgm
2561;
2562; GFX9-LABEL: sub_i32_varying_nouse:
2563; GFX9:       ; %bb.0: ; %entry
2564; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2565; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2566; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2567; GFX9-NEXT:    s_not_b64 exec, exec
2568; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2569; GFX9-NEXT:    s_not_b64 exec, exec
2570; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
2571; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2572; GFX9-NEXT:    s_nop 1
2573; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2574; GFX9-NEXT:    s_nop 1
2575; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2576; GFX9-NEXT:    s_nop 1
2577; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2578; GFX9-NEXT:    s_nop 1
2579; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2580; GFX9-NEXT:    s_nop 1
2581; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2582; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
2583; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
2584; GFX9-NEXT:    s_mov_b32 s0, s2
2585; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2586; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2587; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2588; GFX9-NEXT:  ; %bb.1:
2589; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2590; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2591; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2592; GFX9-NEXT:    ds_sub_u32 v0, v2
2593; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2594; GFX9-NEXT:  .LBB10_2:
2595; GFX9-NEXT:    s_endpgm
2596;
2597; GFX1064-LABEL: sub_i32_varying_nouse:
2598; GFX1064:       ; %bb.0: ; %entry
2599; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2600; GFX1064-NEXT:    s_not_b64 exec, exec
2601; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2602; GFX1064-NEXT:    s_not_b64 exec, exec
2603; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2604; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2605; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2606; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2607; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2608; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2609; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2610; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2611; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2612; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2613; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2614; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
2615; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
2616; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2617; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2618; GFX1064-NEXT:    s_add_i32 s0, s2, s3
2619; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2620; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2621; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2622; GFX1064-NEXT:  ; %bb.1:
2623; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2624; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
2625; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2626; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2627; GFX1064-NEXT:    ds_sub_u32 v0, v3
2628; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2629; GFX1064-NEXT:    buffer_gl0_inv
2630; GFX1064-NEXT:  .LBB10_2:
2631; GFX1064-NEXT:    s_endpgm
2632;
2633; GFX1032-LABEL: sub_i32_varying_nouse:
2634; GFX1032:       ; %bb.0: ; %entry
2635; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2636; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2637; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2638; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2639; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
2640; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2641; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2642; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2643; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2644; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2645; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2646; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2647; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
2648; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2649; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2650; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2651; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2652; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2653; GFX1032-NEXT:  ; %bb.1:
2654; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2655; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2656; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2657; GFX1032-NEXT:    ds_sub_u32 v3, v0
2658; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2659; GFX1032-NEXT:    buffer_gl0_inv
2660; GFX1032-NEXT:  .LBB10_2:
2661; GFX1032-NEXT:    s_endpgm
2662;
2663; GFX1164-LABEL: sub_i32_varying_nouse:
2664; GFX1164:       ; %bb.0: ; %entry
2665; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2666; GFX1164-NEXT:    s_not_b64 exec, exec
2667; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2668; GFX1164-NEXT:    s_not_b64 exec, exec
2669; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2670; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2671; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2672; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2673; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2674; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2675; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2676; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2677; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2678; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2679; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2680; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2681; GFX1164-NEXT:    v_permlane64_b32 v2, v1
2682; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2683; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2684; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2685; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2686; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2687; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2688; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
2689; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
2690; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2691; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
2692; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2693; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
2694; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2695; GFX1164-NEXT:  ; %bb.1:
2696; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2697; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2698; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2699; GFX1164-NEXT:    ds_sub_u32 v3, v0
2700; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2701; GFX1164-NEXT:    buffer_gl0_inv
2702; GFX1164-NEXT:  .LBB10_2:
2703; GFX1164-NEXT:    s_endpgm
2704;
2705; GFX1132-LABEL: sub_i32_varying_nouse:
2706; GFX1132:       ; %bb.0: ; %entry
2707; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2708; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2709; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2710; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2711; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
2712; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2713; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2714; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2715; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2716; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2717; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2718; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2719; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2720; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2721; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2722; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2723; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
2724; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2725; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2726; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
2727; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
2728; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
2729; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
2730; GFX1132-NEXT:  ; %bb.1:
2731; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2732; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2733; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2734; GFX1132-NEXT:    ds_sub_u32 v3, v0
2735; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2736; GFX1132-NEXT:    buffer_gl0_inv
2737; GFX1132-NEXT:  .LBB10_2:
2738; GFX1132-NEXT:    s_endpgm
2739entry:
2740  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2741  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2742  ret void
2743}
2744
2745define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2746;
2747;
2748; GFX7LESS-LABEL: sub_i64_constant:
2749; GFX7LESS:       ; %bb.0: ; %entry
2750; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2751; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2752; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2753; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
2754; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2755; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2756; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2757; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
2758; GFX7LESS-NEXT:  ; %bb.1:
2759; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2760; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
2761; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2762; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
2763; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2764; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2765; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2766; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2767; GFX7LESS-NEXT:  .LBB11_2:
2768; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2769; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2770; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2771; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
2772; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2773; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2774; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2775; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2776; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2777; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2778; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2779; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2780; GFX7LESS-NEXT:    s_endpgm
2781;
2782; GFX8-LABEL: sub_i64_constant:
2783; GFX8:       ; %bb.0: ; %entry
2784; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2785; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2786; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2787; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2788; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2789; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2790; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2791; GFX8-NEXT:    s_cbranch_execz .LBB11_2
2792; GFX8-NEXT:  ; %bb.1:
2793; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2794; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2795; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2796; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2797; GFX8-NEXT:    s_mov_b32 m0, -1
2798; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2799; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2800; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2801; GFX8-NEXT:  .LBB11_2:
2802; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2803; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2804; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2805; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2806; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2807; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2808; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2809; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2810; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2811; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2812; GFX8-NEXT:    s_mov_b32 s2, -1
2813; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2814; GFX8-NEXT:    s_endpgm
2815;
2816; GFX9-LABEL: sub_i64_constant:
2817; GFX9:       ; %bb.0: ; %entry
2818; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2819; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2820; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2821; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2822; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2823; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2824; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2825; GFX9-NEXT:    s_cbranch_execz .LBB11_2
2826; GFX9-NEXT:  ; %bb.1:
2827; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2828; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2829; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2830; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2831; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2832; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2833; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2834; GFX9-NEXT:  .LBB11_2:
2835; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2836; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2838; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2839; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2840; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2841; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2842; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2843; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2844; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2845; GFX9-NEXT:    s_mov_b32 s2, -1
2846; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2847; GFX9-NEXT:    s_endpgm
2848;
2849; GFX1064-LABEL: sub_i64_constant:
2850; GFX1064:       ; %bb.0: ; %entry
2851; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2852; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2853; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2854; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2855; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2856; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2857; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2858; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
2859; GFX1064-NEXT:  ; %bb.1:
2860; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2861; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2862; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2863; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2864; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2865; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2866; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2867; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2868; GFX1064-NEXT:    buffer_gl0_inv
2869; GFX1064-NEXT:  .LBB11_2:
2870; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2871; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2872; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2873; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2874; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2875; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2876; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2877; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2878; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2879; GFX1064-NEXT:    s_mov_b32 s2, -1
2880; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2881; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2882; GFX1064-NEXT:    s_endpgm
2883;
2884; GFX1032-LABEL: sub_i64_constant:
2885; GFX1032:       ; %bb.0: ; %entry
2886; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2887; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2888; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2889; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2890; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2891; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2892; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2893; GFX1032-NEXT:  ; %bb.1:
2894; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2895; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2896; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2897; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2898; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2899; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2900; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2901; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2902; GFX1032-NEXT:    buffer_gl0_inv
2903; GFX1032-NEXT:  .LBB11_2:
2904; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2905; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2906; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2907; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2908; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2909; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2910; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2911; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2912; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2913; GFX1032-NEXT:    s_mov_b32 s2, -1
2914; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2915; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2916; GFX1032-NEXT:    s_endpgm
2917;
2918; GFX1164-LABEL: sub_i64_constant:
2919; GFX1164:       ; %bb.0: ; %entry
2920; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2921; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2922; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2923; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2924; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2925; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2926; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2927; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2928; GFX1164-NEXT:    s_cbranch_execz .LBB11_2
2929; GFX1164-NEXT:  ; %bb.1:
2930; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2931; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2932; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
2933; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2934; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2935; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2936; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2937; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2938; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2939; GFX1164-NEXT:    buffer_gl0_inv
2940; GFX1164-NEXT:  .LBB11_2:
2941; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
2942; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2943; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2944; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2945; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2946; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2947; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2948; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2949; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2950; GFX1164-NEXT:    s_mov_b32 s2, -1
2951; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2952; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2953; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2954; GFX1164-NEXT:    s_endpgm
2955;
2956; GFX1132-LABEL: sub_i64_constant:
2957; GFX1132:       ; %bb.0: ; %entry
2958; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2959; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
2960; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2961; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2962; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2963; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2964; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2965; GFX1132-NEXT:    s_cbranch_execz .LBB11_2
2966; GFX1132-NEXT:  ; %bb.1:
2967; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
2968; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2969; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
2970; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2971; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
2972; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2973; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2974; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2975; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2976; GFX1132-NEXT:    buffer_gl0_inv
2977; GFX1132-NEXT:  .LBB11_2:
2978; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2979; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2980; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2981; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2982; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2983; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2984; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2985; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2986; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2987; GFX1132-NEXT:    s_mov_b32 s2, -1
2988; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2989; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2990; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2991; GFX1132-NEXT:    s_endpgm
2992entry:
2993  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2994  store i64 %old, i64 addrspace(1)* %out
2995  ret void
2996}
2997
2998define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2999;
3000;
3001; GFX7LESS-LABEL: sub_i64_uniform:
3002; GFX7LESS:       ; %bb.0: ; %entry
3003; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
3004; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3005; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
3006; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
3007; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3008; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3009; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3010; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
3011; GFX7LESS-NEXT:  ; %bb.1:
3012; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3013; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
3014; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3015; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
3016; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
3017; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
3018; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
3019; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
3020; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
3021; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3022; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3023; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3024; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3025; GFX7LESS-NEXT:  .LBB12_2:
3026; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
3027; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
3028; GFX7LESS-NEXT:    s_mov_b32 s6, -1
3029; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3030; GFX7LESS-NEXT:    s_mov_b32 s4, s0
3031; GFX7LESS-NEXT:    s_mov_b32 s5, s1
3032; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
3033; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
3034; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
3035; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
3036; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
3037; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
3038; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
3039; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
3040; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
3041; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3042; GFX7LESS-NEXT:    s_endpgm
3043;
3044; GFX8-LABEL: sub_i64_uniform:
3045; GFX8:       ; %bb.0: ; %entry
3046; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3047; GFX8-NEXT:    s_mov_b64 s[6:7], exec
3048; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3049; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3050; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3051; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3052; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3053; GFX8-NEXT:    s_cbranch_execz .LBB12_2
3054; GFX8-NEXT:  ; %bb.1:
3055; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
3056; GFX8-NEXT:    v_mov_b32_e32 v0, s8
3057; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3058; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
3059; GFX8-NEXT:    s_mul_i32 s6, s3, s8
3060; GFX8-NEXT:    v_mov_b32_e32 v3, 0
3061; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
3062; GFX8-NEXT:    s_mov_b32 m0, -1
3063; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3064; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3065; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3066; GFX8-NEXT:  .LBB12_2:
3067; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3068; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3069; GFX8-NEXT:    s_mov_b32 s4, s0
3070; GFX8-NEXT:    s_mov_b32 s5, s1
3071; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
3072; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
3073; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
3074; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
3075; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
3076; GFX8-NEXT:    v_mov_b32_e32 v3, s1
3077; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
3078; GFX8-NEXT:    s_mov_b32 s7, 0xf000
3079; GFX8-NEXT:    s_mov_b32 s6, -1
3080; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
3081; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3082; GFX8-NEXT:    s_endpgm
3083;
3084; GFX9-LABEL: sub_i64_uniform:
3085; GFX9:       ; %bb.0: ; %entry
3086; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3087; GFX9-NEXT:    s_mov_b64 s[6:7], exec
3088; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3089; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3090; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3091; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3092; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3093; GFX9-NEXT:    s_cbranch_execz .LBB12_2
3094; GFX9-NEXT:  ; %bb.1:
3095; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3096; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3097; GFX9-NEXT:    s_mul_i32 s7, s3, s6
3098; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
3099; GFX9-NEXT:    s_add_i32 s8, s8, s7
3100; GFX9-NEXT:    s_mul_i32 s6, s2, s6
3101; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3102; GFX9-NEXT:    v_mov_b32_e32 v1, s8
3103; GFX9-NEXT:    v_mov_b32_e32 v3, 0
3104; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3105; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3106; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3107; GFX9-NEXT:  .LBB12_2:
3108; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3109; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3110; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3111; GFX9-NEXT:    s_mov_b32 s4, s0
3112; GFX9-NEXT:    s_mov_b32 s5, s1
3113; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3114; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3115; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3116; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3117; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3118; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
3119; GFX9-NEXT:    s_mov_b32 s7, 0xf000
3120; GFX9-NEXT:    s_mov_b32 s6, -1
3121; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
3122; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3123; GFX9-NEXT:    s_endpgm
3124;
3125; GFX1064-LABEL: sub_i64_uniform:
3126; GFX1064:       ; %bb.0: ; %entry
3127; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3128; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3129; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3130; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3131; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3132; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3133; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3134; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
3135; GFX1064-NEXT:  ; %bb.1:
3136; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3137; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3138; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3139; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
3140; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
3141; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
3142; GFX1064-NEXT:    s_add_i32 s8, s8, s7
3143; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
3144; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
3145; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3146; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3147; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3148; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3149; GFX1064-NEXT:    buffer_gl0_inv
3150; GFX1064-NEXT:  .LBB12_2:
3151; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3152; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3153; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3155; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
3156; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3157; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3158; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3159; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3160; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
3161; GFX1064-NEXT:    s_mov_b32 s2, -1
3162; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3163; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3164; GFX1064-NEXT:    s_endpgm
3165;
3166; GFX1032-LABEL: sub_i64_uniform:
3167; GFX1032:       ; %bb.0: ; %entry
3168; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3169; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
3170; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3171; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3172; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
3173; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3174; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
3175; GFX1032-NEXT:  ; %bb.1:
3176; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
3177; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3178; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3179; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
3180; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
3181; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
3182; GFX1032-NEXT:    s_add_i32 s7, s7, s6
3183; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
3184; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
3185; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3186; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3187; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3188; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3189; GFX1032-NEXT:    buffer_gl0_inv
3190; GFX1032-NEXT:  .LBB12_2:
3191; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3192; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3193; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3194; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
3195; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
3196; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
3197; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3198; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3199; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3200; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
3201; GFX1032-NEXT:    s_mov_b32 s2, -1
3202; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3203; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3204; GFX1032-NEXT:    s_endpgm
3205;
3206; GFX1164-LABEL: sub_i64_uniform:
3207; GFX1164:       ; %bb.0: ; %entry
3208; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3209; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3210; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3211; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3212; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3213; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3214; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
3215; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
3216; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
3217; GFX1164-NEXT:  ; %bb.1:
3218; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3219; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3220; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3221; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
3222; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
3223; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
3224; GFX1164-NEXT:    s_add_i32 s8, s8, s7
3225; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
3226; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
3227; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3228; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3229; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3230; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3231; GFX1164-NEXT:    buffer_gl0_inv
3232; GFX1164-NEXT:  .LBB12_2:
3233; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3234; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3235; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3236; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
3237; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
3238; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
3239; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3240; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3241; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3242; GFX1164-NEXT:    s_mov_b32 s2, -1
3243; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3244; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3245; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3246; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3247; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3248; GFX1164-NEXT:    s_endpgm
3249;
3250; GFX1132-LABEL: sub_i64_uniform:
3251; GFX1132:       ; %bb.0: ; %entry
3252; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3253; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
3254; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3255; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3256; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3257; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3258; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3259; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
3260; GFX1132-NEXT:  ; %bb.1:
3261; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
3262; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3263; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3264; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
3265; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
3266; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
3267; GFX1132-NEXT:    s_add_i32 s7, s7, s6
3268; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
3269; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
3270; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3271; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3272; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3273; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3274; GFX1132-NEXT:    buffer_gl0_inv
3275; GFX1132-NEXT:  .LBB12_2:
3276; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3277; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3278; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3279; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
3280; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
3281; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3282; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3283; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3284; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3285; GFX1132-NEXT:    s_mov_b32 s2, -1
3286; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3287; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3288; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3289; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3290; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3291; GFX1132-NEXT:    s_endpgm
3292entry:
3293  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
3294  store i64 %old, i64 addrspace(1)* %out
3295  ret void
3296}
3297
3298define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
3299;
3300;
3301; GFX7LESS-LABEL: sub_i64_varying:
3302; GFX7LESS:       ; %bb.0: ; %entry
3303; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3304; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3305; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3306; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3307; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3308; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3309; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3310; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3311; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3312; GFX7LESS-NEXT:    s_endpgm
3313;
3314; GFX8-LABEL: sub_i64_varying:
3315; GFX8:       ; %bb.0: ; %entry
3316; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3317; GFX8-NEXT:    s_mov_b32 m0, -1
3318; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3319; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3320; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3321; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3322; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3323; GFX8-NEXT:    s_mov_b32 s2, -1
3324; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3325; GFX8-NEXT:    s_endpgm
3326;
3327; GFX9-LABEL: sub_i64_varying:
3328; GFX9:       ; %bb.0: ; %entry
3329; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3330; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3331; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3332; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3334; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3335; GFX9-NEXT:    s_mov_b32 s2, -1
3336; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3337; GFX9-NEXT:    s_endpgm
3338;
3339; GFX10-LABEL: sub_i64_varying:
3340; GFX10:       ; %bb.0: ; %entry
3341; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3342; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3343; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
3344; GFX10-NEXT:    s_mov_b32 s2, -1
3345; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3346; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3347; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3348; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3349; GFX10-NEXT:    buffer_gl0_inv
3350; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3351; GFX10-NEXT:    s_endpgm
3352;
3353; GFX11-LABEL: sub_i64_varying:
3354; GFX11:       ; %bb.0: ; %entry
3355; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3356; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3357; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
3358; GFX11-NEXT:    s_mov_b32 s2, -1
3359; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3360; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3361; GFX11-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3362; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3363; GFX11-NEXT:    buffer_gl0_inv
3364; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3365; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3366; GFX11-NEXT:    s_endpgm
3367entry:
3368  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3369  %zext = zext i32 %lane to i64
3370  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
3371  store i64 %old, i64 addrspace(1)* %out
3372  ret void
3373}
3374
3375define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
3376;
3377;
3378; GFX7LESS-LABEL: and_i32_varying:
3379; GFX7LESS:       ; %bb.0: ; %entry
3380; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3381; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3382; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3383; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3384; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
3385; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3386; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3387; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3388; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3389; GFX7LESS-NEXT:    s_endpgm
3390;
3391; GFX8-LABEL: and_i32_varying:
3392; GFX8:       ; %bb.0: ; %entry
3393; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3394; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3395; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3396; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3397; GFX8-NEXT:    v_mov_b32_e32 v1, -1
3398; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3399; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3400; GFX8-NEXT:    s_not_b64 exec, exec
3401; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3402; GFX8-NEXT:    s_not_b64 exec, exec
3403; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3404; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3405; GFX8-NEXT:    s_nop 1
3406; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3407; GFX8-NEXT:    s_nop 1
3408; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3409; GFX8-NEXT:    s_nop 1
3410; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3411; GFX8-NEXT:    s_nop 1
3412; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3413; GFX8-NEXT:    s_nop 1
3414; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3415; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3416; GFX8-NEXT:    s_nop 0
3417; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3418; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3419; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3420; GFX8-NEXT:    ; implicit-def: $vgpr0
3421; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3422; GFX8-NEXT:    s_cbranch_execz .LBB14_2
3423; GFX8-NEXT:  ; %bb.1:
3424; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3425; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3426; GFX8-NEXT:    s_mov_b32 m0, -1
3427; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3428; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
3429; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3430; GFX8-NEXT:  .LBB14_2:
3431; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3432; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3433; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3434; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3435; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
3436; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3437; GFX8-NEXT:    s_mov_b32 s2, -1
3438; GFX8-NEXT:    s_nop 0
3439; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3440; GFX8-NEXT:    s_endpgm
3441;
3442; GFX9-LABEL: and_i32_varying:
3443; GFX9:       ; %bb.0: ; %entry
3444; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3445; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3446; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3447; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3448; GFX9-NEXT:    v_mov_b32_e32 v1, -1
3449; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3450; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3451; GFX9-NEXT:    s_not_b64 exec, exec
3452; GFX9-NEXT:    v_mov_b32_e32 v2, -1
3453; GFX9-NEXT:    s_not_b64 exec, exec
3454; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3455; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3456; GFX9-NEXT:    s_nop 1
3457; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3458; GFX9-NEXT:    s_nop 1
3459; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3460; GFX9-NEXT:    s_nop 1
3461; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3462; GFX9-NEXT:    s_nop 1
3463; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3464; GFX9-NEXT:    s_nop 1
3465; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3466; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3467; GFX9-NEXT:    s_nop 0
3468; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3469; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3470; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3471; GFX9-NEXT:    ; implicit-def: $vgpr0
3472; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3473; GFX9-NEXT:    s_cbranch_execz .LBB14_2
3474; GFX9-NEXT:  ; %bb.1:
3475; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3476; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3477; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3478; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
3479; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3480; GFX9-NEXT:  .LBB14_2:
3481; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3482; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3483; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3484; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3485; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
3486; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3487; GFX9-NEXT:    s_mov_b32 s2, -1
3488; GFX9-NEXT:    s_nop 0
3489; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3490; GFX9-NEXT:    s_endpgm
3491;
3492; GFX1064-LABEL: and_i32_varying:
3493; GFX1064:       ; %bb.0: ; %entry
3494; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3495; GFX1064-NEXT:    s_not_b64 exec, exec
3496; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
3497; GFX1064-NEXT:    s_not_b64 exec, exec
3498; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3499; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3500; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
3501; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3502; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3503; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3504; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3505; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3506; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3507; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3508; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3509; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3510; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3511; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3512; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3513; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3514; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3515; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3516; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3517; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3518; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3519; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3520; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3521; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3522; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3523; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3524; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3525; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3526; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3527; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3528; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3529; GFX1064-NEXT:    s_mov_b32 s2, -1
3530; GFX1064-NEXT:    ; implicit-def: $vgpr0
3531; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3532; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
3533; GFX1064-NEXT:  ; %bb.1:
3534; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3535; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3536; GFX1064-NEXT:    s_mov_b32 s3, s7
3537; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3538; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3539; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
3540; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3541; GFX1064-NEXT:    buffer_gl0_inv
3542; GFX1064-NEXT:  .LBB14_2:
3543; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3544; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3545; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3546; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3547; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
3548; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3549; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3550; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3551; GFX1064-NEXT:    s_endpgm
3552;
3553; GFX1032-LABEL: and_i32_varying:
3554; GFX1032:       ; %bb.0: ; %entry
3555; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3556; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3557; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
3558; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3559; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3560; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3561; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3562; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3563; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3564; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3565; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3566; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3567; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3568; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3569; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3570; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
3571; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3572; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3573; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3574; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3575; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3576; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3577; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3578; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3579; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3580; GFX1032-NEXT:    s_mov_b32 s2, -1
3581; GFX1032-NEXT:    ; implicit-def: $vgpr0
3582; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3583; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
3584; GFX1032-NEXT:  ; %bb.1:
3585; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3586; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3587; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3588; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3589; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
3590; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3591; GFX1032-NEXT:    buffer_gl0_inv
3592; GFX1032-NEXT:  .LBB14_2:
3593; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3594; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3595; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3596; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3597; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
3598; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3599; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3600; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3601; GFX1032-NEXT:    s_endpgm
3602;
3603; GFX1164-LABEL: and_i32_varying:
3604; GFX1164:       ; %bb.0: ; %entry
3605; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3606; GFX1164-NEXT:    s_not_b64 exec, exec
3607; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
3608; GFX1164-NEXT:    s_not_b64 exec, exec
3609; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3610; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3611; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3612; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
3613; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3614; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3615; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3616; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3617; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3618; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3619; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3620; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3621; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3622; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3623; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3624; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3625; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3626; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3627; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3628; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3629; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3630; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3631; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3632; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3633; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3634; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3635; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3636; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3637; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3638; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3639; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3640; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3641; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3642; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
3643; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3644; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3645; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3646; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3647; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3648; GFX1164-NEXT:    s_mov_b32 s2, -1
3649; GFX1164-NEXT:    ; implicit-def: $vgpr0
3650; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3651; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
3652; GFX1164-NEXT:  ; %bb.1:
3653; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3654; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3655; GFX1164-NEXT:    s_mov_b32 s3, s7
3656; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3657; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3658; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
3659; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3660; GFX1164-NEXT:    buffer_gl0_inv
3661; GFX1164-NEXT:  .LBB14_2:
3662; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3663; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3664; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3665; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3666; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
3667; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3668; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3669; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3670; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3671; GFX1164-NEXT:    s_endpgm
3672;
3673; GFX1132-LABEL: and_i32_varying:
3674; GFX1132:       ; %bb.0: ; %entry
3675; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3676; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3677; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
3678; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3679; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3680; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3681; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3682; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3683; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3684; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3685; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3686; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3687; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3688; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3689; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3690; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3691; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3692; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3693; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3694; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
3695; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3696; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3697; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3698; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3699; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3700; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3701; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3702; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3703; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3704; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3705; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3706; GFX1132-NEXT:    s_mov_b32 s2, -1
3707; GFX1132-NEXT:    ; implicit-def: $vgpr0
3708; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3709; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
3710; GFX1132-NEXT:  ; %bb.1:
3711; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3712; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3713; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3714; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3715; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
3716; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3717; GFX1132-NEXT:    buffer_gl0_inv
3718; GFX1132-NEXT:  .LBB14_2:
3719; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3720; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3721; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3722; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3723; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
3724; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3725; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3726; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3727; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3728; GFX1132-NEXT:    s_endpgm
3729entry:
3730  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3731  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3732  store i32 %old, i32 addrspace(1)* %out
3733  ret void
3734}
3735
3736define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
3737;
3738;
3739; GFX7LESS-LABEL: or_i32_varying:
3740; GFX7LESS:       ; %bb.0: ; %entry
3741; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3742; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3743; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3744; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3745; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
3746; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3747; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3748; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3749; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3750; GFX7LESS-NEXT:    s_endpgm
3751;
3752; GFX8-LABEL: or_i32_varying:
3753; GFX8:       ; %bb.0: ; %entry
3754; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3755; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3756; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3757; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3758; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3759; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3760; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3761; GFX8-NEXT:    s_not_b64 exec, exec
3762; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3763; GFX8-NEXT:    s_not_b64 exec, exec
3764; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3765; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3766; GFX8-NEXT:    s_nop 1
3767; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3768; GFX8-NEXT:    s_nop 1
3769; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3770; GFX8-NEXT:    s_nop 1
3771; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3772; GFX8-NEXT:    s_nop 1
3773; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3774; GFX8-NEXT:    s_nop 1
3775; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3776; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3777; GFX8-NEXT:    s_nop 0
3778; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3779; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3780; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3781; GFX8-NEXT:    ; implicit-def: $vgpr0
3782; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3783; GFX8-NEXT:    s_cbranch_execz .LBB15_2
3784; GFX8-NEXT:  ; %bb.1:
3785; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3786; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3787; GFX8-NEXT:    s_mov_b32 m0, -1
3788; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3789; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3790; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3791; GFX8-NEXT:  .LBB15_2:
3792; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3793; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3794; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3795; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3796; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3797; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3798; GFX8-NEXT:    s_mov_b32 s2, -1
3799; GFX8-NEXT:    s_nop 0
3800; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3801; GFX8-NEXT:    s_endpgm
3802;
3803; GFX9-LABEL: or_i32_varying:
3804; GFX9:       ; %bb.0: ; %entry
3805; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3806; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3807; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3808; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3809; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3810; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3811; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3812; GFX9-NEXT:    s_not_b64 exec, exec
3813; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3814; GFX9-NEXT:    s_not_b64 exec, exec
3815; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3816; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3817; GFX9-NEXT:    s_nop 1
3818; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3819; GFX9-NEXT:    s_nop 1
3820; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3821; GFX9-NEXT:    s_nop 1
3822; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3823; GFX9-NEXT:    s_nop 1
3824; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3825; GFX9-NEXT:    s_nop 1
3826; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3827; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3828; GFX9-NEXT:    s_nop 0
3829; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3830; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3831; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3832; GFX9-NEXT:    ; implicit-def: $vgpr0
3833; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3834; GFX9-NEXT:    s_cbranch_execz .LBB15_2
3835; GFX9-NEXT:  ; %bb.1:
3836; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3837; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3838; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3839; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3840; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3841; GFX9-NEXT:  .LBB15_2:
3842; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3843; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3844; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3845; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3846; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3847; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3848; GFX9-NEXT:    s_mov_b32 s2, -1
3849; GFX9-NEXT:    s_nop 0
3850; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3851; GFX9-NEXT:    s_endpgm
3852;
3853; GFX1064-LABEL: or_i32_varying:
3854; GFX1064:       ; %bb.0: ; %entry
3855; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3856; GFX1064-NEXT:    s_not_b64 exec, exec
3857; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3858; GFX1064-NEXT:    s_not_b64 exec, exec
3859; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3860; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3861; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3862; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3863; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3864; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3865; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3866; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3867; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3868; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3869; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3870; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3871; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3872; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3873; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3874; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3875; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3876; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3877; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3878; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3879; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3880; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3881; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3882; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3883; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3884; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3885; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3886; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3887; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3888; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3889; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3890; GFX1064-NEXT:    s_mov_b32 s2, -1
3891; GFX1064-NEXT:    ; implicit-def: $vgpr0
3892; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3893; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
3894; GFX1064-NEXT:  ; %bb.1:
3895; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3896; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3897; GFX1064-NEXT:    s_mov_b32 s3, s7
3898; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3899; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3900; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
3901; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3902; GFX1064-NEXT:    buffer_gl0_inv
3903; GFX1064-NEXT:  .LBB15_2:
3904; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3905; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3906; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3907; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3908; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3909; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3910; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3911; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3912; GFX1064-NEXT:    s_endpgm
3913;
3914; GFX1032-LABEL: or_i32_varying:
3915; GFX1032:       ; %bb.0: ; %entry
3916; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3917; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3918; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3919; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3920; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3921; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3922; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3923; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3924; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3925; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3926; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3927; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3928; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3929; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3930; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3931; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3932; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3933; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3934; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3935; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3936; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3937; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3938; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3939; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3940; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3941; GFX1032-NEXT:    s_mov_b32 s2, -1
3942; GFX1032-NEXT:    ; implicit-def: $vgpr0
3943; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3944; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
3945; GFX1032-NEXT:  ; %bb.1:
3946; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3947; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3948; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3949; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3950; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
3951; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3952; GFX1032-NEXT:    buffer_gl0_inv
3953; GFX1032-NEXT:  .LBB15_2:
3954; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3955; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3956; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3957; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3958; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3959; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3960; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3961; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3962; GFX1032-NEXT:    s_endpgm
3963;
3964; GFX1164-LABEL: or_i32_varying:
3965; GFX1164:       ; %bb.0: ; %entry
3966; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3967; GFX1164-NEXT:    s_not_b64 exec, exec
3968; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3969; GFX1164-NEXT:    s_not_b64 exec, exec
3970; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3971; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3972; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3973; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3974; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3975; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3976; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3977; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3978; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3979; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3980; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3981; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3982; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3983; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3984; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3985; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3986; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3987; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3988; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3989; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3990; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3991; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3992; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3993; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3994; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3995; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3996; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3997; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3998; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3999; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4000; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4001; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4002; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4003; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4004; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4005; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4006; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4007; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4008; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4009; GFX1164-NEXT:    s_mov_b32 s2, -1
4010; GFX1164-NEXT:    ; implicit-def: $vgpr0
4011; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4012; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
4013; GFX1164-NEXT:  ; %bb.1:
4014; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4015; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4016; GFX1164-NEXT:    s_mov_b32 s3, s7
4017; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4018; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4019; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v4
4020; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4021; GFX1164-NEXT:    buffer_gl0_inv
4022; GFX1164-NEXT:  .LBB15_2:
4023; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4024; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4025; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4026; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4027; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
4028; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4029; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4030; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4031; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4032; GFX1164-NEXT:    s_endpgm
4033;
4034; GFX1132-LABEL: or_i32_varying:
4035; GFX1132:       ; %bb.0: ; %entry
4036; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4037; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4038; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4039; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4040; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4041; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4042; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4043; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4044; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4045; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4046; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4047; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4048; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4049; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4050; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4051; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4052; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4053; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4054; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4055; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4056; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4057; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4058; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4059; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4060; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4061; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4062; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4063; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4064; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4065; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4066; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4067; GFX1132-NEXT:    s_mov_b32 s2, -1
4068; GFX1132-NEXT:    ; implicit-def: $vgpr0
4069; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4070; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
4071; GFX1132-NEXT:  ; %bb.1:
4072; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4073; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4074; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4075; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4076; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v4
4077; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4078; GFX1132-NEXT:    buffer_gl0_inv
4079; GFX1132-NEXT:  .LBB15_2:
4080; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4081; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4082; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4083; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4084; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
4085; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4086; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4087; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4088; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4089; GFX1132-NEXT:    s_endpgm
4090entry:
4091  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4092  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4093  store i32 %old, i32 addrspace(1)* %out
4094  ret void
4095}
4096
4097define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
4098;
4099;
4100; GFX7LESS-LABEL: xor_i32_varying:
4101; GFX7LESS:       ; %bb.0: ; %entry
4102; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4103; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4104; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4105; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4106; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
4107; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4108; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4109; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4110; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4111; GFX7LESS-NEXT:    s_endpgm
4112;
4113; GFX8-LABEL: xor_i32_varying:
4114; GFX8:       ; %bb.0: ; %entry
4115; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4116; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4117; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4118; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4119; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4120; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4121; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4122; GFX8-NEXT:    s_not_b64 exec, exec
4123; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4124; GFX8-NEXT:    s_not_b64 exec, exec
4125; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4126; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4127; GFX8-NEXT:    s_nop 1
4128; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4129; GFX8-NEXT:    s_nop 1
4130; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4131; GFX8-NEXT:    s_nop 1
4132; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4133; GFX8-NEXT:    s_nop 1
4134; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4135; GFX8-NEXT:    s_nop 1
4136; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4137; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4138; GFX8-NEXT:    s_nop 0
4139; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4140; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4141; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4142; GFX8-NEXT:    ; implicit-def: $vgpr0
4143; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4144; GFX8-NEXT:    s_cbranch_execz .LBB16_2
4145; GFX8-NEXT:  ; %bb.1:
4146; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4147; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4148; GFX8-NEXT:    s_mov_b32 m0, -1
4149; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4150; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4151; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4152; GFX8-NEXT:  .LBB16_2:
4153; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4154; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4155; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4156; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4157; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
4158; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4159; GFX8-NEXT:    s_mov_b32 s2, -1
4160; GFX8-NEXT:    s_nop 0
4161; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4162; GFX8-NEXT:    s_endpgm
4163;
4164; GFX9-LABEL: xor_i32_varying:
4165; GFX9:       ; %bb.0: ; %entry
4166; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4167; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4168; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4169; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4170; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4171; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4172; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4173; GFX9-NEXT:    s_not_b64 exec, exec
4174; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4175; GFX9-NEXT:    s_not_b64 exec, exec
4176; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4177; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4178; GFX9-NEXT:    s_nop 1
4179; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4180; GFX9-NEXT:    s_nop 1
4181; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4182; GFX9-NEXT:    s_nop 1
4183; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4184; GFX9-NEXT:    s_nop 1
4185; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4186; GFX9-NEXT:    s_nop 1
4187; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4188; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4189; GFX9-NEXT:    s_nop 0
4190; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4191; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4192; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4193; GFX9-NEXT:    ; implicit-def: $vgpr0
4194; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4195; GFX9-NEXT:    s_cbranch_execz .LBB16_2
4196; GFX9-NEXT:  ; %bb.1:
4197; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4198; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4199; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4200; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4201; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4202; GFX9-NEXT:  .LBB16_2:
4203; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4204; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4205; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4206; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4207; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
4208; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4209; GFX9-NEXT:    s_mov_b32 s2, -1
4210; GFX9-NEXT:    s_nop 0
4211; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4212; GFX9-NEXT:    s_endpgm
4213;
4214; GFX1064-LABEL: xor_i32_varying:
4215; GFX1064:       ; %bb.0: ; %entry
4216; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4217; GFX1064-NEXT:    s_not_b64 exec, exec
4218; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4219; GFX1064-NEXT:    s_not_b64 exec, exec
4220; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4221; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4222; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4223; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4224; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4225; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4226; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4227; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4228; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4229; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4230; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4231; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4232; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4233; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4234; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4235; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4236; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4237; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4238; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4239; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4240; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4241; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4242; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4243; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4244; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4245; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4246; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4247; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4248; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4249; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4250; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4251; GFX1064-NEXT:    s_mov_b32 s2, -1
4252; GFX1064-NEXT:    ; implicit-def: $vgpr0
4253; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4254; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
4255; GFX1064-NEXT:  ; %bb.1:
4256; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4257; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4258; GFX1064-NEXT:    s_mov_b32 s3, s7
4259; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4260; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4261; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4262; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4263; GFX1064-NEXT:    buffer_gl0_inv
4264; GFX1064-NEXT:  .LBB16_2:
4265; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4266; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4267; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4268; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4269; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
4270; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4271; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4272; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4273; GFX1064-NEXT:    s_endpgm
4274;
4275; GFX1032-LABEL: xor_i32_varying:
4276; GFX1032:       ; %bb.0: ; %entry
4277; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4278; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4279; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4280; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4281; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4282; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4283; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4284; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4285; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4286; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4287; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4288; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4289; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4290; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4291; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4292; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4293; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4294; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4295; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4296; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4297; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4298; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4299; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4300; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4301; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4302; GFX1032-NEXT:    s_mov_b32 s2, -1
4303; GFX1032-NEXT:    ; implicit-def: $vgpr0
4304; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4305; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
4306; GFX1032-NEXT:  ; %bb.1:
4307; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4308; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4309; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4310; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4311; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4312; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4313; GFX1032-NEXT:    buffer_gl0_inv
4314; GFX1032-NEXT:  .LBB16_2:
4315; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4316; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4317; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4318; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4319; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
4320; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4321; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4322; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4323; GFX1032-NEXT:    s_endpgm
4324;
4325; GFX1164-LABEL: xor_i32_varying:
4326; GFX1164:       ; %bb.0: ; %entry
4327; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4328; GFX1164-NEXT:    s_not_b64 exec, exec
4329; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4330; GFX1164-NEXT:    s_not_b64 exec, exec
4331; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4332; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4333; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4334; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
4335; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4336; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4337; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4338; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4339; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4340; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4341; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4342; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4343; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4344; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4345; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4346; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4347; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4348; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4349; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4350; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4351; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4352; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4353; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4354; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4355; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4356; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4357; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4358; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4359; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4360; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4361; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4362; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4363; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4364; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4365; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4366; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4367; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4368; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4369; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4370; GFX1164-NEXT:    s_mov_b32 s2, -1
4371; GFX1164-NEXT:    ; implicit-def: $vgpr0
4372; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4373; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
4374; GFX1164-NEXT:  ; %bb.1:
4375; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4376; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4377; GFX1164-NEXT:    s_mov_b32 s3, s7
4378; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4379; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4380; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4381; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4382; GFX1164-NEXT:    buffer_gl0_inv
4383; GFX1164-NEXT:  .LBB16_2:
4384; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4385; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4386; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4387; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4388; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
4389; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4390; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4391; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4392; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4393; GFX1164-NEXT:    s_endpgm
4394;
4395; GFX1132-LABEL: xor_i32_varying:
4396; GFX1132:       ; %bb.0: ; %entry
4397; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4398; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4399; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4400; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4401; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4402; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4403; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4404; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4405; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4406; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4407; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4408; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4409; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4410; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4411; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4412; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4413; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4414; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4415; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4416; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4417; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4418; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4419; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4420; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4421; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4422; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4423; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4424; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4425; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4426; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4427; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4428; GFX1132-NEXT:    s_mov_b32 s2, -1
4429; GFX1132-NEXT:    ; implicit-def: $vgpr0
4430; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4431; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
4432; GFX1132-NEXT:  ; %bb.1:
4433; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4434; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4435; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4436; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4437; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4438; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4439; GFX1132-NEXT:    buffer_gl0_inv
4440; GFX1132-NEXT:  .LBB16_2:
4441; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4442; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4443; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4444; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4445; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
4446; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4447; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4448; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4449; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4450; GFX1132-NEXT:    s_endpgm
4451entry:
4452  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4453  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4454  store i32 %old, i32 addrspace(1)* %out
4455  ret void
4456}
4457
4458define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
4459;
4460;
4461; GFX7LESS-LABEL: max_i32_varying:
4462; GFX7LESS:       ; %bb.0: ; %entry
4463; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4464; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4465; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4466; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4467; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
4468; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4469; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4470; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4471; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4472; GFX7LESS-NEXT:    s_endpgm
4473;
4474; GFX8-LABEL: max_i32_varying:
4475; GFX8:       ; %bb.0: ; %entry
4476; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4477; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4478; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4479; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4480; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4481; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4482; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4483; GFX8-NEXT:    s_not_b64 exec, exec
4484; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
4485; GFX8-NEXT:    s_not_b64 exec, exec
4486; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4487; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4488; GFX8-NEXT:    s_nop 1
4489; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4490; GFX8-NEXT:    s_nop 1
4491; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4492; GFX8-NEXT:    s_nop 1
4493; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4494; GFX8-NEXT:    s_nop 1
4495; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4496; GFX8-NEXT:    s_nop 1
4497; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4498; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4499; GFX8-NEXT:    s_nop 0
4500; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4501; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4502; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4503; GFX8-NEXT:    ; implicit-def: $vgpr0
4504; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4505; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4506; GFX8-NEXT:  ; %bb.1:
4507; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4508; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4509; GFX8-NEXT:    s_mov_b32 m0, -1
4510; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4511; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
4512; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4513; GFX8-NEXT:  .LBB17_2:
4514; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4515; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4516; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4517; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4518; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
4519; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4520; GFX8-NEXT:    s_mov_b32 s2, -1
4521; GFX8-NEXT:    s_nop 0
4522; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4523; GFX8-NEXT:    s_endpgm
4524;
4525; GFX9-LABEL: max_i32_varying:
4526; GFX9:       ; %bb.0: ; %entry
4527; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4528; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4529; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4530; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4531; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4532; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4533; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4534; GFX9-NEXT:    s_not_b64 exec, exec
4535; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
4536; GFX9-NEXT:    s_not_b64 exec, exec
4537; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4538; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4539; GFX9-NEXT:    s_nop 1
4540; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4541; GFX9-NEXT:    s_nop 1
4542; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4543; GFX9-NEXT:    s_nop 1
4544; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4545; GFX9-NEXT:    s_nop 1
4546; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4547; GFX9-NEXT:    s_nop 1
4548; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4549; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4550; GFX9-NEXT:    s_nop 0
4551; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4552; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4553; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4554; GFX9-NEXT:    ; implicit-def: $vgpr0
4555; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4556; GFX9-NEXT:    s_cbranch_execz .LBB17_2
4557; GFX9-NEXT:  ; %bb.1:
4558; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4559; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4560; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4561; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
4562; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4563; GFX9-NEXT:  .LBB17_2:
4564; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4565; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4566; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4567; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4568; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
4569; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4570; GFX9-NEXT:    s_mov_b32 s2, -1
4571; GFX9-NEXT:    s_nop 0
4572; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4573; GFX9-NEXT:    s_endpgm
4574;
4575; GFX1064-LABEL: max_i32_varying:
4576; GFX1064:       ; %bb.0: ; %entry
4577; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4578; GFX1064-NEXT:    s_not_b64 exec, exec
4579; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
4580; GFX1064-NEXT:    s_not_b64 exec, exec
4581; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4582; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4583; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
4584; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4585; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4586; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4587; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4588; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4589; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4590; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4591; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4592; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4593; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4594; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4595; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4596; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4597; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4598; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4599; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4600; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4601; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4602; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4603; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4604; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4605; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4606; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4607; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4608; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4609; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4610; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4611; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4612; GFX1064-NEXT:    s_mov_b32 s2, -1
4613; GFX1064-NEXT:    ; implicit-def: $vgpr0
4614; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4615; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
4616; GFX1064-NEXT:  ; %bb.1:
4617; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4618; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4619; GFX1064-NEXT:    s_mov_b32 s3, s7
4620; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4621; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4622; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
4623; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4624; GFX1064-NEXT:    buffer_gl0_inv
4625; GFX1064-NEXT:  .LBB17_2:
4626; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4627; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4628; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4629; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4630; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
4631; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4632; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4633; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4634; GFX1064-NEXT:    s_endpgm
4635;
4636; GFX1032-LABEL: max_i32_varying:
4637; GFX1032:       ; %bb.0: ; %entry
4638; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4639; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4640; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
4641; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4642; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4643; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4644; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4645; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4646; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4647; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4648; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4649; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4650; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4651; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4652; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4653; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
4654; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4655; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4656; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4657; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4658; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4659; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4660; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4661; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4662; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4663; GFX1032-NEXT:    s_mov_b32 s2, -1
4664; GFX1032-NEXT:    ; implicit-def: $vgpr0
4665; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4666; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
4667; GFX1032-NEXT:  ; %bb.1:
4668; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4669; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4670; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4671; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4672; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
4673; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4674; GFX1032-NEXT:    buffer_gl0_inv
4675; GFX1032-NEXT:  .LBB17_2:
4676; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4677; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4678; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4679; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4680; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
4681; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4682; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4683; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4684; GFX1032-NEXT:    s_endpgm
4685;
4686; GFX1164-LABEL: max_i32_varying:
4687; GFX1164:       ; %bb.0: ; %entry
4688; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4689; GFX1164-NEXT:    s_not_b64 exec, exec
4690; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
4691; GFX1164-NEXT:    s_not_b64 exec, exec
4692; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4693; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4694; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4695; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
4696; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4697; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4698; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4699; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4700; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4701; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4702; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4703; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4704; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4705; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4706; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4707; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4708; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4709; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4710; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4711; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4712; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4713; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4714; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4715; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4716; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4717; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4718; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4719; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4720; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4721; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4722; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4723; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4724; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4725; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4726; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4727; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4728; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4729; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4730; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4731; GFX1164-NEXT:    s_mov_b32 s2, -1
4732; GFX1164-NEXT:    ; implicit-def: $vgpr0
4733; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4734; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
4735; GFX1164-NEXT:  ; %bb.1:
4736; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4737; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4738; GFX1164-NEXT:    s_mov_b32 s3, s7
4739; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4740; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4741; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
4742; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4743; GFX1164-NEXT:    buffer_gl0_inv
4744; GFX1164-NEXT:  .LBB17_2:
4745; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4746; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4747; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4748; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4749; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
4750; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4751; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4752; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4753; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4754; GFX1164-NEXT:    s_endpgm
4755;
4756; GFX1132-LABEL: max_i32_varying:
4757; GFX1132:       ; %bb.0: ; %entry
4758; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4759; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4760; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
4761; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4762; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4763; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4764; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4765; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4766; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4767; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4768; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4769; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4770; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4771; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4772; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4773; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4774; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4775; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4776; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4777; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
4778; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4779; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4780; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4781; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4782; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4783; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4784; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4785; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4786; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4787; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4788; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4789; GFX1132-NEXT:    s_mov_b32 s2, -1
4790; GFX1132-NEXT:    ; implicit-def: $vgpr0
4791; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4792; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
4793; GFX1132-NEXT:  ; %bb.1:
4794; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4795; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4796; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4797; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4798; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
4799; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4800; GFX1132-NEXT:    buffer_gl0_inv
4801; GFX1132-NEXT:  .LBB17_2:
4802; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4803; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4804; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4805; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4806; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
4807; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4808; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4809; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4810; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4811; GFX1132-NEXT:    s_endpgm
4812entry:
4813  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4814  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4815  store i32 %old, i32 addrspace(1)* %out
4816  ret void
4817}
4818
4819define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
4820;
4821;
4822; GFX7LESS-LABEL: max_i64_constant:
4823; GFX7LESS:       ; %bb.0: ; %entry
4824; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4825; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4826; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4827; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4828; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4829; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4830; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
4831; GFX7LESS-NEXT:  ; %bb.1:
4832; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4833; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4834; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4835; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4836; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4837; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4838; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4839; GFX7LESS-NEXT:  .LBB18_2:
4840; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4841; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4842; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4843; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4844; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
4845; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4846; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4847; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4848; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4849; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4850; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
4851; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4852; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4853; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4854; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4855; GFX7LESS-NEXT:    s_endpgm
4856;
4857; GFX8-LABEL: max_i64_constant:
4858; GFX8:       ; %bb.0: ; %entry
4859; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4860; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4861; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4862; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4863; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4864; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4865; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4866; GFX8-NEXT:  ; %bb.1:
4867; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4868; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4869; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4870; GFX8-NEXT:    s_mov_b32 m0, -1
4871; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4872; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4873; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4874; GFX8-NEXT:  .LBB18_2:
4875; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4876; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4877; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4878; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
4879; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4880; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4881; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4882; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4883; GFX8-NEXT:    v_mov_b32_e32 v2, s3
4884; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4885; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4886; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4887; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4888; GFX8-NEXT:    s_mov_b32 s2, -1
4889; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4890; GFX8-NEXT:    s_endpgm
4891;
4892; GFX9-LABEL: max_i64_constant:
4893; GFX9:       ; %bb.0: ; %entry
4894; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4895; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4896; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4897; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4898; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4899; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4900; GFX9-NEXT:    s_cbranch_execz .LBB18_2
4901; GFX9-NEXT:  ; %bb.1:
4902; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4903; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4904; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4905; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4906; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4908; GFX9-NEXT:  .LBB18_2:
4909; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4910; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4911; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4912; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
4913; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4914; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4915; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4916; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4917; GFX9-NEXT:    v_mov_b32_e32 v2, s3
4918; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4919; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4920; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4921; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4922; GFX9-NEXT:    s_mov_b32 s2, -1
4923; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4924; GFX9-NEXT:    s_endpgm
4925;
4926; GFX1064-LABEL: max_i64_constant:
4927; GFX1064:       ; %bb.0: ; %entry
4928; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4929; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4930; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4931; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4932; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4933; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4934; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
4935; GFX1064-NEXT:  ; %bb.1:
4936; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4937; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4938; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4939; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4940; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4941; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4942; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4943; GFX1064-NEXT:    buffer_gl0_inv
4944; GFX1064-NEXT:  .LBB18_2:
4945; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4946; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4947; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4948; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4949; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4950; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4951; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4952; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4953; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4954; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4955; GFX1064-NEXT:    s_mov_b32 s2, -1
4956; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4957; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4958; GFX1064-NEXT:    s_endpgm
4959;
4960; GFX1032-LABEL: max_i64_constant:
4961; GFX1032:       ; %bb.0: ; %entry
4962; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4963; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4964; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4965; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4966; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4967; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
4968; GFX1032-NEXT:  ; %bb.1:
4969; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4970; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4971; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4972; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4973; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4974; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4975; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4976; GFX1032-NEXT:    buffer_gl0_inv
4977; GFX1032-NEXT:  .LBB18_2:
4978; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4979; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4980; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4981; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4982; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4983; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4984; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4985; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4986; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4987; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4988; GFX1032-NEXT:    s_mov_b32 s2, -1
4989; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4990; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4991; GFX1032-NEXT:    s_endpgm
4992;
4993; GFX1164-LABEL: max_i64_constant:
4994; GFX1164:       ; %bb.0: ; %entry
4995; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4996; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4997; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4998; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4999; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5000; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5001; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5002; GFX1164-NEXT:    s_cbranch_execz .LBB18_2
5003; GFX1164-NEXT:  ; %bb.1:
5004; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5005; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5006; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5007; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5008; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5009; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
5010; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5011; GFX1164-NEXT:    buffer_gl0_inv
5012; GFX1164-NEXT:  .LBB18_2:
5013; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5014; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5015; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5016; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
5017; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5018; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5019; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
5020; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5021; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5022; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5023; GFX1164-NEXT:    s_mov_b32 s2, -1
5024; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5025; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5026; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5027; GFX1164-NEXT:    s_endpgm
5028;
5029; GFX1132-LABEL: max_i64_constant:
5030; GFX1132:       ; %bb.0: ; %entry
5031; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5032; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5033; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5034; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5035; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5036; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5037; GFX1132-NEXT:    s_cbranch_execz .LBB18_2
5038; GFX1132-NEXT:  ; %bb.1:
5039; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5040; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5041; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5042; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5043; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5044; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
5045; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5046; GFX1132-NEXT:    buffer_gl0_inv
5047; GFX1132-NEXT:  .LBB18_2:
5048; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5049; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5050; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5051; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
5052; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
5053; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5054; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
5055; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5056; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5057; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5058; GFX1132-NEXT:    s_mov_b32 s2, -1
5059; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5060; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5061; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5062; GFX1132-NEXT:    s_endpgm
5063entry:
5064  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
5065  store i64 %old, i64 addrspace(1)* %out
5066  ret void
5067}
5068
5069define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
5070;
5071;
5072; GFX7LESS-LABEL: min_i32_varying:
5073; GFX7LESS:       ; %bb.0: ; %entry
5074; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5075; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5076; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5077; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5078; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
5079; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5080; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5081; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5082; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5083; GFX7LESS-NEXT:    s_endpgm
5084;
5085; GFX8-LABEL: min_i32_varying:
5086; GFX8:       ; %bb.0: ; %entry
5087; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5088; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5089; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5090; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5091; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
5092; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5093; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5094; GFX8-NEXT:    s_not_b64 exec, exec
5095; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
5096; GFX8-NEXT:    s_not_b64 exec, exec
5097; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5098; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5099; GFX8-NEXT:    s_nop 1
5100; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5101; GFX8-NEXT:    s_nop 1
5102; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5103; GFX8-NEXT:    s_nop 1
5104; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5105; GFX8-NEXT:    s_nop 1
5106; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5107; GFX8-NEXT:    s_nop 1
5108; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5109; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5110; GFX8-NEXT:    s_nop 0
5111; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5112; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5113; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5114; GFX8-NEXT:    ; implicit-def: $vgpr0
5115; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5116; GFX8-NEXT:    s_cbranch_execz .LBB19_2
5117; GFX8-NEXT:  ; %bb.1:
5118; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5119; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5120; GFX8-NEXT:    s_mov_b32 m0, -1
5121; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5122; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
5123; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5124; GFX8-NEXT:  .LBB19_2:
5125; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5126; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5127; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5128; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5129; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
5130; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5131; GFX8-NEXT:    s_mov_b32 s2, -1
5132; GFX8-NEXT:    s_nop 0
5133; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5134; GFX8-NEXT:    s_endpgm
5135;
5136; GFX9-LABEL: min_i32_varying:
5137; GFX9:       ; %bb.0: ; %entry
5138; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5139; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5140; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5141; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5142; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
5143; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5144; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5145; GFX9-NEXT:    s_not_b64 exec, exec
5146; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
5147; GFX9-NEXT:    s_not_b64 exec, exec
5148; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5149; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5150; GFX9-NEXT:    s_nop 1
5151; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5152; GFX9-NEXT:    s_nop 1
5153; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5154; GFX9-NEXT:    s_nop 1
5155; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5156; GFX9-NEXT:    s_nop 1
5157; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5158; GFX9-NEXT:    s_nop 1
5159; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5160; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5161; GFX9-NEXT:    s_nop 0
5162; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5163; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5164; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5165; GFX9-NEXT:    ; implicit-def: $vgpr0
5166; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5167; GFX9-NEXT:    s_cbranch_execz .LBB19_2
5168; GFX9-NEXT:  ; %bb.1:
5169; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5170; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5171; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5172; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
5173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5174; GFX9-NEXT:  .LBB19_2:
5175; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5176; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5177; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5178; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5179; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
5180; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5181; GFX9-NEXT:    s_mov_b32 s2, -1
5182; GFX9-NEXT:    s_nop 0
5183; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5184; GFX9-NEXT:    s_endpgm
5185;
5186; GFX1064-LABEL: min_i32_varying:
5187; GFX1064:       ; %bb.0: ; %entry
5188; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5189; GFX1064-NEXT:    s_not_b64 exec, exec
5190; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
5191; GFX1064-NEXT:    s_not_b64 exec, exec
5192; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5193; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5194; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
5195; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5196; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5197; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5198; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5199; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5200; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5201; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5202; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5203; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5204; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5205; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5206; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5207; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5208; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5209; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5210; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5211; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5212; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5213; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5214; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5215; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5216; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5217; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5218; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5219; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5220; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5221; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5222; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5223; GFX1064-NEXT:    s_mov_b32 s2, -1
5224; GFX1064-NEXT:    ; implicit-def: $vgpr0
5225; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5226; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
5227; GFX1064-NEXT:  ; %bb.1:
5228; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5229; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5230; GFX1064-NEXT:    s_mov_b32 s3, s7
5231; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5232; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5233; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
5234; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5235; GFX1064-NEXT:    buffer_gl0_inv
5236; GFX1064-NEXT:  .LBB19_2:
5237; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5238; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5239; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5240; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5241; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
5242; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5243; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5244; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5245; GFX1064-NEXT:    s_endpgm
5246;
5247; GFX1032-LABEL: min_i32_varying:
5248; GFX1032:       ; %bb.0: ; %entry
5249; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5250; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5251; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
5252; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5253; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5254; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5255; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5256; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5257; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5258; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5259; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5260; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5261; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5262; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5263; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5264; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
5265; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5266; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5267; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5268; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5269; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5270; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5271; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5272; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5273; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5274; GFX1032-NEXT:    s_mov_b32 s2, -1
5275; GFX1032-NEXT:    ; implicit-def: $vgpr0
5276; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5277; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
5278; GFX1032-NEXT:  ; %bb.1:
5279; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5280; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5281; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5282; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5283; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
5284; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5285; GFX1032-NEXT:    buffer_gl0_inv
5286; GFX1032-NEXT:  .LBB19_2:
5287; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5288; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5289; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5290; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5291; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
5292; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5293; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5294; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5295; GFX1032-NEXT:    s_endpgm
5296;
5297; GFX1164-LABEL: min_i32_varying:
5298; GFX1164:       ; %bb.0: ; %entry
5299; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5300; GFX1164-NEXT:    s_not_b64 exec, exec
5301; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
5302; GFX1164-NEXT:    s_not_b64 exec, exec
5303; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5304; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5305; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5306; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
5307; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5308; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5309; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5310; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5311; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5312; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5313; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5314; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5315; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5316; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5317; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5318; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5319; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5320; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5321; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5322; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5323; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5324; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5325; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5326; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5327; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5328; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5329; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5330; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5331; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5332; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5333; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5334; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5335; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5336; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5337; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5338; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5339; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5340; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5341; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5342; GFX1164-NEXT:    s_mov_b32 s2, -1
5343; GFX1164-NEXT:    ; implicit-def: $vgpr0
5344; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5345; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
5346; GFX1164-NEXT:  ; %bb.1:
5347; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5348; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5349; GFX1164-NEXT:    s_mov_b32 s3, s7
5350; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5351; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5352; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
5353; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5354; GFX1164-NEXT:    buffer_gl0_inv
5355; GFX1164-NEXT:  .LBB19_2:
5356; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5357; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5358; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5359; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5360; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
5361; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5362; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5363; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5364; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5365; GFX1164-NEXT:    s_endpgm
5366;
5367; GFX1132-LABEL: min_i32_varying:
5368; GFX1132:       ; %bb.0: ; %entry
5369; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5370; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5371; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
5372; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5373; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5374; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5375; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5376; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5377; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5378; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5379; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5380; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5381; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5382; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5383; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5384; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5385; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5386; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5387; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5388; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
5389; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5390; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5391; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5392; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5393; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5394; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5395; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5396; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5397; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5398; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5399; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5400; GFX1132-NEXT:    s_mov_b32 s2, -1
5401; GFX1132-NEXT:    ; implicit-def: $vgpr0
5402; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5403; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
5404; GFX1132-NEXT:  ; %bb.1:
5405; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5406; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5407; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5408; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5409; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
5410; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5411; GFX1132-NEXT:    buffer_gl0_inv
5412; GFX1132-NEXT:  .LBB19_2:
5413; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5414; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5415; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5416; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5417; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
5418; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5419; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5420; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5421; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5422; GFX1132-NEXT:    s_endpgm
5423entry:
5424  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5425  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5426  store i32 %old, i32 addrspace(1)* %out
5427  ret void
5428}
5429
5430define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
5431;
5432;
5433; GFX7LESS-LABEL: min_i64_constant:
5434; GFX7LESS:       ; %bb.0: ; %entry
5435; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5436; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5437; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5438; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5439; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5440; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5441; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
5442; GFX7LESS-NEXT:  ; %bb.1:
5443; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5444; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5445; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5446; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5447; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5448; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5449; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5450; GFX7LESS-NEXT:  .LBB20_2:
5451; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5452; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5453; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5454; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5455; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
5456; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5457; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5458; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5459; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5460; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
5461; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5462; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5463; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5464; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5465; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5466; GFX7LESS-NEXT:    s_endpgm
5467;
5468; GFX8-LABEL: min_i64_constant:
5469; GFX8:       ; %bb.0: ; %entry
5470; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5471; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5472; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5473; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5474; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5475; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5476; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5477; GFX8-NEXT:  ; %bb.1:
5478; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5479; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5480; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5481; GFX8-NEXT:    s_mov_b32 m0, -1
5482; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5483; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5484; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5485; GFX8-NEXT:  .LBB20_2:
5486; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5487; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5488; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5489; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
5490; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5491; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5492; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5493; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5494; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5495; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5496; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5497; GFX8-NEXT:    s_mov_b32 s2, -1
5498; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5499; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5500; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5501; GFX8-NEXT:    s_endpgm
5502;
5503; GFX9-LABEL: min_i64_constant:
5504; GFX9:       ; %bb.0: ; %entry
5505; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5506; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5507; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5508; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5509; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5510; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5511; GFX9-NEXT:    s_cbranch_execz .LBB20_2
5512; GFX9-NEXT:  ; %bb.1:
5513; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5514; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5515; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5517; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5518; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5519; GFX9-NEXT:  .LBB20_2:
5520; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5521; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5522; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5523; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
5524; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5525; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5526; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5527; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5528; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5529; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5530; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5531; GFX9-NEXT:    s_mov_b32 s2, -1
5532; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5533; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5534; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5535; GFX9-NEXT:    s_endpgm
5536;
5537; GFX1064-LABEL: min_i64_constant:
5538; GFX1064:       ; %bb.0: ; %entry
5539; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5540; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5541; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5542; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5543; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5544; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5545; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
5546; GFX1064-NEXT:  ; %bb.1:
5547; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5548; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5549; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5550; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5551; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5552; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5553; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5554; GFX1064-NEXT:    buffer_gl0_inv
5555; GFX1064-NEXT:  .LBB20_2:
5556; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5557; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5558; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5559; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5560; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5561; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5562; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5563; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5564; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5565; GFX1064-NEXT:    s_mov_b32 s2, -1
5566; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5567; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5568; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5569; GFX1064-NEXT:    s_endpgm
5570;
5571; GFX1032-LABEL: min_i64_constant:
5572; GFX1032:       ; %bb.0: ; %entry
5573; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5574; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5575; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5576; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5577; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5578; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
5579; GFX1032-NEXT:  ; %bb.1:
5580; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5581; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5582; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5583; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5584; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5585; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5586; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5587; GFX1032-NEXT:    buffer_gl0_inv
5588; GFX1032-NEXT:  .LBB20_2:
5589; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5590; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5591; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5592; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5593; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5594; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5595; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5596; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5597; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5598; GFX1032-NEXT:    s_mov_b32 s2, -1
5599; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5600; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5601; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5602; GFX1032-NEXT:    s_endpgm
5603;
5604; GFX1164-LABEL: min_i64_constant:
5605; GFX1164:       ; %bb.0: ; %entry
5606; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5607; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5608; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5609; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5610; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5611; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5612; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5613; GFX1164-NEXT:    s_cbranch_execz .LBB20_2
5614; GFX1164-NEXT:  ; %bb.1:
5615; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5616; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5617; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5618; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5619; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5620; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5621; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5622; GFX1164-NEXT:    buffer_gl0_inv
5623; GFX1164-NEXT:  .LBB20_2:
5624; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5625; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5626; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5627; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5628; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5629; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5630; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5631; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5632; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5633; GFX1164-NEXT:    s_mov_b32 s2, -1
5634; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5635; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5636; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5637; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5638; GFX1164-NEXT:    s_endpgm
5639;
5640; GFX1132-LABEL: min_i64_constant:
5641; GFX1132:       ; %bb.0: ; %entry
5642; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5643; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5644; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5645; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5646; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5647; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5648; GFX1132-NEXT:    s_cbranch_execz .LBB20_2
5649; GFX1132-NEXT:  ; %bb.1:
5650; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5651; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5652; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5653; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5654; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5655; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5656; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5657; GFX1132-NEXT:    buffer_gl0_inv
5658; GFX1132-NEXT:  .LBB20_2:
5659; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5660; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5661; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5662; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5663; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5664; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5665; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5666; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5667; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5668; GFX1132-NEXT:    s_mov_b32 s2, -1
5669; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5670; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5671; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5672; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5673; GFX1132-NEXT:    s_endpgm
5674entry:
5675  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
5676  store i64 %old, i64 addrspace(1)* %out
5677  ret void
5678}
5679
5680define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
5681;
5682;
5683; GFX7LESS-LABEL: umax_i32_varying:
5684; GFX7LESS:       ; %bb.0: ; %entry
5685; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5686; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5687; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5688; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5689; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
5690; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5691; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5692; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5693; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5694; GFX7LESS-NEXT:    s_endpgm
5695;
5696; GFX8-LABEL: umax_i32_varying:
5697; GFX8:       ; %bb.0: ; %entry
5698; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5699; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5700; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5701; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5702; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5703; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5704; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5705; GFX8-NEXT:    s_not_b64 exec, exec
5706; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5707; GFX8-NEXT:    s_not_b64 exec, exec
5708; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5709; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5710; GFX8-NEXT:    s_nop 1
5711; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5712; GFX8-NEXT:    s_nop 1
5713; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5714; GFX8-NEXT:    s_nop 1
5715; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5716; GFX8-NEXT:    s_nop 1
5717; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5718; GFX8-NEXT:    s_nop 1
5719; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5720; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5721; GFX8-NEXT:    s_nop 0
5722; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5723; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5724; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5725; GFX8-NEXT:    ; implicit-def: $vgpr0
5726; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5727; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5728; GFX8-NEXT:  ; %bb.1:
5729; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5730; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5731; GFX8-NEXT:    s_mov_b32 m0, -1
5732; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5733; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
5734; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5735; GFX8-NEXT:  .LBB21_2:
5736; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5737; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5738; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5739; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5740; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
5741; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5742; GFX8-NEXT:    s_mov_b32 s2, -1
5743; GFX8-NEXT:    s_nop 0
5744; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5745; GFX8-NEXT:    s_endpgm
5746;
5747; GFX9-LABEL: umax_i32_varying:
5748; GFX9:       ; %bb.0: ; %entry
5749; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5750; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5751; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5752; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5753; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5754; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5755; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5756; GFX9-NEXT:    s_not_b64 exec, exec
5757; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5758; GFX9-NEXT:    s_not_b64 exec, exec
5759; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5760; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5761; GFX9-NEXT:    s_nop 1
5762; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5763; GFX9-NEXT:    s_nop 1
5764; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5765; GFX9-NEXT:    s_nop 1
5766; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5767; GFX9-NEXT:    s_nop 1
5768; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5769; GFX9-NEXT:    s_nop 1
5770; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5771; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5772; GFX9-NEXT:    s_nop 0
5773; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5774; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5775; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5776; GFX9-NEXT:    ; implicit-def: $vgpr0
5777; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5778; GFX9-NEXT:    s_cbranch_execz .LBB21_2
5779; GFX9-NEXT:  ; %bb.1:
5780; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5781; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5782; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5783; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
5784; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5785; GFX9-NEXT:  .LBB21_2:
5786; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5787; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5788; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5789; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5790; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
5791; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5792; GFX9-NEXT:    s_mov_b32 s2, -1
5793; GFX9-NEXT:    s_nop 0
5794; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5795; GFX9-NEXT:    s_endpgm
5796;
5797; GFX1064-LABEL: umax_i32_varying:
5798; GFX1064:       ; %bb.0: ; %entry
5799; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5800; GFX1064-NEXT:    s_not_b64 exec, exec
5801; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5802; GFX1064-NEXT:    s_not_b64 exec, exec
5803; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5804; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5805; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5806; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5807; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5808; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5809; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5810; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5811; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5812; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5813; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5814; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5815; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5816; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5817; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5818; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5819; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5820; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5821; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5822; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5823; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5824; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5825; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5826; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5827; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5828; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5829; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5830; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5831; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5832; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5833; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5834; GFX1064-NEXT:    s_mov_b32 s2, -1
5835; GFX1064-NEXT:    ; implicit-def: $vgpr0
5836; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5837; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
5838; GFX1064-NEXT:  ; %bb.1:
5839; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5840; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5841; GFX1064-NEXT:    s_mov_b32 s3, s7
5842; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5843; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5844; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
5845; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5846; GFX1064-NEXT:    buffer_gl0_inv
5847; GFX1064-NEXT:  .LBB21_2:
5848; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5849; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5850; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5851; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5852; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
5853; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5854; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5855; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5856; GFX1064-NEXT:    s_endpgm
5857;
5858; GFX1032-LABEL: umax_i32_varying:
5859; GFX1032:       ; %bb.0: ; %entry
5860; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5861; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5862; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5863; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5864; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5865; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5866; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5867; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5868; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5869; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5870; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5871; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5872; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5873; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5874; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5875; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5876; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5877; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5878; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5879; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5880; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5881; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5882; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5883; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5884; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5885; GFX1032-NEXT:    s_mov_b32 s2, -1
5886; GFX1032-NEXT:    ; implicit-def: $vgpr0
5887; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5888; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
5889; GFX1032-NEXT:  ; %bb.1:
5890; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5891; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5892; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5893; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5894; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
5895; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5896; GFX1032-NEXT:    buffer_gl0_inv
5897; GFX1032-NEXT:  .LBB21_2:
5898; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5899; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5900; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5901; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5902; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
5903; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5904; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5905; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5906; GFX1032-NEXT:    s_endpgm
5907;
5908; GFX1164-LABEL: umax_i32_varying:
5909; GFX1164:       ; %bb.0: ; %entry
5910; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5911; GFX1164-NEXT:    s_not_b64 exec, exec
5912; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5913; GFX1164-NEXT:    s_not_b64 exec, exec
5914; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5915; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5916; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5917; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5918; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5919; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5920; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5921; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5922; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5923; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5924; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5925; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5926; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5927; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5928; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5929; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5930; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5931; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5932; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5933; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5934; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5935; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5936; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5937; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5938; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5939; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5940; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5941; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5942; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5943; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5944; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5945; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5946; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5947; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5948; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5949; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5950; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5951; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5952; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5953; GFX1164-NEXT:    s_mov_b32 s2, -1
5954; GFX1164-NEXT:    ; implicit-def: $vgpr0
5955; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5956; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
5957; GFX1164-NEXT:  ; %bb.1:
5958; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5959; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5960; GFX1164-NEXT:    s_mov_b32 s3, s7
5961; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5962; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5963; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v4
5964; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5965; GFX1164-NEXT:    buffer_gl0_inv
5966; GFX1164-NEXT:  .LBB21_2:
5967; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5968; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5969; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5970; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5971; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
5972; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5973; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5974; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5975; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5976; GFX1164-NEXT:    s_endpgm
5977;
5978; GFX1132-LABEL: umax_i32_varying:
5979; GFX1132:       ; %bb.0: ; %entry
5980; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5981; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5982; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5983; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5984; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5985; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5986; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5987; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5988; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5989; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5990; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5991; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5992; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5993; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5994; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5995; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5996; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5997; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5998; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5999; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
6000; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6001; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6002; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6003; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6004; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6005; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6006; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6007; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6008; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6009; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6010; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6011; GFX1132-NEXT:    s_mov_b32 s2, -1
6012; GFX1132-NEXT:    ; implicit-def: $vgpr0
6013; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6014; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
6015; GFX1132-NEXT:  ; %bb.1:
6016; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6017; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6018; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6019; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6020; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v4
6021; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6022; GFX1132-NEXT:    buffer_gl0_inv
6023; GFX1132-NEXT:  .LBB21_2:
6024; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6025; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6026; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6027; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6028; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
6029; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6030; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6031; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6032; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6033; GFX1132-NEXT:    s_endpgm
6034entry:
6035  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6036  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6037  store i32 %old, i32 addrspace(1)* %out
6038  ret void
6039}
6040
6041define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
6042;
6043;
6044; GFX7LESS-LABEL: umax_i64_constant:
6045; GFX7LESS:       ; %bb.0: ; %entry
6046; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6047; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6048; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6049; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6050; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6051; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6052; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
6053; GFX7LESS-NEXT:  ; %bb.1:
6054; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6055; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6056; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6057; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6058; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6059; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6060; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6061; GFX7LESS-NEXT:  .LBB22_2:
6062; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6063; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6064; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6065; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6066; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6067; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6068; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6069; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6070; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
6071; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6072; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
6073; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6074; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6075; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6076; GFX7LESS-NEXT:    s_endpgm
6077;
6078; GFX8-LABEL: umax_i64_constant:
6079; GFX8:       ; %bb.0: ; %entry
6080; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6081; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6082; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6083; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6084; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6085; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6086; GFX8-NEXT:    s_cbranch_execz .LBB22_2
6087; GFX8-NEXT:  ; %bb.1:
6088; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6089; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6090; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6091; GFX8-NEXT:    s_mov_b32 m0, -1
6092; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6093; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6094; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6095; GFX8-NEXT:  .LBB22_2:
6096; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6097; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6098; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6099; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
6100; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6101; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6102; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6103; GFX8-NEXT:    v_mov_b32_e32 v2, s2
6104; GFX8-NEXT:    v_mov_b32_e32 v1, s3
6105; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6106; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6107; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6108; GFX8-NEXT:    s_mov_b32 s2, -1
6109; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6110; GFX8-NEXT:    s_endpgm
6111;
6112; GFX9-LABEL: umax_i64_constant:
6113; GFX9:       ; %bb.0: ; %entry
6114; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6115; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6116; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6117; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6118; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6119; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6120; GFX9-NEXT:    s_cbranch_execz .LBB22_2
6121; GFX9-NEXT:  ; %bb.1:
6122; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6123; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6124; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6125; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6126; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6128; GFX9-NEXT:  .LBB22_2:
6129; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6130; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6131; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6132; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
6133; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6134; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6135; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6136; GFX9-NEXT:    v_mov_b32_e32 v2, s2
6137; GFX9-NEXT:    v_mov_b32_e32 v1, s3
6138; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6139; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6140; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6141; GFX9-NEXT:    s_mov_b32 s2, -1
6142; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6143; GFX9-NEXT:    s_endpgm
6144;
6145; GFX1064-LABEL: umax_i64_constant:
6146; GFX1064:       ; %bb.0: ; %entry
6147; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6148; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6149; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6150; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6151; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6152; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6153; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
6154; GFX1064-NEXT:  ; %bb.1:
6155; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6156; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6157; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6158; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6159; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6160; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6161; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6162; GFX1064-NEXT:    buffer_gl0_inv
6163; GFX1064-NEXT:  .LBB22_2:
6164; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6165; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6166; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6167; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6168; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6169; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6170; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6171; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6172; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
6173; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6174; GFX1064-NEXT:    s_mov_b32 s2, -1
6175; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6176; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6177; GFX1064-NEXT:    s_endpgm
6178;
6179; GFX1032-LABEL: umax_i64_constant:
6180; GFX1032:       ; %bb.0: ; %entry
6181; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6182; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6183; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6184; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6185; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6186; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
6187; GFX1032-NEXT:  ; %bb.1:
6188; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6189; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6190; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6191; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6192; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6193; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6194; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6195; GFX1032-NEXT:    buffer_gl0_inv
6196; GFX1032-NEXT:  .LBB22_2:
6197; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6198; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6199; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6200; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6201; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6202; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6203; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6204; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6205; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6206; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6207; GFX1032-NEXT:    s_mov_b32 s2, -1
6208; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6209; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6210; GFX1032-NEXT:    s_endpgm
6211;
6212; GFX1164-LABEL: umax_i64_constant:
6213; GFX1164:       ; %bb.0: ; %entry
6214; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6215; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6216; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6217; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6218; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6219; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6220; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6221; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
6222; GFX1164-NEXT:  ; %bb.1:
6223; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6224; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6225; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6226; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6227; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6228; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6229; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6230; GFX1164-NEXT:    buffer_gl0_inv
6231; GFX1164-NEXT:  .LBB22_2:
6232; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6233; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6234; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6235; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6236; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6237; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6238; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6239; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6240; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
6241; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6242; GFX1164-NEXT:    s_mov_b32 s2, -1
6243; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6244; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6245; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6246; GFX1164-NEXT:    s_endpgm
6247;
6248; GFX1132-LABEL: umax_i64_constant:
6249; GFX1132:       ; %bb.0: ; %entry
6250; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6251; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6252; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6253; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6254; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6255; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6256; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
6257; GFX1132-NEXT:  ; %bb.1:
6258; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6259; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6260; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6261; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6262; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6263; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6264; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6265; GFX1132-NEXT:    buffer_gl0_inv
6266; GFX1132-NEXT:  .LBB22_2:
6267; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6268; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6269; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6270; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6271; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6272; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6273; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6274; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6275; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6276; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6277; GFX1132-NEXT:    s_mov_b32 s2, -1
6278; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6279; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6280; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6281; GFX1132-NEXT:    s_endpgm
6282entry:
6283  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
6284  store i64 %old, i64 addrspace(1)* %out
6285  ret void
6286}
6287
6288define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
6289;
6290;
6291; GFX7LESS-LABEL: umin_i32_varying:
6292; GFX7LESS:       ; %bb.0: ; %entry
6293; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6294; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6295; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6296; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6297; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
6298; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6299; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6300; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6301; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6302; GFX7LESS-NEXT:    s_endpgm
6303;
6304; GFX8-LABEL: umin_i32_varying:
6305; GFX8:       ; %bb.0: ; %entry
6306; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6307; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6308; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6309; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6310; GFX8-NEXT:    v_mov_b32_e32 v1, -1
6311; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6312; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6313; GFX8-NEXT:    s_not_b64 exec, exec
6314; GFX8-NEXT:    v_mov_b32_e32 v2, -1
6315; GFX8-NEXT:    s_not_b64 exec, exec
6316; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6317; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6318; GFX8-NEXT:    s_nop 1
6319; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6320; GFX8-NEXT:    s_nop 1
6321; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6322; GFX8-NEXT:    s_nop 1
6323; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6324; GFX8-NEXT:    s_nop 1
6325; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6326; GFX8-NEXT:    s_nop 1
6327; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6328; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
6329; GFX8-NEXT:    s_nop 0
6330; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6331; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6332; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6333; GFX8-NEXT:    ; implicit-def: $vgpr0
6334; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6335; GFX8-NEXT:    s_cbranch_execz .LBB23_2
6336; GFX8-NEXT:  ; %bb.1:
6337; GFX8-NEXT:    v_mov_b32_e32 v0, 0
6338; GFX8-NEXT:    v_mov_b32_e32 v3, s4
6339; GFX8-NEXT:    s_mov_b32 m0, -1
6340; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6341; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
6342; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6343; GFX8-NEXT:  .LBB23_2:
6344; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6345; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6346; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6347; GFX8-NEXT:    v_mov_b32_e32 v0, v1
6348; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
6349; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6350; GFX8-NEXT:    s_mov_b32 s2, -1
6351; GFX8-NEXT:    s_nop 0
6352; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6353; GFX8-NEXT:    s_endpgm
6354;
6355; GFX9-LABEL: umin_i32_varying:
6356; GFX9:       ; %bb.0: ; %entry
6357; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6358; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6359; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6360; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6361; GFX9-NEXT:    v_mov_b32_e32 v1, -1
6362; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6363; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6364; GFX9-NEXT:    s_not_b64 exec, exec
6365; GFX9-NEXT:    v_mov_b32_e32 v2, -1
6366; GFX9-NEXT:    s_not_b64 exec, exec
6367; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6368; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6369; GFX9-NEXT:    s_nop 1
6370; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6371; GFX9-NEXT:    s_nop 1
6372; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6373; GFX9-NEXT:    s_nop 1
6374; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6375; GFX9-NEXT:    s_nop 1
6376; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6377; GFX9-NEXT:    s_nop 1
6378; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6379; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
6380; GFX9-NEXT:    s_nop 0
6381; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6382; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6383; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6384; GFX9-NEXT:    ; implicit-def: $vgpr0
6385; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6386; GFX9-NEXT:    s_cbranch_execz .LBB23_2
6387; GFX9-NEXT:  ; %bb.1:
6388; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6389; GFX9-NEXT:    v_mov_b32_e32 v3, s4
6390; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6391; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
6392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6393; GFX9-NEXT:  .LBB23_2:
6394; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6395; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6396; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6397; GFX9-NEXT:    v_mov_b32_e32 v0, v1
6398; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
6399; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6400; GFX9-NEXT:    s_mov_b32 s2, -1
6401; GFX9-NEXT:    s_nop 0
6402; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6403; GFX9-NEXT:    s_endpgm
6404;
6405; GFX1064-LABEL: umin_i32_varying:
6406; GFX1064:       ; %bb.0: ; %entry
6407; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
6408; GFX1064-NEXT:    s_not_b64 exec, exec
6409; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
6410; GFX1064-NEXT:    s_not_b64 exec, exec
6411; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6412; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6413; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
6414; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6415; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6416; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6417; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
6418; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6419; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6420; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
6421; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
6422; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6423; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
6424; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6425; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6426; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6427; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6428; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
6429; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
6430; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6431; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6432; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6433; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
6434; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
6435; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
6436; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6437; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6438; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
6439; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
6440; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
6441; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6442; GFX1064-NEXT:    s_mov_b32 s2, -1
6443; GFX1064-NEXT:    ; implicit-def: $vgpr0
6444; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6445; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
6446; GFX1064-NEXT:  ; %bb.1:
6447; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
6448; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
6449; GFX1064-NEXT:    s_mov_b32 s3, s7
6450; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6451; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6452; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
6453; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6454; GFX1064-NEXT:    buffer_gl0_inv
6455; GFX1064-NEXT:  .LBB23_2:
6456; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6457; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
6458; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
6459; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
6460; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
6461; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6462; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6463; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6464; GFX1064-NEXT:    s_endpgm
6465;
6466; GFX1032-LABEL: umin_i32_varying:
6467; GFX1032:       ; %bb.0: ; %entry
6468; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
6469; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6470; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
6471; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6472; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6473; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6474; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6475; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6476; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6477; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
6478; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6479; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6480; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6481; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6482; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6483; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
6484; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
6485; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
6486; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6487; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6488; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6489; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6490; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
6491; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6492; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6493; GFX1032-NEXT:    s_mov_b32 s2, -1
6494; GFX1032-NEXT:    ; implicit-def: $vgpr0
6495; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6496; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
6497; GFX1032-NEXT:  ; %bb.1:
6498; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
6499; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
6500; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6501; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6502; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
6503; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6504; GFX1032-NEXT:    buffer_gl0_inv
6505; GFX1032-NEXT:  .LBB23_2:
6506; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6507; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6508; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
6509; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
6510; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
6511; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6512; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6513; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6514; GFX1032-NEXT:    s_endpgm
6515;
6516; GFX1164-LABEL: umin_i32_varying:
6517; GFX1164:       ; %bb.0: ; %entry
6518; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
6519; GFX1164-NEXT:    s_not_b64 exec, exec
6520; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
6521; GFX1164-NEXT:    s_not_b64 exec, exec
6522; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6523; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6524; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6525; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
6526; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6527; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6528; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6529; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6530; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6531; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
6532; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6533; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6534; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6535; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
6536; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6537; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
6538; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6539; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6540; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
6541; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6542; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6543; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6544; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6545; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
6546; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
6547; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6548; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6549; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6550; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6551; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
6552; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
6553; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
6554; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6555; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
6556; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6557; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
6558; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
6559; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
6560; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6561; GFX1164-NEXT:    s_mov_b32 s2, -1
6562; GFX1164-NEXT:    ; implicit-def: $vgpr0
6563; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6564; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
6565; GFX1164-NEXT:  ; %bb.1:
6566; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
6567; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
6568; GFX1164-NEXT:    s_mov_b32 s3, s7
6569; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6570; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6571; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
6572; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6573; GFX1164-NEXT:    buffer_gl0_inv
6574; GFX1164-NEXT:  .LBB23_2:
6575; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
6576; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
6577; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
6578; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6579; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
6580; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6581; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6582; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6583; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6584; GFX1164-NEXT:    s_endpgm
6585;
6586; GFX1132-LABEL: umin_i32_varying:
6587; GFX1132:       ; %bb.0: ; %entry
6588; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
6589; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6590; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
6591; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6592; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6593; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6594; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6595; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6596; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6597; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6598; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6599; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6600; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
6601; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6602; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6603; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6604; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6605; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6606; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6607; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
6608; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6609; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6610; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6611; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6612; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6613; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6614; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6615; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6616; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6617; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6618; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6619; GFX1132-NEXT:    s_mov_b32 s2, -1
6620; GFX1132-NEXT:    ; implicit-def: $vgpr0
6621; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6622; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
6623; GFX1132-NEXT:  ; %bb.1:
6624; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6625; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6626; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6627; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6628; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
6629; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6630; GFX1132-NEXT:    buffer_gl0_inv
6631; GFX1132-NEXT:  .LBB23_2:
6632; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6633; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6634; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6635; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6636; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
6637; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6638; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6639; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6640; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6641; GFX1132-NEXT:    s_endpgm
6642entry:
6643  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6644  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6645  store i32 %old, i32 addrspace(1)* %out
6646  ret void
6647}
6648
6649define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
6650;
6651;
6652; GFX7LESS-LABEL: umin_i64_constant:
6653; GFX7LESS:       ; %bb.0: ; %entry
6654; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6655; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6656; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6657; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6658; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6659; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6660; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
6661; GFX7LESS-NEXT:  ; %bb.1:
6662; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6663; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6664; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6665; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6666; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6667; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6668; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6669; GFX7LESS-NEXT:  .LBB24_2:
6670; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6671; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6672; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6673; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6674; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6675; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6676; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6677; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
6678; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6679; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6680; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6681; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6682; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6683; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6684; GFX7LESS-NEXT:    s_endpgm
6685;
6686; GFX8-LABEL: umin_i64_constant:
6687; GFX8:       ; %bb.0: ; %entry
6688; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6689; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6690; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6691; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6692; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6693; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6694; GFX8-NEXT:    s_cbranch_execz .LBB24_2
6695; GFX8-NEXT:  ; %bb.1:
6696; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6697; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6698; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6699; GFX8-NEXT:    s_mov_b32 m0, -1
6700; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6701; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6702; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6703; GFX8-NEXT:  .LBB24_2:
6704; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6705; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6706; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6707; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
6708; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6709; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6710; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6711; GFX8-NEXT:    v_mov_b32_e32 v2, s5
6712; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6713; GFX8-NEXT:    v_mov_b32_e32 v2, s4
6714; GFX8-NEXT:    s_mov_b32 s2, -1
6715; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6716; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6717; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6718; GFX8-NEXT:    s_endpgm
6719;
6720; GFX9-LABEL: umin_i64_constant:
6721; GFX9:       ; %bb.0: ; %entry
6722; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6723; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6724; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6725; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6726; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6727; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6728; GFX9-NEXT:    s_cbranch_execz .LBB24_2
6729; GFX9-NEXT:  ; %bb.1:
6730; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6731; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6732; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6733; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6734; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6735; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6736; GFX9-NEXT:  .LBB24_2:
6737; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6738; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6739; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6740; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
6741; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6742; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6743; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6744; GFX9-NEXT:    v_mov_b32_e32 v2, s5
6745; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6746; GFX9-NEXT:    v_mov_b32_e32 v2, s4
6747; GFX9-NEXT:    s_mov_b32 s2, -1
6748; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6749; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6750; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6751; GFX9-NEXT:    s_endpgm
6752;
6753; GFX1064-LABEL: umin_i64_constant:
6754; GFX1064:       ; %bb.0: ; %entry
6755; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6756; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6757; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6758; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6759; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6760; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6761; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
6762; GFX1064-NEXT:  ; %bb.1:
6763; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6764; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6765; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6766; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6767; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6768; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6769; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6770; GFX1064-NEXT:    buffer_gl0_inv
6771; GFX1064-NEXT:  .LBB24_2:
6772; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6773; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6774; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6775; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6776; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6777; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6778; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6779; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6780; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6781; GFX1064-NEXT:    s_mov_b32 s2, -1
6782; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6783; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6784; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6785; GFX1064-NEXT:    s_endpgm
6786;
6787; GFX1032-LABEL: umin_i64_constant:
6788; GFX1032:       ; %bb.0: ; %entry
6789; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6790; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6791; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6792; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6793; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6794; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
6795; GFX1032-NEXT:  ; %bb.1:
6796; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6797; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6798; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6799; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6800; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6801; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6802; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6803; GFX1032-NEXT:    buffer_gl0_inv
6804; GFX1032-NEXT:  .LBB24_2:
6805; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6806; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6807; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6808; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6809; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6810; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6811; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6812; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6813; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6814; GFX1032-NEXT:    s_mov_b32 s2, -1
6815; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6816; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6817; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6818; GFX1032-NEXT:    s_endpgm
6819;
6820; GFX1164-LABEL: umin_i64_constant:
6821; GFX1164:       ; %bb.0: ; %entry
6822; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6823; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6824; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6825; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6826; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6827; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6828; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6829; GFX1164-NEXT:    s_cbranch_execz .LBB24_2
6830; GFX1164-NEXT:  ; %bb.1:
6831; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6832; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6833; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6834; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6835; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6836; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6837; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6838; GFX1164-NEXT:    buffer_gl0_inv
6839; GFX1164-NEXT:  .LBB24_2:
6840; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6841; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6842; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6843; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6844; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6845; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6846; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6847; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6848; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6849; GFX1164-NEXT:    s_mov_b32 s2, -1
6850; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6851; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6852; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6853; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6854; GFX1164-NEXT:    s_endpgm
6855;
6856; GFX1132-LABEL: umin_i64_constant:
6857; GFX1132:       ; %bb.0: ; %entry
6858; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6859; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6860; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6861; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6862; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6863; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6864; GFX1132-NEXT:    s_cbranch_execz .LBB24_2
6865; GFX1132-NEXT:  ; %bb.1:
6866; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6867; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6868; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6869; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6870; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6871; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6872; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6873; GFX1132-NEXT:    buffer_gl0_inv
6874; GFX1132-NEXT:  .LBB24_2:
6875; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6876; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6877; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6878; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6879; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6880; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6881; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6882; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6883; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6884; GFX1132-NEXT:    s_mov_b32 s2, -1
6885; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6886; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6887; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6888; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6889; GFX1132-NEXT:    s_endpgm
6890entry:
6891  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
6892  store i64 %old, i64 addrspace(1)* %out
6893  ret void
6894}
6895