1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12@local_var32 = addrspace(3) global i32 undef, align 4
13@local_var64 = addrspace(3) global i64 undef, align 8
14
15; Show what the atomic optimization pass will do for local pointers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
18;
19;
20; GFX7LESS-LABEL: add_i32_constant:
21; GFX7LESS:       ; %bb.0: ; %entry
22; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
23; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
25; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
26; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
27; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
28; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
29; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
30; GFX7LESS-NEXT:  ; %bb.1:
31; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
34; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7LESS-NEXT:    s_mov_b32 m0, -1
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:  .LBB0_2:
40; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
43; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
45; GFX7LESS-NEXT:    s_mov_b32 s2, -1
46; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; GFX7LESS-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
52; GFX8-NEXT:    s_mov_b64 s[2:3], exec
53; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
54; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
55; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
56; GFX8-NEXT:    ; implicit-def: $vgpr1
57; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
58; GFX8-NEXT:    s_cbranch_execz .LBB0_2
59; GFX8-NEXT:  ; %bb.1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
61; GFX8-NEXT:    s_mul_i32 s2, s2, 5
62; GFX8-NEXT:    v_mov_b32_e32 v1, 0
63; GFX8-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-NEXT:    s_mov_b32 m0, -1
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:  .LBB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
73; GFX8-NEXT:    s_mov_b32 s3, 0xf000
74; GFX8-NEXT:    s_mov_b32 s2, -1
75; GFX8-NEXT:    s_nop 1
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    s_mov_b64 s[2:3], exec
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX9-NEXT:    s_cbranch_execz .LBB0_2
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
91; GFX9-NEXT:    s_mul_i32 s2, s2, 5
92; GFX9-NEXT:    v_mov_b32_e32 v1, 0
93; GFX9-NEXT:    v_mov_b32_e32 v2, s2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
111; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
112; GFX1064-NEXT:    ; implicit-def: $vgpr1
113; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
114; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
116; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
117; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
118; GFX1064-NEXT:  ; %bb.1:
119; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
120; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
121; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
122; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1064-NEXT:    buffer_gl0_inv
128; GFX1064-NEXT:  .LBB0_2:
129; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
130; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
131; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
132; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
133; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
134; GFX1064-NEXT:    s_mov_b32 s2, -1
135; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
137; GFX1064-NEXT:    s_endpgm
138;
139; GFX1032-LABEL: add_i32_constant:
140; GFX1032:       ; %bb.0: ; %entry
141; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
151; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
152; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
155; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1032-NEXT:    buffer_gl0_inv
158; GFX1032-NEXT:  .LBB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168;
169; GFX1164-LABEL: add_i32_constant:
170; GFX1164:       ; %bb.0: ; %entry
171; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
172; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
173; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
174; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
175; GFX1164-NEXT:    ; implicit-def: $vgpr1
176; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
177; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
178; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
179; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
180; GFX1164-NEXT:  ; %bb.1:
181; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
182; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
183; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
184; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
185; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
186; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
187; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
188; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
189; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX1164-NEXT:    buffer_gl0_inv
191; GFX1164-NEXT:  .LBB0_2:
192; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
193; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
194; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
195; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
196; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
197; GFX1164-NEXT:    s_mov_b32 s2, -1
198; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
200; GFX1164-NEXT:    s_endpgm
201;
202; GFX1132-LABEL: add_i32_constant:
203; GFX1132:       ; %bb.0: ; %entry
204; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
205; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
206; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
207; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
208; GFX1132-NEXT:    ; implicit-def: $vgpr1
209; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
210; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
211; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
212; GFX1132-NEXT:  ; %bb.1:
213; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
214; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
215; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
216; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
217; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
218; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
219; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
220; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
221; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX1132-NEXT:    buffer_gl0_inv
223; GFX1132-NEXT:  .LBB0_2:
224; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
225; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
226; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
227; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
228; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
229; GFX1132-NEXT:    s_mov_b32 s2, -1
230; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
232; GFX1132-NEXT:    s_endpgm
233entry:
234  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
235  store i32 %old, i32 addrspace(1)* %out
236  ret void
237}
238
239define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
240;
241;
242; GFX7LESS-LABEL: add_i32_uniform:
243; GFX7LESS:       ; %bb.0: ; %entry
244; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
245; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
246; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
247; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
248; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
249; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
250; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
251; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
252; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
253; GFX7LESS-NEXT:  ; %bb.1:
254; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
255; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
257; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
258; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
259; GFX7LESS-NEXT:    s_mov_b32 m0, -1
260; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
262; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX7LESS-NEXT:  .LBB1_2:
264; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
265; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
267; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
268; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
269; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
270; GFX7LESS-NEXT:    s_mov_b32 s6, -1
271; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
272; GFX7LESS-NEXT:    s_endpgm
273;
274; GFX8-LABEL: add_i32_uniform:
275; GFX8:       ; %bb.0: ; %entry
276; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
277; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
278; GFX8-NEXT:    s_mov_b64 s[2:3], exec
279; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
280; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
281; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
282; GFX8-NEXT:    ; implicit-def: $vgpr1
283; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
284; GFX8-NEXT:    s_cbranch_execz .LBB1_2
285; GFX8-NEXT:  ; %bb.1:
286; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
287; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX8-NEXT:    s_mul_i32 s2, s6, s2
289; GFX8-NEXT:    v_mov_b32_e32 v1, 0
290; GFX8-NEXT:    v_mov_b32_e32 v2, s2
291; GFX8-NEXT:    s_mov_b32 m0, -1
292; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
294; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX8-NEXT:  .LBB1_2:
296; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
297; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
299; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
300; GFX8-NEXT:    s_mov_b32 s7, 0xf000
301; GFX8-NEXT:    s_mov_b32 s6, -1
302; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
303; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
304; GFX8-NEXT:    s_endpgm
305;
306; GFX9-LABEL: add_i32_uniform:
307; GFX9:       ; %bb.0: ; %entry
308; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
309; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
310; GFX9-NEXT:    s_mov_b64 s[2:3], exec
311; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
312; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
313; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
314; GFX9-NEXT:    ; implicit-def: $vgpr1
315; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
316; GFX9-NEXT:    s_cbranch_execz .LBB1_2
317; GFX9-NEXT:  ; %bb.1:
318; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
319; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX9-NEXT:    s_mul_i32 s2, s6, s2
321; GFX9-NEXT:    v_mov_b32_e32 v1, 0
322; GFX9-NEXT:    v_mov_b32_e32 v2, s2
323; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
325; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX9-NEXT:  .LBB1_2:
327; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
330; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
331; GFX9-NEXT:    s_mov_b32 s7, 0xf000
332; GFX9-NEXT:    s_mov_b32 s6, -1
333; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
334; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
335; GFX9-NEXT:    s_endpgm
336;
337; GFX1064-LABEL: add_i32_uniform:
338; GFX1064:       ; %bb.0: ; %entry
339; GFX1064-NEXT:    s_clause 0x1
340; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
341; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
342; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
343; GFX1064-NEXT:    ; implicit-def: $vgpr1
344; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
345; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
346; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
347; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
348; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
349; GFX1064-NEXT:  ; %bb.1:
350; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
351; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
352; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
354; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
355; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
356; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
357; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
358; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX1064-NEXT:    buffer_gl0_inv
360; GFX1064-NEXT:  .LBB1_2:
361; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
362; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
363; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
364; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
365; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
367; GFX1064-NEXT:    s_mov_b32 s6, -1
368; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
369; GFX1064-NEXT:    s_endpgm
370;
371; GFX1032-LABEL: add_i32_uniform:
372; GFX1032:       ; %bb.0: ; %entry
373; GFX1032-NEXT:    s_clause 0x1
374; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
375; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
376; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
377; GFX1032-NEXT:    ; implicit-def: $vgpr1
378; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
379; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
380; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
381; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
382; GFX1032-NEXT:  ; %bb.1:
383; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
384; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
385; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
387; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
388; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
389; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
390; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
391; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX1032-NEXT:    buffer_gl0_inv
393; GFX1032-NEXT:  .LBB1_2:
394; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
395; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
396; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
397; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
398; GFX1032-NEXT:    s_mov_b32 s6, -1
399; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
401; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
402; GFX1032-NEXT:    s_endpgm
403;
404; GFX1164-LABEL: add_i32_uniform:
405; GFX1164:       ; %bb.0: ; %entry
406; GFX1164-NEXT:    s_clause 0x1
407; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
408; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
409; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
410; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
411; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
412; GFX1164-NEXT:    ; implicit-def: $vgpr1
413; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
414; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
415; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
416; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
417; GFX1164-NEXT:  ; %bb.1:
418; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
419; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
420; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
422; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
423; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
424; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
425; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
426; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
427; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX1164-NEXT:    buffer_gl0_inv
429; GFX1164-NEXT:  .LBB1_2:
430; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
431; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
432; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
433; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
435; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
436; GFX1164-NEXT:    s_mov_b32 s6, -1
437; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
438; GFX1164-NEXT:    s_endpgm
439;
440; GFX1132-LABEL: add_i32_uniform:
441; GFX1132:       ; %bb.0: ; %entry
442; GFX1132-NEXT:    s_clause 0x1
443; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
444; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
445; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
446; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
447; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
448; GFX1132-NEXT:    ; implicit-def: $vgpr1
449; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
450; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
451; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
452; GFX1132-NEXT:  ; %bb.1:
453; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
454; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
455; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
457; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
458; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
459; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
460; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
461; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
462; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX1132-NEXT:    buffer_gl0_inv
464; GFX1132-NEXT:  .LBB1_2:
465; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
466; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
467; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
468; GFX1132-NEXT:    s_mov_b32 s6, -1
469; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
471; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
472; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
473; GFX1132-NEXT:    s_endpgm
474entry:
475  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
476  store i32 %old, i32 addrspace(1)* %out
477  ret void
478}
479
480define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
481;
482;
483; GFX7LESS-LABEL: add_i32_varying:
484; GFX7LESS:       ; %bb.0: ; %entry
485; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
486; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
487; GFX7LESS-NEXT:    s_mov_b32 m0, -1
488; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
490; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
492; GFX7LESS-NEXT:    s_mov_b32 s2, -1
493; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
494; GFX7LESS-NEXT:    s_endpgm
495;
496; GFX8-LABEL: add_i32_varying:
497; GFX8:       ; %bb.0: ; %entry
498; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
499; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
500; GFX8-NEXT:    v_mov_b32_e32 v1, 0
501; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
502; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
503; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
504; GFX8-NEXT:    v_mov_b32_e32 v2, v0
505; GFX8-NEXT:    s_not_b64 exec, exec
506; GFX8-NEXT:    v_mov_b32_e32 v2, 0
507; GFX8-NEXT:    s_not_b64 exec, exec
508; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
509; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
510; GFX8-NEXT:    s_nop 1
511; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
512; GFX8-NEXT:    s_nop 1
513; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
514; GFX8-NEXT:    s_nop 1
515; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
516; GFX8-NEXT:    s_nop 1
517; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
518; GFX8-NEXT:    s_nop 1
519; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
520; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
521; GFX8-NEXT:    s_nop 0
522; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
523; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
524; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
525; GFX8-NEXT:    ; implicit-def: $vgpr0
526; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
527; GFX8-NEXT:    s_cbranch_execz .LBB2_2
528; GFX8-NEXT:  ; %bb.1:
529; GFX8-NEXT:    v_mov_b32_e32 v0, 0
530; GFX8-NEXT:    v_mov_b32_e32 v3, s4
531; GFX8-NEXT:    s_mov_b32 m0, -1
532; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
534; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX8-NEXT:  .LBB2_2:
536; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
537; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
539; GFX8-NEXT:    v_mov_b32_e32 v0, v1
540; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
541; GFX8-NEXT:    s_mov_b32 s3, 0xf000
542; GFX8-NEXT:    s_mov_b32 s2, -1
543; GFX8-NEXT:    s_nop 0
544; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
545; GFX8-NEXT:    s_endpgm
546;
547; GFX9-LABEL: add_i32_varying:
548; GFX9:       ; %bb.0: ; %entry
549; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
550; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
551; GFX9-NEXT:    v_mov_b32_e32 v1, 0
552; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
553; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
554; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
555; GFX9-NEXT:    v_mov_b32_e32 v2, v0
556; GFX9-NEXT:    s_not_b64 exec, exec
557; GFX9-NEXT:    v_mov_b32_e32 v2, 0
558; GFX9-NEXT:    s_not_b64 exec, exec
559; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
560; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
561; GFX9-NEXT:    s_nop 1
562; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
563; GFX9-NEXT:    s_nop 1
564; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
565; GFX9-NEXT:    s_nop 1
566; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
567; GFX9-NEXT:    s_nop 1
568; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
569; GFX9-NEXT:    s_nop 1
570; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
571; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
572; GFX9-NEXT:    s_nop 0
573; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
574; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
575; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
576; GFX9-NEXT:    ; implicit-def: $vgpr0
577; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
578; GFX9-NEXT:    s_cbranch_execz .LBB2_2
579; GFX9-NEXT:  ; %bb.1:
580; GFX9-NEXT:    v_mov_b32_e32 v0, 0
581; GFX9-NEXT:    v_mov_b32_e32 v3, s4
582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
584; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
585; GFX9-NEXT:  .LBB2_2:
586; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
587; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
589; GFX9-NEXT:    v_mov_b32_e32 v0, v1
590; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
591; GFX9-NEXT:    s_mov_b32 s3, 0xf000
592; GFX9-NEXT:    s_mov_b32 s2, -1
593; GFX9-NEXT:    s_nop 0
594; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
595; GFX9-NEXT:    s_endpgm
596;
597; GFX1064-LABEL: add_i32_varying:
598; GFX1064:       ; %bb.0: ; %entry
599; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
600; GFX1064-NEXT:    s_not_b64 exec, exec
601; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
602; GFX1064-NEXT:    s_not_b64 exec, exec
603; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
604; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
605; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
606; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
607; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
608; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
609; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
610; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
611; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
612; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
613; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
614; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
615; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
616; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
617; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
618; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
619; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
620; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
621; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
622; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
623; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
624; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
625; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
626; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
627; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
628; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
629; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
630; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
631; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
632; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
633; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
634; GFX1064-NEXT:    s_mov_b32 s2, -1
635; GFX1064-NEXT:    ; implicit-def: $vgpr0
636; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
637; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
638; GFX1064-NEXT:  ; %bb.1:
639; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
640; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
641; GFX1064-NEXT:    s_mov_b32 s3, s7
642; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
643; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
644; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
645; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX1064-NEXT:    buffer_gl0_inv
647; GFX1064-NEXT:  .LBB2_2:
648; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
649; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
650; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
651; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
652; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
653; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
654; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
656; GFX1064-NEXT:    s_endpgm
657;
658; GFX1032-LABEL: add_i32_varying:
659; GFX1032:       ; %bb.0: ; %entry
660; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
661; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
662; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
663; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
664; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
665; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
666; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
670; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
671; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
672; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
673; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
674; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
675; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
676; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
677; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
678; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
679; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
680; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
681; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
682; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
683; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
684; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
685; GFX1032-NEXT:    s_mov_b32 s2, -1
686; GFX1032-NEXT:    ; implicit-def: $vgpr0
687; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
688; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
689; GFX1032-NEXT:  ; %bb.1:
690; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
691; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
692; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
693; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
694; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
695; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX1032-NEXT:    buffer_gl0_inv
697; GFX1032-NEXT:  .LBB2_2:
698; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
699; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
700; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
701; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
702; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
703; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
704; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
706; GFX1032-NEXT:    s_endpgm
707;
708; GFX1164-LABEL: add_i32_varying:
709; GFX1164:       ; %bb.0: ; %entry
710; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
711; GFX1164-NEXT:    s_not_b64 exec, exec
712; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
713; GFX1164-NEXT:    s_not_b64 exec, exec
714; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
715; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
716; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
717; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
718; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
719; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
720; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
721; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
722; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
723; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
724; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
725; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
726; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
727; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
728; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
729; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
730; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
731; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
732; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
733; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
734; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
735; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
736; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
737; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
738; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
739; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
740; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
741; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
742; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
743; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
744; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
745; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
746; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
747; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
748; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
749; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
750; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
751; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
752; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
753; GFX1164-NEXT:    s_mov_b32 s2, -1
754; GFX1164-NEXT:    ; implicit-def: $vgpr0
755; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
756; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
757; GFX1164-NEXT:  ; %bb.1:
758; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
759; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
760; GFX1164-NEXT:    s_mov_b32 s3, s7
761; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
762; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
763; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v4
764; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX1164-NEXT:    buffer_gl0_inv
766; GFX1164-NEXT:  .LBB2_2:
767; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
768; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
769; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
770; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
771; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
772; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
773; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
775; GFX1164-NEXT:    s_endpgm
776;
777; GFX1132-LABEL: add_i32_varying:
778; GFX1132:       ; %bb.0: ; %entry
779; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
780; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
781; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
782; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
783; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
784; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
785; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
786; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
787; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
788; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
789; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
790; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
791; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
792; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
793; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
794; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
795; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
796; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
797; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
798; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
799; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
800; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
801; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
802; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
803; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
804; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
805; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
806; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
807; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
808; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
809; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
810; GFX1132-NEXT:    s_mov_b32 s2, -1
811; GFX1132-NEXT:    ; implicit-def: $vgpr0
812; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
813; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
814; GFX1132-NEXT:  ; %bb.1:
815; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
816; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
817; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
818; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
819; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v4
820; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX1132-NEXT:    buffer_gl0_inv
822; GFX1132-NEXT:  .LBB2_2:
823; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
824; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
825; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
826; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
827; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
828; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
829; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
830; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
831; GFX1132-NEXT:    s_endpgm
832entry:
833  %lane = call i32 @llvm.amdgcn.workitem.id.x()
834  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
835  store i32 %old, i32 addrspace(1)* %out
836  ret void
837}
838
839define amdgpu_kernel void @add_i32_varying_nouse() {
840; GFX7LESS-LABEL: add_i32_varying_nouse:
841; GFX7LESS:       ; %bb.0: ; %entry
842; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
843; GFX7LESS-NEXT:    s_mov_b32 m0, -1
844; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
845; GFX7LESS-NEXT:    ds_add_u32 v1, v0
846; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX7LESS-NEXT:    s_endpgm
848;
849; GFX8-LABEL: add_i32_varying_nouse:
850; GFX8:       ; %bb.0: ; %entry
851; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
852; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
853; GFX8-NEXT:    v_mov_b32_e32 v1, v0
854; GFX8-NEXT:    s_not_b64 exec, exec
855; GFX8-NEXT:    v_mov_b32_e32 v1, 0
856; GFX8-NEXT:    s_not_b64 exec, exec
857; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
858; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
859; GFX8-NEXT:    s_nop 1
860; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
861; GFX8-NEXT:    s_nop 1
862; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
863; GFX8-NEXT:    s_nop 1
864; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
865; GFX8-NEXT:    s_nop 1
866; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
867; GFX8-NEXT:    s_nop 1
868; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
869; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
870; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
871; GFX8-NEXT:    s_mov_b32 s0, s2
872; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
873; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
874; GFX8-NEXT:    s_cbranch_execz .LBB3_2
875; GFX8-NEXT:  ; %bb.1:
876; GFX8-NEXT:    v_mov_b32_e32 v0, 0
877; GFX8-NEXT:    v_mov_b32_e32 v2, s0
878; GFX8-NEXT:    s_mov_b32 m0, -1
879; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX8-NEXT:    ds_add_u32 v0, v2
881; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX8-NEXT:  .LBB3_2:
883; GFX8-NEXT:    s_endpgm
884;
885; GFX9-LABEL: add_i32_varying_nouse:
886; GFX9:       ; %bb.0: ; %entry
887; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
888; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
889; GFX9-NEXT:    v_mov_b32_e32 v1, v0
890; GFX9-NEXT:    s_not_b64 exec, exec
891; GFX9-NEXT:    v_mov_b32_e32 v1, 0
892; GFX9-NEXT:    s_not_b64 exec, exec
893; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
894; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
895; GFX9-NEXT:    s_nop 1
896; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
897; GFX9-NEXT:    s_nop 1
898; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
899; GFX9-NEXT:    s_nop 1
900; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX9-NEXT:    s_nop 1
902; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
903; GFX9-NEXT:    s_nop 1
904; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
905; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
906; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
907; GFX9-NEXT:    s_mov_b32 s0, s2
908; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
909; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
910; GFX9-NEXT:    s_cbranch_execz .LBB3_2
911; GFX9-NEXT:  ; %bb.1:
912; GFX9-NEXT:    v_mov_b32_e32 v0, 0
913; GFX9-NEXT:    v_mov_b32_e32 v2, s0
914; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX9-NEXT:    ds_add_u32 v0, v2
916; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX9-NEXT:  .LBB3_2:
918; GFX9-NEXT:    s_endpgm
919;
920; GFX1064-LABEL: add_i32_varying_nouse:
921; GFX1064:       ; %bb.0: ; %entry
922; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
923; GFX1064-NEXT:    s_not_b64 exec, exec
924; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
925; GFX1064-NEXT:    s_not_b64 exec, exec
926; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
927; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
928; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
929; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
930; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
931; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
932; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
933; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
934; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
935; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
936; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
937; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
938; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
939; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
940; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
941; GFX1064-NEXT:    s_add_i32 s0, s2, s3
942; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
943; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
944; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
945; GFX1064-NEXT:  ; %bb.1:
946; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
947; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
948; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
949; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
950; GFX1064-NEXT:    ds_add_u32 v0, v3
951; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
952; GFX1064-NEXT:    buffer_gl0_inv
953; GFX1064-NEXT:  .LBB3_2:
954; GFX1064-NEXT:    s_endpgm
955;
956; GFX1032-LABEL: add_i32_varying_nouse:
957; GFX1032:       ; %bb.0: ; %entry
958; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
959; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
960; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
961; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
962; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
963; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
964; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
965; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
966; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
967; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
968; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
969; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
970; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
971; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
972; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
973; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
974; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
975; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
976; GFX1032-NEXT:  ; %bb.1:
977; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
978; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
979; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
980; GFX1032-NEXT:    ds_add_u32 v3, v0
981; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX1032-NEXT:    buffer_gl0_inv
983; GFX1032-NEXT:  .LBB3_2:
984; GFX1032-NEXT:    s_endpgm
985;
986; GFX1164-LABEL: add_i32_varying_nouse:
987; GFX1164:       ; %bb.0: ; %entry
988; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
989; GFX1164-NEXT:    s_not_b64 exec, exec
990; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
991; GFX1164-NEXT:    s_not_b64 exec, exec
992; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
993; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
994; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
995; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
996; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
997; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
998; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
999; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1000; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
1001; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1002; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1003; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1004; GFX1164-NEXT:    v_permlane64_b32 v2, v1
1005; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
1006; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1007; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1008; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
1009; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1010; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
1011; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1012; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
1013; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1014; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
1015; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
1016; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
1017; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
1018; GFX1164-NEXT:  ; %bb.1:
1019; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1020; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1021; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1022; GFX1164-NEXT:    ds_add_u32 v3, v0
1023; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX1164-NEXT:    buffer_gl0_inv
1025; GFX1164-NEXT:  .LBB3_2:
1026; GFX1164-NEXT:    s_endpgm
1027;
1028; GFX1132-LABEL: add_i32_varying_nouse:
1029; GFX1132:       ; %bb.0: ; %entry
1030; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
1031; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1032; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1033; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1034; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
1035; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1036; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1037; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1038; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1039; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1040; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1041; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1042; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
1043; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1044; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1045; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1046; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
1047; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1048; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1049; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
1050; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1051; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
1052; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1053; GFX1132-NEXT:  ; %bb.1:
1054; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1055; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1056; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1057; GFX1132-NEXT:    ds_add_u32 v3, v0
1058; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX1132-NEXT:    buffer_gl0_inv
1060; GFX1132-NEXT:  .LBB3_2:
1061; GFX1132-NEXT:    s_endpgm
1062entry:
1063  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1064  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1065  ret void
1066}
1067
1068define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1069;
1070;
1071; GFX7LESS-LABEL: add_i64_constant:
1072; GFX7LESS:       ; %bb.0: ; %entry
1073; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1074; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1075; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1076; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1077; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1078; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1079; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1080; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1081; GFX7LESS-NEXT:  ; %bb.1:
1082; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1083; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1084; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1085; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1086; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1087; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1089; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX7LESS-NEXT:  .LBB4_2:
1091; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1092; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1094; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1095; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1096; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1097; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1098; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1099; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1100; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1101; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1102; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1103; GFX7LESS-NEXT:    s_endpgm
1104;
1105; GFX8-LABEL: add_i64_constant:
1106; GFX8:       ; %bb.0: ; %entry
1107; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1108; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1109; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1110; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1111; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1112; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1113; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1114; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1115; GFX8-NEXT:  ; %bb.1:
1116; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1117; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1118; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1119; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1120; GFX8-NEXT:    s_mov_b32 m0, -1
1121; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1123; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX8-NEXT:  .LBB4_2:
1125; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1126; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1127; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1128; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1129; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1130; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1131; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1132; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1133; GFX8-NEXT:    s_mov_b32 s2, -1
1134; GFX8-NEXT:    s_nop 2
1135; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1136; GFX8-NEXT:    s_endpgm
1137;
1138; GFX9-LABEL: add_i64_constant:
1139; GFX9:       ; %bb.0: ; %entry
1140; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1141; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1142; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1143; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1144; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1145; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1146; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1147; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1148; GFX9-NEXT:  ; %bb.1:
1149; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1150; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1151; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1152; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1155; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1156; GFX9-NEXT:  .LBB4_2:
1157; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1158; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1159; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1160; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1161; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1162; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1163; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1164; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1165; GFX9-NEXT:    s_mov_b32 s2, -1
1166; GFX9-NEXT:    s_nop 2
1167; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1168; GFX9-NEXT:    s_endpgm
1169;
1170; GFX1064-LABEL: add_i64_constant:
1171; GFX1064:       ; %bb.0: ; %entry
1172; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1173; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1174; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1175; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1176; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1177; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1178; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1179; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1180; GFX1064-NEXT:  ; %bb.1:
1181; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1182; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1183; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
1184; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1185; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1186; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1187; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1188; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1189; GFX1064-NEXT:    buffer_gl0_inv
1190; GFX1064-NEXT:  .LBB4_2:
1191; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1192; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1193; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1194; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1195; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1196; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1197; GFX1064-NEXT:    s_mov_b32 s2, -1
1198; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1199; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1200; GFX1064-NEXT:    s_endpgm
1201;
1202; GFX1032-LABEL: add_i64_constant:
1203; GFX1032:       ; %bb.0: ; %entry
1204; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1205; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1206; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1207; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1208; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1209; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1210; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1211; GFX1032-NEXT:  ; %bb.1:
1212; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1213; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1214; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1215; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
1216; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1217; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1218; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1219; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1220; GFX1032-NEXT:    buffer_gl0_inv
1221; GFX1032-NEXT:  .LBB4_2:
1222; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1223; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1224; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1225; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1226; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1227; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1228; GFX1032-NEXT:    s_mov_b32 s2, -1
1229; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1230; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1231; GFX1032-NEXT:    s_endpgm
1232;
1233; GFX1164-LABEL: add_i64_constant:
1234; GFX1164:       ; %bb.0: ; %entry
1235; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1236; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1237; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1238; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1239; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1240; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1241; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1242; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1243; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1244; GFX1164-NEXT:  ; %bb.1:
1245; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1246; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1247; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
1248; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1249; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
1250; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1251; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1252; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1253; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1254; GFX1164-NEXT:    buffer_gl0_inv
1255; GFX1164-NEXT:  .LBB4_2:
1256; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1257; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1258; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1259; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1260; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1261; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1262; GFX1164-NEXT:    s_mov_b32 s2, -1
1263; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1264; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1265; GFX1164-NEXT:    s_endpgm
1266;
1267; GFX1132-LABEL: add_i64_constant:
1268; GFX1132:       ; %bb.0: ; %entry
1269; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1270; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1271; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1272; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1273; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1274; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1275; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1276; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1277; GFX1132-NEXT:  ; %bb.1:
1278; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1279; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1280; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1281; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1282; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
1283; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1284; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1285; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1286; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX1132-NEXT:    buffer_gl0_inv
1288; GFX1132-NEXT:  .LBB4_2:
1289; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1290; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1291; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1292; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1293; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1294; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1295; GFX1132-NEXT:    s_mov_b32 s2, -1
1296; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1297; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1298; GFX1132-NEXT:    s_endpgm
1299entry:
1300  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1301  store i64 %old, i64 addrspace(1)* %out
1302  ret void
1303}
1304
1305define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1306;
1307;
1308; GFX7LESS-LABEL: add_i64_uniform:
1309; GFX7LESS:       ; %bb.0: ; %entry
1310; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1311; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1312; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1313; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1314; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1315; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1316; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1317; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1318; GFX7LESS-NEXT:  ; %bb.1:
1319; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1320; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1321; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1323; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1324; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1325; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1326; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1327; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1328; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1329; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1330; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1331; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1332; GFX7LESS-NEXT:  .LBB5_2:
1333; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1334; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1335; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1336; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1337; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1338; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1339; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
1340; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1341; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1342; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1343; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1344; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1345; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
1346; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1347; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1348; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1349; GFX7LESS-NEXT:    s_endpgm
1350;
1351; GFX8-LABEL: add_i64_uniform:
1352; GFX8:       ; %bb.0: ; %entry
1353; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1354; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1355; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1356; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1357; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1358; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1359; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1360; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1361; GFX8-NEXT:  ; %bb.1:
1362; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1363; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1364; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1366; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1367; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1368; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1369; GFX8-NEXT:    s_mov_b32 m0, -1
1370; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1371; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1372; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1373; GFX8-NEXT:  .LBB5_2:
1374; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1375; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1376; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1377; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1378; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1379; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1380; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1381; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1382; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1383; GFX8-NEXT:    s_mov_b32 s6, -1
1384; GFX8-NEXT:    s_mov_b32 s4, s0
1385; GFX8-NEXT:    s_mov_b32 s5, s1
1386; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1387; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1388; GFX8-NEXT:    s_endpgm
1389;
1390; GFX9-LABEL: add_i64_uniform:
1391; GFX9:       ; %bb.0: ; %entry
1392; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1393; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1394; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1395; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1396; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1397; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1398; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1399; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1400; GFX9-NEXT:  ; %bb.1:
1401; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1404; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1405; GFX9-NEXT:    s_add_i32 s8, s8, s7
1406; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1407; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1408; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1409; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1412; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1413; GFX9-NEXT:  .LBB5_2:
1414; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1415; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1417; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1418; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1419; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1420; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1421; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1422; GFX9-NEXT:    s_mov_b32 s6, -1
1423; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1424; GFX9-NEXT:    s_mov_b32 s4, s0
1425; GFX9-NEXT:    s_mov_b32 s5, s1
1426; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1427; GFX9-NEXT:    s_endpgm
1428;
1429; GFX1064-LABEL: add_i64_uniform:
1430; GFX1064:       ; %bb.0: ; %entry
1431; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1432; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1433; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1434; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1435; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1436; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1437; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1438; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1439; GFX1064-NEXT:  ; %bb.1:
1440; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1441; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1442; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1444; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1445; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1446; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1447; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1448; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1449; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1450; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1451; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1452; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX1064-NEXT:    buffer_gl0_inv
1454; GFX1064-NEXT:  .LBB5_2:
1455; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1456; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1457; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1458; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1459; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1460; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1461; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1462; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1463; GFX1064-NEXT:    s_mov_b32 s2, -1
1464; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1465; GFX1064-NEXT:    s_endpgm
1466;
1467; GFX1032-LABEL: add_i64_uniform:
1468; GFX1032:       ; %bb.0: ; %entry
1469; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1470; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1471; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1472; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1473; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1474; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1475; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1476; GFX1032-NEXT:  ; %bb.1:
1477; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1478; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1479; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1481; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1482; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1483; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1484; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1485; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1486; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1487; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1488; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1489; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX1032-NEXT:    buffer_gl0_inv
1491; GFX1032-NEXT:  .LBB5_2:
1492; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1493; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1494; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1495; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1496; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1497; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1498; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1499; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1500; GFX1032-NEXT:    s_mov_b32 s2, -1
1501; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1502; GFX1032-NEXT:    s_endpgm
1503;
1504; GFX1164-LABEL: add_i64_uniform:
1505; GFX1164:       ; %bb.0: ; %entry
1506; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1507; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1508; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1509; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1510; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1511; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1512; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1513; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1514; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1515; GFX1164-NEXT:  ; %bb.1:
1516; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1517; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1518; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1519; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1520; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1521; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1522; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1523; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1524; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1525; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1526; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1527; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1528; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1529; GFX1164-NEXT:    buffer_gl0_inv
1530; GFX1164-NEXT:  .LBB5_2:
1531; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1532; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1533; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1534; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1535; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1536; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1537; GFX1164-NEXT:    s_mov_b32 s2, -1
1538; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1539; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1540; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1541; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1542; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1543; GFX1164-NEXT:    s_endpgm
1544;
1545; GFX1132-LABEL: add_i64_uniform:
1546; GFX1132:       ; %bb.0: ; %entry
1547; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1548; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1549; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1550; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1551; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1552; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1553; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1554; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1555; GFX1132-NEXT:  ; %bb.1:
1556; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1557; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1558; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1559; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1560; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1561; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1562; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1563; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
1564; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
1565; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1566; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1567; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1568; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1569; GFX1132-NEXT:    buffer_gl0_inv
1570; GFX1132-NEXT:  .LBB5_2:
1571; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1572; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1573; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1574; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1575; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1576; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1577; GFX1132-NEXT:    s_mov_b32 s2, -1
1578; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1579; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1580; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1581; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1582; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1583; GFX1132-NEXT:    s_endpgm
1584entry:
1585  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1586  store i64 %old, i64 addrspace(1)* %out
1587  ret void
1588}
1589
1590define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1591;
1592;
1593; GFX7LESS-LABEL: add_i64_varying:
1594; GFX7LESS:       ; %bb.0: ; %entry
1595; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1596; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1597; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1598; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1599; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1600; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1602; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1603; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1604; GFX7LESS-NEXT:    s_endpgm
1605;
1606; GFX8-LABEL: add_i64_varying:
1607; GFX8:       ; %bb.0: ; %entry
1608; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1609; GFX8-NEXT:    s_mov_b32 m0, -1
1610; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1611; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1612; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1613; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1615; GFX8-NEXT:    s_mov_b32 s2, -1
1616; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1617; GFX8-NEXT:    s_endpgm
1618;
1619; GFX9-LABEL: add_i64_varying:
1620; GFX9:       ; %bb.0: ; %entry
1621; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1622; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1623; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1625; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1626; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1627; GFX9-NEXT:    s_mov_b32 s2, -1
1628; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1629; GFX9-NEXT:    s_endpgm
1630;
1631; GFX10-LABEL: add_i64_varying:
1632; GFX10:       ; %bb.0: ; %entry
1633; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1634; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1635; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1636; GFX10-NEXT:    s_mov_b32 s2, -1
1637; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1638; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1639; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1640; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1641; GFX10-NEXT:    buffer_gl0_inv
1642; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1643; GFX10-NEXT:    s_endpgm
1644;
1645; GFX11-LABEL: add_i64_varying:
1646; GFX11:       ; %bb.0: ; %entry
1647; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1648; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1649; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1650; GFX11-NEXT:    s_mov_b32 s2, -1
1651; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1652; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1653; GFX11-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1654; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1655; GFX11-NEXT:    buffer_gl0_inv
1656; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1657; GFX11-NEXT:    s_endpgm
1658entry:
1659  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1660  %zext = zext i32 %lane to i64
1661  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1662  store i64 %old, i64 addrspace(1)* %out
1663  ret void
1664}
1665
1666define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1667;
1668;
1669; GFX7LESS-LABEL: sub_i32_constant:
1670; GFX7LESS:       ; %bb.0: ; %entry
1671; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1672; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1673; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1674; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1675; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1676; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1677; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1678; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1679; GFX7LESS-NEXT:  ; %bb.1:
1680; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1681; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1682; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1683; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1684; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1685; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1686; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1687; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1688; GFX7LESS-NEXT:  .LBB7_2:
1689; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1690; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1691; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1692; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1693; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1694; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1695; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1696; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1697; GFX7LESS-NEXT:    s_endpgm
1698;
1699; GFX8-LABEL: sub_i32_constant:
1700; GFX8:       ; %bb.0: ; %entry
1701; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1702; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1703; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1704; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1705; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1706; GFX8-NEXT:    ; implicit-def: $vgpr1
1707; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1708; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1709; GFX8-NEXT:  ; %bb.1:
1710; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1711; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1712; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1713; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1714; GFX8-NEXT:    s_mov_b32 m0, -1
1715; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1716; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1717; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1718; GFX8-NEXT:  .LBB7_2:
1719; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1720; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1721; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1722; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1723; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1724; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1725; GFX8-NEXT:    s_mov_b32 s2, -1
1726; GFX8-NEXT:    s_nop 0
1727; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1728; GFX8-NEXT:    s_endpgm
1729;
1730; GFX9-LABEL: sub_i32_constant:
1731; GFX9:       ; %bb.0: ; %entry
1732; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1733; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1734; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1735; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1736; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1737; GFX9-NEXT:    ; implicit-def: $vgpr1
1738; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1739; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1740; GFX9-NEXT:  ; %bb.1:
1741; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1742; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1743; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1744; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1745; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1746; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1747; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1748; GFX9-NEXT:  .LBB7_2:
1749; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1750; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1751; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1752; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1753; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1754; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1755; GFX9-NEXT:    s_mov_b32 s2, -1
1756; GFX9-NEXT:    s_nop 0
1757; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1758; GFX9-NEXT:    s_endpgm
1759;
1760; GFX1064-LABEL: sub_i32_constant:
1761; GFX1064:       ; %bb.0: ; %entry
1762; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1763; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1764; GFX1064-NEXT:    ; implicit-def: $vgpr1
1765; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1766; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1767; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1768; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1769; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1770; GFX1064-NEXT:  ; %bb.1:
1771; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1772; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1773; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1774; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1775; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1776; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1777; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1778; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1779; GFX1064-NEXT:    buffer_gl0_inv
1780; GFX1064-NEXT:  .LBB7_2:
1781; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1782; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1783; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1784; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1785; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1786; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1787; GFX1064-NEXT:    s_mov_b32 s2, -1
1788; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1789; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1790; GFX1064-NEXT:    s_endpgm
1791;
1792; GFX1032-LABEL: sub_i32_constant:
1793; GFX1032:       ; %bb.0: ; %entry
1794; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1795; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1796; GFX1032-NEXT:    ; implicit-def: $vgpr1
1797; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1798; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1799; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1800; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1801; GFX1032-NEXT:  ; %bb.1:
1802; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1803; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1804; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1805; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1806; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1807; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1808; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1809; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX1032-NEXT:    buffer_gl0_inv
1811; GFX1032-NEXT:  .LBB7_2:
1812; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1813; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1814; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1815; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1816; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1817; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1818; GFX1032-NEXT:    s_mov_b32 s2, -1
1819; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1821; GFX1032-NEXT:    s_endpgm
1822;
1823; GFX1164-LABEL: sub_i32_constant:
1824; GFX1164:       ; %bb.0: ; %entry
1825; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1826; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1827; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1828; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1829; GFX1164-NEXT:    ; implicit-def: $vgpr1
1830; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1831; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1832; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1833; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1834; GFX1164-NEXT:  ; %bb.1:
1835; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1836; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1837; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1838; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1839; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
1840; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1841; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1842; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1843; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX1164-NEXT:    buffer_gl0_inv
1845; GFX1164-NEXT:  .LBB7_2:
1846; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1847; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1848; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1849; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1850; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1851; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1852; GFX1164-NEXT:    s_mov_b32 s2, -1
1853; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1854; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1855; GFX1164-NEXT:    s_endpgm
1856;
1857; GFX1132-LABEL: sub_i32_constant:
1858; GFX1132:       ; %bb.0: ; %entry
1859; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1860; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1861; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1862; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1863; GFX1132-NEXT:    ; implicit-def: $vgpr1
1864; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1865; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1866; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
1867; GFX1132-NEXT:  ; %bb.1:
1868; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1869; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1870; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1871; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1872; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
1873; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1874; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1875; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1876; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX1132-NEXT:    buffer_gl0_inv
1878; GFX1132-NEXT:  .LBB7_2:
1879; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1880; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1881; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1882; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1883; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1884; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1885; GFX1132-NEXT:    s_mov_b32 s2, -1
1886; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1888; GFX1132-NEXT:    s_endpgm
1889entry:
1890  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1891  store i32 %old, i32 addrspace(1)* %out
1892  ret void
1893}
1894
1895define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1896;
1897;
1898; GFX7LESS-LABEL: sub_i32_uniform:
1899; GFX7LESS:       ; %bb.0: ; %entry
1900; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1901; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1902; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1903; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1904; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1905; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1906; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1907; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1908; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1909; GFX7LESS-NEXT:  ; %bb.1:
1910; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1911; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1912; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1913; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1914; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1915; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1916; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1917; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1918; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1919; GFX7LESS-NEXT:  .LBB8_2:
1920; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1921; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1922; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1923; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1924; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1925; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1926; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1927; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1928; GFX7LESS-NEXT:    s_endpgm
1929;
1930; GFX8-LABEL: sub_i32_uniform:
1931; GFX8:       ; %bb.0: ; %entry
1932; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1933; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1934; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1935; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1936; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1937; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1938; GFX8-NEXT:    ; implicit-def: $vgpr1
1939; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1940; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1941; GFX8-NEXT:  ; %bb.1:
1942; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1943; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1944; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1945; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1946; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1947; GFX8-NEXT:    s_mov_b32 m0, -1
1948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1950; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1951; GFX8-NEXT:  .LBB8_2:
1952; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1954; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1955; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1956; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1957; GFX8-NEXT:    s_mov_b32 s6, -1
1958; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1959; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1960; GFX8-NEXT:    s_endpgm
1961;
1962; GFX9-LABEL: sub_i32_uniform:
1963; GFX9:       ; %bb.0: ; %entry
1964; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1965; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1966; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1967; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1968; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1969; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1970; GFX9-NEXT:    ; implicit-def: $vgpr1
1971; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1972; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1973; GFX9-NEXT:  ; %bb.1:
1974; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1975; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1976; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1977; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1978; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1979; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1980; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX9-NEXT:  .LBB8_2:
1983; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1984; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1986; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1987; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1988; GFX9-NEXT:    s_mov_b32 s6, -1
1989; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1990; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1991; GFX9-NEXT:    s_endpgm
1992;
1993; GFX1064-LABEL: sub_i32_uniform:
1994; GFX1064:       ; %bb.0: ; %entry
1995; GFX1064-NEXT:    s_clause 0x1
1996; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1997; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
1998; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1999; GFX1064-NEXT:    ; implicit-def: $vgpr1
2000; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2001; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2002; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2003; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2004; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
2005; GFX1064-NEXT:  ; %bb.1:
2006; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2007; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2008; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2009; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
2010; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
2011; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2012; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2013; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2014; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2015; GFX1064-NEXT:    buffer_gl0_inv
2016; GFX1064-NEXT:  .LBB8_2:
2017; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2018; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
2019; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
2021; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
2022; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
2023; GFX1064-NEXT:    s_mov_b32 s6, -1
2024; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2025; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2026; GFX1064-NEXT:    s_endpgm
2027;
2028; GFX1032-LABEL: sub_i32_uniform:
2029; GFX1032:       ; %bb.0: ; %entry
2030; GFX1032-NEXT:    s_clause 0x1
2031; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2032; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
2033; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2034; GFX1032-NEXT:    ; implicit-def: $vgpr1
2035; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
2036; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2037; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2038; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
2039; GFX1032-NEXT:  ; %bb.1:
2040; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
2041; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2042; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2043; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
2044; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
2045; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2046; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2047; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2048; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2049; GFX1032-NEXT:    buffer_gl0_inv
2050; GFX1032-NEXT:  .LBB8_2:
2051; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2052; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2053; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2054; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2055; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
2056; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2057; GFX1032-NEXT:    s_mov_b32 s6, -1
2058; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2059; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2060; GFX1032-NEXT:    s_endpgm
2061;
2062; GFX1164-LABEL: sub_i32_uniform:
2063; GFX1164:       ; %bb.0: ; %entry
2064; GFX1164-NEXT:    s_clause 0x1
2065; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2066; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
2067; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2068; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2069; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2070; GFX1164-NEXT:    ; implicit-def: $vgpr1
2071; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2072; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2073; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2074; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2075; GFX1164-NEXT:  ; %bb.1:
2076; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2077; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2078; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
2080; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2081; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
2082; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2083; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2084; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2085; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2086; GFX1164-NEXT:    buffer_gl0_inv
2087; GFX1164-NEXT:  .LBB8_2:
2088; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2089; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2090; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
2091; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2092; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2093; GFX1164-NEXT:    s_mov_b32 s6, -1
2094; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2095; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2096; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2097; GFX1164-NEXT:    s_endpgm
2098;
2099; GFX1132-LABEL: sub_i32_uniform:
2100; GFX1132:       ; %bb.0: ; %entry
2101; GFX1132-NEXT:    s_clause 0x1
2102; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2103; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
2104; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2105; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2106; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2107; GFX1132-NEXT:    ; implicit-def: $vgpr1
2108; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2109; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2110; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2111; GFX1132-NEXT:  ; %bb.1:
2112; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2113; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2114; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2115; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2116; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2117; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
2118; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2119; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2120; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2121; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2122; GFX1132-NEXT:    buffer_gl0_inv
2123; GFX1132-NEXT:  .LBB8_2:
2124; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2125; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2126; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2127; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2128; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2129; GFX1132-NEXT:    s_mov_b32 s6, -1
2130; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2131; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2132; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2133; GFX1132-NEXT:    s_endpgm
2134entry:
2135  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
2136  store i32 %old, i32 addrspace(1)* %out
2137  ret void
2138}
2139
2140define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
2141;
2142;
2143; GFX7LESS-LABEL: sub_i32_varying:
2144; GFX7LESS:       ; %bb.0: ; %entry
2145; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2146; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2147; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2148; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2149; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
2150; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2151; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2152; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2153; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2154; GFX7LESS-NEXT:    s_endpgm
2155;
2156; GFX8-LABEL: sub_i32_varying:
2157; GFX8:       ; %bb.0: ; %entry
2158; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2159; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2160; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2161; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2162; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2163; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2164; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2165; GFX8-NEXT:    s_not_b64 exec, exec
2166; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2167; GFX8-NEXT:    s_not_b64 exec, exec
2168; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2169; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2170; GFX8-NEXT:    s_nop 1
2171; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2172; GFX8-NEXT:    s_nop 1
2173; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2174; GFX8-NEXT:    s_nop 1
2175; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2176; GFX8-NEXT:    s_nop 1
2177; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2178; GFX8-NEXT:    s_nop 1
2179; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2180; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2181; GFX8-NEXT:    s_nop 0
2182; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2183; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2184; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2185; GFX8-NEXT:    ; implicit-def: $vgpr0
2186; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2187; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2188; GFX8-NEXT:  ; %bb.1:
2189; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2190; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2191; GFX8-NEXT:    s_mov_b32 m0, -1
2192; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2193; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2194; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2195; GFX8-NEXT:  .LBB9_2:
2196; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2197; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2198; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2199; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2200; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2201; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2202; GFX8-NEXT:    s_mov_b32 s2, -1
2203; GFX8-NEXT:    s_nop 0
2204; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2205; GFX8-NEXT:    s_endpgm
2206;
2207; GFX9-LABEL: sub_i32_varying:
2208; GFX9:       ; %bb.0: ; %entry
2209; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2210; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2211; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2212; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2213; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2214; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2215; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2216; GFX9-NEXT:    s_not_b64 exec, exec
2217; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2218; GFX9-NEXT:    s_not_b64 exec, exec
2219; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2220; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2221; GFX9-NEXT:    s_nop 1
2222; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2223; GFX9-NEXT:    s_nop 1
2224; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2225; GFX9-NEXT:    s_nop 1
2226; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2227; GFX9-NEXT:    s_nop 1
2228; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2229; GFX9-NEXT:    s_nop 1
2230; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2231; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2232; GFX9-NEXT:    s_nop 0
2233; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2234; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2235; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2236; GFX9-NEXT:    ; implicit-def: $vgpr0
2237; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2238; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2239; GFX9-NEXT:  ; %bb.1:
2240; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2241; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2242; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2244; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2245; GFX9-NEXT:  .LBB9_2:
2246; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2247; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2248; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2249; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2250; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2251; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2252; GFX9-NEXT:    s_mov_b32 s2, -1
2253; GFX9-NEXT:    s_nop 0
2254; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2255; GFX9-NEXT:    s_endpgm
2256;
2257; GFX1064-LABEL: sub_i32_varying:
2258; GFX1064:       ; %bb.0: ; %entry
2259; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2260; GFX1064-NEXT:    s_not_b64 exec, exec
2261; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2262; GFX1064-NEXT:    s_not_b64 exec, exec
2263; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2264; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2265; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2266; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2267; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2268; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2269; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2270; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2271; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2272; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2273; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2274; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2275; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2276; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2277; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2278; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2279; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2280; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2281; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2282; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2283; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2284; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2285; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2286; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2287; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2288; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2289; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2290; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2291; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2292; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2293; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2294; GFX1064-NEXT:    s_mov_b32 s2, -1
2295; GFX1064-NEXT:    ; implicit-def: $vgpr0
2296; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2297; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2298; GFX1064-NEXT:  ; %bb.1:
2299; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2300; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2301; GFX1064-NEXT:    s_mov_b32 s3, s7
2302; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2303; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2304; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2305; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2306; GFX1064-NEXT:    buffer_gl0_inv
2307; GFX1064-NEXT:  .LBB9_2:
2308; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2309; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2310; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2311; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2312; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2313; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2314; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2315; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2316; GFX1064-NEXT:    s_endpgm
2317;
2318; GFX1032-LABEL: sub_i32_varying:
2319; GFX1032:       ; %bb.0: ; %entry
2320; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2321; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2322; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2323; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2324; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2325; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2326; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2327; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2328; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2329; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2330; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2331; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2332; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2333; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2334; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2335; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2336; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2337; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2338; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2339; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2340; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2341; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2342; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2343; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2344; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2345; GFX1032-NEXT:    s_mov_b32 s2, -1
2346; GFX1032-NEXT:    ; implicit-def: $vgpr0
2347; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2348; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2349; GFX1032-NEXT:  ; %bb.1:
2350; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2351; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2352; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2353; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2354; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2355; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2356; GFX1032-NEXT:    buffer_gl0_inv
2357; GFX1032-NEXT:  .LBB9_2:
2358; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2359; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2360; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2361; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2362; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2363; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2364; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2365; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2366; GFX1032-NEXT:    s_endpgm
2367;
2368; GFX1164-LABEL: sub_i32_varying:
2369; GFX1164:       ; %bb.0: ; %entry
2370; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2371; GFX1164-NEXT:    s_not_b64 exec, exec
2372; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2373; GFX1164-NEXT:    s_not_b64 exec, exec
2374; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2375; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2376; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2377; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2378; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2379; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2380; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2381; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2382; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2383; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2384; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2385; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2386; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2387; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2388; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2389; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2390; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2391; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2392; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
2393; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2394; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2395; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2396; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2397; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
2398; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
2399; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2400; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2401; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2402; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2403; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
2404; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
2405; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
2406; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2407; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2408; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2409; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2410; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
2411; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2412; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2413; GFX1164-NEXT:    s_mov_b32 s2, -1
2414; GFX1164-NEXT:    ; implicit-def: $vgpr0
2415; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2416; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2417; GFX1164-NEXT:  ; %bb.1:
2418; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2419; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
2420; GFX1164-NEXT:    s_mov_b32 s3, s7
2421; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2422; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2423; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2424; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2425; GFX1164-NEXT:    buffer_gl0_inv
2426; GFX1164-NEXT:  .LBB9_2:
2427; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2428; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
2429; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2430; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2431; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2432; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2433; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2434; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2435; GFX1164-NEXT:    s_endpgm
2436;
2437; GFX1132-LABEL: sub_i32_varying:
2438; GFX1132:       ; %bb.0: ; %entry
2439; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2440; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2441; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2442; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2443; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2444; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2445; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2446; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2447; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2448; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2449; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2450; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2451; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2452; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2453; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2454; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2455; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2456; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2457; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2458; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2459; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
2460; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
2461; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2462; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2463; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2464; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2465; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2466; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
2467; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2468; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2469; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2470; GFX1132-NEXT:    s_mov_b32 s2, -1
2471; GFX1132-NEXT:    ; implicit-def: $vgpr0
2472; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2473; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2474; GFX1132-NEXT:  ; %bb.1:
2475; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
2476; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
2477; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2478; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2479; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2480; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX1132-NEXT:    buffer_gl0_inv
2482; GFX1132-NEXT:  .LBB9_2:
2483; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2484; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
2485; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2486; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2487; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2488; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2489; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2490; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2491; GFX1132-NEXT:    s_endpgm
2492entry:
2493  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2494  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2495  store i32 %old, i32 addrspace(1)* %out
2496  ret void
2497}
2498
2499define amdgpu_kernel void @sub_i32_varying_nouse() {
2500; GFX7LESS-LABEL: sub_i32_varying_nouse:
2501; GFX7LESS:       ; %bb.0: ; %entry
2502; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2503; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2504; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
2506; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2507; GFX7LESS-NEXT:    s_endpgm
2508;
2509; GFX8-LABEL: sub_i32_varying_nouse:
2510; GFX8:       ; %bb.0: ; %entry
2511; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2512; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2513; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2514; GFX8-NEXT:    s_not_b64 exec, exec
2515; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2516; GFX8-NEXT:    s_not_b64 exec, exec
2517; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
2518; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2519; GFX8-NEXT:    s_nop 1
2520; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2521; GFX8-NEXT:    s_nop 1
2522; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2523; GFX8-NEXT:    s_nop 1
2524; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2525; GFX8-NEXT:    s_nop 1
2526; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2527; GFX8-NEXT:    s_nop 1
2528; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2529; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
2530; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
2531; GFX8-NEXT:    s_mov_b32 s0, s2
2532; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2533; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2534; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2535; GFX8-NEXT:  ; %bb.1:
2536; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2537; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2538; GFX8-NEXT:    s_mov_b32 m0, -1
2539; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2540; GFX8-NEXT:    ds_sub_u32 v0, v2
2541; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX8-NEXT:  .LBB10_2:
2543; GFX8-NEXT:    s_endpgm
2544;
2545; GFX9-LABEL: sub_i32_varying_nouse:
2546; GFX9:       ; %bb.0: ; %entry
2547; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2548; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2549; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2550; GFX9-NEXT:    s_not_b64 exec, exec
2551; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2552; GFX9-NEXT:    s_not_b64 exec, exec
2553; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
2554; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2555; GFX9-NEXT:    s_nop 1
2556; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2557; GFX9-NEXT:    s_nop 1
2558; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2559; GFX9-NEXT:    s_nop 1
2560; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2561; GFX9-NEXT:    s_nop 1
2562; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2563; GFX9-NEXT:    s_nop 1
2564; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2565; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
2566; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
2567; GFX9-NEXT:    s_mov_b32 s0, s2
2568; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2569; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2570; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2571; GFX9-NEXT:  ; %bb.1:
2572; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2573; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2574; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2575; GFX9-NEXT:    ds_sub_u32 v0, v2
2576; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2577; GFX9-NEXT:  .LBB10_2:
2578; GFX9-NEXT:    s_endpgm
2579;
2580; GFX1064-LABEL: sub_i32_varying_nouse:
2581; GFX1064:       ; %bb.0: ; %entry
2582; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2583; GFX1064-NEXT:    s_not_b64 exec, exec
2584; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2585; GFX1064-NEXT:    s_not_b64 exec, exec
2586; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2587; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2588; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2589; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2590; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2591; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2592; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2593; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2594; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2595; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2596; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2597; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
2598; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
2599; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2600; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2601; GFX1064-NEXT:    s_add_i32 s0, s2, s3
2602; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2603; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2604; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2605; GFX1064-NEXT:  ; %bb.1:
2606; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2607; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
2608; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2609; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2610; GFX1064-NEXT:    ds_sub_u32 v0, v3
2611; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2612; GFX1064-NEXT:    buffer_gl0_inv
2613; GFX1064-NEXT:  .LBB10_2:
2614; GFX1064-NEXT:    s_endpgm
2615;
2616; GFX1032-LABEL: sub_i32_varying_nouse:
2617; GFX1032:       ; %bb.0: ; %entry
2618; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2619; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2620; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2621; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2622; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
2623; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2624; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2625; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2626; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2627; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2628; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2629; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2630; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
2631; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2632; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2633; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2634; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2635; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2636; GFX1032-NEXT:  ; %bb.1:
2637; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2638; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2639; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2640; GFX1032-NEXT:    ds_sub_u32 v3, v0
2641; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2642; GFX1032-NEXT:    buffer_gl0_inv
2643; GFX1032-NEXT:  .LBB10_2:
2644; GFX1032-NEXT:    s_endpgm
2645;
2646; GFX1164-LABEL: sub_i32_varying_nouse:
2647; GFX1164:       ; %bb.0: ; %entry
2648; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2649; GFX1164-NEXT:    s_not_b64 exec, exec
2650; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2651; GFX1164-NEXT:    s_not_b64 exec, exec
2652; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2653; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2654; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2655; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2656; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2657; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2658; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2659; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2660; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2661; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2662; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2663; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2664; GFX1164-NEXT:    v_permlane64_b32 v2, v1
2665; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2666; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2667; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2668; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2669; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2670; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2671; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
2672; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
2673; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2674; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
2675; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2676; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
2677; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2678; GFX1164-NEXT:  ; %bb.1:
2679; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2680; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2681; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2682; GFX1164-NEXT:    ds_sub_u32 v3, v0
2683; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2684; GFX1164-NEXT:    buffer_gl0_inv
2685; GFX1164-NEXT:  .LBB10_2:
2686; GFX1164-NEXT:    s_endpgm
2687;
2688; GFX1132-LABEL: sub_i32_varying_nouse:
2689; GFX1132:       ; %bb.0: ; %entry
2690; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2691; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2692; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2693; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2694; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
2695; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2696; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2697; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2698; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2699; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2700; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2701; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2702; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2703; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2704; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2705; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2706; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
2707; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2708; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2709; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
2710; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
2711; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
2712; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
2713; GFX1132-NEXT:  ; %bb.1:
2714; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2715; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2716; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2717; GFX1132-NEXT:    ds_sub_u32 v3, v0
2718; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2719; GFX1132-NEXT:    buffer_gl0_inv
2720; GFX1132-NEXT:  .LBB10_2:
2721; GFX1132-NEXT:    s_endpgm
2722entry:
2723  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2724  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2725  ret void
2726}
2727
2728define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2729;
2730;
2731; GFX7LESS-LABEL: sub_i64_constant:
2732; GFX7LESS:       ; %bb.0: ; %entry
2733; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2734; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2735; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2736; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
2737; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2738; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2739; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2740; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
2741; GFX7LESS-NEXT:  ; %bb.1:
2742; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2743; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
2744; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2745; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
2746; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2747; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2748; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2749; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2750; GFX7LESS-NEXT:  .LBB11_2:
2751; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2752; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2753; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2754; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
2755; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2756; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2757; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2758; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2759; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2760; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2761; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2762; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2763; GFX7LESS-NEXT:    s_endpgm
2764;
2765; GFX8-LABEL: sub_i64_constant:
2766; GFX8:       ; %bb.0: ; %entry
2767; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2768; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2769; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2770; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2771; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2772; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2773; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2774; GFX8-NEXT:    s_cbranch_execz .LBB11_2
2775; GFX8-NEXT:  ; %bb.1:
2776; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2777; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2778; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2779; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2780; GFX8-NEXT:    s_mov_b32 m0, -1
2781; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2782; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2783; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2784; GFX8-NEXT:  .LBB11_2:
2785; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2786; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2787; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2788; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2789; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2790; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2791; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2792; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2793; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2794; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2795; GFX8-NEXT:    s_mov_b32 s2, -1
2796; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2797; GFX8-NEXT:    s_endpgm
2798;
2799; GFX9-LABEL: sub_i64_constant:
2800; GFX9:       ; %bb.0: ; %entry
2801; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2802; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2803; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2804; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2805; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2806; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2807; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2808; GFX9-NEXT:    s_cbranch_execz .LBB11_2
2809; GFX9-NEXT:  ; %bb.1:
2810; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2811; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2812; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2813; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2814; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2815; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2816; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2817; GFX9-NEXT:  .LBB11_2:
2818; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2820; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2821; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2822; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2823; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2824; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2825; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2826; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2827; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2828; GFX9-NEXT:    s_mov_b32 s2, -1
2829; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2830; GFX9-NEXT:    s_endpgm
2831;
2832; GFX1064-LABEL: sub_i64_constant:
2833; GFX1064:       ; %bb.0: ; %entry
2834; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2835; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2836; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2837; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2838; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2839; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2840; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2841; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
2842; GFX1064-NEXT:  ; %bb.1:
2843; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2844; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2845; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2846; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2847; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2848; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2849; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2850; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2851; GFX1064-NEXT:    buffer_gl0_inv
2852; GFX1064-NEXT:  .LBB11_2:
2853; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2854; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2855; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2856; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2857; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2858; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2859; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2860; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2861; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2862; GFX1064-NEXT:    s_mov_b32 s2, -1
2863; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2864; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2865; GFX1064-NEXT:    s_endpgm
2866;
2867; GFX1032-LABEL: sub_i64_constant:
2868; GFX1032:       ; %bb.0: ; %entry
2869; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2870; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2871; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2872; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2873; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2874; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2875; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2876; GFX1032-NEXT:  ; %bb.1:
2877; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2878; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2879; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2880; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2881; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2882; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2883; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2884; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2885; GFX1032-NEXT:    buffer_gl0_inv
2886; GFX1032-NEXT:  .LBB11_2:
2887; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2888; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2889; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2890; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2891; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2892; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2893; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2894; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2895; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2896; GFX1032-NEXT:    s_mov_b32 s2, -1
2897; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2898; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2899; GFX1032-NEXT:    s_endpgm
2900;
2901; GFX1164-LABEL: sub_i64_constant:
2902; GFX1164:       ; %bb.0: ; %entry
2903; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2904; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2905; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2906; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2907; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2908; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2909; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2910; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2911; GFX1164-NEXT:    s_cbranch_execz .LBB11_2
2912; GFX1164-NEXT:  ; %bb.1:
2913; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2914; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2915; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
2916; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2917; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2918; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2919; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2920; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2921; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2922; GFX1164-NEXT:    buffer_gl0_inv
2923; GFX1164-NEXT:  .LBB11_2:
2924; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
2925; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2926; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2927; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2928; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2929; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2930; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2931; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2932; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2933; GFX1164-NEXT:    s_mov_b32 s2, -1
2934; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2935; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2936; GFX1164-NEXT:    s_endpgm
2937;
2938; GFX1132-LABEL: sub_i64_constant:
2939; GFX1132:       ; %bb.0: ; %entry
2940; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2941; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
2942; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2943; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2944; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2945; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2946; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2947; GFX1132-NEXT:    s_cbranch_execz .LBB11_2
2948; GFX1132-NEXT:  ; %bb.1:
2949; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
2950; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2951; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
2952; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2953; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
2954; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2955; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2956; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2957; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2958; GFX1132-NEXT:    buffer_gl0_inv
2959; GFX1132-NEXT:  .LBB11_2:
2960; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2961; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2962; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2963; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2964; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2965; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2966; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2967; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2968; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2969; GFX1132-NEXT:    s_mov_b32 s2, -1
2970; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2971; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2972; GFX1132-NEXT:    s_endpgm
2973entry:
2974  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2975  store i64 %old, i64 addrspace(1)* %out
2976  ret void
2977}
2978
2979define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2980;
2981;
2982; GFX7LESS-LABEL: sub_i64_uniform:
2983; GFX7LESS:       ; %bb.0: ; %entry
2984; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2985; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2986; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2987; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
2988; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2989; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2990; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2991; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
2992; GFX7LESS-NEXT:  ; %bb.1:
2993; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2994; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
2995; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2996; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2997; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2998; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
2999; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
3000; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
3001; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
3002; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3003; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3004; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3005; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3006; GFX7LESS-NEXT:  .LBB12_2:
3007; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
3008; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
3009; GFX7LESS-NEXT:    s_mov_b32 s6, -1
3010; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3011; GFX7LESS-NEXT:    s_mov_b32 s4, s0
3012; GFX7LESS-NEXT:    s_mov_b32 s5, s1
3013; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
3014; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
3015; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
3016; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
3017; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
3018; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
3019; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
3020; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
3021; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
3022; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3023; GFX7LESS-NEXT:    s_endpgm
3024;
3025; GFX8-LABEL: sub_i64_uniform:
3026; GFX8:       ; %bb.0: ; %entry
3027; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3028; GFX8-NEXT:    s_mov_b64 s[6:7], exec
3029; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3030; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3031; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3032; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3033; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3034; GFX8-NEXT:    s_cbranch_execz .LBB12_2
3035; GFX8-NEXT:  ; %bb.1:
3036; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
3037; GFX8-NEXT:    v_mov_b32_e32 v0, s8
3038; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3039; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
3040; GFX8-NEXT:    s_mul_i32 s6, s3, s8
3041; GFX8-NEXT:    v_mov_b32_e32 v3, 0
3042; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
3043; GFX8-NEXT:    s_mov_b32 m0, -1
3044; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3045; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3046; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3047; GFX8-NEXT:  .LBB12_2:
3048; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3049; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3050; GFX8-NEXT:    s_mov_b32 s4, s0
3051; GFX8-NEXT:    s_mov_b32 s5, s1
3052; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
3053; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
3054; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
3055; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
3056; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
3057; GFX8-NEXT:    v_mov_b32_e32 v3, s1
3058; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
3059; GFX8-NEXT:    s_mov_b32 s7, 0xf000
3060; GFX8-NEXT:    s_mov_b32 s6, -1
3061; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
3062; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3063; GFX8-NEXT:    s_endpgm
3064;
3065; GFX9-LABEL: sub_i64_uniform:
3066; GFX9:       ; %bb.0: ; %entry
3067; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3068; GFX9-NEXT:    s_mov_b64 s[6:7], exec
3069; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3070; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3071; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3072; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3073; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3074; GFX9-NEXT:    s_cbranch_execz .LBB12_2
3075; GFX9-NEXT:  ; %bb.1:
3076; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3077; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3078; GFX9-NEXT:    s_mul_i32 s7, s3, s6
3079; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
3080; GFX9-NEXT:    s_add_i32 s8, s8, s7
3081; GFX9-NEXT:    s_mul_i32 s6, s2, s6
3082; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3083; GFX9-NEXT:    v_mov_b32_e32 v1, s8
3084; GFX9-NEXT:    v_mov_b32_e32 v3, 0
3085; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3086; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3087; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3088; GFX9-NEXT:  .LBB12_2:
3089; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3090; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3091; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3092; GFX9-NEXT:    s_mov_b32 s4, s0
3093; GFX9-NEXT:    s_mov_b32 s5, s1
3094; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3095; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3096; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3097; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3098; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3099; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
3100; GFX9-NEXT:    s_mov_b32 s7, 0xf000
3101; GFX9-NEXT:    s_mov_b32 s6, -1
3102; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
3103; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3104; GFX9-NEXT:    s_endpgm
3105;
3106; GFX1064-LABEL: sub_i64_uniform:
3107; GFX1064:       ; %bb.0: ; %entry
3108; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3109; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3110; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3111; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3112; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3113; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3114; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3115; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
3116; GFX1064-NEXT:  ; %bb.1:
3117; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3118; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3119; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3120; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
3121; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
3122; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
3123; GFX1064-NEXT:    s_add_i32 s8, s8, s7
3124; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
3125; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
3126; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3127; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3128; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3129; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3130; GFX1064-NEXT:    buffer_gl0_inv
3131; GFX1064-NEXT:  .LBB12_2:
3132; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3133; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3134; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3135; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3136; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
3137; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3138; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3139; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3140; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3141; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
3142; GFX1064-NEXT:    s_mov_b32 s2, -1
3143; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3144; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3145; GFX1064-NEXT:    s_endpgm
3146;
3147; GFX1032-LABEL: sub_i64_uniform:
3148; GFX1032:       ; %bb.0: ; %entry
3149; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3150; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
3151; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3152; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3153; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
3154; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3155; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
3156; GFX1032-NEXT:  ; %bb.1:
3157; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
3158; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3159; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3160; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
3161; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
3162; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
3163; GFX1032-NEXT:    s_add_i32 s7, s7, s6
3164; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
3165; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
3166; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3167; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3168; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3169; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3170; GFX1032-NEXT:    buffer_gl0_inv
3171; GFX1032-NEXT:  .LBB12_2:
3172; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3173; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3174; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3175; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
3176; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
3177; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
3178; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3179; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3180; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3181; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
3182; GFX1032-NEXT:    s_mov_b32 s2, -1
3183; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3184; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3185; GFX1032-NEXT:    s_endpgm
3186;
3187; GFX1164-LABEL: sub_i64_uniform:
3188; GFX1164:       ; %bb.0: ; %entry
3189; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3190; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3191; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3192; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3193; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3194; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3195; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
3196; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
3197; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
3198; GFX1164-NEXT:  ; %bb.1:
3199; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3200; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3201; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3202; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
3203; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
3204; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
3205; GFX1164-NEXT:    s_add_i32 s8, s8, s7
3206; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
3207; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
3208; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3209; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3210; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3211; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3212; GFX1164-NEXT:    buffer_gl0_inv
3213; GFX1164-NEXT:  .LBB12_2:
3214; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3215; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3216; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3217; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
3218; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
3219; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
3220; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3221; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3222; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3223; GFX1164-NEXT:    s_mov_b32 s2, -1
3224; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3225; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3226; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3227; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3228; GFX1164-NEXT:    s_endpgm
3229;
3230; GFX1132-LABEL: sub_i64_uniform:
3231; GFX1132:       ; %bb.0: ; %entry
3232; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3233; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
3234; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3235; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3236; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3237; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3238; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3239; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
3240; GFX1132-NEXT:  ; %bb.1:
3241; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
3242; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3243; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3244; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
3245; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
3246; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
3247; GFX1132-NEXT:    s_add_i32 s7, s7, s6
3248; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
3249; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
3250; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3251; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3252; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3253; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3254; GFX1132-NEXT:    buffer_gl0_inv
3255; GFX1132-NEXT:  .LBB12_2:
3256; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3257; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3258; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3259; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
3260; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
3261; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3262; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3263; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3264; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3265; GFX1132-NEXT:    s_mov_b32 s2, -1
3266; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3267; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3268; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3269; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3270; GFX1132-NEXT:    s_endpgm
3271entry:
3272  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
3273  store i64 %old, i64 addrspace(1)* %out
3274  ret void
3275}
3276
3277define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
3278;
3279;
3280; GFX7LESS-LABEL: sub_i64_varying:
3281; GFX7LESS:       ; %bb.0: ; %entry
3282; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3283; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3284; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3285; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3286; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3287; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3288; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3289; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3290; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3291; GFX7LESS-NEXT:    s_endpgm
3292;
3293; GFX8-LABEL: sub_i64_varying:
3294; GFX8:       ; %bb.0: ; %entry
3295; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3296; GFX8-NEXT:    s_mov_b32 m0, -1
3297; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3298; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3299; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3300; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3301; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3302; GFX8-NEXT:    s_mov_b32 s2, -1
3303; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3304; GFX8-NEXT:    s_endpgm
3305;
3306; GFX9-LABEL: sub_i64_varying:
3307; GFX9:       ; %bb.0: ; %entry
3308; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3309; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3310; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3311; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3313; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3314; GFX9-NEXT:    s_mov_b32 s2, -1
3315; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3316; GFX9-NEXT:    s_endpgm
3317;
3318; GFX10-LABEL: sub_i64_varying:
3319; GFX10:       ; %bb.0: ; %entry
3320; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3321; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3322; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
3323; GFX10-NEXT:    s_mov_b32 s2, -1
3324; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3325; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3326; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3327; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3328; GFX10-NEXT:    buffer_gl0_inv
3329; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3330; GFX10-NEXT:    s_endpgm
3331;
3332; GFX11-LABEL: sub_i64_varying:
3333; GFX11:       ; %bb.0: ; %entry
3334; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3335; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3336; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
3337; GFX11-NEXT:    s_mov_b32 s2, -1
3338; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3339; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3340; GFX11-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3341; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3342; GFX11-NEXT:    buffer_gl0_inv
3343; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3344; GFX11-NEXT:    s_endpgm
3345entry:
3346  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3347  %zext = zext i32 %lane to i64
3348  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
3349  store i64 %old, i64 addrspace(1)* %out
3350  ret void
3351}
3352
3353define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
3354;
3355;
3356; GFX7LESS-LABEL: and_i32_varying:
3357; GFX7LESS:       ; %bb.0: ; %entry
3358; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3359; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3360; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3361; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3362; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
3363; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3364; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3365; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3366; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3367; GFX7LESS-NEXT:    s_endpgm
3368;
3369; GFX8-LABEL: and_i32_varying:
3370; GFX8:       ; %bb.0: ; %entry
3371; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3372; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3373; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3374; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3375; GFX8-NEXT:    v_mov_b32_e32 v1, -1
3376; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3377; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3378; GFX8-NEXT:    s_not_b64 exec, exec
3379; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3380; GFX8-NEXT:    s_not_b64 exec, exec
3381; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3382; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3383; GFX8-NEXT:    s_nop 1
3384; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3385; GFX8-NEXT:    s_nop 1
3386; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3387; GFX8-NEXT:    s_nop 1
3388; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3389; GFX8-NEXT:    s_nop 1
3390; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3391; GFX8-NEXT:    s_nop 1
3392; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3393; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3394; GFX8-NEXT:    s_nop 0
3395; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3396; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3397; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3398; GFX8-NEXT:    ; implicit-def: $vgpr0
3399; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3400; GFX8-NEXT:    s_cbranch_execz .LBB14_2
3401; GFX8-NEXT:  ; %bb.1:
3402; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3403; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3404; GFX8-NEXT:    s_mov_b32 m0, -1
3405; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3406; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
3407; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3408; GFX8-NEXT:  .LBB14_2:
3409; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3410; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3411; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3412; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3413; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
3414; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3415; GFX8-NEXT:    s_mov_b32 s2, -1
3416; GFX8-NEXT:    s_nop 0
3417; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3418; GFX8-NEXT:    s_endpgm
3419;
3420; GFX9-LABEL: and_i32_varying:
3421; GFX9:       ; %bb.0: ; %entry
3422; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3423; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3424; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3425; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3426; GFX9-NEXT:    v_mov_b32_e32 v1, -1
3427; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3428; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3429; GFX9-NEXT:    s_not_b64 exec, exec
3430; GFX9-NEXT:    v_mov_b32_e32 v2, -1
3431; GFX9-NEXT:    s_not_b64 exec, exec
3432; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3433; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3434; GFX9-NEXT:    s_nop 1
3435; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3436; GFX9-NEXT:    s_nop 1
3437; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3438; GFX9-NEXT:    s_nop 1
3439; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3440; GFX9-NEXT:    s_nop 1
3441; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3442; GFX9-NEXT:    s_nop 1
3443; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3444; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3445; GFX9-NEXT:    s_nop 0
3446; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3447; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3448; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3449; GFX9-NEXT:    ; implicit-def: $vgpr0
3450; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3451; GFX9-NEXT:    s_cbranch_execz .LBB14_2
3452; GFX9-NEXT:  ; %bb.1:
3453; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3454; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3455; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3456; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
3457; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3458; GFX9-NEXT:  .LBB14_2:
3459; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3460; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3461; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3462; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3463; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
3464; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3465; GFX9-NEXT:    s_mov_b32 s2, -1
3466; GFX9-NEXT:    s_nop 0
3467; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3468; GFX9-NEXT:    s_endpgm
3469;
3470; GFX1064-LABEL: and_i32_varying:
3471; GFX1064:       ; %bb.0: ; %entry
3472; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3473; GFX1064-NEXT:    s_not_b64 exec, exec
3474; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
3475; GFX1064-NEXT:    s_not_b64 exec, exec
3476; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3477; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3478; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
3479; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3480; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3481; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3482; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3483; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3484; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3485; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3486; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3487; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3488; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3489; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3490; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3491; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3492; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3493; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3494; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3495; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3496; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3497; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3498; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3499; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3500; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3501; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3502; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3503; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3504; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3505; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3506; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3507; GFX1064-NEXT:    s_mov_b32 s2, -1
3508; GFX1064-NEXT:    ; implicit-def: $vgpr0
3509; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3510; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
3511; GFX1064-NEXT:  ; %bb.1:
3512; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3513; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3514; GFX1064-NEXT:    s_mov_b32 s3, s7
3515; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3516; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3517; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
3518; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3519; GFX1064-NEXT:    buffer_gl0_inv
3520; GFX1064-NEXT:  .LBB14_2:
3521; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3522; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3523; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3524; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3525; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
3526; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3527; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3529; GFX1064-NEXT:    s_endpgm
3530;
3531; GFX1032-LABEL: and_i32_varying:
3532; GFX1032:       ; %bb.0: ; %entry
3533; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3534; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3535; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
3536; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3537; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3538; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3539; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3540; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3541; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3542; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3543; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3544; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3545; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3546; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3547; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3548; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
3549; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3550; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3551; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3552; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3553; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3554; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3555; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3556; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3557; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3558; GFX1032-NEXT:    s_mov_b32 s2, -1
3559; GFX1032-NEXT:    ; implicit-def: $vgpr0
3560; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3561; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
3562; GFX1032-NEXT:  ; %bb.1:
3563; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3564; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3565; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3566; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3567; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
3568; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3569; GFX1032-NEXT:    buffer_gl0_inv
3570; GFX1032-NEXT:  .LBB14_2:
3571; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3572; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3573; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3574; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3575; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
3576; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3577; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3578; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3579; GFX1032-NEXT:    s_endpgm
3580;
3581; GFX1164-LABEL: and_i32_varying:
3582; GFX1164:       ; %bb.0: ; %entry
3583; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3584; GFX1164-NEXT:    s_not_b64 exec, exec
3585; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
3586; GFX1164-NEXT:    s_not_b64 exec, exec
3587; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3588; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3589; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3590; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
3591; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3592; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3593; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3594; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3595; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3596; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3597; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3598; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3599; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3600; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3601; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3602; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3603; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3604; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3605; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3606; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3607; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3608; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3609; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3610; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3611; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3612; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3613; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3614; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3615; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3616; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3617; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3618; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3619; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3620; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
3621; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3622; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3623; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3624; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3625; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3626; GFX1164-NEXT:    s_mov_b32 s2, -1
3627; GFX1164-NEXT:    ; implicit-def: $vgpr0
3628; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3629; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
3630; GFX1164-NEXT:  ; %bb.1:
3631; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3632; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3633; GFX1164-NEXT:    s_mov_b32 s3, s7
3634; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3635; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3636; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
3637; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3638; GFX1164-NEXT:    buffer_gl0_inv
3639; GFX1164-NEXT:  .LBB14_2:
3640; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3641; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3642; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3643; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3644; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
3645; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3646; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3647; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3648; GFX1164-NEXT:    s_endpgm
3649;
3650; GFX1132-LABEL: and_i32_varying:
3651; GFX1132:       ; %bb.0: ; %entry
3652; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3653; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3654; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
3655; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3656; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3657; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3658; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3659; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3660; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3661; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3662; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3663; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3664; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3665; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3666; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3667; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3668; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3669; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3670; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3671; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
3672; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3673; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3674; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3675; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3676; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3677; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3678; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3679; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3680; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3681; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3682; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3683; GFX1132-NEXT:    s_mov_b32 s2, -1
3684; GFX1132-NEXT:    ; implicit-def: $vgpr0
3685; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3686; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
3687; GFX1132-NEXT:  ; %bb.1:
3688; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3689; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3690; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3691; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3692; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
3693; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3694; GFX1132-NEXT:    buffer_gl0_inv
3695; GFX1132-NEXT:  .LBB14_2:
3696; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3697; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3698; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3699; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3700; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
3701; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3702; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3703; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3704; GFX1132-NEXT:    s_endpgm
3705entry:
3706  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3707  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3708  store i32 %old, i32 addrspace(1)* %out
3709  ret void
3710}
3711
3712define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
3713;
3714;
3715; GFX7LESS-LABEL: or_i32_varying:
3716; GFX7LESS:       ; %bb.0: ; %entry
3717; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3718; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3719; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3720; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3721; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
3722; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3723; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3724; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3725; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3726; GFX7LESS-NEXT:    s_endpgm
3727;
3728; GFX8-LABEL: or_i32_varying:
3729; GFX8:       ; %bb.0: ; %entry
3730; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3731; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3732; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3733; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3734; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3735; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3736; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3737; GFX8-NEXT:    s_not_b64 exec, exec
3738; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3739; GFX8-NEXT:    s_not_b64 exec, exec
3740; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3741; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3742; GFX8-NEXT:    s_nop 1
3743; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3744; GFX8-NEXT:    s_nop 1
3745; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3746; GFX8-NEXT:    s_nop 1
3747; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3748; GFX8-NEXT:    s_nop 1
3749; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3750; GFX8-NEXT:    s_nop 1
3751; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3752; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3753; GFX8-NEXT:    s_nop 0
3754; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3755; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3756; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3757; GFX8-NEXT:    ; implicit-def: $vgpr0
3758; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3759; GFX8-NEXT:    s_cbranch_execz .LBB15_2
3760; GFX8-NEXT:  ; %bb.1:
3761; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3762; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3763; GFX8-NEXT:    s_mov_b32 m0, -1
3764; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3765; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3766; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3767; GFX8-NEXT:  .LBB15_2:
3768; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3769; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3770; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3771; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3772; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3773; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3774; GFX8-NEXT:    s_mov_b32 s2, -1
3775; GFX8-NEXT:    s_nop 0
3776; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3777; GFX8-NEXT:    s_endpgm
3778;
3779; GFX9-LABEL: or_i32_varying:
3780; GFX9:       ; %bb.0: ; %entry
3781; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3782; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3783; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3784; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3785; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3786; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3787; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3788; GFX9-NEXT:    s_not_b64 exec, exec
3789; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3790; GFX9-NEXT:    s_not_b64 exec, exec
3791; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3792; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3793; GFX9-NEXT:    s_nop 1
3794; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3795; GFX9-NEXT:    s_nop 1
3796; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3797; GFX9-NEXT:    s_nop 1
3798; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3799; GFX9-NEXT:    s_nop 1
3800; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3801; GFX9-NEXT:    s_nop 1
3802; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3803; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3804; GFX9-NEXT:    s_nop 0
3805; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3806; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3807; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3808; GFX9-NEXT:    ; implicit-def: $vgpr0
3809; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3810; GFX9-NEXT:    s_cbranch_execz .LBB15_2
3811; GFX9-NEXT:  ; %bb.1:
3812; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3813; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3814; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3815; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3816; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3817; GFX9-NEXT:  .LBB15_2:
3818; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3820; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3821; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3822; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3823; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3824; GFX9-NEXT:    s_mov_b32 s2, -1
3825; GFX9-NEXT:    s_nop 0
3826; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3827; GFX9-NEXT:    s_endpgm
3828;
3829; GFX1064-LABEL: or_i32_varying:
3830; GFX1064:       ; %bb.0: ; %entry
3831; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3832; GFX1064-NEXT:    s_not_b64 exec, exec
3833; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3834; GFX1064-NEXT:    s_not_b64 exec, exec
3835; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3836; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3837; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3838; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3839; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3840; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3841; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3842; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3843; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3844; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3845; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3846; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3847; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3848; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3849; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3850; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3851; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3852; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3853; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3854; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3855; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3856; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3857; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3858; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3859; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3860; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3861; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3862; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3863; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3864; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3865; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3866; GFX1064-NEXT:    s_mov_b32 s2, -1
3867; GFX1064-NEXT:    ; implicit-def: $vgpr0
3868; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3869; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
3870; GFX1064-NEXT:  ; %bb.1:
3871; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3872; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3873; GFX1064-NEXT:    s_mov_b32 s3, s7
3874; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3875; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3876; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
3877; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3878; GFX1064-NEXT:    buffer_gl0_inv
3879; GFX1064-NEXT:  .LBB15_2:
3880; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3881; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3882; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3883; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3884; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3885; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3886; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3887; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3888; GFX1064-NEXT:    s_endpgm
3889;
3890; GFX1032-LABEL: or_i32_varying:
3891; GFX1032:       ; %bb.0: ; %entry
3892; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3893; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3894; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3895; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3896; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3897; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3898; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3899; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3900; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3901; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3902; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3903; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3904; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3905; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3906; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3907; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3908; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3909; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3910; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3911; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3912; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3913; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3914; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3915; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3916; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3917; GFX1032-NEXT:    s_mov_b32 s2, -1
3918; GFX1032-NEXT:    ; implicit-def: $vgpr0
3919; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3920; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
3921; GFX1032-NEXT:  ; %bb.1:
3922; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3923; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3924; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3925; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3926; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
3927; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3928; GFX1032-NEXT:    buffer_gl0_inv
3929; GFX1032-NEXT:  .LBB15_2:
3930; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3931; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3932; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3933; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3934; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3935; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3936; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3937; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3938; GFX1032-NEXT:    s_endpgm
3939;
3940; GFX1164-LABEL: or_i32_varying:
3941; GFX1164:       ; %bb.0: ; %entry
3942; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3943; GFX1164-NEXT:    s_not_b64 exec, exec
3944; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3945; GFX1164-NEXT:    s_not_b64 exec, exec
3946; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3947; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3948; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3949; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3950; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3951; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3952; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3953; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3954; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3955; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3956; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3957; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3958; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3959; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3960; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3961; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3962; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3963; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3964; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3965; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3966; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3967; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3968; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3969; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3970; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3971; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3972; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3973; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3974; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3975; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3976; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3977; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3978; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3979; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
3980; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3981; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3982; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3983; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3984; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3985; GFX1164-NEXT:    s_mov_b32 s2, -1
3986; GFX1164-NEXT:    ; implicit-def: $vgpr0
3987; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3988; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
3989; GFX1164-NEXT:  ; %bb.1:
3990; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3991; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3992; GFX1164-NEXT:    s_mov_b32 s3, s7
3993; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3994; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3995; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v4
3996; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3997; GFX1164-NEXT:    buffer_gl0_inv
3998; GFX1164-NEXT:  .LBB15_2:
3999; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4000; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4001; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4002; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4003; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
4004; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4005; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4006; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4007; GFX1164-NEXT:    s_endpgm
4008;
4009; GFX1132-LABEL: or_i32_varying:
4010; GFX1132:       ; %bb.0: ; %entry
4011; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4012; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4013; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4014; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4015; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4016; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4017; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4018; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4019; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4020; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4021; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4022; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4023; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4024; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4025; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4026; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4027; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4028; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4029; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4030; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4031; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4032; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4033; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4034; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4035; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4036; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4037; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4038; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4039; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4040; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4041; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4042; GFX1132-NEXT:    s_mov_b32 s2, -1
4043; GFX1132-NEXT:    ; implicit-def: $vgpr0
4044; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4045; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
4046; GFX1132-NEXT:  ; %bb.1:
4047; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4048; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4049; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4050; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4051; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v4
4052; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4053; GFX1132-NEXT:    buffer_gl0_inv
4054; GFX1132-NEXT:  .LBB15_2:
4055; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4056; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4057; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4058; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4059; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
4060; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4061; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4062; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4063; GFX1132-NEXT:    s_endpgm
4064entry:
4065  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4066  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4067  store i32 %old, i32 addrspace(1)* %out
4068  ret void
4069}
4070
4071define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
4072;
4073;
4074; GFX7LESS-LABEL: xor_i32_varying:
4075; GFX7LESS:       ; %bb.0: ; %entry
4076; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4077; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4078; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4079; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4080; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
4081; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4082; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4083; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4084; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4085; GFX7LESS-NEXT:    s_endpgm
4086;
4087; GFX8-LABEL: xor_i32_varying:
4088; GFX8:       ; %bb.0: ; %entry
4089; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4090; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4091; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4092; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4093; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4094; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4095; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4096; GFX8-NEXT:    s_not_b64 exec, exec
4097; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4098; GFX8-NEXT:    s_not_b64 exec, exec
4099; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4100; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4101; GFX8-NEXT:    s_nop 1
4102; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4103; GFX8-NEXT:    s_nop 1
4104; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4105; GFX8-NEXT:    s_nop 1
4106; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4107; GFX8-NEXT:    s_nop 1
4108; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4109; GFX8-NEXT:    s_nop 1
4110; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4111; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4112; GFX8-NEXT:    s_nop 0
4113; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4114; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4115; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4116; GFX8-NEXT:    ; implicit-def: $vgpr0
4117; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4118; GFX8-NEXT:    s_cbranch_execz .LBB16_2
4119; GFX8-NEXT:  ; %bb.1:
4120; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4121; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4122; GFX8-NEXT:    s_mov_b32 m0, -1
4123; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4124; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4125; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4126; GFX8-NEXT:  .LBB16_2:
4127; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4128; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4129; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4130; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4131; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
4132; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4133; GFX8-NEXT:    s_mov_b32 s2, -1
4134; GFX8-NEXT:    s_nop 0
4135; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4136; GFX8-NEXT:    s_endpgm
4137;
4138; GFX9-LABEL: xor_i32_varying:
4139; GFX9:       ; %bb.0: ; %entry
4140; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4141; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4142; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4143; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4144; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4145; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4146; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4147; GFX9-NEXT:    s_not_b64 exec, exec
4148; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4149; GFX9-NEXT:    s_not_b64 exec, exec
4150; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4151; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4152; GFX9-NEXT:    s_nop 1
4153; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4154; GFX9-NEXT:    s_nop 1
4155; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4156; GFX9-NEXT:    s_nop 1
4157; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4158; GFX9-NEXT:    s_nop 1
4159; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4160; GFX9-NEXT:    s_nop 1
4161; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4162; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4163; GFX9-NEXT:    s_nop 0
4164; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4165; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4166; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4167; GFX9-NEXT:    ; implicit-def: $vgpr0
4168; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4169; GFX9-NEXT:    s_cbranch_execz .LBB16_2
4170; GFX9-NEXT:  ; %bb.1:
4171; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4172; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4174; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4175; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4176; GFX9-NEXT:  .LBB16_2:
4177; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4178; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4179; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4180; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4181; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
4182; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4183; GFX9-NEXT:    s_mov_b32 s2, -1
4184; GFX9-NEXT:    s_nop 0
4185; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4186; GFX9-NEXT:    s_endpgm
4187;
4188; GFX1064-LABEL: xor_i32_varying:
4189; GFX1064:       ; %bb.0: ; %entry
4190; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4191; GFX1064-NEXT:    s_not_b64 exec, exec
4192; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4193; GFX1064-NEXT:    s_not_b64 exec, exec
4194; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4195; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4196; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4197; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4198; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4199; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4200; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4201; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4202; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4203; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4204; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4205; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4206; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4207; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4208; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4209; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4210; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4211; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4212; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4213; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4214; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4215; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4216; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4217; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4218; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4219; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4220; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4221; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4222; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4223; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4224; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4225; GFX1064-NEXT:    s_mov_b32 s2, -1
4226; GFX1064-NEXT:    ; implicit-def: $vgpr0
4227; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4228; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
4229; GFX1064-NEXT:  ; %bb.1:
4230; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4231; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4232; GFX1064-NEXT:    s_mov_b32 s3, s7
4233; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4234; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4235; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4236; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4237; GFX1064-NEXT:    buffer_gl0_inv
4238; GFX1064-NEXT:  .LBB16_2:
4239; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4240; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4241; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4242; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4243; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
4244; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4245; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4246; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4247; GFX1064-NEXT:    s_endpgm
4248;
4249; GFX1032-LABEL: xor_i32_varying:
4250; GFX1032:       ; %bb.0: ; %entry
4251; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4252; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4253; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4254; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4255; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4256; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4257; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4258; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4259; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4260; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4261; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4262; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4263; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4264; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4265; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4266; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4267; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4268; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4269; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4270; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4271; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4272; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4273; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4274; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4275; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4276; GFX1032-NEXT:    s_mov_b32 s2, -1
4277; GFX1032-NEXT:    ; implicit-def: $vgpr0
4278; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4279; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
4280; GFX1032-NEXT:  ; %bb.1:
4281; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4282; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4283; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4284; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4285; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4286; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4287; GFX1032-NEXT:    buffer_gl0_inv
4288; GFX1032-NEXT:  .LBB16_2:
4289; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4290; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4291; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4292; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4293; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
4294; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4295; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4296; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4297; GFX1032-NEXT:    s_endpgm
4298;
4299; GFX1164-LABEL: xor_i32_varying:
4300; GFX1164:       ; %bb.0: ; %entry
4301; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4302; GFX1164-NEXT:    s_not_b64 exec, exec
4303; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4304; GFX1164-NEXT:    s_not_b64 exec, exec
4305; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4306; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4307; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4308; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
4309; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4310; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4311; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4312; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4313; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4314; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4315; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4316; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4317; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4318; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4319; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4320; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4321; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4322; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4323; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4324; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4325; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4326; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4327; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4328; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4329; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4330; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4331; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4332; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4333; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4334; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4335; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4336; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4337; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4338; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4339; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4340; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4341; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4342; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4343; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4344; GFX1164-NEXT:    s_mov_b32 s2, -1
4345; GFX1164-NEXT:    ; implicit-def: $vgpr0
4346; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4347; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
4348; GFX1164-NEXT:  ; %bb.1:
4349; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4350; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4351; GFX1164-NEXT:    s_mov_b32 s3, s7
4352; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4353; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4354; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4355; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4356; GFX1164-NEXT:    buffer_gl0_inv
4357; GFX1164-NEXT:  .LBB16_2:
4358; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4359; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4360; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4361; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4362; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
4363; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4364; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4365; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4366; GFX1164-NEXT:    s_endpgm
4367;
4368; GFX1132-LABEL: xor_i32_varying:
4369; GFX1132:       ; %bb.0: ; %entry
4370; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4371; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4372; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4373; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4374; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4375; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4376; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4377; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4378; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4379; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4380; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4381; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4382; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4383; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4384; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4385; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4386; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4387; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4388; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4389; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4390; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4391; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4392; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4393; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4394; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4395; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4396; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4397; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4398; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4399; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4400; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4401; GFX1132-NEXT:    s_mov_b32 s2, -1
4402; GFX1132-NEXT:    ; implicit-def: $vgpr0
4403; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4404; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
4405; GFX1132-NEXT:  ; %bb.1:
4406; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4407; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4408; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4409; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4410; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4411; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4412; GFX1132-NEXT:    buffer_gl0_inv
4413; GFX1132-NEXT:  .LBB16_2:
4414; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4415; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4416; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4417; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4418; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
4419; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4420; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4421; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4422; GFX1132-NEXT:    s_endpgm
4423entry:
4424  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4425  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4426  store i32 %old, i32 addrspace(1)* %out
4427  ret void
4428}
4429
4430define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
4431;
4432;
4433; GFX7LESS-LABEL: max_i32_varying:
4434; GFX7LESS:       ; %bb.0: ; %entry
4435; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4436; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4437; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4438; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4439; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
4440; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4441; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4442; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4443; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4444; GFX7LESS-NEXT:    s_endpgm
4445;
4446; GFX8-LABEL: max_i32_varying:
4447; GFX8:       ; %bb.0: ; %entry
4448; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4449; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4450; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4451; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4452; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4453; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4454; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4455; GFX8-NEXT:    s_not_b64 exec, exec
4456; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
4457; GFX8-NEXT:    s_not_b64 exec, exec
4458; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4459; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4460; GFX8-NEXT:    s_nop 1
4461; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4462; GFX8-NEXT:    s_nop 1
4463; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4464; GFX8-NEXT:    s_nop 1
4465; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4466; GFX8-NEXT:    s_nop 1
4467; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4468; GFX8-NEXT:    s_nop 1
4469; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4470; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4471; GFX8-NEXT:    s_nop 0
4472; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4473; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4474; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4475; GFX8-NEXT:    ; implicit-def: $vgpr0
4476; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4477; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4478; GFX8-NEXT:  ; %bb.1:
4479; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4480; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4481; GFX8-NEXT:    s_mov_b32 m0, -1
4482; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4483; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
4484; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4485; GFX8-NEXT:  .LBB17_2:
4486; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4487; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4488; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4489; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4490; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
4491; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4492; GFX8-NEXT:    s_mov_b32 s2, -1
4493; GFX8-NEXT:    s_nop 0
4494; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4495; GFX8-NEXT:    s_endpgm
4496;
4497; GFX9-LABEL: max_i32_varying:
4498; GFX9:       ; %bb.0: ; %entry
4499; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4500; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4501; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4502; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4503; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4504; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4505; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4506; GFX9-NEXT:    s_not_b64 exec, exec
4507; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
4508; GFX9-NEXT:    s_not_b64 exec, exec
4509; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4510; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4511; GFX9-NEXT:    s_nop 1
4512; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4513; GFX9-NEXT:    s_nop 1
4514; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4515; GFX9-NEXT:    s_nop 1
4516; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4517; GFX9-NEXT:    s_nop 1
4518; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4519; GFX9-NEXT:    s_nop 1
4520; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4521; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4522; GFX9-NEXT:    s_nop 0
4523; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4524; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4525; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4526; GFX9-NEXT:    ; implicit-def: $vgpr0
4527; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4528; GFX9-NEXT:    s_cbranch_execz .LBB17_2
4529; GFX9-NEXT:  ; %bb.1:
4530; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4531; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4532; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4533; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
4534; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4535; GFX9-NEXT:  .LBB17_2:
4536; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4537; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4538; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4539; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4540; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
4541; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4542; GFX9-NEXT:    s_mov_b32 s2, -1
4543; GFX9-NEXT:    s_nop 0
4544; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4545; GFX9-NEXT:    s_endpgm
4546;
4547; GFX1064-LABEL: max_i32_varying:
4548; GFX1064:       ; %bb.0: ; %entry
4549; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4550; GFX1064-NEXT:    s_not_b64 exec, exec
4551; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
4552; GFX1064-NEXT:    s_not_b64 exec, exec
4553; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4554; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4555; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
4556; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4557; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4558; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4559; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4560; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4561; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4562; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4563; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4564; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4565; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4566; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4567; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4568; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4569; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4570; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4571; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4572; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4573; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4574; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4575; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4576; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4577; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4578; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4579; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4580; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4581; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4582; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4583; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4584; GFX1064-NEXT:    s_mov_b32 s2, -1
4585; GFX1064-NEXT:    ; implicit-def: $vgpr0
4586; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4587; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
4588; GFX1064-NEXT:  ; %bb.1:
4589; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4590; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4591; GFX1064-NEXT:    s_mov_b32 s3, s7
4592; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4593; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4594; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
4595; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4596; GFX1064-NEXT:    buffer_gl0_inv
4597; GFX1064-NEXT:  .LBB17_2:
4598; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4599; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4600; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4601; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4602; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
4603; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4604; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4605; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4606; GFX1064-NEXT:    s_endpgm
4607;
4608; GFX1032-LABEL: max_i32_varying:
4609; GFX1032:       ; %bb.0: ; %entry
4610; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4611; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4612; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
4613; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4614; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4615; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4616; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4617; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4618; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4619; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4620; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4621; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4622; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4623; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4624; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4625; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
4626; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4627; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4628; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4629; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4630; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4631; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4632; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4633; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4634; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4635; GFX1032-NEXT:    s_mov_b32 s2, -1
4636; GFX1032-NEXT:    ; implicit-def: $vgpr0
4637; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4638; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
4639; GFX1032-NEXT:  ; %bb.1:
4640; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4641; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4642; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4643; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4644; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
4645; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4646; GFX1032-NEXT:    buffer_gl0_inv
4647; GFX1032-NEXT:  .LBB17_2:
4648; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4649; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4650; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4651; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4652; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
4653; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4654; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4655; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4656; GFX1032-NEXT:    s_endpgm
4657;
4658; GFX1164-LABEL: max_i32_varying:
4659; GFX1164:       ; %bb.0: ; %entry
4660; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4661; GFX1164-NEXT:    s_not_b64 exec, exec
4662; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
4663; GFX1164-NEXT:    s_not_b64 exec, exec
4664; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4665; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4666; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4667; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
4668; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4669; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4670; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4671; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4672; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4673; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4674; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4675; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4676; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4677; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4678; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4679; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4680; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4681; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4682; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4683; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4684; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4685; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4686; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4687; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4688; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4689; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4690; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4691; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4692; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4693; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4694; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4695; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4696; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4697; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4698; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4699; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4700; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4701; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4702; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4703; GFX1164-NEXT:    s_mov_b32 s2, -1
4704; GFX1164-NEXT:    ; implicit-def: $vgpr0
4705; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4706; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
4707; GFX1164-NEXT:  ; %bb.1:
4708; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4709; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4710; GFX1164-NEXT:    s_mov_b32 s3, s7
4711; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4712; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4713; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
4714; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4715; GFX1164-NEXT:    buffer_gl0_inv
4716; GFX1164-NEXT:  .LBB17_2:
4717; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4718; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4719; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4720; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4721; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
4722; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4723; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4724; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4725; GFX1164-NEXT:    s_endpgm
4726;
4727; GFX1132-LABEL: max_i32_varying:
4728; GFX1132:       ; %bb.0: ; %entry
4729; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4730; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4731; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
4732; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4733; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4734; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4735; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4736; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4737; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4738; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4739; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4740; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4741; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4742; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4743; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4744; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4745; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4746; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4747; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4748; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
4749; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4750; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4751; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4752; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4753; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4754; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4755; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4756; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4757; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4758; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4759; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4760; GFX1132-NEXT:    s_mov_b32 s2, -1
4761; GFX1132-NEXT:    ; implicit-def: $vgpr0
4762; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4763; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
4764; GFX1132-NEXT:  ; %bb.1:
4765; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4766; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4767; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4768; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4769; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
4770; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4771; GFX1132-NEXT:    buffer_gl0_inv
4772; GFX1132-NEXT:  .LBB17_2:
4773; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4774; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4775; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4776; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4777; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
4778; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4779; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4780; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4781; GFX1132-NEXT:    s_endpgm
4782entry:
4783  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4784  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4785  store i32 %old, i32 addrspace(1)* %out
4786  ret void
4787}
4788
4789define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
4790;
4791;
4792; GFX7LESS-LABEL: max_i64_constant:
4793; GFX7LESS:       ; %bb.0: ; %entry
4794; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4795; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4796; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4797; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4798; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4799; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4800; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
4801; GFX7LESS-NEXT:  ; %bb.1:
4802; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4803; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4804; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4805; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4806; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4807; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4808; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4809; GFX7LESS-NEXT:  .LBB18_2:
4810; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4811; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4812; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4813; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4814; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
4815; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4816; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4817; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4818; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4819; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4820; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
4821; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4822; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4823; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4824; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4825; GFX7LESS-NEXT:    s_endpgm
4826;
4827; GFX8-LABEL: max_i64_constant:
4828; GFX8:       ; %bb.0: ; %entry
4829; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4830; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4831; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4832; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4833; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4834; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4835; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4836; GFX8-NEXT:  ; %bb.1:
4837; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4838; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4839; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4840; GFX8-NEXT:    s_mov_b32 m0, -1
4841; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4842; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4843; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4844; GFX8-NEXT:  .LBB18_2:
4845; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4846; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4847; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4848; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
4849; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4850; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4851; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4852; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4853; GFX8-NEXT:    v_mov_b32_e32 v2, s3
4854; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4855; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4856; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4857; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4858; GFX8-NEXT:    s_mov_b32 s2, -1
4859; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4860; GFX8-NEXT:    s_endpgm
4861;
4862; GFX9-LABEL: max_i64_constant:
4863; GFX9:       ; %bb.0: ; %entry
4864; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4865; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4866; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4867; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4868; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4869; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4870; GFX9-NEXT:    s_cbranch_execz .LBB18_2
4871; GFX9-NEXT:  ; %bb.1:
4872; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4873; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4874; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4875; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4876; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4877; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4878; GFX9-NEXT:  .LBB18_2:
4879; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4880; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4881; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4882; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
4883; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4884; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4885; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4886; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4887; GFX9-NEXT:    v_mov_b32_e32 v2, s3
4888; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4889; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4890; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4891; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4892; GFX9-NEXT:    s_mov_b32 s2, -1
4893; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4894; GFX9-NEXT:    s_endpgm
4895;
4896; GFX1064-LABEL: max_i64_constant:
4897; GFX1064:       ; %bb.0: ; %entry
4898; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4899; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4900; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4901; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4902; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4903; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4904; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
4905; GFX1064-NEXT:  ; %bb.1:
4906; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4907; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4908; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4909; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4910; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4911; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4912; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4913; GFX1064-NEXT:    buffer_gl0_inv
4914; GFX1064-NEXT:  .LBB18_2:
4915; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4916; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4917; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4918; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4919; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4920; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4921; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4922; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4923; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4924; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4925; GFX1064-NEXT:    s_mov_b32 s2, -1
4926; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4927; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4928; GFX1064-NEXT:    s_endpgm
4929;
4930; GFX1032-LABEL: max_i64_constant:
4931; GFX1032:       ; %bb.0: ; %entry
4932; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4933; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4934; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4935; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4936; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4937; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
4938; GFX1032-NEXT:  ; %bb.1:
4939; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4940; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4941; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4942; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4943; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4944; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4945; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4946; GFX1032-NEXT:    buffer_gl0_inv
4947; GFX1032-NEXT:  .LBB18_2:
4948; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4949; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4950; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4951; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4952; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4953; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4954; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4955; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4956; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4957; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4958; GFX1032-NEXT:    s_mov_b32 s2, -1
4959; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4960; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4961; GFX1032-NEXT:    s_endpgm
4962;
4963; GFX1164-LABEL: max_i64_constant:
4964; GFX1164:       ; %bb.0: ; %entry
4965; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4966; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4967; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4968; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4969; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4970; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
4971; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4972; GFX1164-NEXT:    s_cbranch_execz .LBB18_2
4973; GFX1164-NEXT:  ; %bb.1:
4974; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
4975; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4976; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
4977; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4978; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4979; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4980; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4981; GFX1164-NEXT:    buffer_gl0_inv
4982; GFX1164-NEXT:  .LBB18_2:
4983; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
4984; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
4985; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
4986; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4987; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4988; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4989; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4990; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4991; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4992; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4993; GFX1164-NEXT:    s_mov_b32 s2, -1
4994; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4995; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4996; GFX1164-NEXT:    s_endpgm
4997;
4998; GFX1132-LABEL: max_i64_constant:
4999; GFX1132:       ; %bb.0: ; %entry
5000; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5001; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5002; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5003; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5004; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5005; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5006; GFX1132-NEXT:    s_cbranch_execz .LBB18_2
5007; GFX1132-NEXT:  ; %bb.1:
5008; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5009; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5010; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5011; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5012; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5013; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
5014; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5015; GFX1132-NEXT:    buffer_gl0_inv
5016; GFX1132-NEXT:  .LBB18_2:
5017; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5018; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5019; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5020; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
5021; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
5022; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5023; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
5024; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5025; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5026; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5027; GFX1132-NEXT:    s_mov_b32 s2, -1
5028; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5029; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5030; GFX1132-NEXT:    s_endpgm
5031entry:
5032  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
5033  store i64 %old, i64 addrspace(1)* %out
5034  ret void
5035}
5036
5037define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
5038;
5039;
5040; GFX7LESS-LABEL: min_i32_varying:
5041; GFX7LESS:       ; %bb.0: ; %entry
5042; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5043; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5044; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5045; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5046; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
5047; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5048; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5049; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5050; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5051; GFX7LESS-NEXT:    s_endpgm
5052;
5053; GFX8-LABEL: min_i32_varying:
5054; GFX8:       ; %bb.0: ; %entry
5055; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5056; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5057; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5058; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5059; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
5060; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5061; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5062; GFX8-NEXT:    s_not_b64 exec, exec
5063; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
5064; GFX8-NEXT:    s_not_b64 exec, exec
5065; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5066; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5067; GFX8-NEXT:    s_nop 1
5068; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5069; GFX8-NEXT:    s_nop 1
5070; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5071; GFX8-NEXT:    s_nop 1
5072; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5073; GFX8-NEXT:    s_nop 1
5074; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5075; GFX8-NEXT:    s_nop 1
5076; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5077; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5078; GFX8-NEXT:    s_nop 0
5079; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5080; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5081; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5082; GFX8-NEXT:    ; implicit-def: $vgpr0
5083; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5084; GFX8-NEXT:    s_cbranch_execz .LBB19_2
5085; GFX8-NEXT:  ; %bb.1:
5086; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5087; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5088; GFX8-NEXT:    s_mov_b32 m0, -1
5089; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5090; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
5091; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5092; GFX8-NEXT:  .LBB19_2:
5093; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5094; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5095; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5096; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5097; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
5098; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5099; GFX8-NEXT:    s_mov_b32 s2, -1
5100; GFX8-NEXT:    s_nop 0
5101; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5102; GFX8-NEXT:    s_endpgm
5103;
5104; GFX9-LABEL: min_i32_varying:
5105; GFX9:       ; %bb.0: ; %entry
5106; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5107; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5108; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5109; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5110; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
5111; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5112; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5113; GFX9-NEXT:    s_not_b64 exec, exec
5114; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
5115; GFX9-NEXT:    s_not_b64 exec, exec
5116; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5117; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5118; GFX9-NEXT:    s_nop 1
5119; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5120; GFX9-NEXT:    s_nop 1
5121; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5122; GFX9-NEXT:    s_nop 1
5123; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5124; GFX9-NEXT:    s_nop 1
5125; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5126; GFX9-NEXT:    s_nop 1
5127; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5128; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5129; GFX9-NEXT:    s_nop 0
5130; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5131; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5132; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5133; GFX9-NEXT:    ; implicit-def: $vgpr0
5134; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5135; GFX9-NEXT:    s_cbranch_execz .LBB19_2
5136; GFX9-NEXT:  ; %bb.1:
5137; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5138; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5140; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
5141; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5142; GFX9-NEXT:  .LBB19_2:
5143; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5144; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5145; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5146; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5147; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
5148; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5149; GFX9-NEXT:    s_mov_b32 s2, -1
5150; GFX9-NEXT:    s_nop 0
5151; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5152; GFX9-NEXT:    s_endpgm
5153;
5154; GFX1064-LABEL: min_i32_varying:
5155; GFX1064:       ; %bb.0: ; %entry
5156; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5157; GFX1064-NEXT:    s_not_b64 exec, exec
5158; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
5159; GFX1064-NEXT:    s_not_b64 exec, exec
5160; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5161; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5162; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
5163; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5164; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5165; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5166; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5167; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5168; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5169; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5170; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5171; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5172; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5173; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5174; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5175; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5176; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5177; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5178; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5179; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5180; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5181; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5182; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5183; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5184; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5185; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5186; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5187; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5188; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5189; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5190; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5191; GFX1064-NEXT:    s_mov_b32 s2, -1
5192; GFX1064-NEXT:    ; implicit-def: $vgpr0
5193; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5194; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
5195; GFX1064-NEXT:  ; %bb.1:
5196; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5197; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5198; GFX1064-NEXT:    s_mov_b32 s3, s7
5199; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5200; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5201; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
5202; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5203; GFX1064-NEXT:    buffer_gl0_inv
5204; GFX1064-NEXT:  .LBB19_2:
5205; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5206; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5207; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5208; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5209; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
5210; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5211; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5212; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5213; GFX1064-NEXT:    s_endpgm
5214;
5215; GFX1032-LABEL: min_i32_varying:
5216; GFX1032:       ; %bb.0: ; %entry
5217; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5218; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5219; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
5220; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5221; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5222; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5223; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5224; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5225; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5226; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5227; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5228; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5229; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5230; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5231; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5232; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
5233; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5234; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5235; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5236; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5237; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5238; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5239; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5240; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5241; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5242; GFX1032-NEXT:    s_mov_b32 s2, -1
5243; GFX1032-NEXT:    ; implicit-def: $vgpr0
5244; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5245; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
5246; GFX1032-NEXT:  ; %bb.1:
5247; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5248; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5249; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5250; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5251; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
5252; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5253; GFX1032-NEXT:    buffer_gl0_inv
5254; GFX1032-NEXT:  .LBB19_2:
5255; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5256; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5257; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5258; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5259; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
5260; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5261; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5262; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5263; GFX1032-NEXT:    s_endpgm
5264;
5265; GFX1164-LABEL: min_i32_varying:
5266; GFX1164:       ; %bb.0: ; %entry
5267; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5268; GFX1164-NEXT:    s_not_b64 exec, exec
5269; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
5270; GFX1164-NEXT:    s_not_b64 exec, exec
5271; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5272; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5273; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5274; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
5275; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5276; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5277; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5278; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5279; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5280; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5281; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5282; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5283; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5284; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5285; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5286; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5287; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5288; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5289; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5290; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5291; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5292; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5293; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5294; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5295; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5296; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5297; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5298; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5299; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5300; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5301; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5302; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5303; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5304; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5305; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5306; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5307; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5308; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5309; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5310; GFX1164-NEXT:    s_mov_b32 s2, -1
5311; GFX1164-NEXT:    ; implicit-def: $vgpr0
5312; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5313; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
5314; GFX1164-NEXT:  ; %bb.1:
5315; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5316; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5317; GFX1164-NEXT:    s_mov_b32 s3, s7
5318; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5319; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5320; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
5321; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5322; GFX1164-NEXT:    buffer_gl0_inv
5323; GFX1164-NEXT:  .LBB19_2:
5324; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5325; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5326; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5327; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5328; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
5329; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5330; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5331; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5332; GFX1164-NEXT:    s_endpgm
5333;
5334; GFX1132-LABEL: min_i32_varying:
5335; GFX1132:       ; %bb.0: ; %entry
5336; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5337; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5338; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
5339; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5340; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5341; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5342; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5343; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5344; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5345; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5346; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5347; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5348; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5349; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5350; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5351; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5352; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5353; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5354; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5355; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
5356; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5357; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5358; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5359; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5360; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5361; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5362; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5363; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5364; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5365; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5366; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5367; GFX1132-NEXT:    s_mov_b32 s2, -1
5368; GFX1132-NEXT:    ; implicit-def: $vgpr0
5369; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5370; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
5371; GFX1132-NEXT:  ; %bb.1:
5372; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5373; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5374; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5375; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5376; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
5377; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5378; GFX1132-NEXT:    buffer_gl0_inv
5379; GFX1132-NEXT:  .LBB19_2:
5380; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5381; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5382; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5383; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5384; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
5385; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5386; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5387; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5388; GFX1132-NEXT:    s_endpgm
5389entry:
5390  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5391  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5392  store i32 %old, i32 addrspace(1)* %out
5393  ret void
5394}
5395
5396define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
5397;
5398;
5399; GFX7LESS-LABEL: min_i64_constant:
5400; GFX7LESS:       ; %bb.0: ; %entry
5401; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5402; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5403; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5404; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5405; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5406; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5407; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
5408; GFX7LESS-NEXT:  ; %bb.1:
5409; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5410; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5411; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5412; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5413; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5414; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5415; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5416; GFX7LESS-NEXT:  .LBB20_2:
5417; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5418; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5419; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5420; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5421; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
5422; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5423; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5424; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5425; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5426; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
5427; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5428; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5429; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5430; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5431; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5432; GFX7LESS-NEXT:    s_endpgm
5433;
5434; GFX8-LABEL: min_i64_constant:
5435; GFX8:       ; %bb.0: ; %entry
5436; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5437; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5438; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5439; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5440; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5441; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5442; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5443; GFX8-NEXT:  ; %bb.1:
5444; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5445; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5446; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5447; GFX8-NEXT:    s_mov_b32 m0, -1
5448; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5449; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5450; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5451; GFX8-NEXT:  .LBB20_2:
5452; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5453; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5454; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5455; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
5456; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5457; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5458; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5459; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5460; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5461; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5462; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5463; GFX8-NEXT:    s_mov_b32 s2, -1
5464; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5465; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5466; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5467; GFX8-NEXT:    s_endpgm
5468;
5469; GFX9-LABEL: min_i64_constant:
5470; GFX9:       ; %bb.0: ; %entry
5471; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5472; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5473; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5474; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5475; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5476; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5477; GFX9-NEXT:    s_cbranch_execz .LBB20_2
5478; GFX9-NEXT:  ; %bb.1:
5479; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5480; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5481; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5482; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5483; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5484; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5485; GFX9-NEXT:  .LBB20_2:
5486; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5487; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5488; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5489; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
5490; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5491; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5492; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5493; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5494; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5495; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5496; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5497; GFX9-NEXT:    s_mov_b32 s2, -1
5498; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5499; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5500; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5501; GFX9-NEXT:    s_endpgm
5502;
5503; GFX1064-LABEL: min_i64_constant:
5504; GFX1064:       ; %bb.0: ; %entry
5505; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5506; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5507; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5508; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5509; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5510; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5511; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
5512; GFX1064-NEXT:  ; %bb.1:
5513; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5514; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5515; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5516; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5517; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5518; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5519; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5520; GFX1064-NEXT:    buffer_gl0_inv
5521; GFX1064-NEXT:  .LBB20_2:
5522; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5523; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5524; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5525; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5526; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5527; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5528; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5529; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5530; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5531; GFX1064-NEXT:    s_mov_b32 s2, -1
5532; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5533; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5534; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5535; GFX1064-NEXT:    s_endpgm
5536;
5537; GFX1032-LABEL: min_i64_constant:
5538; GFX1032:       ; %bb.0: ; %entry
5539; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5540; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5541; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5542; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5543; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5544; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
5545; GFX1032-NEXT:  ; %bb.1:
5546; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5547; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5548; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5549; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5550; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5551; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5552; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5553; GFX1032-NEXT:    buffer_gl0_inv
5554; GFX1032-NEXT:  .LBB20_2:
5555; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5556; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5557; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5558; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5559; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5560; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5561; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5562; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5563; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5564; GFX1032-NEXT:    s_mov_b32 s2, -1
5565; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5566; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5567; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5568; GFX1032-NEXT:    s_endpgm
5569;
5570; GFX1164-LABEL: min_i64_constant:
5571; GFX1164:       ; %bb.0: ; %entry
5572; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5573; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5574; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5575; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5576; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5577; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5578; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5579; GFX1164-NEXT:    s_cbranch_execz .LBB20_2
5580; GFX1164-NEXT:  ; %bb.1:
5581; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5582; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5583; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5584; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5585; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5586; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5587; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5588; GFX1164-NEXT:    buffer_gl0_inv
5589; GFX1164-NEXT:  .LBB20_2:
5590; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5591; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5592; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5593; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5594; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5595; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5596; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5597; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5598; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5599; GFX1164-NEXT:    s_mov_b32 s2, -1
5600; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5601; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5602; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5603; GFX1164-NEXT:    s_endpgm
5604;
5605; GFX1132-LABEL: min_i64_constant:
5606; GFX1132:       ; %bb.0: ; %entry
5607; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5608; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5609; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5610; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5611; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5612; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5613; GFX1132-NEXT:    s_cbranch_execz .LBB20_2
5614; GFX1132-NEXT:  ; %bb.1:
5615; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5616; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5617; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5618; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5619; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5620; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5621; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5622; GFX1132-NEXT:    buffer_gl0_inv
5623; GFX1132-NEXT:  .LBB20_2:
5624; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5625; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5626; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5627; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5628; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5629; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5630; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5631; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5632; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5633; GFX1132-NEXT:    s_mov_b32 s2, -1
5634; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5635; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5636; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5637; GFX1132-NEXT:    s_endpgm
5638entry:
5639  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
5640  store i64 %old, i64 addrspace(1)* %out
5641  ret void
5642}
5643
5644define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
5645;
5646;
5647; GFX7LESS-LABEL: umax_i32_varying:
5648; GFX7LESS:       ; %bb.0: ; %entry
5649; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5650; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5651; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5652; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5653; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
5654; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5655; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5656; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5657; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5658; GFX7LESS-NEXT:    s_endpgm
5659;
5660; GFX8-LABEL: umax_i32_varying:
5661; GFX8:       ; %bb.0: ; %entry
5662; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5663; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5664; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5665; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5666; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5667; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5668; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5669; GFX8-NEXT:    s_not_b64 exec, exec
5670; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5671; GFX8-NEXT:    s_not_b64 exec, exec
5672; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5673; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5674; GFX8-NEXT:    s_nop 1
5675; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5676; GFX8-NEXT:    s_nop 1
5677; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5678; GFX8-NEXT:    s_nop 1
5679; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5680; GFX8-NEXT:    s_nop 1
5681; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5682; GFX8-NEXT:    s_nop 1
5683; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5684; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5685; GFX8-NEXT:    s_nop 0
5686; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5687; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5688; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5689; GFX8-NEXT:    ; implicit-def: $vgpr0
5690; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5691; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5692; GFX8-NEXT:  ; %bb.1:
5693; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5694; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5695; GFX8-NEXT:    s_mov_b32 m0, -1
5696; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5697; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
5698; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5699; GFX8-NEXT:  .LBB21_2:
5700; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5701; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5702; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5703; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5704; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
5705; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5706; GFX8-NEXT:    s_mov_b32 s2, -1
5707; GFX8-NEXT:    s_nop 0
5708; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5709; GFX8-NEXT:    s_endpgm
5710;
5711; GFX9-LABEL: umax_i32_varying:
5712; GFX9:       ; %bb.0: ; %entry
5713; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5714; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5715; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5716; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5717; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5718; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5719; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5720; GFX9-NEXT:    s_not_b64 exec, exec
5721; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5722; GFX9-NEXT:    s_not_b64 exec, exec
5723; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5724; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5725; GFX9-NEXT:    s_nop 1
5726; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5727; GFX9-NEXT:    s_nop 1
5728; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5729; GFX9-NEXT:    s_nop 1
5730; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5731; GFX9-NEXT:    s_nop 1
5732; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5733; GFX9-NEXT:    s_nop 1
5734; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5735; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5736; GFX9-NEXT:    s_nop 0
5737; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5738; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5739; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5740; GFX9-NEXT:    ; implicit-def: $vgpr0
5741; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5742; GFX9-NEXT:    s_cbranch_execz .LBB21_2
5743; GFX9-NEXT:  ; %bb.1:
5744; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5745; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5746; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5747; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
5748; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5749; GFX9-NEXT:  .LBB21_2:
5750; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5751; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5752; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5753; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5754; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
5755; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5756; GFX9-NEXT:    s_mov_b32 s2, -1
5757; GFX9-NEXT:    s_nop 0
5758; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5759; GFX9-NEXT:    s_endpgm
5760;
5761; GFX1064-LABEL: umax_i32_varying:
5762; GFX1064:       ; %bb.0: ; %entry
5763; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5764; GFX1064-NEXT:    s_not_b64 exec, exec
5765; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5766; GFX1064-NEXT:    s_not_b64 exec, exec
5767; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5768; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5769; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5770; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5771; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5772; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5773; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5774; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5775; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5776; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5777; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5778; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5779; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5780; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5781; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5782; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5783; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5784; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5785; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5786; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5787; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5788; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5789; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5790; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5791; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5792; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5793; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5794; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5795; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5796; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5797; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5798; GFX1064-NEXT:    s_mov_b32 s2, -1
5799; GFX1064-NEXT:    ; implicit-def: $vgpr0
5800; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5801; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
5802; GFX1064-NEXT:  ; %bb.1:
5803; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5804; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5805; GFX1064-NEXT:    s_mov_b32 s3, s7
5806; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5807; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5808; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
5809; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5810; GFX1064-NEXT:    buffer_gl0_inv
5811; GFX1064-NEXT:  .LBB21_2:
5812; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5813; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5814; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5815; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5816; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
5817; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5818; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5819; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5820; GFX1064-NEXT:    s_endpgm
5821;
5822; GFX1032-LABEL: umax_i32_varying:
5823; GFX1032:       ; %bb.0: ; %entry
5824; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5825; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5826; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5827; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5828; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5829; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5830; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5831; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5832; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5833; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5834; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5835; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5836; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5837; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5838; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5839; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5840; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5841; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5842; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5843; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5844; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5845; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5846; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5847; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5848; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5849; GFX1032-NEXT:    s_mov_b32 s2, -1
5850; GFX1032-NEXT:    ; implicit-def: $vgpr0
5851; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5852; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
5853; GFX1032-NEXT:  ; %bb.1:
5854; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5855; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5856; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5857; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5858; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
5859; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5860; GFX1032-NEXT:    buffer_gl0_inv
5861; GFX1032-NEXT:  .LBB21_2:
5862; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5863; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5864; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5865; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5866; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
5867; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5868; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5869; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5870; GFX1032-NEXT:    s_endpgm
5871;
5872; GFX1164-LABEL: umax_i32_varying:
5873; GFX1164:       ; %bb.0: ; %entry
5874; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5875; GFX1164-NEXT:    s_not_b64 exec, exec
5876; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5877; GFX1164-NEXT:    s_not_b64 exec, exec
5878; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5879; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5880; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5881; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5882; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5883; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5884; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5885; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5886; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5887; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5888; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5889; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5890; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5891; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5892; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5893; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5894; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5895; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5896; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5897; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5898; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5899; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5900; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5901; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5902; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5903; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5904; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5905; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5906; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5907; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5908; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5909; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5910; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5911; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5912; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5913; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5914; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5915; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5916; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5917; GFX1164-NEXT:    s_mov_b32 s2, -1
5918; GFX1164-NEXT:    ; implicit-def: $vgpr0
5919; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5920; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
5921; GFX1164-NEXT:  ; %bb.1:
5922; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5923; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5924; GFX1164-NEXT:    s_mov_b32 s3, s7
5925; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5926; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5927; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v4
5928; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5929; GFX1164-NEXT:    buffer_gl0_inv
5930; GFX1164-NEXT:  .LBB21_2:
5931; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5932; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5933; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5934; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5935; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
5936; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5937; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5938; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5939; GFX1164-NEXT:    s_endpgm
5940;
5941; GFX1132-LABEL: umax_i32_varying:
5942; GFX1132:       ; %bb.0: ; %entry
5943; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5944; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5945; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5946; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5947; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5948; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5949; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5950; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5951; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5952; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5953; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5954; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5955; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5956; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5957; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5958; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5959; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5960; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5961; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5962; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
5963; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5964; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5965; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5966; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5967; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5968; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5969; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5970; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5971; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5972; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5973; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5974; GFX1132-NEXT:    s_mov_b32 s2, -1
5975; GFX1132-NEXT:    ; implicit-def: $vgpr0
5976; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5977; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
5978; GFX1132-NEXT:  ; %bb.1:
5979; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5980; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5981; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5982; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5983; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v4
5984; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5985; GFX1132-NEXT:    buffer_gl0_inv
5986; GFX1132-NEXT:  .LBB21_2:
5987; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5988; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5989; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5990; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5991; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
5992; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5993; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5994; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5995; GFX1132-NEXT:    s_endpgm
5996entry:
5997  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5998  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5999  store i32 %old, i32 addrspace(1)* %out
6000  ret void
6001}
6002
6003define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
6004;
6005;
6006; GFX7LESS-LABEL: umax_i64_constant:
6007; GFX7LESS:       ; %bb.0: ; %entry
6008; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6009; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6010; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6011; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6012; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6013; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6014; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
6015; GFX7LESS-NEXT:  ; %bb.1:
6016; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6017; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6018; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6019; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6020; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6021; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6022; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6023; GFX7LESS-NEXT:  .LBB22_2:
6024; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6025; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6026; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6027; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6028; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6029; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6030; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6031; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6032; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
6033; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6034; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
6035; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6036; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6037; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6038; GFX7LESS-NEXT:    s_endpgm
6039;
6040; GFX8-LABEL: umax_i64_constant:
6041; GFX8:       ; %bb.0: ; %entry
6042; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6043; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6044; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6045; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6046; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6047; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6048; GFX8-NEXT:    s_cbranch_execz .LBB22_2
6049; GFX8-NEXT:  ; %bb.1:
6050; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6051; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6052; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6053; GFX8-NEXT:    s_mov_b32 m0, -1
6054; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6055; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6056; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6057; GFX8-NEXT:  .LBB22_2:
6058; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6059; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6060; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6061; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
6062; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6063; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6064; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6065; GFX8-NEXT:    v_mov_b32_e32 v2, s2
6066; GFX8-NEXT:    v_mov_b32_e32 v1, s3
6067; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6068; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6069; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6070; GFX8-NEXT:    s_mov_b32 s2, -1
6071; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6072; GFX8-NEXT:    s_endpgm
6073;
6074; GFX9-LABEL: umax_i64_constant:
6075; GFX9:       ; %bb.0: ; %entry
6076; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6077; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6078; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6079; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6080; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6081; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6082; GFX9-NEXT:    s_cbranch_execz .LBB22_2
6083; GFX9-NEXT:  ; %bb.1:
6084; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6085; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6086; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6087; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6088; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6089; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6090; GFX9-NEXT:  .LBB22_2:
6091; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6092; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6093; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6094; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
6095; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6096; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6097; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6098; GFX9-NEXT:    v_mov_b32_e32 v2, s2
6099; GFX9-NEXT:    v_mov_b32_e32 v1, s3
6100; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6101; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
6102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6103; GFX9-NEXT:    s_mov_b32 s2, -1
6104; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6105; GFX9-NEXT:    s_endpgm
6106;
6107; GFX1064-LABEL: umax_i64_constant:
6108; GFX1064:       ; %bb.0: ; %entry
6109; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6110; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6111; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6112; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6113; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6114; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6115; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
6116; GFX1064-NEXT:  ; %bb.1:
6117; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6118; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6119; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6120; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6121; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6122; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6123; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6124; GFX1064-NEXT:    buffer_gl0_inv
6125; GFX1064-NEXT:  .LBB22_2:
6126; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6127; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6128; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6129; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6130; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6131; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6132; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6133; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6134; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
6135; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6136; GFX1064-NEXT:    s_mov_b32 s2, -1
6137; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6138; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6139; GFX1064-NEXT:    s_endpgm
6140;
6141; GFX1032-LABEL: umax_i64_constant:
6142; GFX1032:       ; %bb.0: ; %entry
6143; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6146; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6147; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6148; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
6149; GFX1032-NEXT:  ; %bb.1:
6150; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6151; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6152; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6155; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6157; GFX1032-NEXT:    buffer_gl0_inv
6158; GFX1032-NEXT:  .LBB22_2:
6159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6162; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6163; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6164; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6165; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6166; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6167; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6168; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6169; GFX1032-NEXT:    s_mov_b32 s2, -1
6170; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6171; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6172; GFX1032-NEXT:    s_endpgm
6173;
6174; GFX1164-LABEL: umax_i64_constant:
6175; GFX1164:       ; %bb.0: ; %entry
6176; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6177; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6178; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6179; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6180; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6181; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6182; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6183; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
6184; GFX1164-NEXT:  ; %bb.1:
6185; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6186; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6187; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6188; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6189; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6190; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6191; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6192; GFX1164-NEXT:    buffer_gl0_inv
6193; GFX1164-NEXT:  .LBB22_2:
6194; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6195; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6196; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6197; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6198; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
6199; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6200; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
6201; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6202; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
6203; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6204; GFX1164-NEXT:    s_mov_b32 s2, -1
6205; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6206; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6207; GFX1164-NEXT:    s_endpgm
6208;
6209; GFX1132-LABEL: umax_i64_constant:
6210; GFX1132:       ; %bb.0: ; %entry
6211; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6212; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6213; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6214; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6215; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6216; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6217; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
6218; GFX1132-NEXT:  ; %bb.1:
6219; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6220; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6221; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6222; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6223; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6224; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6225; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6226; GFX1132-NEXT:    buffer_gl0_inv
6227; GFX1132-NEXT:  .LBB22_2:
6228; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6229; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6230; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6231; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6232; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6233; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6234; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6235; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6236; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6237; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6238; GFX1132-NEXT:    s_mov_b32 s2, -1
6239; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6240; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6241; GFX1132-NEXT:    s_endpgm
6242entry:
6243  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
6244  store i64 %old, i64 addrspace(1)* %out
6245  ret void
6246}
6247
6248define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
6249;
6250;
6251; GFX7LESS-LABEL: umin_i32_varying:
6252; GFX7LESS:       ; %bb.0: ; %entry
6253; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6254; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6255; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6257; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
6258; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6259; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6260; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6261; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6262; GFX7LESS-NEXT:    s_endpgm
6263;
6264; GFX8-LABEL: umin_i32_varying:
6265; GFX8:       ; %bb.0: ; %entry
6266; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6267; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6268; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6269; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6270; GFX8-NEXT:    v_mov_b32_e32 v1, -1
6271; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6272; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6273; GFX8-NEXT:    s_not_b64 exec, exec
6274; GFX8-NEXT:    v_mov_b32_e32 v2, -1
6275; GFX8-NEXT:    s_not_b64 exec, exec
6276; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6277; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6278; GFX8-NEXT:    s_nop 1
6279; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6280; GFX8-NEXT:    s_nop 1
6281; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6282; GFX8-NEXT:    s_nop 1
6283; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6284; GFX8-NEXT:    s_nop 1
6285; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6286; GFX8-NEXT:    s_nop 1
6287; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6288; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
6289; GFX8-NEXT:    s_nop 0
6290; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6291; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6292; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6293; GFX8-NEXT:    ; implicit-def: $vgpr0
6294; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6295; GFX8-NEXT:    s_cbranch_execz .LBB23_2
6296; GFX8-NEXT:  ; %bb.1:
6297; GFX8-NEXT:    v_mov_b32_e32 v0, 0
6298; GFX8-NEXT:    v_mov_b32_e32 v3, s4
6299; GFX8-NEXT:    s_mov_b32 m0, -1
6300; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6301; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
6302; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6303; GFX8-NEXT:  .LBB23_2:
6304; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6306; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6307; GFX8-NEXT:    v_mov_b32_e32 v0, v1
6308; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
6309; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6310; GFX8-NEXT:    s_mov_b32 s2, -1
6311; GFX8-NEXT:    s_nop 0
6312; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6313; GFX8-NEXT:    s_endpgm
6314;
6315; GFX9-LABEL: umin_i32_varying:
6316; GFX9:       ; %bb.0: ; %entry
6317; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6318; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6319; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6320; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6321; GFX9-NEXT:    v_mov_b32_e32 v1, -1
6322; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6323; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6324; GFX9-NEXT:    s_not_b64 exec, exec
6325; GFX9-NEXT:    v_mov_b32_e32 v2, -1
6326; GFX9-NEXT:    s_not_b64 exec, exec
6327; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6328; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6329; GFX9-NEXT:    s_nop 1
6330; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6331; GFX9-NEXT:    s_nop 1
6332; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6333; GFX9-NEXT:    s_nop 1
6334; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6335; GFX9-NEXT:    s_nop 1
6336; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6337; GFX9-NEXT:    s_nop 1
6338; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6339; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
6340; GFX9-NEXT:    s_nop 0
6341; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6342; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6343; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6344; GFX9-NEXT:    ; implicit-def: $vgpr0
6345; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6346; GFX9-NEXT:    s_cbranch_execz .LBB23_2
6347; GFX9-NEXT:  ; %bb.1:
6348; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6349; GFX9-NEXT:    v_mov_b32_e32 v3, s4
6350; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6351; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
6352; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6353; GFX9-NEXT:  .LBB23_2:
6354; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6355; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6356; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6357; GFX9-NEXT:    v_mov_b32_e32 v0, v1
6358; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
6359; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6360; GFX9-NEXT:    s_mov_b32 s2, -1
6361; GFX9-NEXT:    s_nop 0
6362; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6363; GFX9-NEXT:    s_endpgm
6364;
6365; GFX1064-LABEL: umin_i32_varying:
6366; GFX1064:       ; %bb.0: ; %entry
6367; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
6368; GFX1064-NEXT:    s_not_b64 exec, exec
6369; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
6370; GFX1064-NEXT:    s_not_b64 exec, exec
6371; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6372; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6373; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
6374; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6375; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6376; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6377; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
6378; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6379; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6380; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
6381; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
6382; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6383; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
6384; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6385; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6386; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6387; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6388; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
6389; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
6390; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6391; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6392; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6393; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
6394; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
6395; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
6396; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6397; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6398; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
6399; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
6400; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
6401; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6402; GFX1064-NEXT:    s_mov_b32 s2, -1
6403; GFX1064-NEXT:    ; implicit-def: $vgpr0
6404; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6405; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
6406; GFX1064-NEXT:  ; %bb.1:
6407; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
6408; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
6409; GFX1064-NEXT:    s_mov_b32 s3, s7
6410; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6411; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6412; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
6413; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6414; GFX1064-NEXT:    buffer_gl0_inv
6415; GFX1064-NEXT:  .LBB23_2:
6416; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6417; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
6418; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
6419; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
6420; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
6421; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6422; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6423; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6424; GFX1064-NEXT:    s_endpgm
6425;
6426; GFX1032-LABEL: umin_i32_varying:
6427; GFX1032:       ; %bb.0: ; %entry
6428; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
6429; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6430; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
6431; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6432; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6433; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6434; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6435; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6436; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6437; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
6438; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6439; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6440; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6441; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6442; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6443; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
6444; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
6445; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
6446; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6447; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6448; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6449; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6450; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
6451; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6452; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6453; GFX1032-NEXT:    s_mov_b32 s2, -1
6454; GFX1032-NEXT:    ; implicit-def: $vgpr0
6455; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6456; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
6457; GFX1032-NEXT:  ; %bb.1:
6458; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
6459; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
6460; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6461; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6462; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
6463; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6464; GFX1032-NEXT:    buffer_gl0_inv
6465; GFX1032-NEXT:  .LBB23_2:
6466; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6467; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6468; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
6469; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
6470; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
6471; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6472; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6473; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6474; GFX1032-NEXT:    s_endpgm
6475;
6476; GFX1164-LABEL: umin_i32_varying:
6477; GFX1164:       ; %bb.0: ; %entry
6478; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
6479; GFX1164-NEXT:    s_not_b64 exec, exec
6480; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
6481; GFX1164-NEXT:    s_not_b64 exec, exec
6482; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6483; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6484; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6485; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
6486; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6487; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6488; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6489; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6490; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6491; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
6492; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6493; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6494; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6495; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
6496; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6497; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
6498; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6499; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6500; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
6501; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6502; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6503; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6504; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6505; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
6506; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
6507; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6508; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6509; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6510; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6511; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
6512; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
6513; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
6514; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6515; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
6516; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6517; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
6518; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
6519; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
6520; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6521; GFX1164-NEXT:    s_mov_b32 s2, -1
6522; GFX1164-NEXT:    ; implicit-def: $vgpr0
6523; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6524; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
6525; GFX1164-NEXT:  ; %bb.1:
6526; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
6527; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
6528; GFX1164-NEXT:    s_mov_b32 s3, s7
6529; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6530; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6531; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
6532; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6533; GFX1164-NEXT:    buffer_gl0_inv
6534; GFX1164-NEXT:  .LBB23_2:
6535; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
6536; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
6537; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
6538; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6539; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
6540; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6541; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6542; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6543; GFX1164-NEXT:    s_endpgm
6544;
6545; GFX1132-LABEL: umin_i32_varying:
6546; GFX1132:       ; %bb.0: ; %entry
6547; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
6548; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6549; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
6550; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6551; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6552; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6553; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6554; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6555; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6556; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6557; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6558; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6559; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
6560; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6561; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6562; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6563; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6564; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6565; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6566; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
6567; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6568; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6569; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6570; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6571; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6572; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6573; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6574; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6575; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6576; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6577; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6578; GFX1132-NEXT:    s_mov_b32 s2, -1
6579; GFX1132-NEXT:    ; implicit-def: $vgpr0
6580; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6581; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
6582; GFX1132-NEXT:  ; %bb.1:
6583; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6584; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6585; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6586; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6587; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
6588; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6589; GFX1132-NEXT:    buffer_gl0_inv
6590; GFX1132-NEXT:  .LBB23_2:
6591; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6592; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6593; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6594; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6595; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
6596; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6597; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6598; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6599; GFX1132-NEXT:    s_endpgm
6600entry:
6601  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6602  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6603  store i32 %old, i32 addrspace(1)* %out
6604  ret void
6605}
6606
6607define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
6608;
6609;
6610; GFX7LESS-LABEL: umin_i64_constant:
6611; GFX7LESS:       ; %bb.0: ; %entry
6612; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6613; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6614; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6615; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6616; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6617; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6618; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
6619; GFX7LESS-NEXT:  ; %bb.1:
6620; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6621; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6622; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6623; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6624; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6625; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6626; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6627; GFX7LESS-NEXT:  .LBB24_2:
6628; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6629; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6630; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6631; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6632; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6633; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6634; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6635; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
6636; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6637; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6638; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6639; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6640; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6641; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6642; GFX7LESS-NEXT:    s_endpgm
6643;
6644; GFX8-LABEL: umin_i64_constant:
6645; GFX8:       ; %bb.0: ; %entry
6646; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6647; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6648; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6649; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6650; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6651; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6652; GFX8-NEXT:    s_cbranch_execz .LBB24_2
6653; GFX8-NEXT:  ; %bb.1:
6654; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6655; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6656; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6657; GFX8-NEXT:    s_mov_b32 m0, -1
6658; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6659; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6660; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6661; GFX8-NEXT:  .LBB24_2:
6662; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6663; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6664; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6665; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
6666; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6667; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6668; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6669; GFX8-NEXT:    v_mov_b32_e32 v2, s5
6670; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6671; GFX8-NEXT:    v_mov_b32_e32 v2, s4
6672; GFX8-NEXT:    s_mov_b32 s2, -1
6673; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6674; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6675; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6676; GFX8-NEXT:    s_endpgm
6677;
6678; GFX9-LABEL: umin_i64_constant:
6679; GFX9:       ; %bb.0: ; %entry
6680; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6681; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6682; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6683; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6684; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6685; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6686; GFX9-NEXT:    s_cbranch_execz .LBB24_2
6687; GFX9-NEXT:  ; %bb.1:
6688; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6689; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6690; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6691; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6692; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6693; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6694; GFX9-NEXT:  .LBB24_2:
6695; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6696; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6697; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6698; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
6699; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6700; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6701; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6702; GFX9-NEXT:    v_mov_b32_e32 v2, s5
6703; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6704; GFX9-NEXT:    v_mov_b32_e32 v2, s4
6705; GFX9-NEXT:    s_mov_b32 s2, -1
6706; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6707; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6708; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6709; GFX9-NEXT:    s_endpgm
6710;
6711; GFX1064-LABEL: umin_i64_constant:
6712; GFX1064:       ; %bb.0: ; %entry
6713; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6714; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6715; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6716; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6717; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6718; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6719; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
6720; GFX1064-NEXT:  ; %bb.1:
6721; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6722; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6723; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6724; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6725; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6726; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6727; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6728; GFX1064-NEXT:    buffer_gl0_inv
6729; GFX1064-NEXT:  .LBB24_2:
6730; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6731; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6732; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6733; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6734; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6735; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6736; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6737; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6738; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6739; GFX1064-NEXT:    s_mov_b32 s2, -1
6740; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6741; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6742; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6743; GFX1064-NEXT:    s_endpgm
6744;
6745; GFX1032-LABEL: umin_i64_constant:
6746; GFX1032:       ; %bb.0: ; %entry
6747; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6748; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6749; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6750; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6751; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6752; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
6753; GFX1032-NEXT:  ; %bb.1:
6754; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6755; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6756; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6757; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6758; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6759; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6760; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6761; GFX1032-NEXT:    buffer_gl0_inv
6762; GFX1032-NEXT:  .LBB24_2:
6763; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6764; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6765; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6766; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6767; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6768; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6769; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6770; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6771; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6772; GFX1032-NEXT:    s_mov_b32 s2, -1
6773; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6774; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6775; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6776; GFX1032-NEXT:    s_endpgm
6777;
6778; GFX1164-LABEL: umin_i64_constant:
6779; GFX1164:       ; %bb.0: ; %entry
6780; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6781; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6782; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6783; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6784; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6785; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6786; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6787; GFX1164-NEXT:    s_cbranch_execz .LBB24_2
6788; GFX1164-NEXT:  ; %bb.1:
6789; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6790; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6791; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6792; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6793; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6794; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6795; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6796; GFX1164-NEXT:    buffer_gl0_inv
6797; GFX1164-NEXT:  .LBB24_2:
6798; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6799; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6800; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6801; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6802; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6803; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6804; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6805; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6806; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6807; GFX1164-NEXT:    s_mov_b32 s2, -1
6808; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6809; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6810; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6811; GFX1164-NEXT:    s_endpgm
6812;
6813; GFX1132-LABEL: umin_i64_constant:
6814; GFX1132:       ; %bb.0: ; %entry
6815; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6816; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6817; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6818; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6819; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6820; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6821; GFX1132-NEXT:    s_cbranch_execz .LBB24_2
6822; GFX1132-NEXT:  ; %bb.1:
6823; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6824; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6825; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6826; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6827; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6828; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6829; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6830; GFX1132-NEXT:    buffer_gl0_inv
6831; GFX1132-NEXT:  .LBB24_2:
6832; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6833; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6834; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6835; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6836; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6837; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6838; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6839; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6840; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6841; GFX1132-NEXT:    s_mov_b32 s2, -1
6842; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6843; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6844; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6845; GFX1132-NEXT:    s_endpgm
6846entry:
6847  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
6848  store i64 %old, i64 addrspace(1)* %out
6849  ret void
6850}
6851