1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12@local_var32 = addrspace(3) global i32 undef, align 4
13@local_var64 = addrspace(3) global i64 undef, align 8
14
15; Show what the atomic optimization pass will do for local pointers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
18;
19;
20; GFX7LESS-LABEL: add_i32_constant:
21; GFX7LESS:       ; %bb.0: ; %entry
22; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
23; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
25; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
26; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
27; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
28; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
29; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
30; GFX7LESS-NEXT:  ; %bb.1:
31; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
34; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7LESS-NEXT:    s_mov_b32 m0, -1
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:  .LBB0_2:
40; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
43; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
45; GFX7LESS-NEXT:    s_mov_b32 s2, -1
46; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; GFX7LESS-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
52; GFX8-NEXT:    s_mov_b64 s[2:3], exec
53; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
54; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
55; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
56; GFX8-NEXT:    ; implicit-def: $vgpr1
57; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
58; GFX8-NEXT:    s_cbranch_execz .LBB0_2
59; GFX8-NEXT:  ; %bb.1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
61; GFX8-NEXT:    s_mul_i32 s2, s2, 5
62; GFX8-NEXT:    v_mov_b32_e32 v1, 0
63; GFX8-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-NEXT:    s_mov_b32 m0, -1
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:  .LBB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
73; GFX8-NEXT:    s_mov_b32 s3, 0xf000
74; GFX8-NEXT:    s_mov_b32 s2, -1
75; GFX8-NEXT:    s_nop 1
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    s_mov_b64 s[2:3], exec
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX9-NEXT:    s_cbranch_execz .LBB0_2
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
91; GFX9-NEXT:    s_mul_i32 s2, s2, 5
92; GFX9-NEXT:    v_mov_b32_e32 v1, 0
93; GFX9-NEXT:    v_mov_b32_e32 v2, s2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
111; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
112; GFX1064-NEXT:    ; implicit-def: $vgpr1
113; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
114; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
116; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
117; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
118; GFX1064-NEXT:  ; %bb.1:
119; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
120; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
121; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
122; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1064-NEXT:    buffer_gl0_inv
128; GFX1064-NEXT:  .LBB0_2:
129; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
130; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
131; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
132; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
133; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
134; GFX1064-NEXT:    s_mov_b32 s2, -1
135; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
137; GFX1064-NEXT:    s_endpgm
138;
139; GFX1032-LABEL: add_i32_constant:
140; GFX1032:       ; %bb.0: ; %entry
141; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
151; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
152; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
155; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1032-NEXT:    buffer_gl0_inv
158; GFX1032-NEXT:  .LBB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168;
169; GFX1164-LABEL: add_i32_constant:
170; GFX1164:       ; %bb.0: ; %entry
171; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
172; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
173; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
174; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
175; GFX1164-NEXT:    ; implicit-def: $vgpr1
176; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
177; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
178; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
179; GFX1164-NEXT:  ; %bb.1:
180; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
181; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
182; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
183; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
184; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
185; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
186; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
187; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX1164-NEXT:    buffer_gl0_inv
189; GFX1164-NEXT:  .LBB0_2:
190; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
191; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
192; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
193; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
194; GFX1164-NEXT:    s_mov_b32 s2, -1
195; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
197; GFX1164-NEXT:    s_endpgm
198;
199; GFX1132-LABEL: add_i32_constant:
200; GFX1132:       ; %bb.0: ; %entry
201; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
202; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
203; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
204; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
205; GFX1132-NEXT:    ; implicit-def: $vgpr1
206; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
207; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
208; GFX1132-NEXT:  ; %bb.1:
209; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
210; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
211; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
212; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
213; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
214; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
215; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
216; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX1132-NEXT:    buffer_gl0_inv
218; GFX1132-NEXT:  .LBB0_2:
219; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
220; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
221; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
222; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
223; GFX1132-NEXT:    s_mov_b32 s2, -1
224; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
226; GFX1132-NEXT:    s_endpgm
227entry:
228  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
229  store i32 %old, i32 addrspace(1)* %out
230  ret void
231}
232
233define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
234;
235;
236; GFX7LESS-LABEL: add_i32_uniform:
237; GFX7LESS:       ; %bb.0: ; %entry
238; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
239; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
240; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
241; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
242; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
243; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
244; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
245; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
246; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
247; GFX7LESS-NEXT:  ; %bb.1:
248; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
249; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
251; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
252; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
253; GFX7LESS-NEXT:    s_mov_b32 m0, -1
254; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7LESS-NEXT:  .LBB1_2:
258; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
259; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
261; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
262; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
263; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
264; GFX7LESS-NEXT:    s_mov_b32 s6, -1
265; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
266; GFX7LESS-NEXT:    s_endpgm
267;
268; GFX8-LABEL: add_i32_uniform:
269; GFX8:       ; %bb.0: ; %entry
270; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
271; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
272; GFX8-NEXT:    s_mov_b64 s[2:3], exec
273; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
274; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
275; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
276; GFX8-NEXT:    ; implicit-def: $vgpr1
277; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
278; GFX8-NEXT:    s_cbranch_execz .LBB1_2
279; GFX8-NEXT:  ; %bb.1:
280; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
281; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX8-NEXT:    s_mul_i32 s2, s6, s2
283; GFX8-NEXT:    v_mov_b32_e32 v1, 0
284; GFX8-NEXT:    v_mov_b32_e32 v2, s2
285; GFX8-NEXT:    s_mov_b32 m0, -1
286; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
288; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX8-NEXT:  .LBB1_2:
290; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
291; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
293; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
294; GFX8-NEXT:    s_mov_b32 s7, 0xf000
295; GFX8-NEXT:    s_mov_b32 s6, -1
296; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
297; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
298; GFX8-NEXT:    s_endpgm
299;
300; GFX9-LABEL: add_i32_uniform:
301; GFX9:       ; %bb.0: ; %entry
302; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
303; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
304; GFX9-NEXT:    s_mov_b64 s[2:3], exec
305; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
306; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
307; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
308; GFX9-NEXT:    ; implicit-def: $vgpr1
309; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
310; GFX9-NEXT:    s_cbranch_execz .LBB1_2
311; GFX9-NEXT:  ; %bb.1:
312; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    s_mul_i32 s2, s6, s2
315; GFX9-NEXT:    v_mov_b32_e32 v1, 0
316; GFX9-NEXT:    v_mov_b32_e32 v2, s2
317; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
319; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX9-NEXT:  .LBB1_2:
321; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
324; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
325; GFX9-NEXT:    s_mov_b32 s7, 0xf000
326; GFX9-NEXT:    s_mov_b32 s6, -1
327; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
328; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
329; GFX9-NEXT:    s_endpgm
330;
331; GFX1064-LABEL: add_i32_uniform:
332; GFX1064:       ; %bb.0: ; %entry
333; GFX1064-NEXT:    s_clause 0x1
334; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
335; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
336; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
337; GFX1064-NEXT:    ; implicit-def: $vgpr1
338; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
339; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
340; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
341; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
342; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
343; GFX1064-NEXT:  ; %bb.1:
344; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
345; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
346; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
348; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
349; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
350; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
351; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
352; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX1064-NEXT:    buffer_gl0_inv
354; GFX1064-NEXT:  .LBB1_2:
355; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
356; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
357; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
358; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
359; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], null, s6, v0, s[0:1]
361; GFX1064-NEXT:    s_mov_b32 s6, -1
362; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
363; GFX1064-NEXT:    s_endpgm
364;
365; GFX1032-LABEL: add_i32_uniform:
366; GFX1032:       ; %bb.0: ; %entry
367; GFX1032-NEXT:    s_clause 0x1
368; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
369; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
370; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
371; GFX1032-NEXT:    ; implicit-def: $vgpr1
372; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
373; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
374; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
375; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
376; GFX1032-NEXT:  ; %bb.1:
377; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
378; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
379; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
381; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
382; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
383; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
384; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
385; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX1032-NEXT:    buffer_gl0_inv
387; GFX1032-NEXT:  .LBB1_2:
388; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
389; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
390; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
391; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
392; GFX1032-NEXT:    s_mov_b32 s6, -1
393; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v0, s[0:1]
395; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
396; GFX1032-NEXT:    s_endpgm
397;
398; GFX1164-LABEL: add_i32_uniform:
399; GFX1164:       ; %bb.0: ; %entry
400; GFX1164-NEXT:    s_clause 0x1
401; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
402; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
403; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
404; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
405; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
406; GFX1164-NEXT:    ; implicit-def: $vgpr1
407; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
408; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
409; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
410; GFX1164-NEXT:  ; %bb.1:
411; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
412; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
413; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
415; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
416; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
417; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
418; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
419; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX1164-NEXT:    buffer_gl0_inv
421; GFX1164-NEXT:  .LBB1_2:
422; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
423; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
424; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
425; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
427; GFX1164-NEXT:    s_mov_b32 s6, -1
428; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
429; GFX1164-NEXT:    s_endpgm
430;
431; GFX1132-LABEL: add_i32_uniform:
432; GFX1132:       ; %bb.0: ; %entry
433; GFX1132-NEXT:    s_clause 0x1
434; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
435; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
436; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
437; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
438; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
439; GFX1132-NEXT:    ; implicit-def: $vgpr1
440; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
441; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
442; GFX1132-NEXT:  ; %bb.1:
443; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
444; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
445; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
447; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
448; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
449; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
450; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
451; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX1132-NEXT:    buffer_gl0_inv
453; GFX1132-NEXT:  .LBB1_2:
454; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
455; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
456; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
457; GFX1132-NEXT:    s_mov_b32 s6, -1
458; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
460; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
461; GFX1132-NEXT:    s_endpgm
462entry:
463  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
464  store i32 %old, i32 addrspace(1)* %out
465  ret void
466}
467
468define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
469;
470;
471; GFX7LESS-LABEL: add_i32_varying:
472; GFX7LESS:       ; %bb.0: ; %entry
473; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
474; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
475; GFX7LESS-NEXT:    s_mov_b32 m0, -1
476; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
478; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
480; GFX7LESS-NEXT:    s_mov_b32 s2, -1
481; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
482; GFX7LESS-NEXT:    s_endpgm
483;
484; GFX8-LABEL: add_i32_varying:
485; GFX8:       ; %bb.0: ; %entry
486; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
487; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
488; GFX8-NEXT:    v_mov_b32_e32 v1, 0
489; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
490; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
491; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
492; GFX8-NEXT:    v_mov_b32_e32 v2, v0
493; GFX8-NEXT:    s_not_b64 exec, exec
494; GFX8-NEXT:    v_mov_b32_e32 v2, 0
495; GFX8-NEXT:    s_not_b64 exec, exec
496; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
497; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
498; GFX8-NEXT:    s_nop 1
499; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
500; GFX8-NEXT:    s_nop 1
501; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
502; GFX8-NEXT:    s_nop 1
503; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
504; GFX8-NEXT:    s_nop 1
505; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
506; GFX8-NEXT:    s_nop 1
507; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
508; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
509; GFX8-NEXT:    s_nop 0
510; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
511; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
512; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
513; GFX8-NEXT:    ; implicit-def: $vgpr0
514; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
515; GFX8-NEXT:    s_cbranch_execz .LBB2_2
516; GFX8-NEXT:  ; %bb.1:
517; GFX8-NEXT:    v_mov_b32_e32 v0, 0
518; GFX8-NEXT:    v_mov_b32_e32 v3, s4
519; GFX8-NEXT:    s_mov_b32 m0, -1
520; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
522; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX8-NEXT:  .LBB2_2:
524; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
525; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
527; GFX8-NEXT:    v_mov_b32_e32 v0, v1
528; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
529; GFX8-NEXT:    s_mov_b32 s3, 0xf000
530; GFX8-NEXT:    s_mov_b32 s2, -1
531; GFX8-NEXT:    s_nop 0
532; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
533; GFX8-NEXT:    s_endpgm
534;
535; GFX9-LABEL: add_i32_varying:
536; GFX9:       ; %bb.0: ; %entry
537; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
538; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
539; GFX9-NEXT:    v_mov_b32_e32 v1, 0
540; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
541; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
542; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
543; GFX9-NEXT:    v_mov_b32_e32 v2, v0
544; GFX9-NEXT:    s_not_b64 exec, exec
545; GFX9-NEXT:    v_mov_b32_e32 v2, 0
546; GFX9-NEXT:    s_not_b64 exec, exec
547; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
548; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
549; GFX9-NEXT:    s_nop 1
550; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
551; GFX9-NEXT:    s_nop 1
552; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
553; GFX9-NEXT:    s_nop 1
554; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
555; GFX9-NEXT:    s_nop 1
556; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
557; GFX9-NEXT:    s_nop 1
558; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
559; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
560; GFX9-NEXT:    s_nop 0
561; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
562; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
563; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
564; GFX9-NEXT:    ; implicit-def: $vgpr0
565; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
566; GFX9-NEXT:    s_cbranch_execz .LBB2_2
567; GFX9-NEXT:  ; %bb.1:
568; GFX9-NEXT:    v_mov_b32_e32 v0, 0
569; GFX9-NEXT:    v_mov_b32_e32 v3, s4
570; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
572; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX9-NEXT:  .LBB2_2:
574; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
575; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
577; GFX9-NEXT:    v_mov_b32_e32 v0, v1
578; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
579; GFX9-NEXT:    s_mov_b32 s3, 0xf000
580; GFX9-NEXT:    s_mov_b32 s2, -1
581; GFX9-NEXT:    s_nop 0
582; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
583; GFX9-NEXT:    s_endpgm
584;
585; GFX1064-LABEL: add_i32_varying:
586; GFX1064:       ; %bb.0: ; %entry
587; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
588; GFX1064-NEXT:    s_not_b64 exec, exec
589; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
590; GFX1064-NEXT:    s_not_b64 exec, exec
591; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
592; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
593; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
594; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
595; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
596; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
597; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
598; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
599; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
600; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
601; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
602; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
603; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
604; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
605; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
606; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
607; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
608; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
609; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
610; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
611; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
612; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
613; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
614; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
615; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
616; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
617; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
618; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
619; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
620; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
621; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
622; GFX1064-NEXT:    s_mov_b32 s2, -1
623; GFX1064-NEXT:    ; implicit-def: $vgpr0
624; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
625; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
626; GFX1064-NEXT:  ; %bb.1:
627; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
628; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
629; GFX1064-NEXT:    s_mov_b32 s3, s7
630; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
631; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
632; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
633; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX1064-NEXT:    buffer_gl0_inv
635; GFX1064-NEXT:  .LBB2_2:
636; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
637; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
638; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
639; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
640; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
641; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
642; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
644; GFX1064-NEXT:    s_endpgm
645;
646; GFX1032-LABEL: add_i32_varying:
647; GFX1032:       ; %bb.0: ; %entry
648; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
649; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
650; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
651; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
652; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
653; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
654; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
655; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
656; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
657; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
658; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
659; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
660; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
661; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
662; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
663; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
664; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
665; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
666; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
667; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
668; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
669; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
670; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
671; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
672; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
673; GFX1032-NEXT:    s_mov_b32 s2, -1
674; GFX1032-NEXT:    ; implicit-def: $vgpr0
675; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
676; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
677; GFX1032-NEXT:  ; %bb.1:
678; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
679; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
680; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
681; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
682; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
683; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX1032-NEXT:    buffer_gl0_inv
685; GFX1032-NEXT:  .LBB2_2:
686; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
687; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
688; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
689; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
690; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
691; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
692; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
694; GFX1032-NEXT:    s_endpgm
695;
696; GFX1164-LABEL: add_i32_varying:
697; GFX1164:       ; %bb.0: ; %entry
698; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
699; GFX1164-NEXT:    s_not_b64 exec, exec
700; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
701; GFX1164-NEXT:    s_not_b64 exec, exec
702; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
703; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
704; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
705; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
706; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
707; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
708; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
709; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
710; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
711; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
712; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
713; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
714; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
715; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
716; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
717; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
718; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
719; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
720; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
721; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
722; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
723; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
724; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
725; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
726; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
727; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
728; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
729; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
730; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
731; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
732; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
733; GFX1164-NEXT:    s_mov_b32 s2, -1
734; GFX1164-NEXT:    ; implicit-def: $vgpr0
735; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
736; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
737; GFX1164-NEXT:  ; %bb.1:
738; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
739; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
740; GFX1164-NEXT:    s_mov_b32 s3, s7
741; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
742; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
743; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v4
744; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX1164-NEXT:    buffer_gl0_inv
746; GFX1164-NEXT:  .LBB2_2:
747; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
748; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
749; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
750; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
751; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
752; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
754; GFX1164-NEXT:    s_endpgm
755;
756; GFX1132-LABEL: add_i32_varying:
757; GFX1132:       ; %bb.0: ; %entry
758; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
759; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
760; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
761; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
762; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
763; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
764; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
765; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
766; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
767; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
768; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
769; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
770; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
771; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
772; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
773; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
774; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
775; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
776; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
777; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
778; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
779; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
780; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
781; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
782; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
783; GFX1132-NEXT:    s_mov_b32 s2, -1
784; GFX1132-NEXT:    ; implicit-def: $vgpr0
785; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
786; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
787; GFX1132-NEXT:  ; %bb.1:
788; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
789; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
790; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
791; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
792; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v4
793; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX1132-NEXT:    buffer_gl0_inv
795; GFX1132-NEXT:  .LBB2_2:
796; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
797; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
798; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
799; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
800; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
801; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
803; GFX1132-NEXT:    s_endpgm
804entry:
805  %lane = call i32 @llvm.amdgcn.workitem.id.x()
806  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
807  store i32 %old, i32 addrspace(1)* %out
808  ret void
809}
810
811define amdgpu_kernel void @add_i32_varying_nouse() {
812; GFX7LESS-LABEL: add_i32_varying_nouse:
813; GFX7LESS:       ; %bb.0: ; %entry
814; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
815; GFX7LESS-NEXT:    s_mov_b32 m0, -1
816; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX7LESS-NEXT:    ds_add_u32 v1, v0
818; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX7LESS-NEXT:    s_endpgm
820;
821; GFX8-LABEL: add_i32_varying_nouse:
822; GFX8:       ; %bb.0: ; %entry
823; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
824; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
825; GFX8-NEXT:    v_mov_b32_e32 v1, v0
826; GFX8-NEXT:    s_not_b64 exec, exec
827; GFX8-NEXT:    v_mov_b32_e32 v1, 0
828; GFX8-NEXT:    s_not_b64 exec, exec
829; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
830; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
831; GFX8-NEXT:    s_nop 1
832; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
833; GFX8-NEXT:    s_nop 1
834; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
835; GFX8-NEXT:    s_nop 1
836; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
837; GFX8-NEXT:    s_nop 1
838; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
839; GFX8-NEXT:    s_nop 1
840; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
841; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
842; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
843; GFX8-NEXT:    s_mov_b32 s0, s2
844; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
845; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
846; GFX8-NEXT:    s_cbranch_execz .LBB3_2
847; GFX8-NEXT:  ; %bb.1:
848; GFX8-NEXT:    v_mov_b32_e32 v0, 0
849; GFX8-NEXT:    v_mov_b32_e32 v2, s0
850; GFX8-NEXT:    s_mov_b32 m0, -1
851; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX8-NEXT:    ds_add_u32 v0, v2
853; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX8-NEXT:  .LBB3_2:
855; GFX8-NEXT:    s_endpgm
856;
857; GFX9-LABEL: add_i32_varying_nouse:
858; GFX9:       ; %bb.0: ; %entry
859; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
860; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
861; GFX9-NEXT:    v_mov_b32_e32 v1, v0
862; GFX9-NEXT:    s_not_b64 exec, exec
863; GFX9-NEXT:    v_mov_b32_e32 v1, 0
864; GFX9-NEXT:    s_not_b64 exec, exec
865; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
866; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
867; GFX9-NEXT:    s_nop 1
868; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
869; GFX9-NEXT:    s_nop 1
870; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
871; GFX9-NEXT:    s_nop 1
872; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
873; GFX9-NEXT:    s_nop 1
874; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
875; GFX9-NEXT:    s_nop 1
876; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
877; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
878; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
879; GFX9-NEXT:    s_mov_b32 s0, s2
880; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
881; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
882; GFX9-NEXT:    s_cbranch_execz .LBB3_2
883; GFX9-NEXT:  ; %bb.1:
884; GFX9-NEXT:    v_mov_b32_e32 v0, 0
885; GFX9-NEXT:    v_mov_b32_e32 v2, s0
886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX9-NEXT:    ds_add_u32 v0, v2
888; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX9-NEXT:  .LBB3_2:
890; GFX9-NEXT:    s_endpgm
891;
892; GFX1064-LABEL: add_i32_varying_nouse:
893; GFX1064:       ; %bb.0: ; %entry
894; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
895; GFX1064-NEXT:    s_not_b64 exec, exec
896; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
897; GFX1064-NEXT:    s_not_b64 exec, exec
898; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
899; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
900; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
902; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
903; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
904; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
905; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
906; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
907; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
908; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
909; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
910; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
911; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
912; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
913; GFX1064-NEXT:    s_add_i32 s0, s2, s3
914; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
915; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
916; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
917; GFX1064-NEXT:  ; %bb.1:
918; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
919; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
920; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
921; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
922; GFX1064-NEXT:    ds_add_u32 v0, v3
923; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX1064-NEXT:    buffer_gl0_inv
925; GFX1064-NEXT:  .LBB3_2:
926; GFX1064-NEXT:    s_endpgm
927;
928; GFX1032-LABEL: add_i32_varying_nouse:
929; GFX1032:       ; %bb.0: ; %entry
930; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
931; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
932; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
933; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
934; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
935; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
936; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
937; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
938; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
939; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
940; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
941; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
942; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
943; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
944; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
945; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
946; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
947; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
948; GFX1032-NEXT:  ; %bb.1:
949; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
950; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
951; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
952; GFX1032-NEXT:    ds_add_u32 v3, v0
953; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX1032-NEXT:    buffer_gl0_inv
955; GFX1032-NEXT:  .LBB3_2:
956; GFX1032-NEXT:    s_endpgm
957;
958; GFX1164-LABEL: add_i32_varying_nouse:
959; GFX1164:       ; %bb.0: ; %entry
960; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
961; GFX1164-NEXT:    s_not_b64 exec, exec
962; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
963; GFX1164-NEXT:    s_not_b64 exec, exec
964; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
965; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
966; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
967; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
968; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
969; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
970; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
971; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
972; GFX1164-NEXT:    v_permlane64_b32 v2, v1
973; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
974; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
975; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
976; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
977; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
978; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
979; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
980; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
981; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
982; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
983; GFX1164-NEXT:  ; %bb.1:
984; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
985; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
986; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
987; GFX1164-NEXT:    ds_add_u32 v3, v0
988; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX1164-NEXT:    buffer_gl0_inv
990; GFX1164-NEXT:  .LBB3_2:
991; GFX1164-NEXT:    s_endpgm
992;
993; GFX1132-LABEL: add_i32_varying_nouse:
994; GFX1132:       ; %bb.0: ; %entry
995; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
996; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
997; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
998; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
999; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
1000; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1001; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1002; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1003; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1004; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
1005; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1006; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1007; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
1008; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1009; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
1010; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1011; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
1012; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1013; GFX1132-NEXT:  ; %bb.1:
1014; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1015; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1016; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1017; GFX1132-NEXT:    ds_add_u32 v3, v0
1018; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX1132-NEXT:    buffer_gl0_inv
1020; GFX1132-NEXT:  .LBB3_2:
1021; GFX1132-NEXT:    s_endpgm
1022entry:
1023  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1024  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1025  ret void
1026}
1027
1028define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1029;
1030;
1031; GFX7LESS-LABEL: add_i64_constant:
1032; GFX7LESS:       ; %bb.0: ; %entry
1033; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1034; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1035; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1036; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1037; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1038; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1039; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1040; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1041; GFX7LESS-NEXT:  ; %bb.1:
1042; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1043; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1044; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1045; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1046; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1047; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1049; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1050; GFX7LESS-NEXT:  .LBB4_2:
1051; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1052; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1054; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1055; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1056; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1057; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1058; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1059; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1060; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1061; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1062; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1063; GFX7LESS-NEXT:    s_endpgm
1064;
1065; GFX8-LABEL: add_i64_constant:
1066; GFX8:       ; %bb.0: ; %entry
1067; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1068; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1069; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1070; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1071; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1072; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1073; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1074; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1075; GFX8-NEXT:  ; %bb.1:
1076; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1077; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1078; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1079; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1080; GFX8-NEXT:    s_mov_b32 m0, -1
1081; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1083; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX8-NEXT:  .LBB4_2:
1085; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1086; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1087; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1088; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1089; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1090; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1091; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1092; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1093; GFX8-NEXT:    s_mov_b32 s2, -1
1094; GFX8-NEXT:    s_nop 2
1095; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1096; GFX8-NEXT:    s_endpgm
1097;
1098; GFX9-LABEL: add_i64_constant:
1099; GFX9:       ; %bb.0: ; %entry
1100; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1101; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1102; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1103; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1104; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1105; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1106; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1107; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1108; GFX9-NEXT:  ; %bb.1:
1109; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1110; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1111; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1112; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1113; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1115; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX9-NEXT:  .LBB4_2:
1117; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1120; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1121; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1122; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1123; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1124; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1125; GFX9-NEXT:    s_mov_b32 s2, -1
1126; GFX9-NEXT:    s_nop 2
1127; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1128; GFX9-NEXT:    s_endpgm
1129;
1130; GFX1064-LABEL: add_i64_constant:
1131; GFX1064:       ; %bb.0: ; %entry
1132; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1133; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1134; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1135; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1136; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1137; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1138; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1139; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1140; GFX1064-NEXT:  ; %bb.1:
1141; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1142; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1143; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
1144; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1145; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1146; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1147; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1148; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX1064-NEXT:    buffer_gl0_inv
1150; GFX1064-NEXT:  .LBB4_2:
1151; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1152; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1153; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1154; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1155; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1156; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1157; GFX1064-NEXT:    s_mov_b32 s2, -1
1158; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1159; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1160; GFX1064-NEXT:    s_endpgm
1161;
1162; GFX1032-LABEL: add_i64_constant:
1163; GFX1032:       ; %bb.0: ; %entry
1164; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1165; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1166; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1167; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1168; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1169; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1170; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1171; GFX1032-NEXT:  ; %bb.1:
1172; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1173; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1174; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1175; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
1176; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1177; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1178; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1179; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX1032-NEXT:    buffer_gl0_inv
1181; GFX1032-NEXT:  .LBB4_2:
1182; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1183; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1184; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1185; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1186; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1187; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1188; GFX1032-NEXT:    s_mov_b32 s2, -1
1189; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1191; GFX1032-NEXT:    s_endpgm
1192;
1193; GFX1164-LABEL: add_i64_constant:
1194; GFX1164:       ; %bb.0: ; %entry
1195; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1196; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1197; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1198; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1199; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1200; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1201; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1202; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1203; GFX1164-NEXT:  ; %bb.1:
1204; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1205; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1206; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
1207; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
1208; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1209; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1210; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1211; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX1164-NEXT:    buffer_gl0_inv
1213; GFX1164-NEXT:  .LBB4_2:
1214; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1215; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1216; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1217; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1218; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1219; GFX1164-NEXT:    s_mov_b32 s2, -1
1220; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1221; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1222; GFX1164-NEXT:    s_endpgm
1223;
1224; GFX1132-LABEL: add_i64_constant:
1225; GFX1132:       ; %bb.0: ; %entry
1226; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1227; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1228; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1229; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1230; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1231; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1232; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1233; GFX1132-NEXT:  ; %bb.1:
1234; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1235; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1236; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1237; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
1238; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1239; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1240; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1241; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX1132-NEXT:    buffer_gl0_inv
1243; GFX1132-NEXT:  .LBB4_2:
1244; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1245; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1246; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1247; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1248; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1249; GFX1132-NEXT:    s_mov_b32 s2, -1
1250; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1251; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1252; GFX1132-NEXT:    s_endpgm
1253entry:
1254  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1255  store i64 %old, i64 addrspace(1)* %out
1256  ret void
1257}
1258
1259define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1260;
1261;
1262; GFX7LESS-LABEL: add_i64_uniform:
1263; GFX7LESS:       ; %bb.0: ; %entry
1264; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1265; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1266; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1267; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1268; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1269; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1270; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1271; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1272; GFX7LESS-NEXT:  ; %bb.1:
1273; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1274; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1275; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1276; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1277; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1278; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1279; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1280; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1281; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1282; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1283; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1284; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1285; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1286; GFX7LESS-NEXT:  .LBB5_2:
1287; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1288; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1289; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1290; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1292; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1293; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
1294; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1295; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1296; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1297; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1298; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1299; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
1300; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1301; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1302; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1303; GFX7LESS-NEXT:    s_endpgm
1304;
1305; GFX8-LABEL: add_i64_uniform:
1306; GFX8:       ; %bb.0: ; %entry
1307; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1308; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1309; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1310; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1311; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1312; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1313; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1314; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1315; GFX8-NEXT:  ; %bb.1:
1316; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1317; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1318; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1320; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1321; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1322; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1323; GFX8-NEXT:    s_mov_b32 m0, -1
1324; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1325; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1326; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX8-NEXT:  .LBB5_2:
1328; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1329; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1330; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1331; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1332; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1333; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1334; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1335; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1336; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1337; GFX8-NEXT:    s_mov_b32 s6, -1
1338; GFX8-NEXT:    s_mov_b32 s4, s0
1339; GFX8-NEXT:    s_mov_b32 s5, s1
1340; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1341; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1342; GFX8-NEXT:    s_endpgm
1343;
1344; GFX9-LABEL: add_i64_uniform:
1345; GFX9:       ; %bb.0: ; %entry
1346; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1347; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1348; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1349; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1350; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1351; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1352; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1353; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1354; GFX9-NEXT:  ; %bb.1:
1355; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1356; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1357; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1358; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1359; GFX9-NEXT:    s_add_i32 s8, s8, s7
1360; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1361; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1362; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1363; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX9-NEXT:  .LBB5_2:
1368; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1369; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1371; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1372; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1373; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1374; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1375; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1376; GFX9-NEXT:    s_mov_b32 s6, -1
1377; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1378; GFX9-NEXT:    s_mov_b32 s4, s0
1379; GFX9-NEXT:    s_mov_b32 s5, s1
1380; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1381; GFX9-NEXT:    s_endpgm
1382;
1383; GFX1064-LABEL: add_i64_uniform:
1384; GFX1064:       ; %bb.0: ; %entry
1385; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1386; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1387; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1388; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1389; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1390; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1391; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1392; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1393; GFX1064-NEXT:  ; %bb.1:
1394; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1395; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1396; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1398; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1399; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1400; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1401; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1402; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1403; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1404; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1405; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1406; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1407; GFX1064-NEXT:    buffer_gl0_inv
1408; GFX1064-NEXT:  .LBB5_2:
1409; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1410; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1411; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1412; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1413; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1415; GFX1064-NEXT:    s_mov_b32 s2, -1
1416; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2]
1417; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1418; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1419; GFX1064-NEXT:    s_endpgm
1420;
1421; GFX1032-LABEL: add_i64_uniform:
1422; GFX1032:       ; %bb.0: ; %entry
1423; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1424; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1425; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1426; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1427; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1428; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1429; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1430; GFX1032-NEXT:  ; %bb.1:
1431; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1432; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1433; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1434; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1435; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1436; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1437; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1438; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1439; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1440; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1441; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1442; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1443; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1444; GFX1032-NEXT:    buffer_gl0_inv
1445; GFX1032-NEXT:  .LBB5_2:
1446; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1447; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1448; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1449; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1450; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1452; GFX1032-NEXT:    s_mov_b32 s2, -1
1453; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2]
1454; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1455; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1456; GFX1032-NEXT:    s_endpgm
1457;
1458; GFX1164-LABEL: add_i64_uniform:
1459; GFX1164:       ; %bb.0: ; %entry
1460; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1461; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1462; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1463; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1464; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1465; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1466; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1467; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1468; GFX1164-NEXT:  ; %bb.1:
1469; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1470; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1471; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1473; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1474; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1475; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1476; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1477; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1478; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1479; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1480; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1481; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX1164-NEXT:    buffer_gl0_inv
1483; GFX1164-NEXT:  .LBB5_2:
1484; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1485; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1486; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1487; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1488; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1489; GFX1164-NEXT:    s_mov_b32 s2, -1
1490; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1491; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1492; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1493; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1494; GFX1164-NEXT:    s_endpgm
1495;
1496; GFX1132-LABEL: add_i64_uniform:
1497; GFX1132:       ; %bb.0: ; %entry
1498; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1499; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1500; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1501; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1502; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1503; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1504; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1505; GFX1132-NEXT:  ; %bb.1:
1506; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1507; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1508; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1510; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1511; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1512; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1513; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
1514; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
1515; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1516; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1517; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1518; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1519; GFX1132-NEXT:    buffer_gl0_inv
1520; GFX1132-NEXT:  .LBB5_2:
1521; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1522; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1523; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1524; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1526; GFX1132-NEXT:    s_mov_b32 s2, -1
1527; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1528; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1529; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1530; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1531; GFX1132-NEXT:    s_endpgm
1532entry:
1533  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1534  store i64 %old, i64 addrspace(1)* %out
1535  ret void
1536}
1537
1538define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1539;
1540;
1541; GFX7LESS-LABEL: add_i64_varying:
1542; GFX7LESS:       ; %bb.0: ; %entry
1543; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1544; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1545; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1546; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1547; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1548; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1549; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1550; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1551; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1552; GFX7LESS-NEXT:    s_endpgm
1553;
1554; GFX8-LABEL: add_i64_varying:
1555; GFX8:       ; %bb.0: ; %entry
1556; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1557; GFX8-NEXT:    s_mov_b32 m0, -1
1558; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1559; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1561; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1562; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1563; GFX8-NEXT:    s_mov_b32 s2, -1
1564; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1565; GFX8-NEXT:    s_endpgm
1566;
1567; GFX9-LABEL: add_i64_varying:
1568; GFX9:       ; %bb.0: ; %entry
1569; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1570; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1571; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1575; GFX9-NEXT:    s_mov_b32 s2, -1
1576; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1577; GFX9-NEXT:    s_endpgm
1578;
1579; GFX10-LABEL: add_i64_varying:
1580; GFX10:       ; %bb.0: ; %entry
1581; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1582; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1583; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1584; GFX10-NEXT:    s_mov_b32 s2, -1
1585; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1586; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1587; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1588; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1589; GFX10-NEXT:    buffer_gl0_inv
1590; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1591; GFX10-NEXT:    s_endpgm
1592;
1593; GFX11-LABEL: add_i64_varying:
1594; GFX11:       ; %bb.0: ; %entry
1595; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1596; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1597; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1598; GFX11-NEXT:    s_mov_b32 s2, -1
1599; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1600; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1601; GFX11-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1602; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX11-NEXT:    buffer_gl0_inv
1604; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1605; GFX11-NEXT:    s_endpgm
1606entry:
1607  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1608  %zext = zext i32 %lane to i64
1609  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1610  store i64 %old, i64 addrspace(1)* %out
1611  ret void
1612}
1613
1614define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1615;
1616;
1617; GFX7LESS-LABEL: sub_i32_constant:
1618; GFX7LESS:       ; %bb.0: ; %entry
1619; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1620; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1621; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1622; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1623; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1624; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1625; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1626; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1627; GFX7LESS-NEXT:  ; %bb.1:
1628; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1629; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1630; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1631; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1632; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1633; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1635; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX7LESS-NEXT:  .LBB7_2:
1637; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1638; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1639; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1640; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1641; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1642; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1643; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1644; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1645; GFX7LESS-NEXT:    s_endpgm
1646;
1647; GFX8-LABEL: sub_i32_constant:
1648; GFX8:       ; %bb.0: ; %entry
1649; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1650; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1651; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1652; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1653; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1654; GFX8-NEXT:    ; implicit-def: $vgpr1
1655; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1656; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1657; GFX8-NEXT:  ; %bb.1:
1658; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1659; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1660; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1661; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1662; GFX8-NEXT:    s_mov_b32 m0, -1
1663; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1664; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1665; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1666; GFX8-NEXT:  .LBB7_2:
1667; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1668; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1670; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1671; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1672; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1673; GFX8-NEXT:    s_mov_b32 s2, -1
1674; GFX8-NEXT:    s_nop 0
1675; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1676; GFX8-NEXT:    s_endpgm
1677;
1678; GFX9-LABEL: sub_i32_constant:
1679; GFX9:       ; %bb.0: ; %entry
1680; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1681; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1682; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1683; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1684; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1685; GFX9-NEXT:    ; implicit-def: $vgpr1
1686; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1687; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1688; GFX9-NEXT:  ; %bb.1:
1689; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1690; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1691; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1692; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1693; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1695; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1696; GFX9-NEXT:  .LBB7_2:
1697; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1700; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1701; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1702; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1703; GFX9-NEXT:    s_mov_b32 s2, -1
1704; GFX9-NEXT:    s_nop 0
1705; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1706; GFX9-NEXT:    s_endpgm
1707;
1708; GFX1064-LABEL: sub_i32_constant:
1709; GFX1064:       ; %bb.0: ; %entry
1710; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1711; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1712; GFX1064-NEXT:    ; implicit-def: $vgpr1
1713; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1714; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1715; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1716; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1717; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1718; GFX1064-NEXT:  ; %bb.1:
1719; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1720; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1721; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1722; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1723; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1724; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1725; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1726; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX1064-NEXT:    buffer_gl0_inv
1728; GFX1064-NEXT:  .LBB7_2:
1729; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1730; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1731; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1732; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1733; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1734; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1735; GFX1064-NEXT:    s_mov_b32 s2, -1
1736; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1737; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1738; GFX1064-NEXT:    s_endpgm
1739;
1740; GFX1032-LABEL: sub_i32_constant:
1741; GFX1032:       ; %bb.0: ; %entry
1742; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1743; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1744; GFX1032-NEXT:    ; implicit-def: $vgpr1
1745; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1746; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1747; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1748; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1749; GFX1032-NEXT:  ; %bb.1:
1750; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1751; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1752; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1753; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1754; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1755; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1756; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1757; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1758; GFX1032-NEXT:    buffer_gl0_inv
1759; GFX1032-NEXT:  .LBB7_2:
1760; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1761; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1762; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1763; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1764; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1765; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1766; GFX1032-NEXT:    s_mov_b32 s2, -1
1767; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1769; GFX1032-NEXT:    s_endpgm
1770;
1771; GFX1164-LABEL: sub_i32_constant:
1772; GFX1164:       ; %bb.0: ; %entry
1773; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1774; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1775; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1776; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1777; GFX1164-NEXT:    ; implicit-def: $vgpr1
1778; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1779; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1780; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1781; GFX1164-NEXT:  ; %bb.1:
1782; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1783; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1784; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1785; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
1786; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1787; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1788; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1789; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1790; GFX1164-NEXT:    buffer_gl0_inv
1791; GFX1164-NEXT:  .LBB7_2:
1792; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1793; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1794; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1795; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1796; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1797; GFX1164-NEXT:    s_mov_b32 s2, -1
1798; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1800; GFX1164-NEXT:    s_endpgm
1801;
1802; GFX1132-LABEL: sub_i32_constant:
1803; GFX1132:       ; %bb.0: ; %entry
1804; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1805; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1806; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1807; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1808; GFX1132-NEXT:    ; implicit-def: $vgpr1
1809; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1810; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
1811; GFX1132-NEXT:  ; %bb.1:
1812; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1813; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1814; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1815; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
1816; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1817; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1818; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1819; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX1132-NEXT:    buffer_gl0_inv
1821; GFX1132-NEXT:  .LBB7_2:
1822; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1823; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1824; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1825; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1826; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1827; GFX1132-NEXT:    s_mov_b32 s2, -1
1828; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1830; GFX1132-NEXT:    s_endpgm
1831entry:
1832  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1833  store i32 %old, i32 addrspace(1)* %out
1834  ret void
1835}
1836
1837define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1838;
1839;
1840; GFX7LESS-LABEL: sub_i32_uniform:
1841; GFX7LESS:       ; %bb.0: ; %entry
1842; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1843; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1844; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1845; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1846; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1847; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1848; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1849; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1850; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1851; GFX7LESS-NEXT:  ; %bb.1:
1852; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1853; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1854; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1855; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1856; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1857; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1858; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1859; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1860; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX7LESS-NEXT:  .LBB8_2:
1862; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1863; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1864; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1865; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1866; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1867; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1868; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1869; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1870; GFX7LESS-NEXT:    s_endpgm
1871;
1872; GFX8-LABEL: sub_i32_uniform:
1873; GFX8:       ; %bb.0: ; %entry
1874; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1875; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1876; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1877; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1878; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1879; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1880; GFX8-NEXT:    ; implicit-def: $vgpr1
1881; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1882; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1883; GFX8-NEXT:  ; %bb.1:
1884; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1885; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1886; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1887; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1888; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1889; GFX8-NEXT:    s_mov_b32 m0, -1
1890; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1891; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1892; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1893; GFX8-NEXT:  .LBB8_2:
1894; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1895; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1896; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1897; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1898; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1899; GFX8-NEXT:    s_mov_b32 s6, -1
1900; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1901; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1902; GFX8-NEXT:    s_endpgm
1903;
1904; GFX9-LABEL: sub_i32_uniform:
1905; GFX9:       ; %bb.0: ; %entry
1906; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1907; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1908; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1909; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1910; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1911; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1912; GFX9-NEXT:    ; implicit-def: $vgpr1
1913; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1914; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1915; GFX9-NEXT:  ; %bb.1:
1916; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1917; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1919; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1920; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1921; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1922; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1923; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1924; GFX9-NEXT:  .LBB8_2:
1925; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1926; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1927; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1928; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1929; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1930; GFX9-NEXT:    s_mov_b32 s6, -1
1931; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1932; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1933; GFX9-NEXT:    s_endpgm
1934;
1935; GFX1064-LABEL: sub_i32_uniform:
1936; GFX1064:       ; %bb.0: ; %entry
1937; GFX1064-NEXT:    s_clause 0x1
1938; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1939; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
1940; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1941; GFX1064-NEXT:    ; implicit-def: $vgpr1
1942; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1943; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1944; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1945; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1946; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
1947; GFX1064-NEXT:  ; %bb.1:
1948; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1949; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1950; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1951; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
1952; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1953; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1954; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1955; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1956; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX1064-NEXT:    buffer_gl0_inv
1958; GFX1064-NEXT:  .LBB8_2:
1959; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1960; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1961; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1962; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
1963; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1964; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1965; GFX1064-NEXT:    s_mov_b32 s6, -1
1966; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1967; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1968; GFX1064-NEXT:    s_endpgm
1969;
1970; GFX1032-LABEL: sub_i32_uniform:
1971; GFX1032:       ; %bb.0: ; %entry
1972; GFX1032-NEXT:    s_clause 0x1
1973; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1974; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1975; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1976; GFX1032-NEXT:    ; implicit-def: $vgpr1
1977; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1978; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1979; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1980; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
1981; GFX1032-NEXT:  ; %bb.1:
1982; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1983; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1984; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1986; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1987; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1988; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1989; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1990; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX1032-NEXT:    buffer_gl0_inv
1992; GFX1032-NEXT:  .LBB8_2:
1993; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1994; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1995; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1996; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1997; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1998; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1999; GFX1032-NEXT:    s_mov_b32 s6, -1
2000; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2001; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2002; GFX1032-NEXT:    s_endpgm
2003;
2004; GFX1164-LABEL: sub_i32_uniform:
2005; GFX1164:       ; %bb.0: ; %entry
2006; GFX1164-NEXT:    s_clause 0x1
2007; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2008; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
2009; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2010; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2011; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2012; GFX1164-NEXT:    ; implicit-def: $vgpr1
2013; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2014; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2015; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2016; GFX1164-NEXT:  ; %bb.1:
2017; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2018; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2019; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
2021; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
2022; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2023; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2024; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2025; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2026; GFX1164-NEXT:    buffer_gl0_inv
2027; GFX1164-NEXT:  .LBB8_2:
2028; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2029; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2030; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
2031; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2032; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2033; GFX1164-NEXT:    s_mov_b32 s6, -1
2034; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2035; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2036; GFX1164-NEXT:    s_endpgm
2037;
2038; GFX1132-LABEL: sub_i32_uniform:
2039; GFX1132:       ; %bb.0: ; %entry
2040; GFX1132-NEXT:    s_clause 0x1
2041; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2042; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
2043; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2044; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2045; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2046; GFX1132-NEXT:    ; implicit-def: $vgpr1
2047; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2048; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2049; GFX1132-NEXT:  ; %bb.1:
2050; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2051; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2052; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2054; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
2055; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2056; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2057; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2058; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2059; GFX1132-NEXT:    buffer_gl0_inv
2060; GFX1132-NEXT:  .LBB8_2:
2061; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2062; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2063; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2064; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2065; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2066; GFX1132-NEXT:    s_mov_b32 s6, -1
2067; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2068; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2069; GFX1132-NEXT:    s_endpgm
2070entry:
2071  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
2072  store i32 %old, i32 addrspace(1)* %out
2073  ret void
2074}
2075
2076define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
2077;
2078;
2079; GFX7LESS-LABEL: sub_i32_varying:
2080; GFX7LESS:       ; %bb.0: ; %entry
2081; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2082; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2083; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2084; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
2086; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2087; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2088; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2089; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2090; GFX7LESS-NEXT:    s_endpgm
2091;
2092; GFX8-LABEL: sub_i32_varying:
2093; GFX8:       ; %bb.0: ; %entry
2094; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2095; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2096; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2097; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2098; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2099; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2100; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2101; GFX8-NEXT:    s_not_b64 exec, exec
2102; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2103; GFX8-NEXT:    s_not_b64 exec, exec
2104; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2105; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2106; GFX8-NEXT:    s_nop 1
2107; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2108; GFX8-NEXT:    s_nop 1
2109; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2110; GFX8-NEXT:    s_nop 1
2111; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2112; GFX8-NEXT:    s_nop 1
2113; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2114; GFX8-NEXT:    s_nop 1
2115; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2116; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2117; GFX8-NEXT:    s_nop 0
2118; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2119; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2120; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2121; GFX8-NEXT:    ; implicit-def: $vgpr0
2122; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2123; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2124; GFX8-NEXT:  ; %bb.1:
2125; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2126; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2127; GFX8-NEXT:    s_mov_b32 m0, -1
2128; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2129; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2130; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2131; GFX8-NEXT:  .LBB9_2:
2132; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2133; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2134; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2135; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2136; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2137; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2138; GFX8-NEXT:    s_mov_b32 s2, -1
2139; GFX8-NEXT:    s_nop 0
2140; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2141; GFX8-NEXT:    s_endpgm
2142;
2143; GFX9-LABEL: sub_i32_varying:
2144; GFX9:       ; %bb.0: ; %entry
2145; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2146; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2147; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2148; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2149; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2150; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2151; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2152; GFX9-NEXT:    s_not_b64 exec, exec
2153; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2154; GFX9-NEXT:    s_not_b64 exec, exec
2155; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2156; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2157; GFX9-NEXT:    s_nop 1
2158; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2159; GFX9-NEXT:    s_nop 1
2160; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2161; GFX9-NEXT:    s_nop 1
2162; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2163; GFX9-NEXT:    s_nop 1
2164; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2165; GFX9-NEXT:    s_nop 1
2166; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2167; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2168; GFX9-NEXT:    s_nop 0
2169; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2170; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2171; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2172; GFX9-NEXT:    ; implicit-def: $vgpr0
2173; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2174; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2175; GFX9-NEXT:  ; %bb.1:
2176; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2177; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2178; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2179; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX9-NEXT:  .LBB9_2:
2182; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2183; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2184; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2185; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2186; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2187; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2188; GFX9-NEXT:    s_mov_b32 s2, -1
2189; GFX9-NEXT:    s_nop 0
2190; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2191; GFX9-NEXT:    s_endpgm
2192;
2193; GFX1064-LABEL: sub_i32_varying:
2194; GFX1064:       ; %bb.0: ; %entry
2195; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2196; GFX1064-NEXT:    s_not_b64 exec, exec
2197; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2198; GFX1064-NEXT:    s_not_b64 exec, exec
2199; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2200; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2201; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2202; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2203; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2204; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2205; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2206; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2207; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2208; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2209; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2210; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2211; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2212; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2213; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2214; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2215; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2216; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2217; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2218; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2219; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2220; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2221; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2222; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2223; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2224; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2225; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2226; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2227; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2228; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2229; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2230; GFX1064-NEXT:    s_mov_b32 s2, -1
2231; GFX1064-NEXT:    ; implicit-def: $vgpr0
2232; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2233; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2234; GFX1064-NEXT:  ; %bb.1:
2235; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2236; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2237; GFX1064-NEXT:    s_mov_b32 s3, s7
2238; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2239; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2240; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2241; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2242; GFX1064-NEXT:    buffer_gl0_inv
2243; GFX1064-NEXT:  .LBB9_2:
2244; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2245; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2246; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2247; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2248; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2249; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2250; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2252; GFX1064-NEXT:    s_endpgm
2253;
2254; GFX1032-LABEL: sub_i32_varying:
2255; GFX1032:       ; %bb.0: ; %entry
2256; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2257; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2258; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2259; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2260; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2261; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2262; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2263; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2264; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2265; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2266; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2267; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2268; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2269; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2270; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2271; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2272; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2273; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2274; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2275; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2276; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2277; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2278; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2279; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2280; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2281; GFX1032-NEXT:    s_mov_b32 s2, -1
2282; GFX1032-NEXT:    ; implicit-def: $vgpr0
2283; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2284; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2285; GFX1032-NEXT:  ; %bb.1:
2286; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2287; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2288; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2289; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2290; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2291; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX1032-NEXT:    buffer_gl0_inv
2293; GFX1032-NEXT:  .LBB9_2:
2294; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2295; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2296; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2297; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2298; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2299; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2300; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2301; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2302; GFX1032-NEXT:    s_endpgm
2303;
2304; GFX1164-LABEL: sub_i32_varying:
2305; GFX1164:       ; %bb.0: ; %entry
2306; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2307; GFX1164-NEXT:    s_not_b64 exec, exec
2308; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2309; GFX1164-NEXT:    s_not_b64 exec, exec
2310; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2311; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2312; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2313; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2314; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2315; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2316; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2317; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2318; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2319; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2320; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2321; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2322; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
2323; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2324; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2325; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2326; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2327; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
2328; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
2329; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2330; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2331; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2332; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
2333; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
2334; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
2335; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2336; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2337; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2338; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
2339; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2340; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2341; GFX1164-NEXT:    s_mov_b32 s2, -1
2342; GFX1164-NEXT:    ; implicit-def: $vgpr0
2343; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2344; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2345; GFX1164-NEXT:  ; %bb.1:
2346; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2347; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
2348; GFX1164-NEXT:    s_mov_b32 s3, s7
2349; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2350; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2351; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2352; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2353; GFX1164-NEXT:    buffer_gl0_inv
2354; GFX1164-NEXT:  .LBB9_2:
2355; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2356; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
2357; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2358; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2359; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2360; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2362; GFX1164-NEXT:    s_endpgm
2363;
2364; GFX1132-LABEL: sub_i32_varying:
2365; GFX1132:       ; %bb.0: ; %entry
2366; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2367; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2368; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2369; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2370; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2371; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2372; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2373; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2374; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2375; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2376; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2377; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2378; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2379; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2380; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2381; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2382; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
2383; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
2384; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2385; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2386; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2387; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2388; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
2389; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2390; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2391; GFX1132-NEXT:    s_mov_b32 s2, -1
2392; GFX1132-NEXT:    ; implicit-def: $vgpr0
2393; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2394; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2395; GFX1132-NEXT:  ; %bb.1:
2396; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
2397; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
2398; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2399; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2400; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2401; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2402; GFX1132-NEXT:    buffer_gl0_inv
2403; GFX1132-NEXT:  .LBB9_2:
2404; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2405; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
2406; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2407; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2408; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2409; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2410; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2411; GFX1132-NEXT:    s_endpgm
2412entry:
2413  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2414  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2415  store i32 %old, i32 addrspace(1)* %out
2416  ret void
2417}
2418
2419define amdgpu_kernel void @sub_i32_varying_nouse() {
2420; GFX7LESS-LABEL: sub_i32_varying_nouse:
2421; GFX7LESS:       ; %bb.0: ; %entry
2422; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2423; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2424; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2425; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
2426; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2427; GFX7LESS-NEXT:    s_endpgm
2428;
2429; GFX8-LABEL: sub_i32_varying_nouse:
2430; GFX8:       ; %bb.0: ; %entry
2431; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2432; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2433; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2434; GFX8-NEXT:    s_not_b64 exec, exec
2435; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2436; GFX8-NEXT:    s_not_b64 exec, exec
2437; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
2438; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2439; GFX8-NEXT:    s_nop 1
2440; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2441; GFX8-NEXT:    s_nop 1
2442; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2443; GFX8-NEXT:    s_nop 1
2444; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2445; GFX8-NEXT:    s_nop 1
2446; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2447; GFX8-NEXT:    s_nop 1
2448; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2449; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
2450; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
2451; GFX8-NEXT:    s_mov_b32 s0, s2
2452; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2453; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2454; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2455; GFX8-NEXT:  ; %bb.1:
2456; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2457; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2458; GFX8-NEXT:    s_mov_b32 m0, -1
2459; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2460; GFX8-NEXT:    ds_sub_u32 v0, v2
2461; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2462; GFX8-NEXT:  .LBB10_2:
2463; GFX8-NEXT:    s_endpgm
2464;
2465; GFX9-LABEL: sub_i32_varying_nouse:
2466; GFX9:       ; %bb.0: ; %entry
2467; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2468; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2469; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2470; GFX9-NEXT:    s_not_b64 exec, exec
2471; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2472; GFX9-NEXT:    s_not_b64 exec, exec
2473; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
2474; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2475; GFX9-NEXT:    s_nop 1
2476; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2477; GFX9-NEXT:    s_nop 1
2478; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2479; GFX9-NEXT:    s_nop 1
2480; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2481; GFX9-NEXT:    s_nop 1
2482; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2483; GFX9-NEXT:    s_nop 1
2484; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2485; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
2486; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
2487; GFX9-NEXT:    s_mov_b32 s0, s2
2488; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2489; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2490; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2491; GFX9-NEXT:  ; %bb.1:
2492; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2493; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2494; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2495; GFX9-NEXT:    ds_sub_u32 v0, v2
2496; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX9-NEXT:  .LBB10_2:
2498; GFX9-NEXT:    s_endpgm
2499;
2500; GFX1064-LABEL: sub_i32_varying_nouse:
2501; GFX1064:       ; %bb.0: ; %entry
2502; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2503; GFX1064-NEXT:    s_not_b64 exec, exec
2504; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2505; GFX1064-NEXT:    s_not_b64 exec, exec
2506; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2507; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2508; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2509; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2510; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2511; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2512; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2513; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2514; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2515; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2516; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2517; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
2518; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
2519; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2520; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2521; GFX1064-NEXT:    s_add_i32 s0, s2, s3
2522; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2523; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2524; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2525; GFX1064-NEXT:  ; %bb.1:
2526; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2527; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
2528; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2529; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2530; GFX1064-NEXT:    ds_sub_u32 v0, v3
2531; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2532; GFX1064-NEXT:    buffer_gl0_inv
2533; GFX1064-NEXT:  .LBB10_2:
2534; GFX1064-NEXT:    s_endpgm
2535;
2536; GFX1032-LABEL: sub_i32_varying_nouse:
2537; GFX1032:       ; %bb.0: ; %entry
2538; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2539; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2540; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2541; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2542; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
2543; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2544; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2545; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2546; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2547; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2548; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2549; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2550; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
2551; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2552; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2553; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2554; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2555; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2556; GFX1032-NEXT:  ; %bb.1:
2557; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2558; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2559; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2560; GFX1032-NEXT:    ds_sub_u32 v3, v0
2561; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX1032-NEXT:    buffer_gl0_inv
2563; GFX1032-NEXT:  .LBB10_2:
2564; GFX1032-NEXT:    s_endpgm
2565;
2566; GFX1164-LABEL: sub_i32_varying_nouse:
2567; GFX1164:       ; %bb.0: ; %entry
2568; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2569; GFX1164-NEXT:    s_not_b64 exec, exec
2570; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2571; GFX1164-NEXT:    s_not_b64 exec, exec
2572; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2573; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2574; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2575; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2576; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2577; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2578; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2579; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2580; GFX1164-NEXT:    v_permlane64_b32 v2, v1
2581; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2582; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2583; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2584; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2585; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2586; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
2587; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
2588; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2589; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
2590; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2591; GFX1164-NEXT:  ; %bb.1:
2592; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2593; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2594; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2595; GFX1164-NEXT:    ds_sub_u32 v3, v0
2596; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2597; GFX1164-NEXT:    buffer_gl0_inv
2598; GFX1164-NEXT:  .LBB10_2:
2599; GFX1164-NEXT:    s_endpgm
2600;
2601; GFX1132-LABEL: sub_i32_varying_nouse:
2602; GFX1132:       ; %bb.0: ; %entry
2603; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2604; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2605; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2606; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2607; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
2608; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2609; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2610; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2611; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2612; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2613; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2614; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2615; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
2616; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2617; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
2618; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
2619; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
2620; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
2621; GFX1132-NEXT:  ; %bb.1:
2622; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2623; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2624; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2625; GFX1132-NEXT:    ds_sub_u32 v3, v0
2626; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2627; GFX1132-NEXT:    buffer_gl0_inv
2628; GFX1132-NEXT:  .LBB10_2:
2629; GFX1132-NEXT:    s_endpgm
2630entry:
2631  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2632  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2633  ret void
2634}
2635
2636define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2637;
2638;
2639; GFX7LESS-LABEL: sub_i64_constant:
2640; GFX7LESS:       ; %bb.0: ; %entry
2641; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2642; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2643; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2644; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
2645; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2646; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2647; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2648; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
2649; GFX7LESS-NEXT:  ; %bb.1:
2650; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2651; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
2652; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2653; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
2654; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2655; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2656; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2657; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX7LESS-NEXT:  .LBB11_2:
2659; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2660; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2661; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2662; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
2663; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2664; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2665; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2666; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2667; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2668; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2669; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2670; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2671; GFX7LESS-NEXT:    s_endpgm
2672;
2673; GFX8-LABEL: sub_i64_constant:
2674; GFX8:       ; %bb.0: ; %entry
2675; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2676; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2677; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2678; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2679; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2680; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2681; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2682; GFX8-NEXT:    s_cbranch_execz .LBB11_2
2683; GFX8-NEXT:  ; %bb.1:
2684; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2685; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2686; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2687; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2688; GFX8-NEXT:    s_mov_b32 m0, -1
2689; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2691; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2692; GFX8-NEXT:  .LBB11_2:
2693; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2694; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2695; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2696; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2697; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2698; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2699; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2700; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2701; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2702; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2703; GFX8-NEXT:    s_mov_b32 s2, -1
2704; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2705; GFX8-NEXT:    s_endpgm
2706;
2707; GFX9-LABEL: sub_i64_constant:
2708; GFX9:       ; %bb.0: ; %entry
2709; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2710; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2711; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2712; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2713; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2714; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2715; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2716; GFX9-NEXT:    s_cbranch_execz .LBB11_2
2717; GFX9-NEXT:  ; %bb.1:
2718; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2719; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2720; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2721; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2722; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2723; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2724; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2725; GFX9-NEXT:  .LBB11_2:
2726; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2727; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2728; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2729; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2730; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2731; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2732; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2733; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2734; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2735; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2736; GFX9-NEXT:    s_mov_b32 s2, -1
2737; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2738; GFX9-NEXT:    s_endpgm
2739;
2740; GFX1064-LABEL: sub_i64_constant:
2741; GFX1064:       ; %bb.0: ; %entry
2742; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2743; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2744; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2745; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2746; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2747; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2748; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2749; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
2750; GFX1064-NEXT:  ; %bb.1:
2751; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2752; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2753; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2754; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2755; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2756; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2757; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2758; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2759; GFX1064-NEXT:    buffer_gl0_inv
2760; GFX1064-NEXT:  .LBB11_2:
2761; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2762; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2763; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2764; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2765; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2766; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2767; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2768; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2769; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2770; GFX1064-NEXT:    s_mov_b32 s2, -1
2771; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2772; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2773; GFX1064-NEXT:    s_endpgm
2774;
2775; GFX1032-LABEL: sub_i64_constant:
2776; GFX1032:       ; %bb.0: ; %entry
2777; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2778; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2779; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2780; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2781; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2782; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2783; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2784; GFX1032-NEXT:  ; %bb.1:
2785; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2786; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2787; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2788; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2789; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2790; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2791; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2792; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2793; GFX1032-NEXT:    buffer_gl0_inv
2794; GFX1032-NEXT:  .LBB11_2:
2795; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2796; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2797; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2798; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2799; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2800; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2801; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2802; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2803; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2804; GFX1032-NEXT:    s_mov_b32 s2, -1
2805; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2807; GFX1032-NEXT:    s_endpgm
2808;
2809; GFX1164-LABEL: sub_i64_constant:
2810; GFX1164:       ; %bb.0: ; %entry
2811; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2812; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2813; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2814; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2815; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2816; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2817; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2818; GFX1164-NEXT:    s_cbranch_execz .LBB11_2
2819; GFX1164-NEXT:  ; %bb.1:
2820; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2821; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2822; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
2823; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2824; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2825; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2826; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2827; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2828; GFX1164-NEXT:    buffer_gl0_inv
2829; GFX1164-NEXT:  .LBB11_2:
2830; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
2831; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2832; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2833; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2834; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2835; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2836; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2837; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2838; GFX1164-NEXT:    s_mov_b32 s2, -1
2839; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2840; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2841; GFX1164-NEXT:    s_endpgm
2842;
2843; GFX1132-LABEL: sub_i64_constant:
2844; GFX1132:       ; %bb.0: ; %entry
2845; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2846; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
2847; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2848; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2849; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2850; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2851; GFX1132-NEXT:    s_cbranch_execz .LBB11_2
2852; GFX1132-NEXT:  ; %bb.1:
2853; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
2854; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2855; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
2856; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
2857; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2858; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2859; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2860; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2861; GFX1132-NEXT:    buffer_gl0_inv
2862; GFX1132-NEXT:  .LBB11_2:
2863; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2864; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2865; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2866; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2867; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2868; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2869; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2870; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2871; GFX1132-NEXT:    s_mov_b32 s2, -1
2872; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2873; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2874; GFX1132-NEXT:    s_endpgm
2875entry:
2876  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2877  store i64 %old, i64 addrspace(1)* %out
2878  ret void
2879}
2880
2881define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2882;
2883;
2884; GFX7LESS-LABEL: sub_i64_uniform:
2885; GFX7LESS:       ; %bb.0: ; %entry
2886; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2887; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2888; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2889; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
2890; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2891; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2892; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2893; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
2894; GFX7LESS-NEXT:  ; %bb.1:
2895; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2896; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
2897; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2898; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2899; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2900; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
2901; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2902; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
2903; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2904; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2905; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2906; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2907; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2908; GFX7LESS-NEXT:  .LBB12_2:
2909; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2910; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2911; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2912; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2913; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2914; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2915; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
2916; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
2917; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
2918; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
2919; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
2920; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
2921; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
2922; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
2923; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2924; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2925; GFX7LESS-NEXT:    s_endpgm
2926;
2927; GFX8-LABEL: sub_i64_uniform:
2928; GFX8:       ; %bb.0: ; %entry
2929; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2930; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2931; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2932; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2933; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2934; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2935; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2936; GFX8-NEXT:    s_cbranch_execz .LBB12_2
2937; GFX8-NEXT:  ; %bb.1:
2938; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
2939; GFX8-NEXT:    v_mov_b32_e32 v0, s8
2940; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2941; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
2942; GFX8-NEXT:    s_mul_i32 s6, s3, s8
2943; GFX8-NEXT:    v_mov_b32_e32 v3, 0
2944; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
2945; GFX8-NEXT:    s_mov_b32 m0, -1
2946; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2947; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2949; GFX8-NEXT:  .LBB12_2:
2950; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2951; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2952; GFX8-NEXT:    s_mov_b32 s4, s0
2953; GFX8-NEXT:    s_mov_b32 s5, s1
2954; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
2955; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2956; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2957; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2958; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
2959; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2960; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
2961; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2962; GFX8-NEXT:    s_mov_b32 s6, -1
2963; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2964; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2965; GFX8-NEXT:    s_endpgm
2966;
2967; GFX9-LABEL: sub_i64_uniform:
2968; GFX9:       ; %bb.0: ; %entry
2969; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2970; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2971; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2972; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2973; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2974; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2975; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2976; GFX9-NEXT:    s_cbranch_execz .LBB12_2
2977; GFX9-NEXT:  ; %bb.1:
2978; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2979; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2981; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2982; GFX9-NEXT:    s_add_i32 s8, s8, s7
2983; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2984; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2985; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2986; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2987; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2988; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2990; GFX9-NEXT:  .LBB12_2:
2991; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2993; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
2994; GFX9-NEXT:    s_mov_b32 s4, s0
2995; GFX9-NEXT:    s_mov_b32 s5, s1
2996; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
2997; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2998; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2999; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3000; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3001; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
3002; GFX9-NEXT:    s_mov_b32 s7, 0xf000
3003; GFX9-NEXT:    s_mov_b32 s6, -1
3004; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
3005; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3006; GFX9-NEXT:    s_endpgm
3007;
3008; GFX1064-LABEL: sub_i64_uniform:
3009; GFX1064:       ; %bb.0: ; %entry
3010; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3011; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3012; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3013; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3014; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3015; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3016; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3017; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
3018; GFX1064-NEXT:  ; %bb.1:
3019; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3020; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3021; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3022; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
3023; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
3024; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
3025; GFX1064-NEXT:    s_add_i32 s8, s8, s7
3026; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
3027; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
3028; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3029; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3030; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3031; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3032; GFX1064-NEXT:    buffer_gl0_inv
3033; GFX1064-NEXT:  .LBB12_2:
3034; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3035; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3036; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3037; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3038; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3039; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
3040; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5]
3041; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3042; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3043; GFX1064-NEXT:    s_mov_b32 s2, -1
3044; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
3045; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3046; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3047; GFX1064-NEXT:    s_endpgm
3048;
3049; GFX1032-LABEL: sub_i64_uniform:
3050; GFX1032:       ; %bb.0: ; %entry
3051; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3052; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
3053; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3054; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3055; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
3056; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3057; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
3058; GFX1032-NEXT:  ; %bb.1:
3059; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
3060; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3061; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3062; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
3063; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
3064; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
3065; GFX1032-NEXT:    s_add_i32 s7, s7, s6
3066; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
3067; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
3068; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3069; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3070; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3071; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3072; GFX1032-NEXT:    buffer_gl0_inv
3073; GFX1032-NEXT:  .LBB12_2:
3074; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3075; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3076; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3077; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3078; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3079; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
3080; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5]
3081; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3082; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3083; GFX1032-NEXT:    s_mov_b32 s2, -1
3084; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
3085; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3086; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3087; GFX1032-NEXT:    s_endpgm
3088;
3089; GFX1164-LABEL: sub_i64_uniform:
3090; GFX1164:       ; %bb.0: ; %entry
3091; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3092; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3093; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3094; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3095; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3096; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
3097; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
3098; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
3099; GFX1164-NEXT:  ; %bb.1:
3100; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3101; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3102; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3103; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
3104; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
3105; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
3106; GFX1164-NEXT:    s_add_i32 s8, s8, s7
3107; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
3108; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
3109; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3110; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3111; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3112; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3113; GFX1164-NEXT:    buffer_gl0_inv
3114; GFX1164-NEXT:  .LBB12_2:
3115; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3116; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3117; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3118; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
3119; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
3120; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3121; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3122; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3123; GFX1164-NEXT:    s_mov_b32 s2, -1
3124; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3125; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3126; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3127; GFX1164-NEXT:    s_endpgm
3128;
3129; GFX1132-LABEL: sub_i64_uniform:
3130; GFX1132:       ; %bb.0: ; %entry
3131; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3132; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
3133; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3134; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3135; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3136; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3137; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
3138; GFX1132-NEXT:  ; %bb.1:
3139; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
3140; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3141; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
3143; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
3144; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
3145; GFX1132-NEXT:    s_add_i32 s7, s7, s6
3146; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
3147; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
3148; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3149; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3150; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3151; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3152; GFX1132-NEXT:    buffer_gl0_inv
3153; GFX1132-NEXT:  .LBB12_2:
3154; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3155; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3156; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3157; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
3158; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
3159; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3160; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3161; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3162; GFX1132-NEXT:    s_mov_b32 s2, -1
3163; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3164; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3165; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3166; GFX1132-NEXT:    s_endpgm
3167entry:
3168  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
3169  store i64 %old, i64 addrspace(1)* %out
3170  ret void
3171}
3172
3173define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
3174;
3175;
3176; GFX7LESS-LABEL: sub_i64_varying:
3177; GFX7LESS:       ; %bb.0: ; %entry
3178; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3179; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3180; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3181; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3182; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3183; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3184; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3185; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3186; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3187; GFX7LESS-NEXT:    s_endpgm
3188;
3189; GFX8-LABEL: sub_i64_varying:
3190; GFX8:       ; %bb.0: ; %entry
3191; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3192; GFX8-NEXT:    s_mov_b32 m0, -1
3193; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3194; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3195; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3196; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3197; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3198; GFX8-NEXT:    s_mov_b32 s2, -1
3199; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3200; GFX8-NEXT:    s_endpgm
3201;
3202; GFX9-LABEL: sub_i64_varying:
3203; GFX9:       ; %bb.0: ; %entry
3204; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3205; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3206; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3207; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3208; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3209; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3210; GFX9-NEXT:    s_mov_b32 s2, -1
3211; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3212; GFX9-NEXT:    s_endpgm
3213;
3214; GFX10-LABEL: sub_i64_varying:
3215; GFX10:       ; %bb.0: ; %entry
3216; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3217; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3218; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
3219; GFX10-NEXT:    s_mov_b32 s2, -1
3220; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3221; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3222; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3223; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3224; GFX10-NEXT:    buffer_gl0_inv
3225; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3226; GFX10-NEXT:    s_endpgm
3227;
3228; GFX11-LABEL: sub_i64_varying:
3229; GFX11:       ; %bb.0: ; %entry
3230; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3231; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3232; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
3233; GFX11-NEXT:    s_mov_b32 s2, -1
3234; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3235; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3236; GFX11-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3237; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3238; GFX11-NEXT:    buffer_gl0_inv
3239; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3240; GFX11-NEXT:    s_endpgm
3241entry:
3242  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3243  %zext = zext i32 %lane to i64
3244  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
3245  store i64 %old, i64 addrspace(1)* %out
3246  ret void
3247}
3248
3249define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
3250;
3251;
3252; GFX7LESS-LABEL: and_i32_varying:
3253; GFX7LESS:       ; %bb.0: ; %entry
3254; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3255; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3256; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3257; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3258; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
3259; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3260; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3261; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3262; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3263; GFX7LESS-NEXT:    s_endpgm
3264;
3265; GFX8-LABEL: and_i32_varying:
3266; GFX8:       ; %bb.0: ; %entry
3267; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3268; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3269; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3270; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3271; GFX8-NEXT:    v_mov_b32_e32 v1, -1
3272; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3273; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3274; GFX8-NEXT:    s_not_b64 exec, exec
3275; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3276; GFX8-NEXT:    s_not_b64 exec, exec
3277; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3278; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3279; GFX8-NEXT:    s_nop 1
3280; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3281; GFX8-NEXT:    s_nop 1
3282; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3283; GFX8-NEXT:    s_nop 1
3284; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3285; GFX8-NEXT:    s_nop 1
3286; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3287; GFX8-NEXT:    s_nop 1
3288; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3289; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3290; GFX8-NEXT:    s_nop 0
3291; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3292; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3293; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3294; GFX8-NEXT:    ; implicit-def: $vgpr0
3295; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3296; GFX8-NEXT:    s_cbranch_execz .LBB14_2
3297; GFX8-NEXT:  ; %bb.1:
3298; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3299; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3300; GFX8-NEXT:    s_mov_b32 m0, -1
3301; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3302; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
3303; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3304; GFX8-NEXT:  .LBB14_2:
3305; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3306; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3307; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3308; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3309; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
3310; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3311; GFX8-NEXT:    s_mov_b32 s2, -1
3312; GFX8-NEXT:    s_nop 0
3313; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3314; GFX8-NEXT:    s_endpgm
3315;
3316; GFX9-LABEL: and_i32_varying:
3317; GFX9:       ; %bb.0: ; %entry
3318; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3319; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3320; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3321; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3322; GFX9-NEXT:    v_mov_b32_e32 v1, -1
3323; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3324; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3325; GFX9-NEXT:    s_not_b64 exec, exec
3326; GFX9-NEXT:    v_mov_b32_e32 v2, -1
3327; GFX9-NEXT:    s_not_b64 exec, exec
3328; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3329; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3330; GFX9-NEXT:    s_nop 1
3331; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3332; GFX9-NEXT:    s_nop 1
3333; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3334; GFX9-NEXT:    s_nop 1
3335; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3336; GFX9-NEXT:    s_nop 1
3337; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3338; GFX9-NEXT:    s_nop 1
3339; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3340; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3341; GFX9-NEXT:    s_nop 0
3342; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3343; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3344; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3345; GFX9-NEXT:    ; implicit-def: $vgpr0
3346; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3347; GFX9-NEXT:    s_cbranch_execz .LBB14_2
3348; GFX9-NEXT:  ; %bb.1:
3349; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3350; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3351; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3352; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
3353; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3354; GFX9-NEXT:  .LBB14_2:
3355; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3356; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3357; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3358; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3359; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
3360; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3361; GFX9-NEXT:    s_mov_b32 s2, -1
3362; GFX9-NEXT:    s_nop 0
3363; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3364; GFX9-NEXT:    s_endpgm
3365;
3366; GFX1064-LABEL: and_i32_varying:
3367; GFX1064:       ; %bb.0: ; %entry
3368; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3369; GFX1064-NEXT:    s_not_b64 exec, exec
3370; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
3371; GFX1064-NEXT:    s_not_b64 exec, exec
3372; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3373; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3374; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
3375; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3376; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3377; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3378; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3379; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3380; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3381; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3382; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3383; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3384; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3385; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3386; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3387; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3388; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3389; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3390; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3391; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3392; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3393; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3394; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3395; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3396; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3397; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3398; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3399; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3400; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3401; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3402; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3403; GFX1064-NEXT:    s_mov_b32 s2, -1
3404; GFX1064-NEXT:    ; implicit-def: $vgpr0
3405; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3406; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
3407; GFX1064-NEXT:  ; %bb.1:
3408; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3409; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3410; GFX1064-NEXT:    s_mov_b32 s3, s7
3411; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3412; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3413; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
3414; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3415; GFX1064-NEXT:    buffer_gl0_inv
3416; GFX1064-NEXT:  .LBB14_2:
3417; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3418; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3419; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3420; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3421; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
3422; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3423; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3424; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3425; GFX1064-NEXT:    s_endpgm
3426;
3427; GFX1032-LABEL: and_i32_varying:
3428; GFX1032:       ; %bb.0: ; %entry
3429; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3430; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3431; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
3432; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3433; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3434; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3435; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3436; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3437; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3438; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3439; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3440; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3441; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3442; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3443; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3444; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
3445; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3446; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3447; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3448; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3449; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3450; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3451; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3452; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3453; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3454; GFX1032-NEXT:    s_mov_b32 s2, -1
3455; GFX1032-NEXT:    ; implicit-def: $vgpr0
3456; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3457; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
3458; GFX1032-NEXT:  ; %bb.1:
3459; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3460; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3461; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3462; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3463; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
3464; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3465; GFX1032-NEXT:    buffer_gl0_inv
3466; GFX1032-NEXT:  .LBB14_2:
3467; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3468; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3469; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3470; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3471; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
3472; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3473; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3474; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3475; GFX1032-NEXT:    s_endpgm
3476;
3477; GFX1164-LABEL: and_i32_varying:
3478; GFX1164:       ; %bb.0: ; %entry
3479; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3480; GFX1164-NEXT:    s_not_b64 exec, exec
3481; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
3482; GFX1164-NEXT:    s_not_b64 exec, exec
3483; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3484; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3485; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
3486; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3487; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3488; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3489; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3490; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3491; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3492; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3493; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3494; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3495; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3496; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3497; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3498; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3499; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3500; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3501; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3502; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3503; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3504; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3505; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3506; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3507; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3508; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3509; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3510; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3511; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3512; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3513; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3514; GFX1164-NEXT:    s_mov_b32 s2, -1
3515; GFX1164-NEXT:    ; implicit-def: $vgpr0
3516; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3517; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
3518; GFX1164-NEXT:  ; %bb.1:
3519; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3520; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3521; GFX1164-NEXT:    s_mov_b32 s3, s7
3522; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3523; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3524; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
3525; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3526; GFX1164-NEXT:    buffer_gl0_inv
3527; GFX1164-NEXT:  .LBB14_2:
3528; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3529; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3530; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3531; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
3532; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3533; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3534; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3535; GFX1164-NEXT:    s_endpgm
3536;
3537; GFX1132-LABEL: and_i32_varying:
3538; GFX1132:       ; %bb.0: ; %entry
3539; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3540; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3541; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
3542; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3543; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3544; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3545; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3546; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3547; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3548; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3549; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3550; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3551; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3552; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3553; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3554; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
3555; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3556; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3557; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3558; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3559; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3560; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3561; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3562; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3563; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3564; GFX1132-NEXT:    s_mov_b32 s2, -1
3565; GFX1132-NEXT:    ; implicit-def: $vgpr0
3566; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3567; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
3568; GFX1132-NEXT:  ; %bb.1:
3569; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3570; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3571; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3572; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3573; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
3574; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3575; GFX1132-NEXT:    buffer_gl0_inv
3576; GFX1132-NEXT:  .LBB14_2:
3577; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3578; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3579; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3580; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
3581; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3582; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3583; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3584; GFX1132-NEXT:    s_endpgm
3585entry:
3586  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3587  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3588  store i32 %old, i32 addrspace(1)* %out
3589  ret void
3590}
3591
3592define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
3593;
3594;
3595; GFX7LESS-LABEL: or_i32_varying:
3596; GFX7LESS:       ; %bb.0: ; %entry
3597; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3598; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3599; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3600; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3601; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
3602; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3603; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3604; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3605; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3606; GFX7LESS-NEXT:    s_endpgm
3607;
3608; GFX8-LABEL: or_i32_varying:
3609; GFX8:       ; %bb.0: ; %entry
3610; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3611; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3612; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3613; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3614; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3615; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3616; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3617; GFX8-NEXT:    s_not_b64 exec, exec
3618; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3619; GFX8-NEXT:    s_not_b64 exec, exec
3620; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3621; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3622; GFX8-NEXT:    s_nop 1
3623; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3624; GFX8-NEXT:    s_nop 1
3625; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3626; GFX8-NEXT:    s_nop 1
3627; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3628; GFX8-NEXT:    s_nop 1
3629; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3630; GFX8-NEXT:    s_nop 1
3631; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3632; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3633; GFX8-NEXT:    s_nop 0
3634; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3635; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3636; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3637; GFX8-NEXT:    ; implicit-def: $vgpr0
3638; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3639; GFX8-NEXT:    s_cbranch_execz .LBB15_2
3640; GFX8-NEXT:  ; %bb.1:
3641; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3642; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3643; GFX8-NEXT:    s_mov_b32 m0, -1
3644; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3645; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3646; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3647; GFX8-NEXT:  .LBB15_2:
3648; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3649; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3650; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3651; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3652; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3653; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3654; GFX8-NEXT:    s_mov_b32 s2, -1
3655; GFX8-NEXT:    s_nop 0
3656; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3657; GFX8-NEXT:    s_endpgm
3658;
3659; GFX9-LABEL: or_i32_varying:
3660; GFX9:       ; %bb.0: ; %entry
3661; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3662; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3663; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3664; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3665; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3666; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3667; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3668; GFX9-NEXT:    s_not_b64 exec, exec
3669; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3670; GFX9-NEXT:    s_not_b64 exec, exec
3671; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3672; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3673; GFX9-NEXT:    s_nop 1
3674; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3675; GFX9-NEXT:    s_nop 1
3676; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3677; GFX9-NEXT:    s_nop 1
3678; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3679; GFX9-NEXT:    s_nop 1
3680; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3681; GFX9-NEXT:    s_nop 1
3682; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3683; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3684; GFX9-NEXT:    s_nop 0
3685; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3686; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3687; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3688; GFX9-NEXT:    ; implicit-def: $vgpr0
3689; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3690; GFX9-NEXT:    s_cbranch_execz .LBB15_2
3691; GFX9-NEXT:  ; %bb.1:
3692; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3693; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3694; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3695; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3696; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3697; GFX9-NEXT:  .LBB15_2:
3698; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3699; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3700; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3701; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3702; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3703; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3704; GFX9-NEXT:    s_mov_b32 s2, -1
3705; GFX9-NEXT:    s_nop 0
3706; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3707; GFX9-NEXT:    s_endpgm
3708;
3709; GFX1064-LABEL: or_i32_varying:
3710; GFX1064:       ; %bb.0: ; %entry
3711; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3712; GFX1064-NEXT:    s_not_b64 exec, exec
3713; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3714; GFX1064-NEXT:    s_not_b64 exec, exec
3715; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3716; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3717; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3718; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3719; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3720; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3721; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3722; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3723; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3724; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3725; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3726; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3727; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3728; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3729; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3730; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3731; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3732; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3733; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3734; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3735; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3736; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3737; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3738; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3739; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3740; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3741; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3742; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3743; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3744; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3745; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3746; GFX1064-NEXT:    s_mov_b32 s2, -1
3747; GFX1064-NEXT:    ; implicit-def: $vgpr0
3748; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3749; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
3750; GFX1064-NEXT:  ; %bb.1:
3751; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3752; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3753; GFX1064-NEXT:    s_mov_b32 s3, s7
3754; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3755; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3756; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
3757; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3758; GFX1064-NEXT:    buffer_gl0_inv
3759; GFX1064-NEXT:  .LBB15_2:
3760; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3761; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3762; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3763; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3764; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3765; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3766; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3767; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3768; GFX1064-NEXT:    s_endpgm
3769;
3770; GFX1032-LABEL: or_i32_varying:
3771; GFX1032:       ; %bb.0: ; %entry
3772; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3773; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3774; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3775; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3776; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3777; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3778; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3779; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3780; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3781; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3782; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3783; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3784; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3785; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3786; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3787; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3788; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3789; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3790; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3791; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3792; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3793; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3794; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3795; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3796; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3797; GFX1032-NEXT:    s_mov_b32 s2, -1
3798; GFX1032-NEXT:    ; implicit-def: $vgpr0
3799; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3800; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
3801; GFX1032-NEXT:  ; %bb.1:
3802; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3803; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3804; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3805; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3806; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
3807; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3808; GFX1032-NEXT:    buffer_gl0_inv
3809; GFX1032-NEXT:  .LBB15_2:
3810; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3811; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3812; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3813; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3814; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3815; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3816; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3817; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3818; GFX1032-NEXT:    s_endpgm
3819;
3820; GFX1164-LABEL: or_i32_varying:
3821; GFX1164:       ; %bb.0: ; %entry
3822; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3823; GFX1164-NEXT:    s_not_b64 exec, exec
3824; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3825; GFX1164-NEXT:    s_not_b64 exec, exec
3826; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3827; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3828; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3829; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3830; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3831; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3832; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3833; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3834; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3835; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3836; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3837; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3838; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3839; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3840; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3841; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3842; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3843; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3844; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3845; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3846; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3847; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3848; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3849; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3850; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3851; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3852; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3853; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3854; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3855; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3856; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3857; GFX1164-NEXT:    s_mov_b32 s2, -1
3858; GFX1164-NEXT:    ; implicit-def: $vgpr0
3859; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3860; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
3861; GFX1164-NEXT:  ; %bb.1:
3862; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3863; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3864; GFX1164-NEXT:    s_mov_b32 s3, s7
3865; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3866; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3867; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v4
3868; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3869; GFX1164-NEXT:    buffer_gl0_inv
3870; GFX1164-NEXT:  .LBB15_2:
3871; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3872; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3873; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3874; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
3875; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3876; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3877; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3878; GFX1164-NEXT:    s_endpgm
3879;
3880; GFX1132-LABEL: or_i32_varying:
3881; GFX1132:       ; %bb.0: ; %entry
3882; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3883; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3884; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
3885; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3886; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3887; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3888; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3889; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3890; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3891; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3892; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3893; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3894; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3895; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3896; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3897; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3898; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3899; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3900; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3901; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3902; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3903; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3904; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3905; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3906; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3907; GFX1132-NEXT:    s_mov_b32 s2, -1
3908; GFX1132-NEXT:    ; implicit-def: $vgpr0
3909; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3910; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
3911; GFX1132-NEXT:  ; %bb.1:
3912; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3913; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3914; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3915; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3916; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v4
3917; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3918; GFX1132-NEXT:    buffer_gl0_inv
3919; GFX1132-NEXT:  .LBB15_2:
3920; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3921; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3922; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3923; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
3924; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3925; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3926; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3927; GFX1132-NEXT:    s_endpgm
3928entry:
3929  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3930  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3931  store i32 %old, i32 addrspace(1)* %out
3932  ret void
3933}
3934
3935define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3936;
3937;
3938; GFX7LESS-LABEL: xor_i32_varying:
3939; GFX7LESS:       ; %bb.0: ; %entry
3940; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3941; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3942; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3943; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3944; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3945; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3946; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3947; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3948; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3949; GFX7LESS-NEXT:    s_endpgm
3950;
3951; GFX8-LABEL: xor_i32_varying:
3952; GFX8:       ; %bb.0: ; %entry
3953; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3954; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3955; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3956; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3957; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3958; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3959; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3960; GFX8-NEXT:    s_not_b64 exec, exec
3961; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3962; GFX8-NEXT:    s_not_b64 exec, exec
3963; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3964; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3965; GFX8-NEXT:    s_nop 1
3966; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3967; GFX8-NEXT:    s_nop 1
3968; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3969; GFX8-NEXT:    s_nop 1
3970; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3971; GFX8-NEXT:    s_nop 1
3972; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3973; GFX8-NEXT:    s_nop 1
3974; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3975; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3976; GFX8-NEXT:    s_nop 0
3977; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3978; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3979; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3980; GFX8-NEXT:    ; implicit-def: $vgpr0
3981; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3982; GFX8-NEXT:    s_cbranch_execz .LBB16_2
3983; GFX8-NEXT:  ; %bb.1:
3984; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3985; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3986; GFX8-NEXT:    s_mov_b32 m0, -1
3987; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3988; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3989; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3990; GFX8-NEXT:  .LBB16_2:
3991; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3992; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3993; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3994; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3995; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3996; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3997; GFX8-NEXT:    s_mov_b32 s2, -1
3998; GFX8-NEXT:    s_nop 0
3999; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4000; GFX8-NEXT:    s_endpgm
4001;
4002; GFX9-LABEL: xor_i32_varying:
4003; GFX9:       ; %bb.0: ; %entry
4004; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4005; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4006; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4007; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4008; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4009; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4010; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4011; GFX9-NEXT:    s_not_b64 exec, exec
4012; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4013; GFX9-NEXT:    s_not_b64 exec, exec
4014; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4015; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4016; GFX9-NEXT:    s_nop 1
4017; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4018; GFX9-NEXT:    s_nop 1
4019; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4020; GFX9-NEXT:    s_nop 1
4021; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4022; GFX9-NEXT:    s_nop 1
4023; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4024; GFX9-NEXT:    s_nop 1
4025; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4026; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4027; GFX9-NEXT:    s_nop 0
4028; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4029; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4030; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4031; GFX9-NEXT:    ; implicit-def: $vgpr0
4032; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4033; GFX9-NEXT:    s_cbranch_execz .LBB16_2
4034; GFX9-NEXT:  ; %bb.1:
4035; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4036; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4037; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4038; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4039; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4040; GFX9-NEXT:  .LBB16_2:
4041; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4042; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4043; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4044; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4045; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
4046; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4047; GFX9-NEXT:    s_mov_b32 s2, -1
4048; GFX9-NEXT:    s_nop 0
4049; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4050; GFX9-NEXT:    s_endpgm
4051;
4052; GFX1064-LABEL: xor_i32_varying:
4053; GFX1064:       ; %bb.0: ; %entry
4054; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4055; GFX1064-NEXT:    s_not_b64 exec, exec
4056; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4057; GFX1064-NEXT:    s_not_b64 exec, exec
4058; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4059; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4060; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4061; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4062; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4063; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4064; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4065; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4066; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4067; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4068; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4069; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4070; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4071; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4072; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4073; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4074; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4075; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4076; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4077; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4078; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4079; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4080; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4081; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4082; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4083; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4084; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4085; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4086; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4087; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4088; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4089; GFX1064-NEXT:    s_mov_b32 s2, -1
4090; GFX1064-NEXT:    ; implicit-def: $vgpr0
4091; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4092; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
4093; GFX1064-NEXT:  ; %bb.1:
4094; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4095; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4096; GFX1064-NEXT:    s_mov_b32 s3, s7
4097; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4098; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4099; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4100; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4101; GFX1064-NEXT:    buffer_gl0_inv
4102; GFX1064-NEXT:  .LBB16_2:
4103; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4104; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4105; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4106; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4107; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
4108; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4109; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4110; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4111; GFX1064-NEXT:    s_endpgm
4112;
4113; GFX1032-LABEL: xor_i32_varying:
4114; GFX1032:       ; %bb.0: ; %entry
4115; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4116; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4117; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4118; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4119; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4120; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4121; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4122; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4123; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4124; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4125; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4126; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4127; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4128; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4129; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4130; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4131; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4132; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4133; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4134; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4135; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4136; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4137; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4138; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4139; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4140; GFX1032-NEXT:    s_mov_b32 s2, -1
4141; GFX1032-NEXT:    ; implicit-def: $vgpr0
4142; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4143; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
4144; GFX1032-NEXT:  ; %bb.1:
4145; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4146; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4147; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4148; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4149; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4150; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4151; GFX1032-NEXT:    buffer_gl0_inv
4152; GFX1032-NEXT:  .LBB16_2:
4153; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4154; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4155; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4156; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4157; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
4158; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4159; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4160; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4161; GFX1032-NEXT:    s_endpgm
4162;
4163; GFX1164-LABEL: xor_i32_varying:
4164; GFX1164:       ; %bb.0: ; %entry
4165; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4166; GFX1164-NEXT:    s_not_b64 exec, exec
4167; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4168; GFX1164-NEXT:    s_not_b64 exec, exec
4169; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4170; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4171; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
4172; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4173; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4174; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4175; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4176; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4177; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4178; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4179; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4180; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4181; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4182; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4183; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4184; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4185; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4186; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4187; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4188; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4189; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4190; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4191; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4192; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4193; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4194; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4195; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4196; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4197; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4198; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4199; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4200; GFX1164-NEXT:    s_mov_b32 s2, -1
4201; GFX1164-NEXT:    ; implicit-def: $vgpr0
4202; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4203; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
4204; GFX1164-NEXT:  ; %bb.1:
4205; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4206; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4207; GFX1164-NEXT:    s_mov_b32 s3, s7
4208; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4209; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4210; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4211; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4212; GFX1164-NEXT:    buffer_gl0_inv
4213; GFX1164-NEXT:  .LBB16_2:
4214; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4215; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4216; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4217; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
4218; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4219; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4220; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4221; GFX1164-NEXT:    s_endpgm
4222;
4223; GFX1132-LABEL: xor_i32_varying:
4224; GFX1132:       ; %bb.0: ; %entry
4225; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4226; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4227; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4228; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4229; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4230; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4231; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4232; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4233; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4234; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4235; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4236; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4237; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4238; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4239; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4240; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4241; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4242; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4243; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4244; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4245; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4246; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4247; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4248; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4249; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4250; GFX1132-NEXT:    s_mov_b32 s2, -1
4251; GFX1132-NEXT:    ; implicit-def: $vgpr0
4252; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4253; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
4254; GFX1132-NEXT:  ; %bb.1:
4255; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4256; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4257; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4258; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4259; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4260; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4261; GFX1132-NEXT:    buffer_gl0_inv
4262; GFX1132-NEXT:  .LBB16_2:
4263; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4264; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4265; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4266; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
4267; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4268; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4269; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4270; GFX1132-NEXT:    s_endpgm
4271entry:
4272  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4273  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4274  store i32 %old, i32 addrspace(1)* %out
4275  ret void
4276}
4277
4278define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
4279;
4280;
4281; GFX7LESS-LABEL: max_i32_varying:
4282; GFX7LESS:       ; %bb.0: ; %entry
4283; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4284; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4285; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4286; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4287; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
4288; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4289; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4290; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4291; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4292; GFX7LESS-NEXT:    s_endpgm
4293;
4294; GFX8-LABEL: max_i32_varying:
4295; GFX8:       ; %bb.0: ; %entry
4296; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4297; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4298; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4299; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4300; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4301; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4302; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4303; GFX8-NEXT:    s_not_b64 exec, exec
4304; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
4305; GFX8-NEXT:    s_not_b64 exec, exec
4306; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4307; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4308; GFX8-NEXT:    s_nop 1
4309; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4310; GFX8-NEXT:    s_nop 1
4311; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4312; GFX8-NEXT:    s_nop 1
4313; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4314; GFX8-NEXT:    s_nop 1
4315; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4316; GFX8-NEXT:    s_nop 1
4317; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4318; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4319; GFX8-NEXT:    s_nop 0
4320; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4321; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4322; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4323; GFX8-NEXT:    ; implicit-def: $vgpr0
4324; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4325; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4326; GFX8-NEXT:  ; %bb.1:
4327; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4328; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4329; GFX8-NEXT:    s_mov_b32 m0, -1
4330; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4331; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
4332; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX8-NEXT:  .LBB17_2:
4334; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4335; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4336; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4337; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4338; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
4339; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4340; GFX8-NEXT:    s_mov_b32 s2, -1
4341; GFX8-NEXT:    s_nop 0
4342; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4343; GFX8-NEXT:    s_endpgm
4344;
4345; GFX9-LABEL: max_i32_varying:
4346; GFX9:       ; %bb.0: ; %entry
4347; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4348; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4349; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4350; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4351; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4352; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4353; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4354; GFX9-NEXT:    s_not_b64 exec, exec
4355; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
4356; GFX9-NEXT:    s_not_b64 exec, exec
4357; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4358; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4359; GFX9-NEXT:    s_nop 1
4360; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4361; GFX9-NEXT:    s_nop 1
4362; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4363; GFX9-NEXT:    s_nop 1
4364; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4365; GFX9-NEXT:    s_nop 1
4366; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4367; GFX9-NEXT:    s_nop 1
4368; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4369; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4370; GFX9-NEXT:    s_nop 0
4371; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4372; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4373; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4374; GFX9-NEXT:    ; implicit-def: $vgpr0
4375; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4376; GFX9-NEXT:    s_cbranch_execz .LBB17_2
4377; GFX9-NEXT:  ; %bb.1:
4378; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4379; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4381; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
4382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4383; GFX9-NEXT:  .LBB17_2:
4384; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4386; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4387; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4388; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
4389; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4390; GFX9-NEXT:    s_mov_b32 s2, -1
4391; GFX9-NEXT:    s_nop 0
4392; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4393; GFX9-NEXT:    s_endpgm
4394;
4395; GFX1064-LABEL: max_i32_varying:
4396; GFX1064:       ; %bb.0: ; %entry
4397; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4398; GFX1064-NEXT:    s_not_b64 exec, exec
4399; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
4400; GFX1064-NEXT:    s_not_b64 exec, exec
4401; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4402; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4403; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
4404; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4405; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4406; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4407; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4408; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4409; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4410; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4411; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4412; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4413; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4414; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4415; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4416; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4417; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4418; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4419; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4420; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4421; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4422; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4423; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4424; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4425; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4426; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4427; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4428; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4429; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4430; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4431; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4432; GFX1064-NEXT:    s_mov_b32 s2, -1
4433; GFX1064-NEXT:    ; implicit-def: $vgpr0
4434; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4435; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
4436; GFX1064-NEXT:  ; %bb.1:
4437; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4438; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4439; GFX1064-NEXT:    s_mov_b32 s3, s7
4440; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4441; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4442; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
4443; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4444; GFX1064-NEXT:    buffer_gl0_inv
4445; GFX1064-NEXT:  .LBB17_2:
4446; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4447; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4448; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4449; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4450; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
4451; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4452; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4453; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4454; GFX1064-NEXT:    s_endpgm
4455;
4456; GFX1032-LABEL: max_i32_varying:
4457; GFX1032:       ; %bb.0: ; %entry
4458; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4459; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4460; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
4461; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4462; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4463; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4464; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4465; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4466; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4467; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4468; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4469; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4470; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4471; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4472; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4473; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
4474; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4475; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4476; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4477; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4478; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4479; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4480; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4481; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4482; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4483; GFX1032-NEXT:    s_mov_b32 s2, -1
4484; GFX1032-NEXT:    ; implicit-def: $vgpr0
4485; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4486; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
4487; GFX1032-NEXT:  ; %bb.1:
4488; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4489; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4490; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4491; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4492; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
4493; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4494; GFX1032-NEXT:    buffer_gl0_inv
4495; GFX1032-NEXT:  .LBB17_2:
4496; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4497; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4498; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4499; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4500; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
4501; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4502; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4503; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4504; GFX1032-NEXT:    s_endpgm
4505;
4506; GFX1164-LABEL: max_i32_varying:
4507; GFX1164:       ; %bb.0: ; %entry
4508; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4509; GFX1164-NEXT:    s_not_b64 exec, exec
4510; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
4511; GFX1164-NEXT:    s_not_b64 exec, exec
4512; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4513; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4514; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
4515; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4516; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4517; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4518; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4519; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4520; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4521; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4522; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4523; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4524; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4525; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4526; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4527; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4528; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4529; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4530; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4531; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4532; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4533; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4534; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4535; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4536; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4537; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4538; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4539; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4540; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4541; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4542; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4543; GFX1164-NEXT:    s_mov_b32 s2, -1
4544; GFX1164-NEXT:    ; implicit-def: $vgpr0
4545; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4546; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
4547; GFX1164-NEXT:  ; %bb.1:
4548; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4549; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4550; GFX1164-NEXT:    s_mov_b32 s3, s7
4551; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4552; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4553; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
4554; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4555; GFX1164-NEXT:    buffer_gl0_inv
4556; GFX1164-NEXT:  .LBB17_2:
4557; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4558; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4559; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4560; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
4561; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4562; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4563; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4564; GFX1164-NEXT:    s_endpgm
4565;
4566; GFX1132-LABEL: max_i32_varying:
4567; GFX1132:       ; %bb.0: ; %entry
4568; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4569; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4570; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
4571; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4572; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4573; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4574; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4575; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4576; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4577; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4578; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4579; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4580; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4581; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4582; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4583; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
4584; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4585; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4586; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4587; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4588; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4589; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4590; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4591; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4592; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4593; GFX1132-NEXT:    s_mov_b32 s2, -1
4594; GFX1132-NEXT:    ; implicit-def: $vgpr0
4595; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4596; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
4597; GFX1132-NEXT:  ; %bb.1:
4598; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4599; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4600; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4601; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4602; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
4603; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4604; GFX1132-NEXT:    buffer_gl0_inv
4605; GFX1132-NEXT:  .LBB17_2:
4606; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4607; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4608; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4609; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
4610; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4611; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4612; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4613; GFX1132-NEXT:    s_endpgm
4614entry:
4615  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4616  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4617  store i32 %old, i32 addrspace(1)* %out
4618  ret void
4619}
4620
4621define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
4622;
4623;
4624; GFX7LESS-LABEL: max_i64_constant:
4625; GFX7LESS:       ; %bb.0: ; %entry
4626; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4627; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4628; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4629; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4630; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4631; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4632; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
4633; GFX7LESS-NEXT:  ; %bb.1:
4634; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4635; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4636; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4637; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4638; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4639; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4640; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4641; GFX7LESS-NEXT:  .LBB18_2:
4642; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4643; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4644; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4645; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4646; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
4647; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4648; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4649; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4650; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4651; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4652; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
4653; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4654; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4655; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4656; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4657; GFX7LESS-NEXT:    s_endpgm
4658;
4659; GFX8-LABEL: max_i64_constant:
4660; GFX8:       ; %bb.0: ; %entry
4661; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4662; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4663; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4664; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4665; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4666; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4667; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4668; GFX8-NEXT:  ; %bb.1:
4669; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4670; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4671; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4672; GFX8-NEXT:    s_mov_b32 m0, -1
4673; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4674; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4675; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4676; GFX8-NEXT:  .LBB18_2:
4677; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4678; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4679; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4680; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
4681; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4682; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4683; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4684; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4685; GFX8-NEXT:    v_mov_b32_e32 v2, s3
4686; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4687; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4688; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4689; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4690; GFX8-NEXT:    s_mov_b32 s2, -1
4691; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4692; GFX8-NEXT:    s_endpgm
4693;
4694; GFX9-LABEL: max_i64_constant:
4695; GFX9:       ; %bb.0: ; %entry
4696; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4697; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4698; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4699; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4700; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4701; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4702; GFX9-NEXT:    s_cbranch_execz .LBB18_2
4703; GFX9-NEXT:  ; %bb.1:
4704; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4705; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4706; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4707; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4708; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4709; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4710; GFX9-NEXT:  .LBB18_2:
4711; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4712; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4713; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4714; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
4715; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4716; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4717; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4718; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4719; GFX9-NEXT:    v_mov_b32_e32 v2, s3
4720; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4721; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4722; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4723; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4724; GFX9-NEXT:    s_mov_b32 s2, -1
4725; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4726; GFX9-NEXT:    s_endpgm
4727;
4728; GFX1064-LABEL: max_i64_constant:
4729; GFX1064:       ; %bb.0: ; %entry
4730; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4731; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4732; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4733; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4734; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4735; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4736; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
4737; GFX1064-NEXT:  ; %bb.1:
4738; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4739; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4740; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4741; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4742; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4743; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4744; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4745; GFX1064-NEXT:    buffer_gl0_inv
4746; GFX1064-NEXT:  .LBB18_2:
4747; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4748; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4749; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4750; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4751; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4752; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4753; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4754; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4755; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4756; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4757; GFX1064-NEXT:    s_mov_b32 s2, -1
4758; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4759; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4760; GFX1064-NEXT:    s_endpgm
4761;
4762; GFX1032-LABEL: max_i64_constant:
4763; GFX1032:       ; %bb.0: ; %entry
4764; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4765; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4766; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4767; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4768; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4769; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
4770; GFX1032-NEXT:  ; %bb.1:
4771; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4772; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4773; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4774; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4775; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4776; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4777; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4778; GFX1032-NEXT:    buffer_gl0_inv
4779; GFX1032-NEXT:  .LBB18_2:
4780; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4781; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4782; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4783; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4784; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4785; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4786; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4787; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4788; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4789; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4790; GFX1032-NEXT:    s_mov_b32 s2, -1
4791; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4792; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4793; GFX1032-NEXT:    s_endpgm
4794;
4795; GFX1164-LABEL: max_i64_constant:
4796; GFX1164:       ; %bb.0: ; %entry
4797; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4798; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4799; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4800; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4801; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
4802; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4803; GFX1164-NEXT:    s_cbranch_execz .LBB18_2
4804; GFX1164-NEXT:  ; %bb.1:
4805; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
4806; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4807; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
4808; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4809; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4810; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4811; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4812; GFX1164-NEXT:    buffer_gl0_inv
4813; GFX1164-NEXT:  .LBB18_2:
4814; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
4815; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
4816; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
4817; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4818; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4819; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4820; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4821; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4822; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4823; GFX1164-NEXT:    s_mov_b32 s2, -1
4824; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4825; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4826; GFX1164-NEXT:    s_endpgm
4827;
4828; GFX1132-LABEL: max_i64_constant:
4829; GFX1132:       ; %bb.0: ; %entry
4830; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4831; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4832; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4833; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
4834; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4835; GFX1132-NEXT:    s_cbranch_execz .LBB18_2
4836; GFX1132-NEXT:  ; %bb.1:
4837; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
4838; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4839; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
4840; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4841; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4842; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4843; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4844; GFX1132-NEXT:    buffer_gl0_inv
4845; GFX1132-NEXT:  .LBB18_2:
4846; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4847; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
4848; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
4849; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4850; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4851; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4852; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4853; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4854; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4855; GFX1132-NEXT:    s_mov_b32 s2, -1
4856; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4857; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4858; GFX1132-NEXT:    s_endpgm
4859entry:
4860  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
4861  store i64 %old, i64 addrspace(1)* %out
4862  ret void
4863}
4864
4865define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
4866;
4867;
4868; GFX7LESS-LABEL: min_i32_varying:
4869; GFX7LESS:       ; %bb.0: ; %entry
4870; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4871; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4872; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4873; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4874; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
4875; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4876; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4877; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4878; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4879; GFX7LESS-NEXT:    s_endpgm
4880;
4881; GFX8-LABEL: min_i32_varying:
4882; GFX8:       ; %bb.0: ; %entry
4883; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4884; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4885; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4886; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4887; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
4888; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4889; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4890; GFX8-NEXT:    s_not_b64 exec, exec
4891; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
4892; GFX8-NEXT:    s_not_b64 exec, exec
4893; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4894; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4895; GFX8-NEXT:    s_nop 1
4896; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4897; GFX8-NEXT:    s_nop 1
4898; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4899; GFX8-NEXT:    s_nop 1
4900; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4901; GFX8-NEXT:    s_nop 1
4902; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4903; GFX8-NEXT:    s_nop 1
4904; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4905; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4906; GFX8-NEXT:    s_nop 0
4907; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4908; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4909; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4910; GFX8-NEXT:    ; implicit-def: $vgpr0
4911; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4912; GFX8-NEXT:    s_cbranch_execz .LBB19_2
4913; GFX8-NEXT:  ; %bb.1:
4914; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4915; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4916; GFX8-NEXT:    s_mov_b32 m0, -1
4917; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4918; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
4919; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4920; GFX8-NEXT:  .LBB19_2:
4921; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4922; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4923; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4924; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4925; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
4926; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4927; GFX8-NEXT:    s_mov_b32 s2, -1
4928; GFX8-NEXT:    s_nop 0
4929; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4930; GFX8-NEXT:    s_endpgm
4931;
4932; GFX9-LABEL: min_i32_varying:
4933; GFX9:       ; %bb.0: ; %entry
4934; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4935; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4936; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4937; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4938; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
4939; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4940; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4941; GFX9-NEXT:    s_not_b64 exec, exec
4942; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
4943; GFX9-NEXT:    s_not_b64 exec, exec
4944; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4945; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4946; GFX9-NEXT:    s_nop 1
4947; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4948; GFX9-NEXT:    s_nop 1
4949; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4950; GFX9-NEXT:    s_nop 1
4951; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4952; GFX9-NEXT:    s_nop 1
4953; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4954; GFX9-NEXT:    s_nop 1
4955; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4956; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4957; GFX9-NEXT:    s_nop 0
4958; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4959; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4960; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4961; GFX9-NEXT:    ; implicit-def: $vgpr0
4962; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4963; GFX9-NEXT:    s_cbranch_execz .LBB19_2
4964; GFX9-NEXT:  ; %bb.1:
4965; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4966; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4967; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4968; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
4969; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4970; GFX9-NEXT:  .LBB19_2:
4971; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4972; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4973; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4974; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4975; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
4976; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4977; GFX9-NEXT:    s_mov_b32 s2, -1
4978; GFX9-NEXT:    s_nop 0
4979; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4980; GFX9-NEXT:    s_endpgm
4981;
4982; GFX1064-LABEL: min_i32_varying:
4983; GFX1064:       ; %bb.0: ; %entry
4984; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4985; GFX1064-NEXT:    s_not_b64 exec, exec
4986; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
4987; GFX1064-NEXT:    s_not_b64 exec, exec
4988; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4989; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4990; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
4991; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4992; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4993; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4994; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4995; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4996; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4997; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4998; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4999; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5000; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5001; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5002; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5003; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5004; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5005; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5006; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5007; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5008; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5009; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5010; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5011; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5012; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5013; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5014; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5015; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5016; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5017; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5018; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5019; GFX1064-NEXT:    s_mov_b32 s2, -1
5020; GFX1064-NEXT:    ; implicit-def: $vgpr0
5021; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5022; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
5023; GFX1064-NEXT:  ; %bb.1:
5024; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5025; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5026; GFX1064-NEXT:    s_mov_b32 s3, s7
5027; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5028; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5029; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
5030; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5031; GFX1064-NEXT:    buffer_gl0_inv
5032; GFX1064-NEXT:  .LBB19_2:
5033; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5034; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5035; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5036; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5037; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
5038; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5039; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5040; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5041; GFX1064-NEXT:    s_endpgm
5042;
5043; GFX1032-LABEL: min_i32_varying:
5044; GFX1032:       ; %bb.0: ; %entry
5045; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5046; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5047; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
5048; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5049; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5050; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5051; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5052; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5053; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5054; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5055; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5056; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5057; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5058; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5059; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5060; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
5061; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5062; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5063; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5064; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5065; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5066; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5067; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5068; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5069; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5070; GFX1032-NEXT:    s_mov_b32 s2, -1
5071; GFX1032-NEXT:    ; implicit-def: $vgpr0
5072; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5073; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
5074; GFX1032-NEXT:  ; %bb.1:
5075; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5076; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5077; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5078; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5079; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
5080; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5081; GFX1032-NEXT:    buffer_gl0_inv
5082; GFX1032-NEXT:  .LBB19_2:
5083; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5084; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5085; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5086; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5087; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
5088; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5089; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5090; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5091; GFX1032-NEXT:    s_endpgm
5092;
5093; GFX1164-LABEL: min_i32_varying:
5094; GFX1164:       ; %bb.0: ; %entry
5095; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5096; GFX1164-NEXT:    s_not_b64 exec, exec
5097; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
5098; GFX1164-NEXT:    s_not_b64 exec, exec
5099; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5100; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5101; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
5102; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5103; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5104; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5105; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5106; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5107; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5108; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5109; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5110; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5111; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5112; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5113; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5114; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5115; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5116; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5117; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5118; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5119; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5120; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5121; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5122; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5123; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5124; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5125; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5126; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5127; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5128; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5129; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5130; GFX1164-NEXT:    s_mov_b32 s2, -1
5131; GFX1164-NEXT:    ; implicit-def: $vgpr0
5132; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5133; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
5134; GFX1164-NEXT:  ; %bb.1:
5135; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5136; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5137; GFX1164-NEXT:    s_mov_b32 s3, s7
5138; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5139; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5140; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
5141; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5142; GFX1164-NEXT:    buffer_gl0_inv
5143; GFX1164-NEXT:  .LBB19_2:
5144; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5145; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5146; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5147; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
5148; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5149; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5150; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5151; GFX1164-NEXT:    s_endpgm
5152;
5153; GFX1132-LABEL: min_i32_varying:
5154; GFX1132:       ; %bb.0: ; %entry
5155; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5156; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5157; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
5158; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5159; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5160; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5161; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5162; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5163; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5164; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5165; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5166; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5167; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5168; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5169; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5170; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
5171; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5172; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5173; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5174; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5175; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5176; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5177; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5178; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5179; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5180; GFX1132-NEXT:    s_mov_b32 s2, -1
5181; GFX1132-NEXT:    ; implicit-def: $vgpr0
5182; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5183; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
5184; GFX1132-NEXT:  ; %bb.1:
5185; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5186; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5187; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5188; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5189; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
5190; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5191; GFX1132-NEXT:    buffer_gl0_inv
5192; GFX1132-NEXT:  .LBB19_2:
5193; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5194; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5195; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5196; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
5197; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5198; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5199; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5200; GFX1132-NEXT:    s_endpgm
5201entry:
5202  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5203  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5204  store i32 %old, i32 addrspace(1)* %out
5205  ret void
5206}
5207
5208define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
5209;
5210;
5211; GFX7LESS-LABEL: min_i64_constant:
5212; GFX7LESS:       ; %bb.0: ; %entry
5213; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5214; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5215; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5216; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5217; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5218; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5219; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
5220; GFX7LESS-NEXT:  ; %bb.1:
5221; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5222; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5223; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5224; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5225; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5226; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5227; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5228; GFX7LESS-NEXT:  .LBB20_2:
5229; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5230; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5231; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5232; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5233; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
5234; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5235; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5236; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5237; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5238; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
5239; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5240; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5241; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5242; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5243; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5244; GFX7LESS-NEXT:    s_endpgm
5245;
5246; GFX8-LABEL: min_i64_constant:
5247; GFX8:       ; %bb.0: ; %entry
5248; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5249; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5250; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5251; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5252; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5253; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5254; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5255; GFX8-NEXT:  ; %bb.1:
5256; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5257; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5258; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5259; GFX8-NEXT:    s_mov_b32 m0, -1
5260; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5261; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5262; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5263; GFX8-NEXT:  .LBB20_2:
5264; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5265; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5266; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5267; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
5268; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5269; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5270; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5271; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5272; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5273; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5274; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5275; GFX8-NEXT:    s_mov_b32 s2, -1
5276; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5277; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5278; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5279; GFX8-NEXT:    s_endpgm
5280;
5281; GFX9-LABEL: min_i64_constant:
5282; GFX9:       ; %bb.0: ; %entry
5283; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5284; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5285; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5286; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5287; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5288; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5289; GFX9-NEXT:    s_cbranch_execz .LBB20_2
5290; GFX9-NEXT:  ; %bb.1:
5291; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5292; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5293; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5294; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5295; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5297; GFX9-NEXT:  .LBB20_2:
5298; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5299; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5300; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5301; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
5302; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5303; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5304; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5305; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5306; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5307; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5308; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5309; GFX9-NEXT:    s_mov_b32 s2, -1
5310; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5311; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5312; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5313; GFX9-NEXT:    s_endpgm
5314;
5315; GFX1064-LABEL: min_i64_constant:
5316; GFX1064:       ; %bb.0: ; %entry
5317; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5318; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5319; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5320; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5321; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5322; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5323; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
5324; GFX1064-NEXT:  ; %bb.1:
5325; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5326; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5327; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5328; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5329; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5330; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5331; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5332; GFX1064-NEXT:    buffer_gl0_inv
5333; GFX1064-NEXT:  .LBB20_2:
5334; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5335; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5336; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5337; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5338; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5339; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5340; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5341; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5342; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5343; GFX1064-NEXT:    s_mov_b32 s2, -1
5344; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5345; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5346; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5347; GFX1064-NEXT:    s_endpgm
5348;
5349; GFX1032-LABEL: min_i64_constant:
5350; GFX1032:       ; %bb.0: ; %entry
5351; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5352; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5353; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5354; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5355; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5356; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
5357; GFX1032-NEXT:  ; %bb.1:
5358; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5359; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5360; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5361; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5362; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5363; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5364; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5365; GFX1032-NEXT:    buffer_gl0_inv
5366; GFX1032-NEXT:  .LBB20_2:
5367; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5368; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5369; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5370; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5371; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5372; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5373; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5374; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5375; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5376; GFX1032-NEXT:    s_mov_b32 s2, -1
5377; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5378; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5379; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5380; GFX1032-NEXT:    s_endpgm
5381;
5382; GFX1164-LABEL: min_i64_constant:
5383; GFX1164:       ; %bb.0: ; %entry
5384; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5385; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5386; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5387; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5388; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5389; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5390; GFX1164-NEXT:    s_cbranch_execz .LBB20_2
5391; GFX1164-NEXT:  ; %bb.1:
5392; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5393; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5394; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5395; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5396; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5397; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5398; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5399; GFX1164-NEXT:    buffer_gl0_inv
5400; GFX1164-NEXT:  .LBB20_2:
5401; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5402; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5403; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5404; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5405; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5406; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5407; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5408; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5409; GFX1164-NEXT:    s_mov_b32 s2, -1
5410; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5411; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5412; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5413; GFX1164-NEXT:    s_endpgm
5414;
5415; GFX1132-LABEL: min_i64_constant:
5416; GFX1132:       ; %bb.0: ; %entry
5417; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5418; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5419; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5420; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5421; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5422; GFX1132-NEXT:    s_cbranch_execz .LBB20_2
5423; GFX1132-NEXT:  ; %bb.1:
5424; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5425; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5426; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5427; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5428; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5429; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5430; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5431; GFX1132-NEXT:    buffer_gl0_inv
5432; GFX1132-NEXT:  .LBB20_2:
5433; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5434; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5435; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5436; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5437; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5438; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5439; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5440; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5441; GFX1132-NEXT:    s_mov_b32 s2, -1
5442; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5443; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5444; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5445; GFX1132-NEXT:    s_endpgm
5446entry:
5447  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
5448  store i64 %old, i64 addrspace(1)* %out
5449  ret void
5450}
5451
5452define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
5453;
5454;
5455; GFX7LESS-LABEL: umax_i32_varying:
5456; GFX7LESS:       ; %bb.0: ; %entry
5457; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5458; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5459; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5460; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5461; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
5462; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5463; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5464; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5465; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5466; GFX7LESS-NEXT:    s_endpgm
5467;
5468; GFX8-LABEL: umax_i32_varying:
5469; GFX8:       ; %bb.0: ; %entry
5470; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5471; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5472; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5473; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5474; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5475; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5476; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5477; GFX8-NEXT:    s_not_b64 exec, exec
5478; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5479; GFX8-NEXT:    s_not_b64 exec, exec
5480; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5481; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5482; GFX8-NEXT:    s_nop 1
5483; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5484; GFX8-NEXT:    s_nop 1
5485; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5486; GFX8-NEXT:    s_nop 1
5487; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5488; GFX8-NEXT:    s_nop 1
5489; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5490; GFX8-NEXT:    s_nop 1
5491; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5492; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5493; GFX8-NEXT:    s_nop 0
5494; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5495; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5496; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5497; GFX8-NEXT:    ; implicit-def: $vgpr0
5498; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5499; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5500; GFX8-NEXT:  ; %bb.1:
5501; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5502; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5503; GFX8-NEXT:    s_mov_b32 m0, -1
5504; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5505; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
5506; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5507; GFX8-NEXT:  .LBB21_2:
5508; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5509; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5510; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5511; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5512; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
5513; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5514; GFX8-NEXT:    s_mov_b32 s2, -1
5515; GFX8-NEXT:    s_nop 0
5516; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5517; GFX8-NEXT:    s_endpgm
5518;
5519; GFX9-LABEL: umax_i32_varying:
5520; GFX9:       ; %bb.0: ; %entry
5521; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5522; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5523; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5524; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5525; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5526; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5527; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5528; GFX9-NEXT:    s_not_b64 exec, exec
5529; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5530; GFX9-NEXT:    s_not_b64 exec, exec
5531; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5532; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5533; GFX9-NEXT:    s_nop 1
5534; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5535; GFX9-NEXT:    s_nop 1
5536; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5537; GFX9-NEXT:    s_nop 1
5538; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5539; GFX9-NEXT:    s_nop 1
5540; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5541; GFX9-NEXT:    s_nop 1
5542; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5543; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5544; GFX9-NEXT:    s_nop 0
5545; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5546; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5547; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5548; GFX9-NEXT:    ; implicit-def: $vgpr0
5549; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5550; GFX9-NEXT:    s_cbranch_execz .LBB21_2
5551; GFX9-NEXT:  ; %bb.1:
5552; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5553; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5554; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5555; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
5556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5557; GFX9-NEXT:  .LBB21_2:
5558; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5559; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5560; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5561; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5562; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
5563; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5564; GFX9-NEXT:    s_mov_b32 s2, -1
5565; GFX9-NEXT:    s_nop 0
5566; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5567; GFX9-NEXT:    s_endpgm
5568;
5569; GFX1064-LABEL: umax_i32_varying:
5570; GFX1064:       ; %bb.0: ; %entry
5571; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5572; GFX1064-NEXT:    s_not_b64 exec, exec
5573; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5574; GFX1064-NEXT:    s_not_b64 exec, exec
5575; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5576; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5577; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5578; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5579; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5580; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5581; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5582; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5583; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5584; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5585; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5586; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5587; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5588; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5589; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5590; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5591; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5592; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5593; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5594; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5595; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5596; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5597; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5598; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5599; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5600; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5601; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5602; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5603; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5604; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5605; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5606; GFX1064-NEXT:    s_mov_b32 s2, -1
5607; GFX1064-NEXT:    ; implicit-def: $vgpr0
5608; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5609; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
5610; GFX1064-NEXT:  ; %bb.1:
5611; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5612; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5613; GFX1064-NEXT:    s_mov_b32 s3, s7
5614; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5615; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5616; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
5617; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5618; GFX1064-NEXT:    buffer_gl0_inv
5619; GFX1064-NEXT:  .LBB21_2:
5620; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5621; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5622; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5623; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5624; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
5625; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5626; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5627; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5628; GFX1064-NEXT:    s_endpgm
5629;
5630; GFX1032-LABEL: umax_i32_varying:
5631; GFX1032:       ; %bb.0: ; %entry
5632; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5633; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5634; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5635; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5636; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5637; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5638; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5639; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5640; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5641; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5642; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5643; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5644; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5645; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5646; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5647; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5648; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5649; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5650; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5651; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5652; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5653; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5654; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5655; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5656; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5657; GFX1032-NEXT:    s_mov_b32 s2, -1
5658; GFX1032-NEXT:    ; implicit-def: $vgpr0
5659; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5660; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
5661; GFX1032-NEXT:  ; %bb.1:
5662; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5663; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5664; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5665; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5666; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
5667; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5668; GFX1032-NEXT:    buffer_gl0_inv
5669; GFX1032-NEXT:  .LBB21_2:
5670; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5671; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5672; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5673; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5674; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
5675; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5676; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5677; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5678; GFX1032-NEXT:    s_endpgm
5679;
5680; GFX1164-LABEL: umax_i32_varying:
5681; GFX1164:       ; %bb.0: ; %entry
5682; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5683; GFX1164-NEXT:    s_not_b64 exec, exec
5684; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5685; GFX1164-NEXT:    s_not_b64 exec, exec
5686; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5687; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5688; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5689; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5690; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5691; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5692; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5693; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5694; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5695; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5696; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5697; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5698; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5699; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5700; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5701; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5702; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5703; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5704; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5705; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5706; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5707; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5708; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5709; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5710; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5711; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5712; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5713; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5714; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5715; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5716; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5717; GFX1164-NEXT:    s_mov_b32 s2, -1
5718; GFX1164-NEXT:    ; implicit-def: $vgpr0
5719; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5720; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
5721; GFX1164-NEXT:  ; %bb.1:
5722; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5723; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5724; GFX1164-NEXT:    s_mov_b32 s3, s7
5725; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5726; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5727; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v4
5728; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5729; GFX1164-NEXT:    buffer_gl0_inv
5730; GFX1164-NEXT:  .LBB21_2:
5731; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5732; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5733; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5734; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
5735; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5736; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5737; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5738; GFX1164-NEXT:    s_endpgm
5739;
5740; GFX1132-LABEL: umax_i32_varying:
5741; GFX1132:       ; %bb.0: ; %entry
5742; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5743; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5744; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5745; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5746; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5747; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5748; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5749; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5750; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5751; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5752; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5753; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5754; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5755; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5756; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5757; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
5758; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5759; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5760; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5761; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5762; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5763; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5764; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5765; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5766; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5767; GFX1132-NEXT:    s_mov_b32 s2, -1
5768; GFX1132-NEXT:    ; implicit-def: $vgpr0
5769; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5770; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
5771; GFX1132-NEXT:  ; %bb.1:
5772; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5773; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5774; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5775; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5776; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v4
5777; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5778; GFX1132-NEXT:    buffer_gl0_inv
5779; GFX1132-NEXT:  .LBB21_2:
5780; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5781; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5782; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5783; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
5784; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5785; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5786; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5787; GFX1132-NEXT:    s_endpgm
5788entry:
5789  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5790  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5791  store i32 %old, i32 addrspace(1)* %out
5792  ret void
5793}
5794
5795define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
5796;
5797;
5798; GFX7LESS-LABEL: umax_i64_constant:
5799; GFX7LESS:       ; %bb.0: ; %entry
5800; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5801; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5802; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5803; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5804; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5805; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5806; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
5807; GFX7LESS-NEXT:  ; %bb.1:
5808; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5809; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5810; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5811; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5812; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5813; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5814; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5815; GFX7LESS-NEXT:  .LBB22_2:
5816; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5817; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5818; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5819; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5820; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5821; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5822; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5823; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
5824; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
5825; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5826; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
5827; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5828; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5829; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5830; GFX7LESS-NEXT:    s_endpgm
5831;
5832; GFX8-LABEL: umax_i64_constant:
5833; GFX8:       ; %bb.0: ; %entry
5834; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5835; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5836; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5837; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5838; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5839; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5840; GFX8-NEXT:    s_cbranch_execz .LBB22_2
5841; GFX8-NEXT:  ; %bb.1:
5842; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5843; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5844; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5845; GFX8-NEXT:    s_mov_b32 m0, -1
5846; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5847; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5848; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5849; GFX8-NEXT:  .LBB22_2:
5850; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5851; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5852; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5853; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
5854; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5855; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5856; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5857; GFX8-NEXT:    v_mov_b32_e32 v2, s2
5858; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5859; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5860; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5861; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5862; GFX8-NEXT:    s_mov_b32 s2, -1
5863; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5864; GFX8-NEXT:    s_endpgm
5865;
5866; GFX9-LABEL: umax_i64_constant:
5867; GFX9:       ; %bb.0: ; %entry
5868; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5869; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5870; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5871; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5872; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5873; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5874; GFX9-NEXT:    s_cbranch_execz .LBB22_2
5875; GFX9-NEXT:  ; %bb.1:
5876; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5877; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5878; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5879; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5880; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5881; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5882; GFX9-NEXT:  .LBB22_2:
5883; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5884; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5885; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5886; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
5887; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5888; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5889; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5890; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5891; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5892; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5893; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5894; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5895; GFX9-NEXT:    s_mov_b32 s2, -1
5896; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5897; GFX9-NEXT:    s_endpgm
5898;
5899; GFX1064-LABEL: umax_i64_constant:
5900; GFX1064:       ; %bb.0: ; %entry
5901; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5902; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5903; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5904; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5905; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5906; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5907; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
5908; GFX1064-NEXT:  ; %bb.1:
5909; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5910; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5911; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5912; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5913; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5914; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5915; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5916; GFX1064-NEXT:    buffer_gl0_inv
5917; GFX1064-NEXT:  .LBB22_2:
5918; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5919; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5920; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5921; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5922; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5923; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5924; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5925; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5926; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
5927; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5928; GFX1064-NEXT:    s_mov_b32 s2, -1
5929; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5930; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5931; GFX1064-NEXT:    s_endpgm
5932;
5933; GFX1032-LABEL: umax_i64_constant:
5934; GFX1032:       ; %bb.0: ; %entry
5935; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5936; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5937; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5938; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5939; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5940; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
5941; GFX1032-NEXT:  ; %bb.1:
5942; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5943; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5944; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5945; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5946; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5947; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5948; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5949; GFX1032-NEXT:    buffer_gl0_inv
5950; GFX1032-NEXT:  .LBB22_2:
5951; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5952; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5953; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5954; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5955; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5956; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
5957; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
5958; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5959; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
5960; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5961; GFX1032-NEXT:    s_mov_b32 s2, -1
5962; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5963; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5964; GFX1032-NEXT:    s_endpgm
5965;
5966; GFX1164-LABEL: umax_i64_constant:
5967; GFX1164:       ; %bb.0: ; %entry
5968; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5969; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5970; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5971; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5972; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5973; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5974; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
5975; GFX1164-NEXT:  ; %bb.1:
5976; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5977; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5978; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5979; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5980; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5981; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5982; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5983; GFX1164-NEXT:    buffer_gl0_inv
5984; GFX1164-NEXT:  .LBB22_2:
5985; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5986; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5987; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5988; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5989; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5990; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5991; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5992; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
5993; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5994; GFX1164-NEXT:    s_mov_b32 s2, -1
5995; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5996; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5997; GFX1164-NEXT:    s_endpgm
5998;
5999; GFX1132-LABEL: umax_i64_constant:
6000; GFX1132:       ; %bb.0: ; %entry
6001; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6002; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6003; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6004; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6005; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6006; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
6007; GFX1132-NEXT:  ; %bb.1:
6008; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6009; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6010; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6011; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6012; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6013; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6014; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6015; GFX1132-NEXT:    buffer_gl0_inv
6016; GFX1132-NEXT:  .LBB22_2:
6017; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6018; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6019; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6020; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6021; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6022; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6023; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6024; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6025; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6026; GFX1132-NEXT:    s_mov_b32 s2, -1
6027; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6028; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6029; GFX1132-NEXT:    s_endpgm
6030entry:
6031  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
6032  store i64 %old, i64 addrspace(1)* %out
6033  ret void
6034}
6035
6036define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
6037;
6038;
6039; GFX7LESS-LABEL: umin_i32_varying:
6040; GFX7LESS:       ; %bb.0: ; %entry
6041; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6042; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6043; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6044; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6045; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
6046; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6047; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6048; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6049; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6050; GFX7LESS-NEXT:    s_endpgm
6051;
6052; GFX8-LABEL: umin_i32_varying:
6053; GFX8:       ; %bb.0: ; %entry
6054; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6055; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6056; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6057; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6058; GFX8-NEXT:    v_mov_b32_e32 v1, -1
6059; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6060; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6061; GFX8-NEXT:    s_not_b64 exec, exec
6062; GFX8-NEXT:    v_mov_b32_e32 v2, -1
6063; GFX8-NEXT:    s_not_b64 exec, exec
6064; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6065; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6066; GFX8-NEXT:    s_nop 1
6067; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6068; GFX8-NEXT:    s_nop 1
6069; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6070; GFX8-NEXT:    s_nop 1
6071; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6072; GFX8-NEXT:    s_nop 1
6073; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6074; GFX8-NEXT:    s_nop 1
6075; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6076; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
6077; GFX8-NEXT:    s_nop 0
6078; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6079; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6080; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6081; GFX8-NEXT:    ; implicit-def: $vgpr0
6082; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6083; GFX8-NEXT:    s_cbranch_execz .LBB23_2
6084; GFX8-NEXT:  ; %bb.1:
6085; GFX8-NEXT:    v_mov_b32_e32 v0, 0
6086; GFX8-NEXT:    v_mov_b32_e32 v3, s4
6087; GFX8-NEXT:    s_mov_b32 m0, -1
6088; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6089; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
6090; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6091; GFX8-NEXT:  .LBB23_2:
6092; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6093; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6094; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6095; GFX8-NEXT:    v_mov_b32_e32 v0, v1
6096; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
6097; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6098; GFX8-NEXT:    s_mov_b32 s2, -1
6099; GFX8-NEXT:    s_nop 0
6100; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6101; GFX8-NEXT:    s_endpgm
6102;
6103; GFX9-LABEL: umin_i32_varying:
6104; GFX9:       ; %bb.0: ; %entry
6105; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6106; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6107; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6108; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6109; GFX9-NEXT:    v_mov_b32_e32 v1, -1
6110; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6111; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6112; GFX9-NEXT:    s_not_b64 exec, exec
6113; GFX9-NEXT:    v_mov_b32_e32 v2, -1
6114; GFX9-NEXT:    s_not_b64 exec, exec
6115; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6116; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6117; GFX9-NEXT:    s_nop 1
6118; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6119; GFX9-NEXT:    s_nop 1
6120; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6121; GFX9-NEXT:    s_nop 1
6122; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6123; GFX9-NEXT:    s_nop 1
6124; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6125; GFX9-NEXT:    s_nop 1
6126; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6127; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
6128; GFX9-NEXT:    s_nop 0
6129; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6130; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6131; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6132; GFX9-NEXT:    ; implicit-def: $vgpr0
6133; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6134; GFX9-NEXT:    s_cbranch_execz .LBB23_2
6135; GFX9-NEXT:  ; %bb.1:
6136; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6137; GFX9-NEXT:    v_mov_b32_e32 v3, s4
6138; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6139; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
6140; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6141; GFX9-NEXT:  .LBB23_2:
6142; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6143; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6144; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6145; GFX9-NEXT:    v_mov_b32_e32 v0, v1
6146; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
6147; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6148; GFX9-NEXT:    s_mov_b32 s2, -1
6149; GFX9-NEXT:    s_nop 0
6150; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6151; GFX9-NEXT:    s_endpgm
6152;
6153; GFX1064-LABEL: umin_i32_varying:
6154; GFX1064:       ; %bb.0: ; %entry
6155; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
6156; GFX1064-NEXT:    s_not_b64 exec, exec
6157; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
6158; GFX1064-NEXT:    s_not_b64 exec, exec
6159; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6160; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6161; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
6162; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6163; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6164; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6165; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
6166; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6167; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6168; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
6169; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
6170; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6171; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
6172; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6173; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6174; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6175; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6176; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
6177; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
6178; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6179; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6180; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6181; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
6182; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
6183; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
6184; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6185; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6186; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
6187; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
6188; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
6189; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6190; GFX1064-NEXT:    s_mov_b32 s2, -1
6191; GFX1064-NEXT:    ; implicit-def: $vgpr0
6192; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6193; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
6194; GFX1064-NEXT:  ; %bb.1:
6195; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
6196; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
6197; GFX1064-NEXT:    s_mov_b32 s3, s7
6198; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6199; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6200; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
6201; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6202; GFX1064-NEXT:    buffer_gl0_inv
6203; GFX1064-NEXT:  .LBB23_2:
6204; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6205; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
6206; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
6207; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
6208; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
6209; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6210; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6211; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6212; GFX1064-NEXT:    s_endpgm
6213;
6214; GFX1032-LABEL: umin_i32_varying:
6215; GFX1032:       ; %bb.0: ; %entry
6216; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
6217; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6218; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
6219; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6220; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6221; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6222; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6223; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6224; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6225; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
6226; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6227; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6228; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6229; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6230; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6231; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
6232; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
6233; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
6234; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6235; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6236; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6237; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6238; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
6239; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6240; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6241; GFX1032-NEXT:    s_mov_b32 s2, -1
6242; GFX1032-NEXT:    ; implicit-def: $vgpr0
6243; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6244; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
6245; GFX1032-NEXT:  ; %bb.1:
6246; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
6247; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
6248; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6249; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6250; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
6251; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6252; GFX1032-NEXT:    buffer_gl0_inv
6253; GFX1032-NEXT:  .LBB23_2:
6254; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6255; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6256; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
6257; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
6258; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
6259; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6260; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6261; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6262; GFX1032-NEXT:    s_endpgm
6263;
6264; GFX1164-LABEL: umin_i32_varying:
6265; GFX1164:       ; %bb.0: ; %entry
6266; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
6267; GFX1164-NEXT:    s_not_b64 exec, exec
6268; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
6269; GFX1164-NEXT:    s_not_b64 exec, exec
6270; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6271; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6272; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
6273; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6274; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6275; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6276; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
6277; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6278; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6279; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
6280; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
6281; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6282; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
6283; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6284; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6285; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6286; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6287; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
6288; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
6289; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6290; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6291; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6292; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
6293; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
6294; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
6295; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6296; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6297; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
6298; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
6299; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
6300; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6301; GFX1164-NEXT:    s_mov_b32 s2, -1
6302; GFX1164-NEXT:    ; implicit-def: $vgpr0
6303; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6304; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
6305; GFX1164-NEXT:  ; %bb.1:
6306; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
6307; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
6308; GFX1164-NEXT:    s_mov_b32 s3, s7
6309; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6310; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6311; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
6312; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6313; GFX1164-NEXT:    buffer_gl0_inv
6314; GFX1164-NEXT:  .LBB23_2:
6315; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
6316; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
6317; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
6318; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
6319; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6320; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6321; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6322; GFX1164-NEXT:    s_endpgm
6323;
6324; GFX1132-LABEL: umin_i32_varying:
6325; GFX1132:       ; %bb.0: ; %entry
6326; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
6327; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6328; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
6329; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6330; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6331; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6332; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6333; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6334; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6335; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
6336; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6337; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6338; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6339; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6340; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6341; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
6342; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6343; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6344; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6345; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6346; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6347; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6348; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6349; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6350; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6351; GFX1132-NEXT:    s_mov_b32 s2, -1
6352; GFX1132-NEXT:    ; implicit-def: $vgpr0
6353; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6354; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
6355; GFX1132-NEXT:  ; %bb.1:
6356; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6357; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6358; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6359; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6360; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
6361; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6362; GFX1132-NEXT:    buffer_gl0_inv
6363; GFX1132-NEXT:  .LBB23_2:
6364; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6365; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6366; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6367; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
6368; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6369; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6370; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6371; GFX1132-NEXT:    s_endpgm
6372entry:
6373  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6374  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6375  store i32 %old, i32 addrspace(1)* %out
6376  ret void
6377}
6378
6379define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
6380;
6381;
6382; GFX7LESS-LABEL: umin_i64_constant:
6383; GFX7LESS:       ; %bb.0: ; %entry
6384; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6385; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6386; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6387; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6388; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6389; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6390; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
6391; GFX7LESS-NEXT:  ; %bb.1:
6392; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6393; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6394; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6395; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6396; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6397; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6398; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6399; GFX7LESS-NEXT:  .LBB24_2:
6400; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6401; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6402; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6403; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6404; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6405; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6406; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6407; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
6408; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6409; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6410; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6411; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6412; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6413; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6414; GFX7LESS-NEXT:    s_endpgm
6415;
6416; GFX8-LABEL: umin_i64_constant:
6417; GFX8:       ; %bb.0: ; %entry
6418; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6419; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6420; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6421; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6422; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6423; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6424; GFX8-NEXT:    s_cbranch_execz .LBB24_2
6425; GFX8-NEXT:  ; %bb.1:
6426; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6427; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6428; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6429; GFX8-NEXT:    s_mov_b32 m0, -1
6430; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6431; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6432; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6433; GFX8-NEXT:  .LBB24_2:
6434; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6435; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6436; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6437; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
6438; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6439; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6440; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6441; GFX8-NEXT:    v_mov_b32_e32 v2, s5
6442; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6443; GFX8-NEXT:    v_mov_b32_e32 v2, s4
6444; GFX8-NEXT:    s_mov_b32 s2, -1
6445; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6446; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6447; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6448; GFX8-NEXT:    s_endpgm
6449;
6450; GFX9-LABEL: umin_i64_constant:
6451; GFX9:       ; %bb.0: ; %entry
6452; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6453; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6454; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6455; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6456; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6457; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6458; GFX9-NEXT:    s_cbranch_execz .LBB24_2
6459; GFX9-NEXT:  ; %bb.1:
6460; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6461; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6462; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6463; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6464; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6465; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6466; GFX9-NEXT:  .LBB24_2:
6467; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6468; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6469; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6470; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
6471; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6472; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6473; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6474; GFX9-NEXT:    v_mov_b32_e32 v2, s5
6475; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6476; GFX9-NEXT:    v_mov_b32_e32 v2, s4
6477; GFX9-NEXT:    s_mov_b32 s2, -1
6478; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6479; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6480; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6481; GFX9-NEXT:    s_endpgm
6482;
6483; GFX1064-LABEL: umin_i64_constant:
6484; GFX1064:       ; %bb.0: ; %entry
6485; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6486; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6487; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6488; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6489; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6490; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6491; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
6492; GFX1064-NEXT:  ; %bb.1:
6493; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6494; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6495; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6496; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6497; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6498; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6499; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6500; GFX1064-NEXT:    buffer_gl0_inv
6501; GFX1064-NEXT:  .LBB24_2:
6502; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6503; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6504; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6505; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6506; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6507; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6508; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6509; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6510; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6511; GFX1064-NEXT:    s_mov_b32 s2, -1
6512; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6513; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6514; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6515; GFX1064-NEXT:    s_endpgm
6516;
6517; GFX1032-LABEL: umin_i64_constant:
6518; GFX1032:       ; %bb.0: ; %entry
6519; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6520; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6521; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6522; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6523; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6524; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
6525; GFX1032-NEXT:  ; %bb.1:
6526; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6527; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6528; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6529; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6530; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6531; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6532; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6533; GFX1032-NEXT:    buffer_gl0_inv
6534; GFX1032-NEXT:  .LBB24_2:
6535; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6536; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6537; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6538; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6539; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6540; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6541; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6542; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6543; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6544; GFX1032-NEXT:    s_mov_b32 s2, -1
6545; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6546; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6547; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6548; GFX1032-NEXT:    s_endpgm
6549;
6550; GFX1164-LABEL: umin_i64_constant:
6551; GFX1164:       ; %bb.0: ; %entry
6552; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6553; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6554; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6555; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6556; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6557; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6558; GFX1164-NEXT:    s_cbranch_execz .LBB24_2
6559; GFX1164-NEXT:  ; %bb.1:
6560; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6561; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6562; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6563; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6564; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6565; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6566; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6567; GFX1164-NEXT:    buffer_gl0_inv
6568; GFX1164-NEXT:  .LBB24_2:
6569; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6570; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6571; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6572; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6573; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6574; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6575; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6576; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6577; GFX1164-NEXT:    s_mov_b32 s2, -1
6578; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6579; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6580; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6581; GFX1164-NEXT:    s_endpgm
6582;
6583; GFX1132-LABEL: umin_i64_constant:
6584; GFX1132:       ; %bb.0: ; %entry
6585; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6586; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6587; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6588; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6589; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6590; GFX1132-NEXT:    s_cbranch_execz .LBB24_2
6591; GFX1132-NEXT:  ; %bb.1:
6592; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6593; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6594; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6595; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6596; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6597; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6598; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6599; GFX1132-NEXT:    buffer_gl0_inv
6600; GFX1132-NEXT:  .LBB24_2:
6601; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6602; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6603; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6604; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6605; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6606; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6607; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6608; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6609; GFX1132-NEXT:    s_mov_b32 s2, -1
6610; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6611; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6612; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6613; GFX1132-NEXT:    s_endpgm
6614entry:
6615  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
6616  store i64 %old, i64 addrspace(1)* %out
6617  ret void
6618}
6619