1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12@local_var32 = addrspace(3) global i32 undef, align 4
13@local_var64 = addrspace(3) global i64 undef, align 8
14
15; Show what the atomic optimization pass will do for local pointers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
18;
19;
20; GFX7LESS-LABEL: add_i32_constant:
21; GFX7LESS:       ; %bb.0: ; %entry
22; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
23; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
25; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
26; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
27; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
28; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
29; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
30; GFX7LESS-NEXT:  ; %bb.1:
31; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
34; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7LESS-NEXT:    s_mov_b32 m0, -1
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:  .LBB0_2:
40; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
43; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
45; GFX7LESS-NEXT:    s_mov_b32 s2, -1
46; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; GFX7LESS-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
52; GFX8-NEXT:    s_mov_b64 s[2:3], exec
53; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
54; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
55; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
56; GFX8-NEXT:    ; implicit-def: $vgpr1
57; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
58; GFX8-NEXT:    s_cbranch_execz .LBB0_2
59; GFX8-NEXT:  ; %bb.1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
61; GFX8-NEXT:    s_mul_i32 s2, s2, 5
62; GFX8-NEXT:    v_mov_b32_e32 v1, 0
63; GFX8-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-NEXT:    s_mov_b32 m0, -1
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:  .LBB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
73; GFX8-NEXT:    s_mov_b32 s3, 0xf000
74; GFX8-NEXT:    s_mov_b32 s2, -1
75; GFX8-NEXT:    s_nop 1
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    s_mov_b64 s[2:3], exec
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX9-NEXT:    s_cbranch_execz .LBB0_2
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
91; GFX9-NEXT:    s_mul_i32 s2, s2, 5
92; GFX9-NEXT:    v_mov_b32_e32 v1, 0
93; GFX9-NEXT:    v_mov_b32_e32 v2, s2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
111; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
112; GFX1064-NEXT:    ; implicit-def: $vgpr1
113; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
114; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
116; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
117; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
118; GFX1064-NEXT:  ; %bb.1:
119; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
120; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
121; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
122; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1064-NEXT:    buffer_gl0_inv
128; GFX1064-NEXT:  .LBB0_2:
129; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
130; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
131; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
132; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
133; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
134; GFX1064-NEXT:    s_mov_b32 s2, -1
135; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
137; GFX1064-NEXT:    s_endpgm
138;
139; GFX1032-LABEL: add_i32_constant:
140; GFX1032:       ; %bb.0: ; %entry
141; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
151; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
152; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
155; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1032-NEXT:    buffer_gl0_inv
158; GFX1032-NEXT:  .LBB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168;
169; GFX1164-LABEL: add_i32_constant:
170; GFX1164:       ; %bb.0: ; %entry
171; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
172; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
173; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
174; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
175; GFX1164-NEXT:    ; implicit-def: $vgpr1
176; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
177; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
178; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
179; GFX1164-NEXT:  ; %bb.1:
180; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
181; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
182; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
183; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
184; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
185; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
186; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
187; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX1164-NEXT:    buffer_gl0_inv
189; GFX1164-NEXT:  .LBB0_2:
190; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
191; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
192; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
193; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
194; GFX1164-NEXT:    s_mov_b32 s2, -1
195; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
197; GFX1164-NEXT:    s_endpgm
198;
199; GFX1132-LABEL: add_i32_constant:
200; GFX1132:       ; %bb.0: ; %entry
201; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
202; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
203; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
204; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
205; GFX1132-NEXT:    ; implicit-def: $vgpr1
206; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
207; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
208; GFX1132-NEXT:  ; %bb.1:
209; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
210; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
211; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
212; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
213; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
214; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
215; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
216; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX1132-NEXT:    buffer_gl0_inv
218; GFX1132-NEXT:  .LBB0_2:
219; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
220; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
221; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
222; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
223; GFX1132-NEXT:    s_mov_b32 s2, -1
224; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
226; GFX1132-NEXT:    s_endpgm
227entry:
228  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
229  store i32 %old, i32 addrspace(1)* %out
230  ret void
231}
232
233define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
234;
235;
236; GFX7LESS-LABEL: add_i32_uniform:
237; GFX7LESS:       ; %bb.0: ; %entry
238; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
239; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
240; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
241; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
242; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
243; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
244; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
245; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
246; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
247; GFX7LESS-NEXT:  ; %bb.1:
248; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
249; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
251; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
252; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
253; GFX7LESS-NEXT:    s_mov_b32 m0, -1
254; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7LESS-NEXT:  .LBB1_2:
258; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
259; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
261; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
262; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
263; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
264; GFX7LESS-NEXT:    s_mov_b32 s6, -1
265; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
266; GFX7LESS-NEXT:    s_endpgm
267;
268; GFX8-LABEL: add_i32_uniform:
269; GFX8:       ; %bb.0: ; %entry
270; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
271; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
272; GFX8-NEXT:    s_mov_b64 s[2:3], exec
273; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
274; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
275; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
276; GFX8-NEXT:    ; implicit-def: $vgpr1
277; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
278; GFX8-NEXT:    s_cbranch_execz .LBB1_2
279; GFX8-NEXT:  ; %bb.1:
280; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
281; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX8-NEXT:    s_mul_i32 s2, s6, s2
283; GFX8-NEXT:    v_mov_b32_e32 v1, 0
284; GFX8-NEXT:    v_mov_b32_e32 v2, s2
285; GFX8-NEXT:    s_mov_b32 m0, -1
286; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
288; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX8-NEXT:  .LBB1_2:
290; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
291; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
293; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
294; GFX8-NEXT:    s_mov_b32 s7, 0xf000
295; GFX8-NEXT:    s_mov_b32 s6, -1
296; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
297; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
298; GFX8-NEXT:    s_endpgm
299;
300; GFX9-LABEL: add_i32_uniform:
301; GFX9:       ; %bb.0: ; %entry
302; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
303; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
304; GFX9-NEXT:    s_mov_b64 s[2:3], exec
305; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
306; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
307; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
308; GFX9-NEXT:    ; implicit-def: $vgpr1
309; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
310; GFX9-NEXT:    s_cbranch_execz .LBB1_2
311; GFX9-NEXT:  ; %bb.1:
312; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    s_mul_i32 s2, s6, s2
315; GFX9-NEXT:    v_mov_b32_e32 v1, 0
316; GFX9-NEXT:    v_mov_b32_e32 v2, s2
317; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
319; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX9-NEXT:  .LBB1_2:
321; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
324; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
325; GFX9-NEXT:    s_mov_b32 s7, 0xf000
326; GFX9-NEXT:    s_mov_b32 s6, -1
327; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
328; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
329; GFX9-NEXT:    s_endpgm
330;
331; GFX1064-LABEL: add_i32_uniform:
332; GFX1064:       ; %bb.0: ; %entry
333; GFX1064-NEXT:    s_clause 0x1
334; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
335; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
336; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
337; GFX1064-NEXT:    ; implicit-def: $vgpr1
338; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
339; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
340; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
341; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
342; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
343; GFX1064-NEXT:  ; %bb.1:
344; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
345; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
346; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
348; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
349; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
350; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
351; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
352; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX1064-NEXT:    buffer_gl0_inv
354; GFX1064-NEXT:  .LBB1_2:
355; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
356; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
357; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
358; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
359; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
361; GFX1064-NEXT:    s_mov_b32 s6, -1
362; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
363; GFX1064-NEXT:    s_endpgm
364;
365; GFX1032-LABEL: add_i32_uniform:
366; GFX1032:       ; %bb.0: ; %entry
367; GFX1032-NEXT:    s_clause 0x1
368; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
369; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
370; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
371; GFX1032-NEXT:    ; implicit-def: $vgpr1
372; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
373; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
374; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
375; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
376; GFX1032-NEXT:  ; %bb.1:
377; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
378; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
379; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
381; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
382; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
383; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
384; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
385; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX1032-NEXT:    buffer_gl0_inv
387; GFX1032-NEXT:  .LBB1_2:
388; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
389; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
390; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
391; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
392; GFX1032-NEXT:    s_mov_b32 s6, -1
393; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
395; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
396; GFX1032-NEXT:    s_endpgm
397;
398; GFX1164-LABEL: add_i32_uniform:
399; GFX1164:       ; %bb.0: ; %entry
400; GFX1164-NEXT:    s_clause 0x1
401; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
402; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
403; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
404; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
405; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
406; GFX1164-NEXT:    ; implicit-def: $vgpr1
407; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
408; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
409; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
410; GFX1164-NEXT:  ; %bb.1:
411; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
412; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
413; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
415; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
416; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
417; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
418; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
419; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX1164-NEXT:    buffer_gl0_inv
421; GFX1164-NEXT:  .LBB1_2:
422; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
423; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
424; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
425; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v0, s[0:1]
427; GFX1164-NEXT:    s_mov_b32 s6, -1
428; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
429; GFX1164-NEXT:    s_endpgm
430;
431; GFX1132-LABEL: add_i32_uniform:
432; GFX1132:       ; %bb.0: ; %entry
433; GFX1132-NEXT:    s_clause 0x1
434; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
435; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
436; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
437; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
438; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
439; GFX1132-NEXT:    ; implicit-def: $vgpr1
440; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
441; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
442; GFX1132-NEXT:  ; %bb.1:
443; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
444; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
445; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
447; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
448; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
449; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
450; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
451; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX1132-NEXT:    buffer_gl0_inv
453; GFX1132-NEXT:  .LBB1_2:
454; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
455; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
456; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
457; GFX1132-NEXT:    s_mov_b32 s6, -1
458; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], s0, s0, v0, s[2:3]
460; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
461; GFX1132-NEXT:    s_endpgm
462entry:
463  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
464  store i32 %old, i32 addrspace(1)* %out
465  ret void
466}
467
468define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
469;
470;
471; GFX7LESS-LABEL: add_i32_varying:
472; GFX7LESS:       ; %bb.0: ; %entry
473; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
474; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
475; GFX7LESS-NEXT:    s_mov_b32 m0, -1
476; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
478; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
480; GFX7LESS-NEXT:    s_mov_b32 s2, -1
481; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
482; GFX7LESS-NEXT:    s_endpgm
483;
484; GFX8-LABEL: add_i32_varying:
485; GFX8:       ; %bb.0: ; %entry
486; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
487; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
488; GFX8-NEXT:    v_mov_b32_e32 v1, 0
489; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
490; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
491; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
492; GFX8-NEXT:    v_mov_b32_e32 v2, v0
493; GFX8-NEXT:    s_not_b64 exec, exec
494; GFX8-NEXT:    v_mov_b32_e32 v2, 0
495; GFX8-NEXT:    s_not_b64 exec, exec
496; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
497; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
498; GFX8-NEXT:    s_nop 1
499; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
500; GFX8-NEXT:    s_nop 1
501; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
502; GFX8-NEXT:    s_nop 1
503; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
504; GFX8-NEXT:    s_nop 1
505; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
506; GFX8-NEXT:    s_nop 1
507; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
508; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
509; GFX8-NEXT:    s_nop 0
510; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
511; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
512; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
513; GFX8-NEXT:    ; implicit-def: $vgpr0
514; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
515; GFX8-NEXT:    s_cbranch_execz .LBB2_2
516; GFX8-NEXT:  ; %bb.1:
517; GFX8-NEXT:    v_mov_b32_e32 v0, 0
518; GFX8-NEXT:    v_mov_b32_e32 v3, s4
519; GFX8-NEXT:    s_mov_b32 m0, -1
520; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
522; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX8-NEXT:  .LBB2_2:
524; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
525; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
527; GFX8-NEXT:    v_mov_b32_e32 v0, v1
528; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
529; GFX8-NEXT:    s_mov_b32 s3, 0xf000
530; GFX8-NEXT:    s_mov_b32 s2, -1
531; GFX8-NEXT:    s_nop 0
532; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
533; GFX8-NEXT:    s_endpgm
534;
535; GFX9-LABEL: add_i32_varying:
536; GFX9:       ; %bb.0: ; %entry
537; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
538; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
539; GFX9-NEXT:    v_mov_b32_e32 v1, 0
540; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
541; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
542; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
543; GFX9-NEXT:    v_mov_b32_e32 v2, v0
544; GFX9-NEXT:    s_not_b64 exec, exec
545; GFX9-NEXT:    v_mov_b32_e32 v2, 0
546; GFX9-NEXT:    s_not_b64 exec, exec
547; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
548; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
549; GFX9-NEXT:    s_nop 1
550; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
551; GFX9-NEXT:    s_nop 1
552; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
553; GFX9-NEXT:    s_nop 1
554; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
555; GFX9-NEXT:    s_nop 1
556; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
557; GFX9-NEXT:    s_nop 1
558; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
559; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
560; GFX9-NEXT:    s_nop 0
561; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
562; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
563; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
564; GFX9-NEXT:    ; implicit-def: $vgpr0
565; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
566; GFX9-NEXT:    s_cbranch_execz .LBB2_2
567; GFX9-NEXT:  ; %bb.1:
568; GFX9-NEXT:    v_mov_b32_e32 v0, 0
569; GFX9-NEXT:    v_mov_b32_e32 v3, s4
570; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
572; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX9-NEXT:  .LBB2_2:
574; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
575; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
577; GFX9-NEXT:    v_mov_b32_e32 v0, v1
578; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
579; GFX9-NEXT:    s_mov_b32 s3, 0xf000
580; GFX9-NEXT:    s_mov_b32 s2, -1
581; GFX9-NEXT:    s_nop 0
582; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
583; GFX9-NEXT:    s_endpgm
584;
585; GFX1064-LABEL: add_i32_varying:
586; GFX1064:       ; %bb.0: ; %entry
587; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
588; GFX1064-NEXT:    s_not_b64 exec, exec
589; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
590; GFX1064-NEXT:    s_not_b64 exec, exec
591; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
592; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
593; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
594; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
595; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
596; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
597; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
598; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
599; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
600; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
601; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
602; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
603; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
604; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
605; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
606; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
607; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
608; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
609; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
610; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
611; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
612; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
613; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
614; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
615; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
616; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
617; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
618; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
619; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
620; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
621; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
622; GFX1064-NEXT:    s_mov_b32 s2, -1
623; GFX1064-NEXT:    ; implicit-def: $vgpr0
624; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
625; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
626; GFX1064-NEXT:  ; %bb.1:
627; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
628; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
629; GFX1064-NEXT:    s_mov_b32 s3, s7
630; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
631; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
632; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
633; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX1064-NEXT:    buffer_gl0_inv
635; GFX1064-NEXT:  .LBB2_2:
636; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
637; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
638; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
639; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
640; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
641; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
642; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
644; GFX1064-NEXT:    s_endpgm
645;
646; GFX1032-LABEL: add_i32_varying:
647; GFX1032:       ; %bb.0: ; %entry
648; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
649; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
650; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
651; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
652; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
653; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
654; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
655; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
656; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
657; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
658; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
659; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
660; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
661; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
662; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
663; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
664; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
665; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
666; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
667; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
668; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
669; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
670; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
671; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
672; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
673; GFX1032-NEXT:    s_mov_b32 s2, -1
674; GFX1032-NEXT:    ; implicit-def: $vgpr0
675; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
676; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
677; GFX1032-NEXT:  ; %bb.1:
678; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
679; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
680; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
681; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
682; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
683; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX1032-NEXT:    buffer_gl0_inv
685; GFX1032-NEXT:  .LBB2_2:
686; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
687; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
688; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
689; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
690; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
691; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
692; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
694; GFX1032-NEXT:    s_endpgm
695;
696; GFX1164-LABEL: add_i32_varying:
697; GFX1164:       ; %bb.0: ; %entry
698; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
699; GFX1164-NEXT:    s_not_b64 exec, exec
700; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
701; GFX1164-NEXT:    s_not_b64 exec, exec
702; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
703; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
704; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
705; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
706; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
707; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
708; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
709; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
710; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
711; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
712; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
713; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
714; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
715; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
716; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
717; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
718; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
719; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
720; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
721; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
722; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
723; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
724; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
725; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
726; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
727; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
728; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
729; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
730; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
731; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
732; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
733; GFX1164-NEXT:    s_mov_b32 s2, -1
734; GFX1164-NEXT:    ; implicit-def: $vgpr0
735; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
736; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
737; GFX1164-NEXT:  ; %bb.1:
738; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
739; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
740; GFX1164-NEXT:    s_mov_b32 s3, s7
741; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
742; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
743; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v4
744; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX1164-NEXT:    buffer_gl0_inv
746; GFX1164-NEXT:  .LBB2_2:
747; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
748; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
749; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
750; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
751; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
752; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
754; GFX1164-NEXT:    s_endpgm
755;
756; GFX1132-LABEL: add_i32_varying:
757; GFX1132:       ; %bb.0: ; %entry
758; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
759; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
760; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
761; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
762; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
763; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
764; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
765; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
766; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
767; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
768; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
769; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
770; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
771; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
772; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
773; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
774; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
775; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
776; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
777; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
778; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
779; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
780; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
781; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
782; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
783; GFX1132-NEXT:    s_mov_b32 s2, -1
784; GFX1132-NEXT:    ; implicit-def: $vgpr0
785; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
786; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
787; GFX1132-NEXT:  ; %bb.1:
788; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
789; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
790; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
791; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
792; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v4
793; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX1132-NEXT:    buffer_gl0_inv
795; GFX1132-NEXT:  .LBB2_2:
796; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
797; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
798; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
799; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
800; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
801; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
803; GFX1132-NEXT:    s_endpgm
804entry:
805  %lane = call i32 @llvm.amdgcn.workitem.id.x()
806  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
807  store i32 %old, i32 addrspace(1)* %out
808  ret void
809}
810
811define amdgpu_kernel void @add_i32_varying_nouse() {
812; GFX7LESS-LABEL: add_i32_varying_nouse:
813; GFX7LESS:       ; %bb.0: ; %entry
814; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
815; GFX7LESS-NEXT:    s_mov_b32 m0, -1
816; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX7LESS-NEXT:    ds_add_u32 v1, v0
818; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX7LESS-NEXT:    s_endpgm
820;
821; GFX8-LABEL: add_i32_varying_nouse:
822; GFX8:       ; %bb.0: ; %entry
823; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
824; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
825; GFX8-NEXT:    v_mov_b32_e32 v1, v0
826; GFX8-NEXT:    s_not_b64 exec, exec
827; GFX8-NEXT:    v_mov_b32_e32 v1, 0
828; GFX8-NEXT:    s_not_b64 exec, exec
829; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
830; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
831; GFX8-NEXT:    s_nop 1
832; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
833; GFX8-NEXT:    s_nop 1
834; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
835; GFX8-NEXT:    s_nop 1
836; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
837; GFX8-NEXT:    s_nop 1
838; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
839; GFX8-NEXT:    s_nop 1
840; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
841; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
842; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
843; GFX8-NEXT:    s_mov_b32 s0, s2
844; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
845; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
846; GFX8-NEXT:    s_cbranch_execz .LBB3_2
847; GFX8-NEXT:  ; %bb.1:
848; GFX8-NEXT:    v_mov_b32_e32 v0, 0
849; GFX8-NEXT:    v_mov_b32_e32 v2, s0
850; GFX8-NEXT:    s_mov_b32 m0, -1
851; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX8-NEXT:    ds_add_u32 v0, v2
853; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX8-NEXT:  .LBB3_2:
855; GFX8-NEXT:    s_endpgm
856;
857; GFX9-LABEL: add_i32_varying_nouse:
858; GFX9:       ; %bb.0: ; %entry
859; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
860; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
861; GFX9-NEXT:    v_mov_b32_e32 v1, v0
862; GFX9-NEXT:    s_not_b64 exec, exec
863; GFX9-NEXT:    v_mov_b32_e32 v1, 0
864; GFX9-NEXT:    s_not_b64 exec, exec
865; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
866; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
867; GFX9-NEXT:    s_nop 1
868; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
869; GFX9-NEXT:    s_nop 1
870; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
871; GFX9-NEXT:    s_nop 1
872; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
873; GFX9-NEXT:    s_nop 1
874; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
875; GFX9-NEXT:    s_nop 1
876; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
877; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
878; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
879; GFX9-NEXT:    s_mov_b32 s0, s2
880; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
881; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
882; GFX9-NEXT:    s_cbranch_execz .LBB3_2
883; GFX9-NEXT:  ; %bb.1:
884; GFX9-NEXT:    v_mov_b32_e32 v0, 0
885; GFX9-NEXT:    v_mov_b32_e32 v2, s0
886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX9-NEXT:    ds_add_u32 v0, v2
888; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX9-NEXT:  .LBB3_2:
890; GFX9-NEXT:    s_endpgm
891;
892; GFX1064-LABEL: add_i32_varying_nouse:
893; GFX1064:       ; %bb.0: ; %entry
894; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
895; GFX1064-NEXT:    s_not_b64 exec, exec
896; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
897; GFX1064-NEXT:    s_not_b64 exec, exec
898; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
899; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
900; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
902; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
903; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
904; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
905; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
906; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
907; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
908; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
909; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
910; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
911; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
912; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
913; GFX1064-NEXT:    s_add_i32 s0, s2, s3
914; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
915; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
916; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
917; GFX1064-NEXT:  ; %bb.1:
918; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
919; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
920; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
921; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
922; GFX1064-NEXT:    ds_add_u32 v0, v3
923; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX1064-NEXT:    buffer_gl0_inv
925; GFX1064-NEXT:  .LBB3_2:
926; GFX1064-NEXT:    s_endpgm
927;
928; GFX1032-LABEL: add_i32_varying_nouse:
929; GFX1032:       ; %bb.0: ; %entry
930; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
931; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
932; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
933; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
934; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
935; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
936; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
937; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
938; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
939; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
940; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
941; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
942; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
943; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
944; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
945; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
946; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
947; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
948; GFX1032-NEXT:  ; %bb.1:
949; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
950; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
951; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
952; GFX1032-NEXT:    ds_add_u32 v3, v0
953; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX1032-NEXT:    buffer_gl0_inv
955; GFX1032-NEXT:  .LBB3_2:
956; GFX1032-NEXT:    s_endpgm
957;
958; GFX1164-LABEL: add_i32_varying_nouse:
959; GFX1164:       ; %bb.0: ; %entry
960; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
961; GFX1164-NEXT:    s_not_b64 exec, exec
962; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
963; GFX1164-NEXT:    s_not_b64 exec, exec
964; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
965; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
966; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
967; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
968; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
969; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
970; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
971; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
972; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
973; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
974; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
975; GFX1164-NEXT:    v_readlane_b32 s2, v1, 0
976; GFX1164-NEXT:    v_readlane_b32 s3, v1, 32
977; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
978; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
979; GFX1164-NEXT:    s_add_i32 s0, s2, s3
980; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
981; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
982; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
983; GFX1164-NEXT:  ; %bb.1:
984; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
985; GFX1164-NEXT:    v_mov_b32_e32 v3, s0
986; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
987; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
988; GFX1164-NEXT:    ds_add_u32 v0, v3
989; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX1164-NEXT:    buffer_gl0_inv
991; GFX1164-NEXT:  .LBB3_2:
992; GFX1164-NEXT:    s_endpgm
993;
994; GFX1132-LABEL: add_i32_varying_nouse:
995; GFX1132:       ; %bb.0: ; %entry
996; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
997; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
998; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
999; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
1000; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
1001; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1002; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1003; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1004; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1005; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
1006; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1007; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1008; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
1009; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1010; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
1011; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1012; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
1013; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1014; GFX1132-NEXT:  ; %bb.1:
1015; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1016; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1017; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1018; GFX1132-NEXT:    ds_add_u32 v3, v0
1019; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1020; GFX1132-NEXT:    buffer_gl0_inv
1021; GFX1132-NEXT:  .LBB3_2:
1022; GFX1132-NEXT:    s_endpgm
1023entry:
1024  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1025  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1026  ret void
1027}
1028
1029define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1030;
1031;
1032; GFX7LESS-LABEL: add_i64_constant:
1033; GFX7LESS:       ; %bb.0: ; %entry
1034; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1035; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1036; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1037; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1038; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1039; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1040; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1041; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1042; GFX7LESS-NEXT:  ; %bb.1:
1043; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1044; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1045; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1046; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1047; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1048; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1049; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1050; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1051; GFX7LESS-NEXT:  .LBB4_2:
1052; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1053; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1054; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1055; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1056; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1057; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1058; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1059; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1060; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1061; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1062; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1063; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1064; GFX7LESS-NEXT:    s_endpgm
1065;
1066; GFX8-LABEL: add_i64_constant:
1067; GFX8:       ; %bb.0: ; %entry
1068; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1069; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1070; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1071; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1072; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1073; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1074; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1075; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1076; GFX8-NEXT:  ; %bb.1:
1077; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1078; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1079; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1080; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1081; GFX8-NEXT:    s_mov_b32 m0, -1
1082; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1084; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1085; GFX8-NEXT:  .LBB4_2:
1086; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1087; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1089; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1090; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1091; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1092; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1093; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1094; GFX8-NEXT:    s_mov_b32 s2, -1
1095; GFX8-NEXT:    s_nop 2
1096; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1097; GFX8-NEXT:    s_endpgm
1098;
1099; GFX9-LABEL: add_i64_constant:
1100; GFX9:       ; %bb.0: ; %entry
1101; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1102; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1103; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1104; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1105; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1106; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1107; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1108; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1109; GFX9-NEXT:  ; %bb.1:
1110; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1111; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1112; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1113; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1114; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1116; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1117; GFX9-NEXT:  .LBB4_2:
1118; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1119; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1120; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1121; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1122; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1123; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1124; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1125; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1126; GFX9-NEXT:    s_mov_b32 s2, -1
1127; GFX9-NEXT:    s_nop 2
1128; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1129; GFX9-NEXT:    s_endpgm
1130;
1131; GFX1064-LABEL: add_i64_constant:
1132; GFX1064:       ; %bb.0: ; %entry
1133; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1134; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1135; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1136; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1137; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1138; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1139; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1140; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1141; GFX1064-NEXT:  ; %bb.1:
1142; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1143; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1144; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
1145; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1146; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1147; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1148; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1149; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1150; GFX1064-NEXT:    buffer_gl0_inv
1151; GFX1064-NEXT:  .LBB4_2:
1152; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1153; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1154; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1155; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1156; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1157; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1158; GFX1064-NEXT:    s_mov_b32 s2, -1
1159; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1161; GFX1064-NEXT:    s_endpgm
1162;
1163; GFX1032-LABEL: add_i64_constant:
1164; GFX1032:       ; %bb.0: ; %entry
1165; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1166; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1167; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1168; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1169; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1170; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1171; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1172; GFX1032-NEXT:  ; %bb.1:
1173; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1174; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1175; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1176; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
1177; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1178; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1179; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1180; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1181; GFX1032-NEXT:    buffer_gl0_inv
1182; GFX1032-NEXT:  .LBB4_2:
1183; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1184; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1185; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1186; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1187; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1188; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1189; GFX1032-NEXT:    s_mov_b32 s2, -1
1190; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1191; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1192; GFX1032-NEXT:    s_endpgm
1193;
1194; GFX1164-LABEL: add_i64_constant:
1195; GFX1164:       ; %bb.0: ; %entry
1196; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1197; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1198; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1199; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1200; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1201; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1202; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1203; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1204; GFX1164-NEXT:  ; %bb.1:
1205; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1206; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1207; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
1208; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
1209; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1210; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1211; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1212; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1213; GFX1164-NEXT:    buffer_gl0_inv
1214; GFX1164-NEXT:  .LBB4_2:
1215; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1216; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1217; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1218; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1219; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1220; GFX1164-NEXT:    s_mov_b32 s2, -1
1221; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1223; GFX1164-NEXT:    s_endpgm
1224;
1225; GFX1132-LABEL: add_i64_constant:
1226; GFX1132:       ; %bb.0: ; %entry
1227; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1228; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1229; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1230; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1231; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1232; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1233; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1234; GFX1132-NEXT:  ; %bb.1:
1235; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1236; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1237; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1238; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
1239; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1240; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1241; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1242; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1243; GFX1132-NEXT:    buffer_gl0_inv
1244; GFX1132-NEXT:  .LBB4_2:
1245; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1246; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1247; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1248; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1249; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1250; GFX1132-NEXT:    s_mov_b32 s2, -1
1251; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1252; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1253; GFX1132-NEXT:    s_endpgm
1254entry:
1255  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1256  store i64 %old, i64 addrspace(1)* %out
1257  ret void
1258}
1259
1260define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1261;
1262;
1263; GFX7LESS-LABEL: add_i64_uniform:
1264; GFX7LESS:       ; %bb.0: ; %entry
1265; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1266; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1267; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1268; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1269; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1270; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1271; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1272; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1273; GFX7LESS-NEXT:  ; %bb.1:
1274; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1275; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1276; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1278; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1279; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1280; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1281; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1282; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1283; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1284; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1286; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX7LESS-NEXT:  .LBB5_2:
1288; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1289; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1290; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1291; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1292; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1293; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1294; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
1295; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1296; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1297; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1298; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1299; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1300; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
1301; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1302; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1303; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1304; GFX7LESS-NEXT:    s_endpgm
1305;
1306; GFX8-LABEL: add_i64_uniform:
1307; GFX8:       ; %bb.0: ; %entry
1308; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1309; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1310; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1311; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1312; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1313; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1314; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1315; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1316; GFX8-NEXT:  ; %bb.1:
1317; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1318; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1319; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1320; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1321; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1322; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1323; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1324; GFX8-NEXT:    s_mov_b32 m0, -1
1325; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1327; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1328; GFX8-NEXT:  .LBB5_2:
1329; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1330; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1332; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1333; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1334; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1335; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1336; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1337; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1338; GFX8-NEXT:    s_mov_b32 s6, -1
1339; GFX8-NEXT:    s_mov_b32 s4, s0
1340; GFX8-NEXT:    s_mov_b32 s5, s1
1341; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1342; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1343; GFX8-NEXT:    s_endpgm
1344;
1345; GFX9-LABEL: add_i64_uniform:
1346; GFX9:       ; %bb.0: ; %entry
1347; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1348; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1349; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1350; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1351; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1352; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1353; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1354; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1355; GFX9-NEXT:  ; %bb.1:
1356; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1357; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1359; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1360; GFX9-NEXT:    s_add_i32 s8, s8, s7
1361; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1362; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1363; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1364; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1365; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1366; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX9-NEXT:  .LBB5_2:
1369; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1370; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1371; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1372; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1373; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1374; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1375; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1376; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1377; GFX9-NEXT:    s_mov_b32 s6, -1
1378; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1379; GFX9-NEXT:    s_mov_b32 s4, s0
1380; GFX9-NEXT:    s_mov_b32 s5, s1
1381; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1382; GFX9-NEXT:    s_endpgm
1383;
1384; GFX1064-LABEL: add_i64_uniform:
1385; GFX1064:       ; %bb.0: ; %entry
1386; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1387; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1388; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1389; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1390; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1391; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1392; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1393; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1394; GFX1064-NEXT:  ; %bb.1:
1395; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1396; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1397; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1398; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1399; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1400; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1401; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1402; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1403; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1404; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1405; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1406; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1407; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1408; GFX1064-NEXT:    buffer_gl0_inv
1409; GFX1064-NEXT:  .LBB5_2:
1410; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1411; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1412; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1413; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1414; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1415; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1416; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1417; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1418; GFX1064-NEXT:    s_mov_b32 s2, -1
1419; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1420; GFX1064-NEXT:    s_endpgm
1421;
1422; GFX1032-LABEL: add_i64_uniform:
1423; GFX1032:       ; %bb.0: ; %entry
1424; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1425; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1426; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1427; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1428; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1429; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1430; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1431; GFX1032-NEXT:  ; %bb.1:
1432; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1433; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1434; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1435; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1436; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1437; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1438; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1439; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1440; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1441; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1442; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1443; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1444; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX1032-NEXT:    buffer_gl0_inv
1446; GFX1032-NEXT:  .LBB5_2:
1447; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1448; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1449; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1450; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1451; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1452; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1453; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1454; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1455; GFX1032-NEXT:    s_mov_b32 s2, -1
1456; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1457; GFX1032-NEXT:    s_endpgm
1458;
1459; GFX1164-LABEL: add_i64_uniform:
1460; GFX1164:       ; %bb.0: ; %entry
1461; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1462; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1463; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1464; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1465; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1466; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1467; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1468; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1469; GFX1164-NEXT:  ; %bb.1:
1470; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1471; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1472; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1473; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1474; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1475; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1476; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1477; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1478; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1479; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1480; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1481; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1482; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1483; GFX1164-NEXT:    buffer_gl0_inv
1484; GFX1164-NEXT:  .LBB5_2:
1485; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1486; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1487; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1488; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1489; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1490; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], s[2:3], s3, v2, v[1:2]
1491; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1492; GFX1164-NEXT:    s_mov_b32 s2, -1
1493; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1494; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1495; GFX1164-NEXT:    s_endpgm
1496;
1497; GFX1132-LABEL: add_i64_uniform:
1498; GFX1132:       ; %bb.0: ; %entry
1499; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1500; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1501; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1502; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1503; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1504; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1505; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1506; GFX1132-NEXT:  ; %bb.1:
1507; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1508; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1509; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1510; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1511; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1512; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1513; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1514; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
1515; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
1516; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1517; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1518; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1519; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1520; GFX1132-NEXT:    buffer_gl0_inv
1521; GFX1132-NEXT:  .LBB5_2:
1522; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1523; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1524; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1525; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1526; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1527; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], s2, s3, v2, v[1:2]
1528; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1529; GFX1132-NEXT:    s_mov_b32 s2, -1
1530; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1531; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1532; GFX1132-NEXT:    s_endpgm
1533entry:
1534  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1535  store i64 %old, i64 addrspace(1)* %out
1536  ret void
1537}
1538
1539define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1540;
1541;
1542; GFX7LESS-LABEL: add_i64_varying:
1543; GFX7LESS:       ; %bb.0: ; %entry
1544; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1545; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1546; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1547; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1549; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1550; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1551; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1552; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1553; GFX7LESS-NEXT:    s_endpgm
1554;
1555; GFX8-LABEL: add_i64_varying:
1556; GFX8:       ; %bb.0: ; %entry
1557; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1558; GFX8-NEXT:    s_mov_b32 m0, -1
1559; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1560; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1561; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1562; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1563; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1564; GFX8-NEXT:    s_mov_b32 s2, -1
1565; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1566; GFX8-NEXT:    s_endpgm
1567;
1568; GFX9-LABEL: add_i64_varying:
1569; GFX9:       ; %bb.0: ; %entry
1570; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1571; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1572; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1573; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1574; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1575; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1576; GFX9-NEXT:    s_mov_b32 s2, -1
1577; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1578; GFX9-NEXT:    s_endpgm
1579;
1580; GFX10-LABEL: add_i64_varying:
1581; GFX10:       ; %bb.0: ; %entry
1582; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1583; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1584; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1585; GFX10-NEXT:    s_mov_b32 s2, -1
1586; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1587; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1588; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1589; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1590; GFX10-NEXT:    buffer_gl0_inv
1591; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1592; GFX10-NEXT:    s_endpgm
1593;
1594; GFX11-LABEL: add_i64_varying:
1595; GFX11:       ; %bb.0: ; %entry
1596; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1597; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1598; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1599; GFX11-NEXT:    s_mov_b32 s2, -1
1600; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1601; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1602; GFX11-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1603; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1604; GFX11-NEXT:    buffer_gl0_inv
1605; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1606; GFX11-NEXT:    s_endpgm
1607entry:
1608  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1609  %zext = zext i32 %lane to i64
1610  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1611  store i64 %old, i64 addrspace(1)* %out
1612  ret void
1613}
1614
1615define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1616;
1617;
1618; GFX7LESS-LABEL: sub_i32_constant:
1619; GFX7LESS:       ; %bb.0: ; %entry
1620; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1621; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1622; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1623; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1624; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1625; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1626; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1627; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1628; GFX7LESS-NEXT:  ; %bb.1:
1629; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1630; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1631; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1632; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1633; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1634; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1635; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1636; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1637; GFX7LESS-NEXT:  .LBB7_2:
1638; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1639; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1640; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1641; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1642; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1643; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1644; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1645; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1646; GFX7LESS-NEXT:    s_endpgm
1647;
1648; GFX8-LABEL: sub_i32_constant:
1649; GFX8:       ; %bb.0: ; %entry
1650; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1651; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1652; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1653; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1654; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1655; GFX8-NEXT:    ; implicit-def: $vgpr1
1656; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1657; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1658; GFX8-NEXT:  ; %bb.1:
1659; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1660; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1661; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1662; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1663; GFX8-NEXT:    s_mov_b32 m0, -1
1664; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1665; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1666; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1667; GFX8-NEXT:  .LBB7_2:
1668; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1669; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1671; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1672; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1673; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1674; GFX8-NEXT:    s_mov_b32 s2, -1
1675; GFX8-NEXT:    s_nop 0
1676; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1677; GFX8-NEXT:    s_endpgm
1678;
1679; GFX9-LABEL: sub_i32_constant:
1680; GFX9:       ; %bb.0: ; %entry
1681; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1682; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1683; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1684; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1685; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1686; GFX9-NEXT:    ; implicit-def: $vgpr1
1687; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1688; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1689; GFX9-NEXT:  ; %bb.1:
1690; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1691; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1692; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1693; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1694; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1696; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX9-NEXT:  .LBB7_2:
1698; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1699; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1700; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1701; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1702; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1703; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1704; GFX9-NEXT:    s_mov_b32 s2, -1
1705; GFX9-NEXT:    s_nop 0
1706; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1707; GFX9-NEXT:    s_endpgm
1708;
1709; GFX1064-LABEL: sub_i32_constant:
1710; GFX1064:       ; %bb.0: ; %entry
1711; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1712; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1713; GFX1064-NEXT:    ; implicit-def: $vgpr1
1714; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1715; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1716; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1717; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1718; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1719; GFX1064-NEXT:  ; %bb.1:
1720; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1721; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1722; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1723; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1724; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1725; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1726; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1727; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX1064-NEXT:    buffer_gl0_inv
1729; GFX1064-NEXT:  .LBB7_2:
1730; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1731; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1732; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1733; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1734; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1735; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1736; GFX1064-NEXT:    s_mov_b32 s2, -1
1737; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1739; GFX1064-NEXT:    s_endpgm
1740;
1741; GFX1032-LABEL: sub_i32_constant:
1742; GFX1032:       ; %bb.0: ; %entry
1743; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1744; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1745; GFX1032-NEXT:    ; implicit-def: $vgpr1
1746; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1747; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1748; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1749; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1750; GFX1032-NEXT:  ; %bb.1:
1751; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1752; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1753; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1754; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1755; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1756; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1757; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1758; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1759; GFX1032-NEXT:    buffer_gl0_inv
1760; GFX1032-NEXT:  .LBB7_2:
1761; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1762; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1763; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1764; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1765; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1766; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1767; GFX1032-NEXT:    s_mov_b32 s2, -1
1768; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1769; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1770; GFX1032-NEXT:    s_endpgm
1771;
1772; GFX1164-LABEL: sub_i32_constant:
1773; GFX1164:       ; %bb.0: ; %entry
1774; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1775; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1776; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1777; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1778; GFX1164-NEXT:    ; implicit-def: $vgpr1
1779; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1780; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1781; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1782; GFX1164-NEXT:  ; %bb.1:
1783; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1784; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1785; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1786; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
1787; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1788; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1789; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1790; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1791; GFX1164-NEXT:    buffer_gl0_inv
1792; GFX1164-NEXT:  .LBB7_2:
1793; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1794; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1795; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1796; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1797; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1798; GFX1164-NEXT:    s_mov_b32 s2, -1
1799; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1800; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1801; GFX1164-NEXT:    s_endpgm
1802;
1803; GFX1132-LABEL: sub_i32_constant:
1804; GFX1132:       ; %bb.0: ; %entry
1805; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1806; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1807; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1808; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1809; GFX1132-NEXT:    ; implicit-def: $vgpr1
1810; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1811; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
1812; GFX1132-NEXT:  ; %bb.1:
1813; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1814; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1815; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1816; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
1817; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1818; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1819; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1820; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX1132-NEXT:    buffer_gl0_inv
1822; GFX1132-NEXT:  .LBB7_2:
1823; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1824; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1825; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1826; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1827; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1828; GFX1132-NEXT:    s_mov_b32 s2, -1
1829; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1830; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1831; GFX1132-NEXT:    s_endpgm
1832entry:
1833  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1834  store i32 %old, i32 addrspace(1)* %out
1835  ret void
1836}
1837
1838define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1839;
1840;
1841; GFX7LESS-LABEL: sub_i32_uniform:
1842; GFX7LESS:       ; %bb.0: ; %entry
1843; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1844; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1845; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1846; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1847; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1848; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1849; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1850; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1851; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1852; GFX7LESS-NEXT:  ; %bb.1:
1853; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1854; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1856; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1857; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1858; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1859; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1860; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1861; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX7LESS-NEXT:  .LBB8_2:
1863; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1864; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1866; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1867; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1868; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1869; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1870; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1871; GFX7LESS-NEXT:    s_endpgm
1872;
1873; GFX8-LABEL: sub_i32_uniform:
1874; GFX8:       ; %bb.0: ; %entry
1875; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1876; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1877; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1878; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1879; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1880; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1881; GFX8-NEXT:    ; implicit-def: $vgpr1
1882; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1883; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1884; GFX8-NEXT:  ; %bb.1:
1885; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1886; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1888; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1889; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1890; GFX8-NEXT:    s_mov_b32 m0, -1
1891; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1892; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1893; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1894; GFX8-NEXT:  .LBB8_2:
1895; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1896; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1898; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1899; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1900; GFX8-NEXT:    s_mov_b32 s6, -1
1901; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1902; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1903; GFX8-NEXT:    s_endpgm
1904;
1905; GFX9-LABEL: sub_i32_uniform:
1906; GFX9:       ; %bb.0: ; %entry
1907; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1908; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1909; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1910; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1911; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1912; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1913; GFX9-NEXT:    ; implicit-def: $vgpr1
1914; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1915; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1916; GFX9-NEXT:  ; %bb.1:
1917; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1918; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1919; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1920; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1921; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1922; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1923; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1924; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX9-NEXT:  .LBB8_2:
1926; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1927; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1928; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1929; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1930; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1931; GFX9-NEXT:    s_mov_b32 s6, -1
1932; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1933; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1934; GFX9-NEXT:    s_endpgm
1935;
1936; GFX1064-LABEL: sub_i32_uniform:
1937; GFX1064:       ; %bb.0: ; %entry
1938; GFX1064-NEXT:    s_clause 0x1
1939; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1940; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
1941; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1942; GFX1064-NEXT:    ; implicit-def: $vgpr1
1943; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1944; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1945; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1946; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1947; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
1948; GFX1064-NEXT:  ; %bb.1:
1949; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1950; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1951; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1952; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
1953; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1954; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1955; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1956; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1957; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1958; GFX1064-NEXT:    buffer_gl0_inv
1959; GFX1064-NEXT:  .LBB8_2:
1960; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1961; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1962; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1963; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
1964; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1965; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1966; GFX1064-NEXT:    s_mov_b32 s6, -1
1967; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1968; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1969; GFX1064-NEXT:    s_endpgm
1970;
1971; GFX1032-LABEL: sub_i32_uniform:
1972; GFX1032:       ; %bb.0: ; %entry
1973; GFX1032-NEXT:    s_clause 0x1
1974; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1975; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1976; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1977; GFX1032-NEXT:    ; implicit-def: $vgpr1
1978; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1979; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1980; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1981; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
1982; GFX1032-NEXT:  ; %bb.1:
1983; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1984; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1985; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1986; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1987; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1988; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1989; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1990; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1991; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1992; GFX1032-NEXT:    buffer_gl0_inv
1993; GFX1032-NEXT:  .LBB8_2:
1994; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1995; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1996; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1997; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1998; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1999; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2000; GFX1032-NEXT:    s_mov_b32 s6, -1
2001; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2002; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2003; GFX1032-NEXT:    s_endpgm
2004;
2005; GFX1164-LABEL: sub_i32_uniform:
2006; GFX1164:       ; %bb.0: ; %entry
2007; GFX1164-NEXT:    s_clause 0x1
2008; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2009; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
2010; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2011; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2012; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2013; GFX1164-NEXT:    ; implicit-def: $vgpr1
2014; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2015; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2016; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2017; GFX1164-NEXT:  ; %bb.1:
2018; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2019; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2020; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2021; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
2022; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
2023; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2024; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2025; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2026; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2027; GFX1164-NEXT:    buffer_gl0_inv
2028; GFX1164-NEXT:  .LBB8_2:
2029; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2030; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2031; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
2032; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2033; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2034; GFX1164-NEXT:    s_mov_b32 s6, -1
2035; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2036; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2037; GFX1164-NEXT:    s_endpgm
2038;
2039; GFX1132-LABEL: sub_i32_uniform:
2040; GFX1132:       ; %bb.0: ; %entry
2041; GFX1132-NEXT:    s_clause 0x1
2042; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2043; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
2044; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2045; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2046; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2047; GFX1132-NEXT:    ; implicit-def: $vgpr1
2048; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2049; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2050; GFX1132-NEXT:  ; %bb.1:
2051; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2052; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2053; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2054; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2055; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
2056; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2057; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2058; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2059; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2060; GFX1132-NEXT:    buffer_gl0_inv
2061; GFX1132-NEXT:  .LBB8_2:
2062; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2063; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2065; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2066; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2067; GFX1132-NEXT:    s_mov_b32 s6, -1
2068; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2069; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2070; GFX1132-NEXT:    s_endpgm
2071entry:
2072  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
2073  store i32 %old, i32 addrspace(1)* %out
2074  ret void
2075}
2076
2077define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
2078;
2079;
2080; GFX7LESS-LABEL: sub_i32_varying:
2081; GFX7LESS:       ; %bb.0: ; %entry
2082; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2083; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2084; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2085; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2086; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
2087; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2088; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2089; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2090; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2091; GFX7LESS-NEXT:    s_endpgm
2092;
2093; GFX8-LABEL: sub_i32_varying:
2094; GFX8:       ; %bb.0: ; %entry
2095; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2096; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2097; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2098; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2099; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2100; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2101; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2102; GFX8-NEXT:    s_not_b64 exec, exec
2103; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2104; GFX8-NEXT:    s_not_b64 exec, exec
2105; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2106; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2107; GFX8-NEXT:    s_nop 1
2108; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2109; GFX8-NEXT:    s_nop 1
2110; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2111; GFX8-NEXT:    s_nop 1
2112; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2113; GFX8-NEXT:    s_nop 1
2114; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2115; GFX8-NEXT:    s_nop 1
2116; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2117; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2118; GFX8-NEXT:    s_nop 0
2119; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2120; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2121; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2122; GFX8-NEXT:    ; implicit-def: $vgpr0
2123; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2124; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2125; GFX8-NEXT:  ; %bb.1:
2126; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2127; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2128; GFX8-NEXT:    s_mov_b32 m0, -1
2129; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2130; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2131; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX8-NEXT:  .LBB9_2:
2133; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2134; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2135; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2136; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2137; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2138; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2139; GFX8-NEXT:    s_mov_b32 s2, -1
2140; GFX8-NEXT:    s_nop 0
2141; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2142; GFX8-NEXT:    s_endpgm
2143;
2144; GFX9-LABEL: sub_i32_varying:
2145; GFX9:       ; %bb.0: ; %entry
2146; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2147; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2148; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2149; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2150; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2151; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2152; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2153; GFX9-NEXT:    s_not_b64 exec, exec
2154; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2155; GFX9-NEXT:    s_not_b64 exec, exec
2156; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2157; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2158; GFX9-NEXT:    s_nop 1
2159; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2160; GFX9-NEXT:    s_nop 1
2161; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2162; GFX9-NEXT:    s_nop 1
2163; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2164; GFX9-NEXT:    s_nop 1
2165; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2166; GFX9-NEXT:    s_nop 1
2167; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2168; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2169; GFX9-NEXT:    s_nop 0
2170; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2171; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2172; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2173; GFX9-NEXT:    ; implicit-def: $vgpr0
2174; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2175; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2176; GFX9-NEXT:  ; %bb.1:
2177; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2178; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2179; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2180; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2181; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2182; GFX9-NEXT:  .LBB9_2:
2183; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2184; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2185; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2186; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2187; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2188; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2189; GFX9-NEXT:    s_mov_b32 s2, -1
2190; GFX9-NEXT:    s_nop 0
2191; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2192; GFX9-NEXT:    s_endpgm
2193;
2194; GFX1064-LABEL: sub_i32_varying:
2195; GFX1064:       ; %bb.0: ; %entry
2196; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2197; GFX1064-NEXT:    s_not_b64 exec, exec
2198; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2199; GFX1064-NEXT:    s_not_b64 exec, exec
2200; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2201; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2202; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2203; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2204; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2205; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2206; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2207; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2208; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2209; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2210; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2211; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2212; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2213; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2214; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2215; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2216; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2217; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2218; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2219; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2220; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2221; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2222; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2223; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2224; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2225; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2226; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2227; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2228; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2229; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2230; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2231; GFX1064-NEXT:    s_mov_b32 s2, -1
2232; GFX1064-NEXT:    ; implicit-def: $vgpr0
2233; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2234; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2235; GFX1064-NEXT:  ; %bb.1:
2236; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2237; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2238; GFX1064-NEXT:    s_mov_b32 s3, s7
2239; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2240; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2241; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2242; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX1064-NEXT:    buffer_gl0_inv
2244; GFX1064-NEXT:  .LBB9_2:
2245; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2246; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2247; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2248; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2249; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2250; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2251; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2252; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2253; GFX1064-NEXT:    s_endpgm
2254;
2255; GFX1032-LABEL: sub_i32_varying:
2256; GFX1032:       ; %bb.0: ; %entry
2257; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2258; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2259; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2260; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2261; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2262; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2263; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2264; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2265; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2266; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2267; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2268; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2269; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2270; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2271; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2272; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2273; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2274; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2275; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2276; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2277; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2278; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2279; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2280; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2281; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2282; GFX1032-NEXT:    s_mov_b32 s2, -1
2283; GFX1032-NEXT:    ; implicit-def: $vgpr0
2284; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2285; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2286; GFX1032-NEXT:  ; %bb.1:
2287; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2288; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2289; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2290; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2291; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2292; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2293; GFX1032-NEXT:    buffer_gl0_inv
2294; GFX1032-NEXT:  .LBB9_2:
2295; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2296; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2297; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2298; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2299; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2300; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2301; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2302; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2303; GFX1032-NEXT:    s_endpgm
2304;
2305; GFX1164-LABEL: sub_i32_varying:
2306; GFX1164:       ; %bb.0: ; %entry
2307; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2308; GFX1164-NEXT:    s_not_b64 exec, exec
2309; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2310; GFX1164-NEXT:    s_not_b64 exec, exec
2311; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2312; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2313; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2314; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2315; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2316; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2317; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2318; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2319; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2320; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2321; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2322; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2323; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
2324; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2325; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2326; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2327; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2328; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
2329; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
2330; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2331; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2332; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2333; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
2334; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
2335; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
2336; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2337; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2338; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2339; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
2340; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2341; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2342; GFX1164-NEXT:    s_mov_b32 s2, -1
2343; GFX1164-NEXT:    ; implicit-def: $vgpr0
2344; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2345; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2346; GFX1164-NEXT:  ; %bb.1:
2347; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2348; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
2349; GFX1164-NEXT:    s_mov_b32 s3, s7
2350; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2351; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2352; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2353; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2354; GFX1164-NEXT:    buffer_gl0_inv
2355; GFX1164-NEXT:  .LBB9_2:
2356; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2357; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
2358; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2359; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2360; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2361; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2362; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2363; GFX1164-NEXT:    s_endpgm
2364;
2365; GFX1132-LABEL: sub_i32_varying:
2366; GFX1132:       ; %bb.0: ; %entry
2367; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2368; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2369; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2370; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2371; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2372; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2373; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2374; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2375; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2376; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2377; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2378; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2379; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2380; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2381; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2382; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2383; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
2384; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
2385; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2386; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2387; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2388; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2389; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
2390; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2391; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2392; GFX1132-NEXT:    s_mov_b32 s2, -1
2393; GFX1132-NEXT:    ; implicit-def: $vgpr0
2394; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2395; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2396; GFX1132-NEXT:  ; %bb.1:
2397; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
2398; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
2399; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2400; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2401; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2402; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2403; GFX1132-NEXT:    buffer_gl0_inv
2404; GFX1132-NEXT:  .LBB9_2:
2405; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2406; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
2407; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2408; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2409; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2410; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2411; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2412; GFX1132-NEXT:    s_endpgm
2413entry:
2414  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2415  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2416  store i32 %old, i32 addrspace(1)* %out
2417  ret void
2418}
2419
2420define amdgpu_kernel void @sub_i32_varying_nouse() {
2421; GFX7LESS-LABEL: sub_i32_varying_nouse:
2422; GFX7LESS:       ; %bb.0: ; %entry
2423; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2424; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2425; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2426; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
2427; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2428; GFX7LESS-NEXT:    s_endpgm
2429;
2430; GFX8-LABEL: sub_i32_varying_nouse:
2431; GFX8:       ; %bb.0: ; %entry
2432; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2433; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2434; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2435; GFX8-NEXT:    s_not_b64 exec, exec
2436; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2437; GFX8-NEXT:    s_not_b64 exec, exec
2438; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
2439; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2440; GFX8-NEXT:    s_nop 1
2441; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2442; GFX8-NEXT:    s_nop 1
2443; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2444; GFX8-NEXT:    s_nop 1
2445; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2446; GFX8-NEXT:    s_nop 1
2447; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2448; GFX8-NEXT:    s_nop 1
2449; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2450; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
2451; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
2452; GFX8-NEXT:    s_mov_b32 s0, s2
2453; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2454; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2455; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2456; GFX8-NEXT:  ; %bb.1:
2457; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2458; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2459; GFX8-NEXT:    s_mov_b32 m0, -1
2460; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2461; GFX8-NEXT:    ds_sub_u32 v0, v2
2462; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX8-NEXT:  .LBB10_2:
2464; GFX8-NEXT:    s_endpgm
2465;
2466; GFX9-LABEL: sub_i32_varying_nouse:
2467; GFX9:       ; %bb.0: ; %entry
2468; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2469; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2470; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2471; GFX9-NEXT:    s_not_b64 exec, exec
2472; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2473; GFX9-NEXT:    s_not_b64 exec, exec
2474; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
2475; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2476; GFX9-NEXT:    s_nop 1
2477; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2478; GFX9-NEXT:    s_nop 1
2479; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2480; GFX9-NEXT:    s_nop 1
2481; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2482; GFX9-NEXT:    s_nop 1
2483; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2484; GFX9-NEXT:    s_nop 1
2485; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2486; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
2487; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
2488; GFX9-NEXT:    s_mov_b32 s0, s2
2489; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2490; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2491; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2492; GFX9-NEXT:  ; %bb.1:
2493; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2494; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2495; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2496; GFX9-NEXT:    ds_sub_u32 v0, v2
2497; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX9-NEXT:  .LBB10_2:
2499; GFX9-NEXT:    s_endpgm
2500;
2501; GFX1064-LABEL: sub_i32_varying_nouse:
2502; GFX1064:       ; %bb.0: ; %entry
2503; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2504; GFX1064-NEXT:    s_not_b64 exec, exec
2505; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2506; GFX1064-NEXT:    s_not_b64 exec, exec
2507; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2508; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2509; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2510; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2511; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2512; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2513; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2514; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2515; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2516; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2517; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2518; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
2519; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
2520; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2521; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2522; GFX1064-NEXT:    s_add_i32 s0, s2, s3
2523; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2524; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2525; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2526; GFX1064-NEXT:  ; %bb.1:
2527; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2528; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
2529; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2530; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2531; GFX1064-NEXT:    ds_sub_u32 v0, v3
2532; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2533; GFX1064-NEXT:    buffer_gl0_inv
2534; GFX1064-NEXT:  .LBB10_2:
2535; GFX1064-NEXT:    s_endpgm
2536;
2537; GFX1032-LABEL: sub_i32_varying_nouse:
2538; GFX1032:       ; %bb.0: ; %entry
2539; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2540; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2541; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2542; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2543; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
2544; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2545; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2546; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2547; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2548; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2549; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2550; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2551; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
2552; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2553; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2554; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2555; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2556; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2557; GFX1032-NEXT:  ; %bb.1:
2558; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2559; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2560; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2561; GFX1032-NEXT:    ds_sub_u32 v3, v0
2562; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2563; GFX1032-NEXT:    buffer_gl0_inv
2564; GFX1032-NEXT:  .LBB10_2:
2565; GFX1032-NEXT:    s_endpgm
2566;
2567; GFX1164-LABEL: sub_i32_varying_nouse:
2568; GFX1164:       ; %bb.0: ; %entry
2569; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2570; GFX1164-NEXT:    s_not_b64 exec, exec
2571; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2572; GFX1164-NEXT:    s_not_b64 exec, exec
2573; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2574; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2575; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2576; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2577; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2578; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2579; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2580; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2581; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2582; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2583; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2584; GFX1164-NEXT:    v_readlane_b32 s2, v1, 0
2585; GFX1164-NEXT:    v_readlane_b32 s3, v1, 32
2586; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2587; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2588; GFX1164-NEXT:    s_add_i32 s0, s2, s3
2589; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2590; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2591; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2592; GFX1164-NEXT:  ; %bb.1:
2593; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2594; GFX1164-NEXT:    v_mov_b32_e32 v3, s0
2595; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2596; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2597; GFX1164-NEXT:    ds_sub_u32 v0, v3
2598; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2599; GFX1164-NEXT:    buffer_gl0_inv
2600; GFX1164-NEXT:  .LBB10_2:
2601; GFX1164-NEXT:    s_endpgm
2602;
2603; GFX1132-LABEL: sub_i32_varying_nouse:
2604; GFX1132:       ; %bb.0: ; %entry
2605; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2606; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2607; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2608; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2609; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
2610; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2611; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2612; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2613; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2614; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2615; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2616; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2617; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
2618; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2619; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
2620; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
2621; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
2622; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
2623; GFX1132-NEXT:  ; %bb.1:
2624; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2625; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2626; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2627; GFX1132-NEXT:    ds_sub_u32 v3, v0
2628; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2629; GFX1132-NEXT:    buffer_gl0_inv
2630; GFX1132-NEXT:  .LBB10_2:
2631; GFX1132-NEXT:    s_endpgm
2632entry:
2633  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2634  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2635  ret void
2636}
2637
2638define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2639;
2640;
2641; GFX7LESS-LABEL: sub_i64_constant:
2642; GFX7LESS:       ; %bb.0: ; %entry
2643; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2644; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2645; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2646; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
2647; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2648; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2649; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2650; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
2651; GFX7LESS-NEXT:  ; %bb.1:
2652; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2653; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
2654; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2655; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
2656; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2657; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2659; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2660; GFX7LESS-NEXT:  .LBB11_2:
2661; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2662; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2663; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2664; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
2665; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2666; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2667; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2668; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2669; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2670; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2671; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2672; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2673; GFX7LESS-NEXT:    s_endpgm
2674;
2675; GFX8-LABEL: sub_i64_constant:
2676; GFX8:       ; %bb.0: ; %entry
2677; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2678; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2679; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2680; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2681; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2682; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2683; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2684; GFX8-NEXT:    s_cbranch_execz .LBB11_2
2685; GFX8-NEXT:  ; %bb.1:
2686; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2687; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2688; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2689; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2690; GFX8-NEXT:    s_mov_b32 m0, -1
2691; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2692; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2693; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2694; GFX8-NEXT:  .LBB11_2:
2695; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2696; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2697; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2698; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2699; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2700; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2701; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2702; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2703; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2704; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2705; GFX8-NEXT:    s_mov_b32 s2, -1
2706; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2707; GFX8-NEXT:    s_endpgm
2708;
2709; GFX9-LABEL: sub_i64_constant:
2710; GFX9:       ; %bb.0: ; %entry
2711; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2712; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2713; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2714; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2715; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2716; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2717; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2718; GFX9-NEXT:    s_cbranch_execz .LBB11_2
2719; GFX9-NEXT:  ; %bb.1:
2720; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2721; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2722; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2723; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2724; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2725; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2726; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2727; GFX9-NEXT:  .LBB11_2:
2728; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2729; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2731; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2732; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2733; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2734; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2735; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2736; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2737; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2738; GFX9-NEXT:    s_mov_b32 s2, -1
2739; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2740; GFX9-NEXT:    s_endpgm
2741;
2742; GFX1064-LABEL: sub_i64_constant:
2743; GFX1064:       ; %bb.0: ; %entry
2744; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2745; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2746; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2747; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2748; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2749; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2750; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2751; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
2752; GFX1064-NEXT:  ; %bb.1:
2753; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2754; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2755; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2756; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2757; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2758; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2759; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2760; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2761; GFX1064-NEXT:    buffer_gl0_inv
2762; GFX1064-NEXT:  .LBB11_2:
2763; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2764; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2765; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2766; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2767; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2768; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2769; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2770; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2771; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2772; GFX1064-NEXT:    s_mov_b32 s2, -1
2773; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2774; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2775; GFX1064-NEXT:    s_endpgm
2776;
2777; GFX1032-LABEL: sub_i64_constant:
2778; GFX1032:       ; %bb.0: ; %entry
2779; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2780; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2781; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2782; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2783; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2784; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2785; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2786; GFX1032-NEXT:  ; %bb.1:
2787; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2788; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2789; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2790; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2791; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2792; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2793; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2794; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2795; GFX1032-NEXT:    buffer_gl0_inv
2796; GFX1032-NEXT:  .LBB11_2:
2797; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2798; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2799; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2800; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2801; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2802; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2803; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2804; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2805; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2806; GFX1032-NEXT:    s_mov_b32 s2, -1
2807; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2808; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2809; GFX1032-NEXT:    s_endpgm
2810;
2811; GFX1164-LABEL: sub_i64_constant:
2812; GFX1164:       ; %bb.0: ; %entry
2813; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2814; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2815; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2816; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2817; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2818; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2819; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2820; GFX1164-NEXT:    s_cbranch_execz .LBB11_2
2821; GFX1164-NEXT:  ; %bb.1:
2822; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2823; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2824; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
2825; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2826; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2827; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2828; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2829; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2830; GFX1164-NEXT:    buffer_gl0_inv
2831; GFX1164-NEXT:  .LBB11_2:
2832; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
2833; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2834; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2835; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2836; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2837; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2838; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2839; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2840; GFX1164-NEXT:    s_mov_b32 s2, -1
2841; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2842; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2843; GFX1164-NEXT:    s_endpgm
2844;
2845; GFX1132-LABEL: sub_i64_constant:
2846; GFX1132:       ; %bb.0: ; %entry
2847; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2848; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
2849; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2850; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2851; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2852; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2853; GFX1132-NEXT:    s_cbranch_execz .LBB11_2
2854; GFX1132-NEXT:  ; %bb.1:
2855; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
2856; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2857; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
2858; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
2859; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2860; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2861; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2862; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2863; GFX1132-NEXT:    buffer_gl0_inv
2864; GFX1132-NEXT:  .LBB11_2:
2865; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2866; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2867; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2868; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2869; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2870; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2871; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2872; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2873; GFX1132-NEXT:    s_mov_b32 s2, -1
2874; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2875; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2876; GFX1132-NEXT:    s_endpgm
2877entry:
2878  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2879  store i64 %old, i64 addrspace(1)* %out
2880  ret void
2881}
2882
2883define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2884;
2885;
2886; GFX7LESS-LABEL: sub_i64_uniform:
2887; GFX7LESS:       ; %bb.0: ; %entry
2888; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2889; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2890; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2891; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
2892; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2893; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2894; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2895; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
2896; GFX7LESS-NEXT:  ; %bb.1:
2897; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2898; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
2899; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2900; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2901; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2902; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
2903; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2904; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
2905; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2906; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2907; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2908; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2909; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2910; GFX7LESS-NEXT:  .LBB12_2:
2911; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2912; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2913; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2914; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2915; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2916; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2917; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
2918; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
2919; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
2920; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
2921; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
2922; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
2923; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
2924; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
2925; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2926; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2927; GFX7LESS-NEXT:    s_endpgm
2928;
2929; GFX8-LABEL: sub_i64_uniform:
2930; GFX8:       ; %bb.0: ; %entry
2931; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2932; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2933; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2934; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2935; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2936; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2937; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2938; GFX8-NEXT:    s_cbranch_execz .LBB12_2
2939; GFX8-NEXT:  ; %bb.1:
2940; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
2941; GFX8-NEXT:    v_mov_b32_e32 v0, s8
2942; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2943; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
2944; GFX8-NEXT:    s_mul_i32 s6, s3, s8
2945; GFX8-NEXT:    v_mov_b32_e32 v3, 0
2946; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
2947; GFX8-NEXT:    s_mov_b32 m0, -1
2948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2949; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2950; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2951; GFX8-NEXT:  .LBB12_2:
2952; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2954; GFX8-NEXT:    s_mov_b32 s4, s0
2955; GFX8-NEXT:    s_mov_b32 s5, s1
2956; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
2957; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2958; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2959; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2960; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
2961; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2962; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
2963; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2964; GFX8-NEXT:    s_mov_b32 s6, -1
2965; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2966; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2967; GFX8-NEXT:    s_endpgm
2968;
2969; GFX9-LABEL: sub_i64_uniform:
2970; GFX9:       ; %bb.0: ; %entry
2971; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2972; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2973; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2974; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2975; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2976; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2977; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2978; GFX9-NEXT:    s_cbranch_execz .LBB12_2
2979; GFX9-NEXT:  ; %bb.1:
2980; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2982; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2983; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2984; GFX9-NEXT:    s_add_i32 s8, s8, s7
2985; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2986; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2987; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2988; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2990; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2991; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2992; GFX9-NEXT:  .LBB12_2:
2993; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2994; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2995; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
2996; GFX9-NEXT:    s_mov_b32 s4, s0
2997; GFX9-NEXT:    s_mov_b32 s5, s1
2998; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
2999; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3000; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3001; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3002; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3003; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
3004; GFX9-NEXT:    s_mov_b32 s7, 0xf000
3005; GFX9-NEXT:    s_mov_b32 s6, -1
3006; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
3007; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3008; GFX9-NEXT:    s_endpgm
3009;
3010; GFX1064-LABEL: sub_i64_uniform:
3011; GFX1064:       ; %bb.0: ; %entry
3012; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3013; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3014; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3015; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3016; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3017; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3018; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3019; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
3020; GFX1064-NEXT:  ; %bb.1:
3021; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3022; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3023; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3024; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
3025; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
3026; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
3027; GFX1064-NEXT:    s_add_i32 s8, s8, s7
3028; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
3029; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
3030; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3031; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3032; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3033; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX1064-NEXT:    buffer_gl0_inv
3035; GFX1064-NEXT:  .LBB12_2:
3036; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3037; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3038; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3039; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3040; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
3041; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3042; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3043; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3044; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3045; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
3046; GFX1064-NEXT:    s_mov_b32 s2, -1
3047; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3048; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3049; GFX1064-NEXT:    s_endpgm
3050;
3051; GFX1032-LABEL: sub_i64_uniform:
3052; GFX1032:       ; %bb.0: ; %entry
3053; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3054; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
3055; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3056; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3057; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
3058; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3059; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
3060; GFX1032-NEXT:  ; %bb.1:
3061; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
3062; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3063; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3064; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
3065; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
3066; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
3067; GFX1032-NEXT:    s_add_i32 s7, s7, s6
3068; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
3069; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
3070; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3071; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3072; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3073; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3074; GFX1032-NEXT:    buffer_gl0_inv
3075; GFX1032-NEXT:  .LBB12_2:
3076; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3077; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3078; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3079; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
3080; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
3081; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
3082; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3083; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3084; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3085; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
3086; GFX1032-NEXT:    s_mov_b32 s2, -1
3087; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3088; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3089; GFX1032-NEXT:    s_endpgm
3090;
3091; GFX1164-LABEL: sub_i64_uniform:
3092; GFX1164:       ; %bb.0: ; %entry
3093; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3094; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3095; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3096; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3097; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3098; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
3099; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
3100; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
3101; GFX1164-NEXT:  ; %bb.1:
3102; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3103; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3104; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3105; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
3106; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
3107; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
3108; GFX1164-NEXT:    s_add_i32 s8, s8, s7
3109; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
3110; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
3111; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3112; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3113; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3114; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3115; GFX1164-NEXT:    buffer_gl0_inv
3116; GFX1164-NEXT:  .LBB12_2:
3117; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3118; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3119; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3120; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
3121; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s3, v2, v[4:5]
3122; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
3123; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3124; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3125; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3126; GFX1164-NEXT:    s_mov_b32 s2, -1
3127; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3128; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3129; GFX1164-NEXT:    s_endpgm
3130;
3131; GFX1132-LABEL: sub_i64_uniform:
3132; GFX1132:       ; %bb.0: ; %entry
3133; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3134; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
3135; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3136; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3137; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3138; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3139; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
3140; GFX1132-NEXT:  ; %bb.1:
3141; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
3142; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3143; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3144; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
3145; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
3146; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
3147; GFX1132-NEXT:    s_add_i32 s7, s7, s6
3148; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
3149; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
3150; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3151; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3152; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3153; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX1132-NEXT:    buffer_gl0_inv
3155; GFX1132-NEXT:  .LBB12_2:
3156; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3157; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3158; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
3159; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
3160; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], s2, s3, v2, v[4:5]
3161; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
3162; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3163; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3164; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3165; GFX1132-NEXT:    s_mov_b32 s2, -1
3166; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3167; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3168; GFX1132-NEXT:    s_endpgm
3169entry:
3170  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
3171  store i64 %old, i64 addrspace(1)* %out
3172  ret void
3173}
3174
3175define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
3176;
3177;
3178; GFX7LESS-LABEL: sub_i64_varying:
3179; GFX7LESS:       ; %bb.0: ; %entry
3180; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3181; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3182; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3183; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3184; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3185; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3186; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3187; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3188; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3189; GFX7LESS-NEXT:    s_endpgm
3190;
3191; GFX8-LABEL: sub_i64_varying:
3192; GFX8:       ; %bb.0: ; %entry
3193; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3194; GFX8-NEXT:    s_mov_b32 m0, -1
3195; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3196; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3197; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3198; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3199; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3200; GFX8-NEXT:    s_mov_b32 s2, -1
3201; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3202; GFX8-NEXT:    s_endpgm
3203;
3204; GFX9-LABEL: sub_i64_varying:
3205; GFX9:       ; %bb.0: ; %entry
3206; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3207; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3208; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3209; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3210; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3211; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3212; GFX9-NEXT:    s_mov_b32 s2, -1
3213; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3214; GFX9-NEXT:    s_endpgm
3215;
3216; GFX10-LABEL: sub_i64_varying:
3217; GFX10:       ; %bb.0: ; %entry
3218; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3219; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3220; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
3221; GFX10-NEXT:    s_mov_b32 s2, -1
3222; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3223; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3224; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3225; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3226; GFX10-NEXT:    buffer_gl0_inv
3227; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3228; GFX10-NEXT:    s_endpgm
3229;
3230; GFX11-LABEL: sub_i64_varying:
3231; GFX11:       ; %bb.0: ; %entry
3232; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3233; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3234; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
3235; GFX11-NEXT:    s_mov_b32 s2, -1
3236; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3237; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3238; GFX11-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3239; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3240; GFX11-NEXT:    buffer_gl0_inv
3241; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3242; GFX11-NEXT:    s_endpgm
3243entry:
3244  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3245  %zext = zext i32 %lane to i64
3246  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
3247  store i64 %old, i64 addrspace(1)* %out
3248  ret void
3249}
3250
3251define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
3252;
3253;
3254; GFX7LESS-LABEL: and_i32_varying:
3255; GFX7LESS:       ; %bb.0: ; %entry
3256; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3257; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3258; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3259; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3260; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
3261; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3262; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3263; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3264; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3265; GFX7LESS-NEXT:    s_endpgm
3266;
3267; GFX8-LABEL: and_i32_varying:
3268; GFX8:       ; %bb.0: ; %entry
3269; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3270; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3271; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3272; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3273; GFX8-NEXT:    v_mov_b32_e32 v1, -1
3274; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3275; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3276; GFX8-NEXT:    s_not_b64 exec, exec
3277; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3278; GFX8-NEXT:    s_not_b64 exec, exec
3279; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3280; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3281; GFX8-NEXT:    s_nop 1
3282; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3283; GFX8-NEXT:    s_nop 1
3284; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3285; GFX8-NEXT:    s_nop 1
3286; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3287; GFX8-NEXT:    s_nop 1
3288; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3289; GFX8-NEXT:    s_nop 1
3290; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3291; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3292; GFX8-NEXT:    s_nop 0
3293; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3294; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3295; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3296; GFX8-NEXT:    ; implicit-def: $vgpr0
3297; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3298; GFX8-NEXT:    s_cbranch_execz .LBB14_2
3299; GFX8-NEXT:  ; %bb.1:
3300; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3301; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3302; GFX8-NEXT:    s_mov_b32 m0, -1
3303; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3304; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
3305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3306; GFX8-NEXT:  .LBB14_2:
3307; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3308; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3309; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3310; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3311; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
3312; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3313; GFX8-NEXT:    s_mov_b32 s2, -1
3314; GFX8-NEXT:    s_nop 0
3315; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3316; GFX8-NEXT:    s_endpgm
3317;
3318; GFX9-LABEL: and_i32_varying:
3319; GFX9:       ; %bb.0: ; %entry
3320; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3321; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3322; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3323; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3324; GFX9-NEXT:    v_mov_b32_e32 v1, -1
3325; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3326; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3327; GFX9-NEXT:    s_not_b64 exec, exec
3328; GFX9-NEXT:    v_mov_b32_e32 v2, -1
3329; GFX9-NEXT:    s_not_b64 exec, exec
3330; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3331; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3332; GFX9-NEXT:    s_nop 1
3333; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3334; GFX9-NEXT:    s_nop 1
3335; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3336; GFX9-NEXT:    s_nop 1
3337; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3338; GFX9-NEXT:    s_nop 1
3339; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3340; GFX9-NEXT:    s_nop 1
3341; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3342; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3343; GFX9-NEXT:    s_nop 0
3344; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3345; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3346; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3347; GFX9-NEXT:    ; implicit-def: $vgpr0
3348; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3349; GFX9-NEXT:    s_cbranch_execz .LBB14_2
3350; GFX9-NEXT:  ; %bb.1:
3351; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3352; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3353; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3354; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
3355; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3356; GFX9-NEXT:  .LBB14_2:
3357; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3358; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3359; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3360; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3361; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
3362; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3363; GFX9-NEXT:    s_mov_b32 s2, -1
3364; GFX9-NEXT:    s_nop 0
3365; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3366; GFX9-NEXT:    s_endpgm
3367;
3368; GFX1064-LABEL: and_i32_varying:
3369; GFX1064:       ; %bb.0: ; %entry
3370; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3371; GFX1064-NEXT:    s_not_b64 exec, exec
3372; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
3373; GFX1064-NEXT:    s_not_b64 exec, exec
3374; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3375; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3376; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
3377; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3378; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3379; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3380; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3381; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3382; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3383; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3384; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3385; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3386; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3387; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3388; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3389; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3390; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3391; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3392; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3393; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3394; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3395; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3396; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3397; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3398; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3399; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3400; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3401; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3402; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3403; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3404; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3405; GFX1064-NEXT:    s_mov_b32 s2, -1
3406; GFX1064-NEXT:    ; implicit-def: $vgpr0
3407; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3408; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
3409; GFX1064-NEXT:  ; %bb.1:
3410; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3411; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3412; GFX1064-NEXT:    s_mov_b32 s3, s7
3413; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3414; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3415; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
3416; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3417; GFX1064-NEXT:    buffer_gl0_inv
3418; GFX1064-NEXT:  .LBB14_2:
3419; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3420; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3421; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3422; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3423; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
3424; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3425; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3426; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3427; GFX1064-NEXT:    s_endpgm
3428;
3429; GFX1032-LABEL: and_i32_varying:
3430; GFX1032:       ; %bb.0: ; %entry
3431; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3432; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3433; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
3434; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3435; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3436; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3437; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3438; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3439; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3440; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3441; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3442; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3443; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3444; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3445; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3446; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
3447; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3448; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3449; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3450; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3451; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3452; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3453; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3454; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3455; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3456; GFX1032-NEXT:    s_mov_b32 s2, -1
3457; GFX1032-NEXT:    ; implicit-def: $vgpr0
3458; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3459; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
3460; GFX1032-NEXT:  ; %bb.1:
3461; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3462; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3463; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3464; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3465; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
3466; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3467; GFX1032-NEXT:    buffer_gl0_inv
3468; GFX1032-NEXT:  .LBB14_2:
3469; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3470; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3471; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3472; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3473; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
3474; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3475; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3476; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3477; GFX1032-NEXT:    s_endpgm
3478;
3479; GFX1164-LABEL: and_i32_varying:
3480; GFX1164:       ; %bb.0: ; %entry
3481; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3482; GFX1164-NEXT:    s_not_b64 exec, exec
3483; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
3484; GFX1164-NEXT:    s_not_b64 exec, exec
3485; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3486; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3487; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
3488; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3489; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3490; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3491; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3492; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3493; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3494; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3495; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3496; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3497; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3498; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3499; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3500; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3501; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3502; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3503; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3504; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3505; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3506; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3507; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3508; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3509; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3510; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3511; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3512; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3513; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3514; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3515; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3516; GFX1164-NEXT:    s_mov_b32 s2, -1
3517; GFX1164-NEXT:    ; implicit-def: $vgpr0
3518; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3519; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
3520; GFX1164-NEXT:  ; %bb.1:
3521; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3522; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3523; GFX1164-NEXT:    s_mov_b32 s3, s7
3524; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3525; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3526; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
3527; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX1164-NEXT:    buffer_gl0_inv
3529; GFX1164-NEXT:  .LBB14_2:
3530; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3531; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3532; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3533; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
3534; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3535; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3536; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3537; GFX1164-NEXT:    s_endpgm
3538;
3539; GFX1132-LABEL: and_i32_varying:
3540; GFX1132:       ; %bb.0: ; %entry
3541; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3542; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3543; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
3544; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3545; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3546; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3547; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3548; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3549; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3550; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3551; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3552; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3553; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3554; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3555; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3556; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
3557; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3558; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3559; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3560; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3561; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3562; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3563; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3564; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3565; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3566; GFX1132-NEXT:    s_mov_b32 s2, -1
3567; GFX1132-NEXT:    ; implicit-def: $vgpr0
3568; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3569; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
3570; GFX1132-NEXT:  ; %bb.1:
3571; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3572; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3573; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3574; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3575; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
3576; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3577; GFX1132-NEXT:    buffer_gl0_inv
3578; GFX1132-NEXT:  .LBB14_2:
3579; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3580; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3581; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3582; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
3583; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3584; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3585; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3586; GFX1132-NEXT:    s_endpgm
3587entry:
3588  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3589  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3590  store i32 %old, i32 addrspace(1)* %out
3591  ret void
3592}
3593
3594define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
3595;
3596;
3597; GFX7LESS-LABEL: or_i32_varying:
3598; GFX7LESS:       ; %bb.0: ; %entry
3599; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3600; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3601; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3602; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3603; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
3604; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3605; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3606; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3607; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3608; GFX7LESS-NEXT:    s_endpgm
3609;
3610; GFX8-LABEL: or_i32_varying:
3611; GFX8:       ; %bb.0: ; %entry
3612; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3613; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3614; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3615; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3616; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3617; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3618; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3619; GFX8-NEXT:    s_not_b64 exec, exec
3620; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3621; GFX8-NEXT:    s_not_b64 exec, exec
3622; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3623; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3624; GFX8-NEXT:    s_nop 1
3625; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3626; GFX8-NEXT:    s_nop 1
3627; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3628; GFX8-NEXT:    s_nop 1
3629; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3630; GFX8-NEXT:    s_nop 1
3631; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3632; GFX8-NEXT:    s_nop 1
3633; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3634; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3635; GFX8-NEXT:    s_nop 0
3636; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3637; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3638; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3639; GFX8-NEXT:    ; implicit-def: $vgpr0
3640; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3641; GFX8-NEXT:    s_cbranch_execz .LBB15_2
3642; GFX8-NEXT:  ; %bb.1:
3643; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3644; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3645; GFX8-NEXT:    s_mov_b32 m0, -1
3646; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3647; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3648; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3649; GFX8-NEXT:  .LBB15_2:
3650; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3651; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3652; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3653; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3654; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3655; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3656; GFX8-NEXT:    s_mov_b32 s2, -1
3657; GFX8-NEXT:    s_nop 0
3658; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3659; GFX8-NEXT:    s_endpgm
3660;
3661; GFX9-LABEL: or_i32_varying:
3662; GFX9:       ; %bb.0: ; %entry
3663; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3664; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3665; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3666; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3667; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3668; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3669; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3670; GFX9-NEXT:    s_not_b64 exec, exec
3671; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3672; GFX9-NEXT:    s_not_b64 exec, exec
3673; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3674; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3675; GFX9-NEXT:    s_nop 1
3676; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3677; GFX9-NEXT:    s_nop 1
3678; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3679; GFX9-NEXT:    s_nop 1
3680; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3681; GFX9-NEXT:    s_nop 1
3682; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3683; GFX9-NEXT:    s_nop 1
3684; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3685; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3686; GFX9-NEXT:    s_nop 0
3687; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3688; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3689; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3690; GFX9-NEXT:    ; implicit-def: $vgpr0
3691; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3692; GFX9-NEXT:    s_cbranch_execz .LBB15_2
3693; GFX9-NEXT:  ; %bb.1:
3694; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3695; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3696; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3697; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3699; GFX9-NEXT:  .LBB15_2:
3700; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3701; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3702; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3703; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3704; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3705; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3706; GFX9-NEXT:    s_mov_b32 s2, -1
3707; GFX9-NEXT:    s_nop 0
3708; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3709; GFX9-NEXT:    s_endpgm
3710;
3711; GFX1064-LABEL: or_i32_varying:
3712; GFX1064:       ; %bb.0: ; %entry
3713; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3714; GFX1064-NEXT:    s_not_b64 exec, exec
3715; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3716; GFX1064-NEXT:    s_not_b64 exec, exec
3717; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3718; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3719; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3720; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3721; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3722; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3723; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3724; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3725; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3726; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3727; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3728; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3729; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3730; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3731; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3732; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3733; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3734; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3735; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3736; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3737; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3738; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3739; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3740; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3741; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3742; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3743; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3744; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3745; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3746; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3747; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3748; GFX1064-NEXT:    s_mov_b32 s2, -1
3749; GFX1064-NEXT:    ; implicit-def: $vgpr0
3750; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3751; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
3752; GFX1064-NEXT:  ; %bb.1:
3753; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3754; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3755; GFX1064-NEXT:    s_mov_b32 s3, s7
3756; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3757; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3758; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
3759; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3760; GFX1064-NEXT:    buffer_gl0_inv
3761; GFX1064-NEXT:  .LBB15_2:
3762; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3763; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3764; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3765; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3766; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3767; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3768; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3769; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3770; GFX1064-NEXT:    s_endpgm
3771;
3772; GFX1032-LABEL: or_i32_varying:
3773; GFX1032:       ; %bb.0: ; %entry
3774; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3775; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3776; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3777; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3778; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3779; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3780; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3781; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3782; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3783; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3784; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3785; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3786; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3787; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3788; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3789; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3790; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3791; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3792; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3793; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3794; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3795; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3796; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3797; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3798; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3799; GFX1032-NEXT:    s_mov_b32 s2, -1
3800; GFX1032-NEXT:    ; implicit-def: $vgpr0
3801; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3802; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
3803; GFX1032-NEXT:  ; %bb.1:
3804; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3805; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3806; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3807; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3808; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
3809; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3810; GFX1032-NEXT:    buffer_gl0_inv
3811; GFX1032-NEXT:  .LBB15_2:
3812; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3813; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3814; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3815; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3816; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3817; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3818; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3819; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3820; GFX1032-NEXT:    s_endpgm
3821;
3822; GFX1164-LABEL: or_i32_varying:
3823; GFX1164:       ; %bb.0: ; %entry
3824; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3825; GFX1164-NEXT:    s_not_b64 exec, exec
3826; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3827; GFX1164-NEXT:    s_not_b64 exec, exec
3828; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3829; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3830; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3831; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3832; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3833; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3834; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3835; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3836; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3837; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3838; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3839; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3840; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3841; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3842; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3843; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3844; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3845; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3846; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3847; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3848; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3849; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3850; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3851; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3852; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3853; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3854; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3855; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3856; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3857; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3858; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3859; GFX1164-NEXT:    s_mov_b32 s2, -1
3860; GFX1164-NEXT:    ; implicit-def: $vgpr0
3861; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3862; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
3863; GFX1164-NEXT:  ; %bb.1:
3864; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3865; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3866; GFX1164-NEXT:    s_mov_b32 s3, s7
3867; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3868; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3869; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v4
3870; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3871; GFX1164-NEXT:    buffer_gl0_inv
3872; GFX1164-NEXT:  .LBB15_2:
3873; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3874; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3875; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3876; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
3877; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3878; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3879; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3880; GFX1164-NEXT:    s_endpgm
3881;
3882; GFX1132-LABEL: or_i32_varying:
3883; GFX1132:       ; %bb.0: ; %entry
3884; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3885; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3886; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
3887; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3888; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3889; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3890; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3891; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3892; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3893; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3894; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3895; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3896; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3897; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3898; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3899; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3900; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3901; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3902; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3903; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3904; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3905; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3906; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3907; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3908; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3909; GFX1132-NEXT:    s_mov_b32 s2, -1
3910; GFX1132-NEXT:    ; implicit-def: $vgpr0
3911; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3912; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
3913; GFX1132-NEXT:  ; %bb.1:
3914; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3915; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3916; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3917; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3918; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v4
3919; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3920; GFX1132-NEXT:    buffer_gl0_inv
3921; GFX1132-NEXT:  .LBB15_2:
3922; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3923; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3924; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3925; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
3926; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3927; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3928; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3929; GFX1132-NEXT:    s_endpgm
3930entry:
3931  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3932  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3933  store i32 %old, i32 addrspace(1)* %out
3934  ret void
3935}
3936
3937define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3938;
3939;
3940; GFX7LESS-LABEL: xor_i32_varying:
3941; GFX7LESS:       ; %bb.0: ; %entry
3942; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3943; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3944; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3945; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3946; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3947; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3948; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3949; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3950; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3951; GFX7LESS-NEXT:    s_endpgm
3952;
3953; GFX8-LABEL: xor_i32_varying:
3954; GFX8:       ; %bb.0: ; %entry
3955; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3956; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3957; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3958; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3959; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3960; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3961; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3962; GFX8-NEXT:    s_not_b64 exec, exec
3963; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3964; GFX8-NEXT:    s_not_b64 exec, exec
3965; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3966; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3967; GFX8-NEXT:    s_nop 1
3968; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3969; GFX8-NEXT:    s_nop 1
3970; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3971; GFX8-NEXT:    s_nop 1
3972; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3973; GFX8-NEXT:    s_nop 1
3974; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3975; GFX8-NEXT:    s_nop 1
3976; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3977; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3978; GFX8-NEXT:    s_nop 0
3979; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3980; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3981; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3982; GFX8-NEXT:    ; implicit-def: $vgpr0
3983; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3984; GFX8-NEXT:    s_cbranch_execz .LBB16_2
3985; GFX8-NEXT:  ; %bb.1:
3986; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3987; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3988; GFX8-NEXT:    s_mov_b32 m0, -1
3989; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3990; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3991; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3992; GFX8-NEXT:  .LBB16_2:
3993; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3994; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3995; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3996; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3997; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3998; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3999; GFX8-NEXT:    s_mov_b32 s2, -1
4000; GFX8-NEXT:    s_nop 0
4001; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4002; GFX8-NEXT:    s_endpgm
4003;
4004; GFX9-LABEL: xor_i32_varying:
4005; GFX9:       ; %bb.0: ; %entry
4006; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4007; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4008; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4009; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4010; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4011; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4012; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4013; GFX9-NEXT:    s_not_b64 exec, exec
4014; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4015; GFX9-NEXT:    s_not_b64 exec, exec
4016; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4017; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4018; GFX9-NEXT:    s_nop 1
4019; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4020; GFX9-NEXT:    s_nop 1
4021; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4022; GFX9-NEXT:    s_nop 1
4023; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4024; GFX9-NEXT:    s_nop 1
4025; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4026; GFX9-NEXT:    s_nop 1
4027; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4028; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4029; GFX9-NEXT:    s_nop 0
4030; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4031; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4032; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4033; GFX9-NEXT:    ; implicit-def: $vgpr0
4034; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4035; GFX9-NEXT:    s_cbranch_execz .LBB16_2
4036; GFX9-NEXT:  ; %bb.1:
4037; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4038; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4039; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4040; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4041; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4042; GFX9-NEXT:  .LBB16_2:
4043; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4044; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4045; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4046; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4047; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
4048; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4049; GFX9-NEXT:    s_mov_b32 s2, -1
4050; GFX9-NEXT:    s_nop 0
4051; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4052; GFX9-NEXT:    s_endpgm
4053;
4054; GFX1064-LABEL: xor_i32_varying:
4055; GFX1064:       ; %bb.0: ; %entry
4056; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4057; GFX1064-NEXT:    s_not_b64 exec, exec
4058; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4059; GFX1064-NEXT:    s_not_b64 exec, exec
4060; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4061; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4062; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4063; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4064; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4065; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4066; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4067; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4068; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4069; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4070; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4071; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4072; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4073; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4074; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4075; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4076; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4077; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4078; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4079; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4080; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4081; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4082; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4083; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4084; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4085; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4086; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4087; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4088; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4089; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4090; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4091; GFX1064-NEXT:    s_mov_b32 s2, -1
4092; GFX1064-NEXT:    ; implicit-def: $vgpr0
4093; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4094; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
4095; GFX1064-NEXT:  ; %bb.1:
4096; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4097; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4098; GFX1064-NEXT:    s_mov_b32 s3, s7
4099; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4100; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4101; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4102; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4103; GFX1064-NEXT:    buffer_gl0_inv
4104; GFX1064-NEXT:  .LBB16_2:
4105; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4106; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4107; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4108; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4109; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
4110; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4111; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4112; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4113; GFX1064-NEXT:    s_endpgm
4114;
4115; GFX1032-LABEL: xor_i32_varying:
4116; GFX1032:       ; %bb.0: ; %entry
4117; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4118; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4119; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4120; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4121; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4122; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4123; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4124; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4125; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4126; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4127; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4128; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4129; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4130; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4131; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4132; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4133; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4134; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4135; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4136; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4137; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4138; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4139; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4140; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4141; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4142; GFX1032-NEXT:    s_mov_b32 s2, -1
4143; GFX1032-NEXT:    ; implicit-def: $vgpr0
4144; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4145; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
4146; GFX1032-NEXT:  ; %bb.1:
4147; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4148; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4149; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4150; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4151; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4152; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4153; GFX1032-NEXT:    buffer_gl0_inv
4154; GFX1032-NEXT:  .LBB16_2:
4155; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4156; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4157; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4158; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4159; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
4160; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4161; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4162; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4163; GFX1032-NEXT:    s_endpgm
4164;
4165; GFX1164-LABEL: xor_i32_varying:
4166; GFX1164:       ; %bb.0: ; %entry
4167; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4168; GFX1164-NEXT:    s_not_b64 exec, exec
4169; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4170; GFX1164-NEXT:    s_not_b64 exec, exec
4171; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4172; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4173; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
4174; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4175; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4176; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4177; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4178; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4179; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4180; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4181; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4182; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4183; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4184; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4185; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4186; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4187; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4188; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4189; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4190; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4191; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4192; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4193; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4194; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4195; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4196; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4197; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4198; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4199; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4200; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4201; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4202; GFX1164-NEXT:    s_mov_b32 s2, -1
4203; GFX1164-NEXT:    ; implicit-def: $vgpr0
4204; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4205; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
4206; GFX1164-NEXT:  ; %bb.1:
4207; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4208; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4209; GFX1164-NEXT:    s_mov_b32 s3, s7
4210; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4211; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4212; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4213; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4214; GFX1164-NEXT:    buffer_gl0_inv
4215; GFX1164-NEXT:  .LBB16_2:
4216; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4217; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4218; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4219; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
4220; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4221; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4222; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4223; GFX1164-NEXT:    s_endpgm
4224;
4225; GFX1132-LABEL: xor_i32_varying:
4226; GFX1132:       ; %bb.0: ; %entry
4227; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4228; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4229; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4230; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4231; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4232; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4233; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4234; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4235; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4236; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4237; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4238; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4239; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4240; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4241; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4242; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4243; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4244; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4245; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4246; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4247; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4248; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4249; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4250; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4251; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4252; GFX1132-NEXT:    s_mov_b32 s2, -1
4253; GFX1132-NEXT:    ; implicit-def: $vgpr0
4254; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4255; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
4256; GFX1132-NEXT:  ; %bb.1:
4257; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4258; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4259; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4260; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4261; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4262; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4263; GFX1132-NEXT:    buffer_gl0_inv
4264; GFX1132-NEXT:  .LBB16_2:
4265; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4266; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4267; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4268; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
4269; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4270; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4271; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4272; GFX1132-NEXT:    s_endpgm
4273entry:
4274  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4275  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4276  store i32 %old, i32 addrspace(1)* %out
4277  ret void
4278}
4279
4280define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
4281;
4282;
4283; GFX7LESS-LABEL: max_i32_varying:
4284; GFX7LESS:       ; %bb.0: ; %entry
4285; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4286; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4287; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4288; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4289; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
4290; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4291; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4292; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4293; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4294; GFX7LESS-NEXT:    s_endpgm
4295;
4296; GFX8-LABEL: max_i32_varying:
4297; GFX8:       ; %bb.0: ; %entry
4298; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4299; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4300; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4301; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4302; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4303; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4304; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4305; GFX8-NEXT:    s_not_b64 exec, exec
4306; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
4307; GFX8-NEXT:    s_not_b64 exec, exec
4308; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4309; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4310; GFX8-NEXT:    s_nop 1
4311; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4312; GFX8-NEXT:    s_nop 1
4313; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4314; GFX8-NEXT:    s_nop 1
4315; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4316; GFX8-NEXT:    s_nop 1
4317; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4318; GFX8-NEXT:    s_nop 1
4319; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4320; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4321; GFX8-NEXT:    s_nop 0
4322; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4323; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4324; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4325; GFX8-NEXT:    ; implicit-def: $vgpr0
4326; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4327; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4328; GFX8-NEXT:  ; %bb.1:
4329; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4330; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4331; GFX8-NEXT:    s_mov_b32 m0, -1
4332; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
4334; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4335; GFX8-NEXT:  .LBB17_2:
4336; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4337; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4338; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4339; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4340; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
4341; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4342; GFX8-NEXT:    s_mov_b32 s2, -1
4343; GFX8-NEXT:    s_nop 0
4344; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4345; GFX8-NEXT:    s_endpgm
4346;
4347; GFX9-LABEL: max_i32_varying:
4348; GFX9:       ; %bb.0: ; %entry
4349; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4350; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4351; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4352; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4353; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4354; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4355; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4356; GFX9-NEXT:    s_not_b64 exec, exec
4357; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
4358; GFX9-NEXT:    s_not_b64 exec, exec
4359; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4360; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4361; GFX9-NEXT:    s_nop 1
4362; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4363; GFX9-NEXT:    s_nop 1
4364; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4365; GFX9-NEXT:    s_nop 1
4366; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4367; GFX9-NEXT:    s_nop 1
4368; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4369; GFX9-NEXT:    s_nop 1
4370; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4371; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4372; GFX9-NEXT:    s_nop 0
4373; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4374; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4375; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4376; GFX9-NEXT:    ; implicit-def: $vgpr0
4377; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4378; GFX9-NEXT:    s_cbranch_execz .LBB17_2
4379; GFX9-NEXT:  ; %bb.1:
4380; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4381; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4383; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
4384; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4385; GFX9-NEXT:  .LBB17_2:
4386; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4387; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4388; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4389; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4390; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
4391; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4392; GFX9-NEXT:    s_mov_b32 s2, -1
4393; GFX9-NEXT:    s_nop 0
4394; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4395; GFX9-NEXT:    s_endpgm
4396;
4397; GFX1064-LABEL: max_i32_varying:
4398; GFX1064:       ; %bb.0: ; %entry
4399; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4400; GFX1064-NEXT:    s_not_b64 exec, exec
4401; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
4402; GFX1064-NEXT:    s_not_b64 exec, exec
4403; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4404; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4405; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
4406; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4407; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4408; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4409; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4410; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4411; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4412; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4413; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4414; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4415; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4416; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4417; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4418; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4419; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4420; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4421; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4422; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4423; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4424; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4425; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4426; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4427; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4428; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4429; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4430; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4431; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4432; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4433; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4434; GFX1064-NEXT:    s_mov_b32 s2, -1
4435; GFX1064-NEXT:    ; implicit-def: $vgpr0
4436; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4437; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
4438; GFX1064-NEXT:  ; %bb.1:
4439; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4440; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4441; GFX1064-NEXT:    s_mov_b32 s3, s7
4442; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4443; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4444; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
4445; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4446; GFX1064-NEXT:    buffer_gl0_inv
4447; GFX1064-NEXT:  .LBB17_2:
4448; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4449; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4450; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4451; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4452; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
4453; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4454; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4455; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4456; GFX1064-NEXT:    s_endpgm
4457;
4458; GFX1032-LABEL: max_i32_varying:
4459; GFX1032:       ; %bb.0: ; %entry
4460; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4461; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4462; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
4463; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4464; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4465; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4466; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4467; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4468; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4469; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4470; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4471; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4472; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4473; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4474; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4475; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
4476; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4477; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4478; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4479; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4480; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4481; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4482; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4483; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4484; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4485; GFX1032-NEXT:    s_mov_b32 s2, -1
4486; GFX1032-NEXT:    ; implicit-def: $vgpr0
4487; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4488; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
4489; GFX1032-NEXT:  ; %bb.1:
4490; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4491; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4492; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4493; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4494; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
4495; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4496; GFX1032-NEXT:    buffer_gl0_inv
4497; GFX1032-NEXT:  .LBB17_2:
4498; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4499; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4500; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4501; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4502; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
4503; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4504; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4505; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4506; GFX1032-NEXT:    s_endpgm
4507;
4508; GFX1164-LABEL: max_i32_varying:
4509; GFX1164:       ; %bb.0: ; %entry
4510; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4511; GFX1164-NEXT:    s_not_b64 exec, exec
4512; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
4513; GFX1164-NEXT:    s_not_b64 exec, exec
4514; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4515; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4516; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
4517; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4518; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4519; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4520; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4521; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4522; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4523; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4524; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4525; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4526; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4527; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4528; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4529; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4530; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4531; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4532; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4533; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4534; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4535; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4536; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4537; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4538; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4539; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4540; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4541; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4542; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4543; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4544; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4545; GFX1164-NEXT:    s_mov_b32 s2, -1
4546; GFX1164-NEXT:    ; implicit-def: $vgpr0
4547; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4548; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
4549; GFX1164-NEXT:  ; %bb.1:
4550; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4551; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4552; GFX1164-NEXT:    s_mov_b32 s3, s7
4553; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4554; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4555; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
4556; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4557; GFX1164-NEXT:    buffer_gl0_inv
4558; GFX1164-NEXT:  .LBB17_2:
4559; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4560; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4561; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4562; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
4563; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4564; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4565; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4566; GFX1164-NEXT:    s_endpgm
4567;
4568; GFX1132-LABEL: max_i32_varying:
4569; GFX1132:       ; %bb.0: ; %entry
4570; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4571; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4572; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
4573; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4574; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4575; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4576; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4577; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4578; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4579; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4580; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4581; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4582; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4583; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4584; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4585; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
4586; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4587; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4588; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4589; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4590; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4591; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4592; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4593; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4594; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4595; GFX1132-NEXT:    s_mov_b32 s2, -1
4596; GFX1132-NEXT:    ; implicit-def: $vgpr0
4597; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4598; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
4599; GFX1132-NEXT:  ; %bb.1:
4600; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4601; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4602; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4603; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4604; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
4605; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4606; GFX1132-NEXT:    buffer_gl0_inv
4607; GFX1132-NEXT:  .LBB17_2:
4608; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4609; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4610; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4611; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
4612; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4613; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4614; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4615; GFX1132-NEXT:    s_endpgm
4616entry:
4617  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4618  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4619  store i32 %old, i32 addrspace(1)* %out
4620  ret void
4621}
4622
4623define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
4624;
4625;
4626; GFX7LESS-LABEL: max_i64_constant:
4627; GFX7LESS:       ; %bb.0: ; %entry
4628; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4629; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4630; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4631; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4632; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4633; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4634; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
4635; GFX7LESS-NEXT:  ; %bb.1:
4636; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4637; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4638; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4639; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4640; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4641; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4642; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4643; GFX7LESS-NEXT:  .LBB18_2:
4644; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4645; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4646; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4647; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4648; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
4649; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4650; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4651; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4652; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4653; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4654; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
4655; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4656; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4657; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4658; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4659; GFX7LESS-NEXT:    s_endpgm
4660;
4661; GFX8-LABEL: max_i64_constant:
4662; GFX8:       ; %bb.0: ; %entry
4663; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4664; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4665; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4666; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4667; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4668; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4669; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4670; GFX8-NEXT:  ; %bb.1:
4671; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4672; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4673; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4674; GFX8-NEXT:    s_mov_b32 m0, -1
4675; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4676; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4677; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4678; GFX8-NEXT:  .LBB18_2:
4679; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4680; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4681; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4682; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
4683; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4684; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4685; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4686; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4687; GFX8-NEXT:    v_mov_b32_e32 v2, s3
4688; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4689; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4690; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4691; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4692; GFX8-NEXT:    s_mov_b32 s2, -1
4693; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4694; GFX8-NEXT:    s_endpgm
4695;
4696; GFX9-LABEL: max_i64_constant:
4697; GFX9:       ; %bb.0: ; %entry
4698; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4699; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4700; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4701; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4702; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4703; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4704; GFX9-NEXT:    s_cbranch_execz .LBB18_2
4705; GFX9-NEXT:  ; %bb.1:
4706; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4707; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4708; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4709; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4710; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4711; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4712; GFX9-NEXT:  .LBB18_2:
4713; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4714; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4715; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4716; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
4717; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4718; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4719; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4720; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4721; GFX9-NEXT:    v_mov_b32_e32 v2, s3
4722; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4723; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4724; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4725; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4726; GFX9-NEXT:    s_mov_b32 s2, -1
4727; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4728; GFX9-NEXT:    s_endpgm
4729;
4730; GFX1064-LABEL: max_i64_constant:
4731; GFX1064:       ; %bb.0: ; %entry
4732; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4733; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4734; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4735; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4736; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4737; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4738; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
4739; GFX1064-NEXT:  ; %bb.1:
4740; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4741; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4742; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4743; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4744; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4745; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4746; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4747; GFX1064-NEXT:    buffer_gl0_inv
4748; GFX1064-NEXT:  .LBB18_2:
4749; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4750; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4751; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4752; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4753; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4754; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4755; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4756; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4757; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4758; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4759; GFX1064-NEXT:    s_mov_b32 s2, -1
4760; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4761; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4762; GFX1064-NEXT:    s_endpgm
4763;
4764; GFX1032-LABEL: max_i64_constant:
4765; GFX1032:       ; %bb.0: ; %entry
4766; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4767; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4768; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4769; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4770; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4771; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
4772; GFX1032-NEXT:  ; %bb.1:
4773; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4774; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4775; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4776; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4777; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4778; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4779; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4780; GFX1032-NEXT:    buffer_gl0_inv
4781; GFX1032-NEXT:  .LBB18_2:
4782; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4783; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4784; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4785; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4786; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4787; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4788; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4789; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4790; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4791; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4792; GFX1032-NEXT:    s_mov_b32 s2, -1
4793; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4794; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4795; GFX1032-NEXT:    s_endpgm
4796;
4797; GFX1164-LABEL: max_i64_constant:
4798; GFX1164:       ; %bb.0: ; %entry
4799; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4800; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4801; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4802; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4803; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
4804; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4805; GFX1164-NEXT:    s_cbranch_execz .LBB18_2
4806; GFX1164-NEXT:  ; %bb.1:
4807; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
4808; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4809; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
4810; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4811; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4812; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4813; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4814; GFX1164-NEXT:    buffer_gl0_inv
4815; GFX1164-NEXT:  .LBB18_2:
4816; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
4817; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
4818; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
4819; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4820; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4821; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4822; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4823; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4824; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4825; GFX1164-NEXT:    s_mov_b32 s2, -1
4826; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4827; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4828; GFX1164-NEXT:    s_endpgm
4829;
4830; GFX1132-LABEL: max_i64_constant:
4831; GFX1132:       ; %bb.0: ; %entry
4832; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4833; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4834; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4835; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
4836; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4837; GFX1132-NEXT:    s_cbranch_execz .LBB18_2
4838; GFX1132-NEXT:  ; %bb.1:
4839; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
4840; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4841; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
4842; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4843; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4844; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4845; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4846; GFX1132-NEXT:    buffer_gl0_inv
4847; GFX1132-NEXT:  .LBB18_2:
4848; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4849; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
4850; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
4851; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4852; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4853; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4854; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4855; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4856; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4857; GFX1132-NEXT:    s_mov_b32 s2, -1
4858; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4859; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4860; GFX1132-NEXT:    s_endpgm
4861entry:
4862  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
4863  store i64 %old, i64 addrspace(1)* %out
4864  ret void
4865}
4866
4867define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
4868;
4869;
4870; GFX7LESS-LABEL: min_i32_varying:
4871; GFX7LESS:       ; %bb.0: ; %entry
4872; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4873; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4874; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4875; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4876; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
4877; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4878; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4879; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4880; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4881; GFX7LESS-NEXT:    s_endpgm
4882;
4883; GFX8-LABEL: min_i32_varying:
4884; GFX8:       ; %bb.0: ; %entry
4885; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4886; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4887; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4888; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4889; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
4890; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4891; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4892; GFX8-NEXT:    s_not_b64 exec, exec
4893; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
4894; GFX8-NEXT:    s_not_b64 exec, exec
4895; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4896; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4897; GFX8-NEXT:    s_nop 1
4898; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4899; GFX8-NEXT:    s_nop 1
4900; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4901; GFX8-NEXT:    s_nop 1
4902; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4903; GFX8-NEXT:    s_nop 1
4904; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4905; GFX8-NEXT:    s_nop 1
4906; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4907; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4908; GFX8-NEXT:    s_nop 0
4909; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4910; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4911; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4912; GFX8-NEXT:    ; implicit-def: $vgpr0
4913; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4914; GFX8-NEXT:    s_cbranch_execz .LBB19_2
4915; GFX8-NEXT:  ; %bb.1:
4916; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4917; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4918; GFX8-NEXT:    s_mov_b32 m0, -1
4919; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4920; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
4921; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4922; GFX8-NEXT:  .LBB19_2:
4923; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4924; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4925; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4926; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4927; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
4928; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4929; GFX8-NEXT:    s_mov_b32 s2, -1
4930; GFX8-NEXT:    s_nop 0
4931; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4932; GFX8-NEXT:    s_endpgm
4933;
4934; GFX9-LABEL: min_i32_varying:
4935; GFX9:       ; %bb.0: ; %entry
4936; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4937; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4938; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4939; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4940; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
4941; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4942; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4943; GFX9-NEXT:    s_not_b64 exec, exec
4944; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
4945; GFX9-NEXT:    s_not_b64 exec, exec
4946; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4947; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4948; GFX9-NEXT:    s_nop 1
4949; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4950; GFX9-NEXT:    s_nop 1
4951; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4952; GFX9-NEXT:    s_nop 1
4953; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4954; GFX9-NEXT:    s_nop 1
4955; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4956; GFX9-NEXT:    s_nop 1
4957; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4958; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4959; GFX9-NEXT:    s_nop 0
4960; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4961; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4962; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4963; GFX9-NEXT:    ; implicit-def: $vgpr0
4964; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4965; GFX9-NEXT:    s_cbranch_execz .LBB19_2
4966; GFX9-NEXT:  ; %bb.1:
4967; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4968; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4969; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4970; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
4971; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4972; GFX9-NEXT:  .LBB19_2:
4973; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4974; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4975; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4976; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4977; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
4978; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4979; GFX9-NEXT:    s_mov_b32 s2, -1
4980; GFX9-NEXT:    s_nop 0
4981; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4982; GFX9-NEXT:    s_endpgm
4983;
4984; GFX1064-LABEL: min_i32_varying:
4985; GFX1064:       ; %bb.0: ; %entry
4986; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4987; GFX1064-NEXT:    s_not_b64 exec, exec
4988; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
4989; GFX1064-NEXT:    s_not_b64 exec, exec
4990; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4991; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4992; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
4993; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4994; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4995; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4996; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4997; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4998; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4999; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5000; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5001; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5002; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5003; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5004; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5005; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5006; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5007; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5008; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5009; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5010; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5011; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5012; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5013; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5014; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5015; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5016; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5017; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5018; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5019; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5020; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5021; GFX1064-NEXT:    s_mov_b32 s2, -1
5022; GFX1064-NEXT:    ; implicit-def: $vgpr0
5023; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5024; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
5025; GFX1064-NEXT:  ; %bb.1:
5026; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5027; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5028; GFX1064-NEXT:    s_mov_b32 s3, s7
5029; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5030; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5031; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
5032; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5033; GFX1064-NEXT:    buffer_gl0_inv
5034; GFX1064-NEXT:  .LBB19_2:
5035; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5036; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5037; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5038; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5039; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
5040; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5041; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5042; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5043; GFX1064-NEXT:    s_endpgm
5044;
5045; GFX1032-LABEL: min_i32_varying:
5046; GFX1032:       ; %bb.0: ; %entry
5047; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5048; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5049; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
5050; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5051; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5052; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5053; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5054; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5055; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5056; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5057; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5058; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5059; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5060; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5061; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5062; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
5063; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5064; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5065; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5066; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5067; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5068; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5069; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5070; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5071; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5072; GFX1032-NEXT:    s_mov_b32 s2, -1
5073; GFX1032-NEXT:    ; implicit-def: $vgpr0
5074; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5075; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
5076; GFX1032-NEXT:  ; %bb.1:
5077; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5078; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5079; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5080; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5081; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
5082; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5083; GFX1032-NEXT:    buffer_gl0_inv
5084; GFX1032-NEXT:  .LBB19_2:
5085; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5086; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5087; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5088; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5089; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
5090; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5091; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5092; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5093; GFX1032-NEXT:    s_endpgm
5094;
5095; GFX1164-LABEL: min_i32_varying:
5096; GFX1164:       ; %bb.0: ; %entry
5097; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5098; GFX1164-NEXT:    s_not_b64 exec, exec
5099; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
5100; GFX1164-NEXT:    s_not_b64 exec, exec
5101; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5102; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5103; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
5104; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5105; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5106; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5107; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5108; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5109; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5110; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5111; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5112; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5113; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5114; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5115; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5116; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5117; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5118; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5119; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5120; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5121; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5122; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5123; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5124; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5125; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5126; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5127; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5128; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5129; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5130; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5131; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5132; GFX1164-NEXT:    s_mov_b32 s2, -1
5133; GFX1164-NEXT:    ; implicit-def: $vgpr0
5134; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5135; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
5136; GFX1164-NEXT:  ; %bb.1:
5137; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5138; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5139; GFX1164-NEXT:    s_mov_b32 s3, s7
5140; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5141; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5142; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
5143; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5144; GFX1164-NEXT:    buffer_gl0_inv
5145; GFX1164-NEXT:  .LBB19_2:
5146; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5147; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5148; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5149; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
5150; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5151; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5152; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5153; GFX1164-NEXT:    s_endpgm
5154;
5155; GFX1132-LABEL: min_i32_varying:
5156; GFX1132:       ; %bb.0: ; %entry
5157; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5158; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5159; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
5160; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5161; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5162; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5163; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5164; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5165; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5166; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5167; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5168; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5169; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5170; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5171; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5172; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
5173; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5174; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5175; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5176; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5177; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5178; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5179; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5180; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5181; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5182; GFX1132-NEXT:    s_mov_b32 s2, -1
5183; GFX1132-NEXT:    ; implicit-def: $vgpr0
5184; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5185; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
5186; GFX1132-NEXT:  ; %bb.1:
5187; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5188; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5189; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5190; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5191; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
5192; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5193; GFX1132-NEXT:    buffer_gl0_inv
5194; GFX1132-NEXT:  .LBB19_2:
5195; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5196; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5197; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5198; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
5199; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5200; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5201; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5202; GFX1132-NEXT:    s_endpgm
5203entry:
5204  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5205  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5206  store i32 %old, i32 addrspace(1)* %out
5207  ret void
5208}
5209
5210define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
5211;
5212;
5213; GFX7LESS-LABEL: min_i64_constant:
5214; GFX7LESS:       ; %bb.0: ; %entry
5215; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5216; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5217; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5218; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5219; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5220; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5221; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
5222; GFX7LESS-NEXT:  ; %bb.1:
5223; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5224; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5225; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5226; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5227; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5228; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5229; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5230; GFX7LESS-NEXT:  .LBB20_2:
5231; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5232; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5233; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5234; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5235; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
5236; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5237; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5238; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5239; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5240; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
5241; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5242; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5243; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5244; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5245; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5246; GFX7LESS-NEXT:    s_endpgm
5247;
5248; GFX8-LABEL: min_i64_constant:
5249; GFX8:       ; %bb.0: ; %entry
5250; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5251; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5252; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5253; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5254; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5255; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5256; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5257; GFX8-NEXT:  ; %bb.1:
5258; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5259; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5260; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5261; GFX8-NEXT:    s_mov_b32 m0, -1
5262; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5263; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5264; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5265; GFX8-NEXT:  .LBB20_2:
5266; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5267; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5268; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5269; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
5270; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5271; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5272; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5273; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5274; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5275; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5276; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5277; GFX8-NEXT:    s_mov_b32 s2, -1
5278; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5279; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5280; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5281; GFX8-NEXT:    s_endpgm
5282;
5283; GFX9-LABEL: min_i64_constant:
5284; GFX9:       ; %bb.0: ; %entry
5285; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5286; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5287; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5288; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5289; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5290; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5291; GFX9-NEXT:    s_cbranch_execz .LBB20_2
5292; GFX9-NEXT:  ; %bb.1:
5293; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5294; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5295; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5297; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5298; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5299; GFX9-NEXT:  .LBB20_2:
5300; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5301; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5302; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5303; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
5304; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5305; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5306; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5307; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5308; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5309; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5310; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5311; GFX9-NEXT:    s_mov_b32 s2, -1
5312; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5313; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5314; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5315; GFX9-NEXT:    s_endpgm
5316;
5317; GFX1064-LABEL: min_i64_constant:
5318; GFX1064:       ; %bb.0: ; %entry
5319; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5320; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5321; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5322; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5323; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5324; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5325; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
5326; GFX1064-NEXT:  ; %bb.1:
5327; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5328; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5329; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5330; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5331; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5332; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5333; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5334; GFX1064-NEXT:    buffer_gl0_inv
5335; GFX1064-NEXT:  .LBB20_2:
5336; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5337; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5338; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5339; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5340; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5341; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5342; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5343; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5344; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5345; GFX1064-NEXT:    s_mov_b32 s2, -1
5346; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5347; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5348; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5349; GFX1064-NEXT:    s_endpgm
5350;
5351; GFX1032-LABEL: min_i64_constant:
5352; GFX1032:       ; %bb.0: ; %entry
5353; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5354; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5355; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5356; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5357; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5358; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
5359; GFX1032-NEXT:  ; %bb.1:
5360; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5361; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5362; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5363; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5364; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5365; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5366; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5367; GFX1032-NEXT:    buffer_gl0_inv
5368; GFX1032-NEXT:  .LBB20_2:
5369; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5370; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5371; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5372; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5373; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5374; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5375; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5376; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5377; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5378; GFX1032-NEXT:    s_mov_b32 s2, -1
5379; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5380; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5381; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5382; GFX1032-NEXT:    s_endpgm
5383;
5384; GFX1164-LABEL: min_i64_constant:
5385; GFX1164:       ; %bb.0: ; %entry
5386; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5387; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5388; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5389; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5390; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5391; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5392; GFX1164-NEXT:    s_cbranch_execz .LBB20_2
5393; GFX1164-NEXT:  ; %bb.1:
5394; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5395; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5396; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5397; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5398; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5399; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5400; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5401; GFX1164-NEXT:    buffer_gl0_inv
5402; GFX1164-NEXT:  .LBB20_2:
5403; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5404; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5405; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5406; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5407; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5408; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5409; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5410; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5411; GFX1164-NEXT:    s_mov_b32 s2, -1
5412; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5413; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5414; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5415; GFX1164-NEXT:    s_endpgm
5416;
5417; GFX1132-LABEL: min_i64_constant:
5418; GFX1132:       ; %bb.0: ; %entry
5419; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5420; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5421; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5422; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5423; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5424; GFX1132-NEXT:    s_cbranch_execz .LBB20_2
5425; GFX1132-NEXT:  ; %bb.1:
5426; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5427; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5428; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5429; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5430; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5431; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5432; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5433; GFX1132-NEXT:    buffer_gl0_inv
5434; GFX1132-NEXT:  .LBB20_2:
5435; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5436; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5437; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5438; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5439; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5440; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5441; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5442; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5443; GFX1132-NEXT:    s_mov_b32 s2, -1
5444; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5445; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5446; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5447; GFX1132-NEXT:    s_endpgm
5448entry:
5449  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
5450  store i64 %old, i64 addrspace(1)* %out
5451  ret void
5452}
5453
5454define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
5455;
5456;
5457; GFX7LESS-LABEL: umax_i32_varying:
5458; GFX7LESS:       ; %bb.0: ; %entry
5459; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5460; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5461; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5462; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5463; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
5464; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5465; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5466; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5467; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5468; GFX7LESS-NEXT:    s_endpgm
5469;
5470; GFX8-LABEL: umax_i32_varying:
5471; GFX8:       ; %bb.0: ; %entry
5472; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5473; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5474; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5475; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5476; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5477; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5478; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5479; GFX8-NEXT:    s_not_b64 exec, exec
5480; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5481; GFX8-NEXT:    s_not_b64 exec, exec
5482; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5483; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5484; GFX8-NEXT:    s_nop 1
5485; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5486; GFX8-NEXT:    s_nop 1
5487; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5488; GFX8-NEXT:    s_nop 1
5489; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5490; GFX8-NEXT:    s_nop 1
5491; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5492; GFX8-NEXT:    s_nop 1
5493; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5494; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5495; GFX8-NEXT:    s_nop 0
5496; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5497; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5498; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5499; GFX8-NEXT:    ; implicit-def: $vgpr0
5500; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5501; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5502; GFX8-NEXT:  ; %bb.1:
5503; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5504; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5505; GFX8-NEXT:    s_mov_b32 m0, -1
5506; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5507; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
5508; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5509; GFX8-NEXT:  .LBB21_2:
5510; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5511; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5512; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5513; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5514; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
5515; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5516; GFX8-NEXT:    s_mov_b32 s2, -1
5517; GFX8-NEXT:    s_nop 0
5518; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5519; GFX8-NEXT:    s_endpgm
5520;
5521; GFX9-LABEL: umax_i32_varying:
5522; GFX9:       ; %bb.0: ; %entry
5523; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5524; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5525; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5526; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5527; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5528; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5529; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5530; GFX9-NEXT:    s_not_b64 exec, exec
5531; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5532; GFX9-NEXT:    s_not_b64 exec, exec
5533; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5534; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5535; GFX9-NEXT:    s_nop 1
5536; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5537; GFX9-NEXT:    s_nop 1
5538; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5539; GFX9-NEXT:    s_nop 1
5540; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5541; GFX9-NEXT:    s_nop 1
5542; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5543; GFX9-NEXT:    s_nop 1
5544; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5545; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5546; GFX9-NEXT:    s_nop 0
5547; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5548; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5549; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5550; GFX9-NEXT:    ; implicit-def: $vgpr0
5551; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5552; GFX9-NEXT:    s_cbranch_execz .LBB21_2
5553; GFX9-NEXT:  ; %bb.1:
5554; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5555; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5557; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
5558; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5559; GFX9-NEXT:  .LBB21_2:
5560; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5561; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5562; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5563; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5564; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
5565; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5566; GFX9-NEXT:    s_mov_b32 s2, -1
5567; GFX9-NEXT:    s_nop 0
5568; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5569; GFX9-NEXT:    s_endpgm
5570;
5571; GFX1064-LABEL: umax_i32_varying:
5572; GFX1064:       ; %bb.0: ; %entry
5573; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5574; GFX1064-NEXT:    s_not_b64 exec, exec
5575; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5576; GFX1064-NEXT:    s_not_b64 exec, exec
5577; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5578; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5579; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5580; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5581; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5582; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5583; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5584; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5585; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5586; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5587; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5588; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5589; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5590; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5591; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5592; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5593; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5594; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5595; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5596; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5597; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5598; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5599; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5600; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5601; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5602; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5603; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5604; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5605; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5606; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5607; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5608; GFX1064-NEXT:    s_mov_b32 s2, -1
5609; GFX1064-NEXT:    ; implicit-def: $vgpr0
5610; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5611; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
5612; GFX1064-NEXT:  ; %bb.1:
5613; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5614; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5615; GFX1064-NEXT:    s_mov_b32 s3, s7
5616; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5617; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5618; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
5619; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5620; GFX1064-NEXT:    buffer_gl0_inv
5621; GFX1064-NEXT:  .LBB21_2:
5622; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5623; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5624; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5625; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5626; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
5627; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5628; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5629; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5630; GFX1064-NEXT:    s_endpgm
5631;
5632; GFX1032-LABEL: umax_i32_varying:
5633; GFX1032:       ; %bb.0: ; %entry
5634; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5635; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5636; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5637; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5638; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5639; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5640; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5641; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5642; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5643; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5644; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5645; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5646; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5647; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5648; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5649; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5650; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5651; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5652; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5653; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5654; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5655; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5656; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5657; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5658; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5659; GFX1032-NEXT:    s_mov_b32 s2, -1
5660; GFX1032-NEXT:    ; implicit-def: $vgpr0
5661; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5662; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
5663; GFX1032-NEXT:  ; %bb.1:
5664; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5665; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5666; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5667; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5668; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
5669; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5670; GFX1032-NEXT:    buffer_gl0_inv
5671; GFX1032-NEXT:  .LBB21_2:
5672; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5673; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5674; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5675; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5676; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
5677; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5678; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5679; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5680; GFX1032-NEXT:    s_endpgm
5681;
5682; GFX1164-LABEL: umax_i32_varying:
5683; GFX1164:       ; %bb.0: ; %entry
5684; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5685; GFX1164-NEXT:    s_not_b64 exec, exec
5686; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5687; GFX1164-NEXT:    s_not_b64 exec, exec
5688; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5689; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5690; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5691; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5692; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5693; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5694; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5695; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5696; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5697; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5698; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5699; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5700; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5701; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5702; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5703; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5704; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5705; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5706; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5707; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5708; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5709; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5710; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5711; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5712; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5713; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5714; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5715; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5716; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5717; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5718; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5719; GFX1164-NEXT:    s_mov_b32 s2, -1
5720; GFX1164-NEXT:    ; implicit-def: $vgpr0
5721; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5722; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
5723; GFX1164-NEXT:  ; %bb.1:
5724; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5725; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5726; GFX1164-NEXT:    s_mov_b32 s3, s7
5727; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5728; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5729; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v4
5730; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5731; GFX1164-NEXT:    buffer_gl0_inv
5732; GFX1164-NEXT:  .LBB21_2:
5733; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5734; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5735; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5736; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
5737; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5738; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5739; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5740; GFX1164-NEXT:    s_endpgm
5741;
5742; GFX1132-LABEL: umax_i32_varying:
5743; GFX1132:       ; %bb.0: ; %entry
5744; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5745; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5746; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5747; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5748; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5749; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5750; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5751; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5752; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5753; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5754; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5755; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5756; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5757; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5758; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5759; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
5760; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5761; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5762; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5763; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5764; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5765; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5766; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5767; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5768; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5769; GFX1132-NEXT:    s_mov_b32 s2, -1
5770; GFX1132-NEXT:    ; implicit-def: $vgpr0
5771; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5772; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
5773; GFX1132-NEXT:  ; %bb.1:
5774; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5775; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5776; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5777; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5778; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v4
5779; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5780; GFX1132-NEXT:    buffer_gl0_inv
5781; GFX1132-NEXT:  .LBB21_2:
5782; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5783; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5784; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5785; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
5786; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5787; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5788; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5789; GFX1132-NEXT:    s_endpgm
5790entry:
5791  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5792  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5793  store i32 %old, i32 addrspace(1)* %out
5794  ret void
5795}
5796
5797define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
5798;
5799;
5800; GFX7LESS-LABEL: umax_i64_constant:
5801; GFX7LESS:       ; %bb.0: ; %entry
5802; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5803; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5804; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5805; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5806; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5807; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5808; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
5809; GFX7LESS-NEXT:  ; %bb.1:
5810; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5811; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5812; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5813; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5814; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5815; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5816; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5817; GFX7LESS-NEXT:  .LBB22_2:
5818; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5819; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5820; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5821; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5822; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5823; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5824; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5825; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
5826; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
5827; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5828; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
5829; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5830; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5831; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5832; GFX7LESS-NEXT:    s_endpgm
5833;
5834; GFX8-LABEL: umax_i64_constant:
5835; GFX8:       ; %bb.0: ; %entry
5836; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5837; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5838; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5839; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5840; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5841; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5842; GFX8-NEXT:    s_cbranch_execz .LBB22_2
5843; GFX8-NEXT:  ; %bb.1:
5844; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5845; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5846; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5847; GFX8-NEXT:    s_mov_b32 m0, -1
5848; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5849; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5850; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5851; GFX8-NEXT:  .LBB22_2:
5852; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5853; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5854; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5855; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
5856; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5857; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5858; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5859; GFX8-NEXT:    v_mov_b32_e32 v2, s2
5860; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5861; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5862; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5863; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5864; GFX8-NEXT:    s_mov_b32 s2, -1
5865; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5866; GFX8-NEXT:    s_endpgm
5867;
5868; GFX9-LABEL: umax_i64_constant:
5869; GFX9:       ; %bb.0: ; %entry
5870; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5871; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5872; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5873; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5874; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5875; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5876; GFX9-NEXT:    s_cbranch_execz .LBB22_2
5877; GFX9-NEXT:  ; %bb.1:
5878; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5879; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5880; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5881; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5882; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5883; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5884; GFX9-NEXT:  .LBB22_2:
5885; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5887; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5888; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
5889; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5890; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5891; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5892; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5893; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5894; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5895; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5896; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5897; GFX9-NEXT:    s_mov_b32 s2, -1
5898; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5899; GFX9-NEXT:    s_endpgm
5900;
5901; GFX1064-LABEL: umax_i64_constant:
5902; GFX1064:       ; %bb.0: ; %entry
5903; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5904; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5905; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5906; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5907; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5908; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5909; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
5910; GFX1064-NEXT:  ; %bb.1:
5911; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5912; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5913; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5914; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5915; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5916; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5917; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5918; GFX1064-NEXT:    buffer_gl0_inv
5919; GFX1064-NEXT:  .LBB22_2:
5920; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5921; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5922; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5923; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5924; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5925; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5926; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5927; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5928; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
5929; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5930; GFX1064-NEXT:    s_mov_b32 s2, -1
5931; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5932; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5933; GFX1064-NEXT:    s_endpgm
5934;
5935; GFX1032-LABEL: umax_i64_constant:
5936; GFX1032:       ; %bb.0: ; %entry
5937; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5938; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5939; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5940; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5941; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5942; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
5943; GFX1032-NEXT:  ; %bb.1:
5944; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5945; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5946; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5947; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5948; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5949; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5950; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5951; GFX1032-NEXT:    buffer_gl0_inv
5952; GFX1032-NEXT:  .LBB22_2:
5953; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5954; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5955; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5956; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5957; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5958; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
5959; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
5960; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5961; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
5962; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5963; GFX1032-NEXT:    s_mov_b32 s2, -1
5964; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5965; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5966; GFX1032-NEXT:    s_endpgm
5967;
5968; GFX1164-LABEL: umax_i64_constant:
5969; GFX1164:       ; %bb.0: ; %entry
5970; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5971; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5972; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5973; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5974; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5975; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5976; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
5977; GFX1164-NEXT:  ; %bb.1:
5978; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5979; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5980; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5981; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5982; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5983; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5984; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5985; GFX1164-NEXT:    buffer_gl0_inv
5986; GFX1164-NEXT:  .LBB22_2:
5987; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5988; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5989; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5990; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5991; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5992; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5993; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5994; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
5995; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5996; GFX1164-NEXT:    s_mov_b32 s2, -1
5997; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5998; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5999; GFX1164-NEXT:    s_endpgm
6000;
6001; GFX1132-LABEL: umax_i64_constant:
6002; GFX1132:       ; %bb.0: ; %entry
6003; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6004; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6005; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6006; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6007; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6008; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
6009; GFX1132-NEXT:  ; %bb.1:
6010; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6011; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6012; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6013; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6014; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6015; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6016; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6017; GFX1132-NEXT:    buffer_gl0_inv
6018; GFX1132-NEXT:  .LBB22_2:
6019; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6020; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6021; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6022; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6023; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6024; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6025; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6026; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6027; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6028; GFX1132-NEXT:    s_mov_b32 s2, -1
6029; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6030; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6031; GFX1132-NEXT:    s_endpgm
6032entry:
6033  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
6034  store i64 %old, i64 addrspace(1)* %out
6035  ret void
6036}
6037
6038define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
6039;
6040;
6041; GFX7LESS-LABEL: umin_i32_varying:
6042; GFX7LESS:       ; %bb.0: ; %entry
6043; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6044; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6045; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6046; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6047; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
6048; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6049; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6050; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6051; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6052; GFX7LESS-NEXT:    s_endpgm
6053;
6054; GFX8-LABEL: umin_i32_varying:
6055; GFX8:       ; %bb.0: ; %entry
6056; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6057; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6058; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6059; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6060; GFX8-NEXT:    v_mov_b32_e32 v1, -1
6061; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6062; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6063; GFX8-NEXT:    s_not_b64 exec, exec
6064; GFX8-NEXT:    v_mov_b32_e32 v2, -1
6065; GFX8-NEXT:    s_not_b64 exec, exec
6066; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6067; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6068; GFX8-NEXT:    s_nop 1
6069; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6070; GFX8-NEXT:    s_nop 1
6071; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6072; GFX8-NEXT:    s_nop 1
6073; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6074; GFX8-NEXT:    s_nop 1
6075; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6076; GFX8-NEXT:    s_nop 1
6077; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6078; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
6079; GFX8-NEXT:    s_nop 0
6080; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6081; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6082; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6083; GFX8-NEXT:    ; implicit-def: $vgpr0
6084; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6085; GFX8-NEXT:    s_cbranch_execz .LBB23_2
6086; GFX8-NEXT:  ; %bb.1:
6087; GFX8-NEXT:    v_mov_b32_e32 v0, 0
6088; GFX8-NEXT:    v_mov_b32_e32 v3, s4
6089; GFX8-NEXT:    s_mov_b32 m0, -1
6090; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6091; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
6092; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6093; GFX8-NEXT:  .LBB23_2:
6094; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6095; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6096; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6097; GFX8-NEXT:    v_mov_b32_e32 v0, v1
6098; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
6099; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6100; GFX8-NEXT:    s_mov_b32 s2, -1
6101; GFX8-NEXT:    s_nop 0
6102; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6103; GFX8-NEXT:    s_endpgm
6104;
6105; GFX9-LABEL: umin_i32_varying:
6106; GFX9:       ; %bb.0: ; %entry
6107; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6108; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6109; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6110; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6111; GFX9-NEXT:    v_mov_b32_e32 v1, -1
6112; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6113; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6114; GFX9-NEXT:    s_not_b64 exec, exec
6115; GFX9-NEXT:    v_mov_b32_e32 v2, -1
6116; GFX9-NEXT:    s_not_b64 exec, exec
6117; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6118; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6119; GFX9-NEXT:    s_nop 1
6120; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6121; GFX9-NEXT:    s_nop 1
6122; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6123; GFX9-NEXT:    s_nop 1
6124; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6125; GFX9-NEXT:    s_nop 1
6126; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6127; GFX9-NEXT:    s_nop 1
6128; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6129; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
6130; GFX9-NEXT:    s_nop 0
6131; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6132; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6133; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6134; GFX9-NEXT:    ; implicit-def: $vgpr0
6135; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6136; GFX9-NEXT:    s_cbranch_execz .LBB23_2
6137; GFX9-NEXT:  ; %bb.1:
6138; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6139; GFX9-NEXT:    v_mov_b32_e32 v3, s4
6140; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6141; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
6142; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6143; GFX9-NEXT:  .LBB23_2:
6144; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6146; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6147; GFX9-NEXT:    v_mov_b32_e32 v0, v1
6148; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
6149; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6150; GFX9-NEXT:    s_mov_b32 s2, -1
6151; GFX9-NEXT:    s_nop 0
6152; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6153; GFX9-NEXT:    s_endpgm
6154;
6155; GFX1064-LABEL: umin_i32_varying:
6156; GFX1064:       ; %bb.0: ; %entry
6157; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
6158; GFX1064-NEXT:    s_not_b64 exec, exec
6159; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
6160; GFX1064-NEXT:    s_not_b64 exec, exec
6161; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6162; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6163; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
6164; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6165; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6166; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6167; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
6168; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6169; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6170; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
6171; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
6172; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6173; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
6174; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6175; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6176; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6177; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6178; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
6179; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
6180; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6181; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6182; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6183; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
6184; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
6185; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
6186; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6187; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6188; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
6189; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
6190; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
6191; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6192; GFX1064-NEXT:    s_mov_b32 s2, -1
6193; GFX1064-NEXT:    ; implicit-def: $vgpr0
6194; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6195; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
6196; GFX1064-NEXT:  ; %bb.1:
6197; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
6198; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
6199; GFX1064-NEXT:    s_mov_b32 s3, s7
6200; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6201; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6202; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
6203; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6204; GFX1064-NEXT:    buffer_gl0_inv
6205; GFX1064-NEXT:  .LBB23_2:
6206; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6207; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
6208; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
6209; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
6210; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
6211; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6212; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6213; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6214; GFX1064-NEXT:    s_endpgm
6215;
6216; GFX1032-LABEL: umin_i32_varying:
6217; GFX1032:       ; %bb.0: ; %entry
6218; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
6219; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6220; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
6221; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6222; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6223; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6224; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6225; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6226; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6227; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
6228; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6229; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6230; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6231; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6232; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6233; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
6234; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
6235; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
6236; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6237; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6238; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6239; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6240; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
6241; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6242; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6243; GFX1032-NEXT:    s_mov_b32 s2, -1
6244; GFX1032-NEXT:    ; implicit-def: $vgpr0
6245; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6246; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
6247; GFX1032-NEXT:  ; %bb.1:
6248; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
6249; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
6250; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6251; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6252; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
6253; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6254; GFX1032-NEXT:    buffer_gl0_inv
6255; GFX1032-NEXT:  .LBB23_2:
6256; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6257; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6258; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
6259; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
6260; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
6261; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6262; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6263; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6264; GFX1032-NEXT:    s_endpgm
6265;
6266; GFX1164-LABEL: umin_i32_varying:
6267; GFX1164:       ; %bb.0: ; %entry
6268; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
6269; GFX1164-NEXT:    s_not_b64 exec, exec
6270; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
6271; GFX1164-NEXT:    s_not_b64 exec, exec
6272; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6273; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6274; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
6275; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6276; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6277; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6278; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
6279; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6280; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6281; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
6282; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
6283; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6284; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
6285; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6286; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6287; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6288; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6289; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
6290; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
6291; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6292; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6293; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6294; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
6295; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
6296; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
6297; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6298; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6299; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
6300; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
6301; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
6302; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6303; GFX1164-NEXT:    s_mov_b32 s2, -1
6304; GFX1164-NEXT:    ; implicit-def: $vgpr0
6305; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6306; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
6307; GFX1164-NEXT:  ; %bb.1:
6308; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
6309; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
6310; GFX1164-NEXT:    s_mov_b32 s3, s7
6311; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6312; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6313; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
6314; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6315; GFX1164-NEXT:    buffer_gl0_inv
6316; GFX1164-NEXT:  .LBB23_2:
6317; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
6318; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
6319; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
6320; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
6321; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6322; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6323; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6324; GFX1164-NEXT:    s_endpgm
6325;
6326; GFX1132-LABEL: umin_i32_varying:
6327; GFX1132:       ; %bb.0: ; %entry
6328; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
6329; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6330; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
6331; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6332; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6333; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6334; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6335; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6336; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6337; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
6338; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6339; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6340; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6341; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6342; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6343; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
6344; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6345; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6346; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6347; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6348; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6349; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6350; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6351; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6352; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6353; GFX1132-NEXT:    s_mov_b32 s2, -1
6354; GFX1132-NEXT:    ; implicit-def: $vgpr0
6355; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6356; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
6357; GFX1132-NEXT:  ; %bb.1:
6358; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6359; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6360; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6361; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6362; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
6363; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6364; GFX1132-NEXT:    buffer_gl0_inv
6365; GFX1132-NEXT:  .LBB23_2:
6366; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6367; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6368; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6369; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
6370; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6371; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6372; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6373; GFX1132-NEXT:    s_endpgm
6374entry:
6375  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6376  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6377  store i32 %old, i32 addrspace(1)* %out
6378  ret void
6379}
6380
6381define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
6382;
6383;
6384; GFX7LESS-LABEL: umin_i64_constant:
6385; GFX7LESS:       ; %bb.0: ; %entry
6386; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6387; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6388; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6389; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6390; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6391; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6392; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
6393; GFX7LESS-NEXT:  ; %bb.1:
6394; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6395; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6396; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6397; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6398; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6399; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6400; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6401; GFX7LESS-NEXT:  .LBB24_2:
6402; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6403; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6404; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6405; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6406; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6407; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6408; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6409; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
6410; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6411; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6412; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6413; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6414; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6415; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6416; GFX7LESS-NEXT:    s_endpgm
6417;
6418; GFX8-LABEL: umin_i64_constant:
6419; GFX8:       ; %bb.0: ; %entry
6420; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6421; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6422; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6423; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6424; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6425; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6426; GFX8-NEXT:    s_cbranch_execz .LBB24_2
6427; GFX8-NEXT:  ; %bb.1:
6428; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6429; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6430; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6431; GFX8-NEXT:    s_mov_b32 m0, -1
6432; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6433; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6434; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6435; GFX8-NEXT:  .LBB24_2:
6436; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6437; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6438; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6439; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
6440; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6441; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6442; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6443; GFX8-NEXT:    v_mov_b32_e32 v2, s5
6444; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6445; GFX8-NEXT:    v_mov_b32_e32 v2, s4
6446; GFX8-NEXT:    s_mov_b32 s2, -1
6447; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6448; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6449; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6450; GFX8-NEXT:    s_endpgm
6451;
6452; GFX9-LABEL: umin_i64_constant:
6453; GFX9:       ; %bb.0: ; %entry
6454; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6455; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6456; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6457; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6458; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6459; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6460; GFX9-NEXT:    s_cbranch_execz .LBB24_2
6461; GFX9-NEXT:  ; %bb.1:
6462; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6463; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6464; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6465; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6466; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6468; GFX9-NEXT:  .LBB24_2:
6469; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6471; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6472; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
6473; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6474; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6475; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6476; GFX9-NEXT:    v_mov_b32_e32 v2, s5
6477; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6478; GFX9-NEXT:    v_mov_b32_e32 v2, s4
6479; GFX9-NEXT:    s_mov_b32 s2, -1
6480; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6481; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6482; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6483; GFX9-NEXT:    s_endpgm
6484;
6485; GFX1064-LABEL: umin_i64_constant:
6486; GFX1064:       ; %bb.0: ; %entry
6487; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6488; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6489; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6490; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6491; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6492; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6493; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
6494; GFX1064-NEXT:  ; %bb.1:
6495; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6496; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6497; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6498; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6499; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6500; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6501; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6502; GFX1064-NEXT:    buffer_gl0_inv
6503; GFX1064-NEXT:  .LBB24_2:
6504; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6505; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6506; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6507; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6508; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6509; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6510; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6511; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6512; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6513; GFX1064-NEXT:    s_mov_b32 s2, -1
6514; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6515; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6516; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6517; GFX1064-NEXT:    s_endpgm
6518;
6519; GFX1032-LABEL: umin_i64_constant:
6520; GFX1032:       ; %bb.0: ; %entry
6521; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6522; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6523; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6524; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6525; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6526; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
6527; GFX1032-NEXT:  ; %bb.1:
6528; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6529; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6530; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6531; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6532; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6533; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6534; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6535; GFX1032-NEXT:    buffer_gl0_inv
6536; GFX1032-NEXT:  .LBB24_2:
6537; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6538; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6539; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6540; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6541; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6542; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6543; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6544; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6545; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6546; GFX1032-NEXT:    s_mov_b32 s2, -1
6547; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6548; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6549; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6550; GFX1032-NEXT:    s_endpgm
6551;
6552; GFX1164-LABEL: umin_i64_constant:
6553; GFX1164:       ; %bb.0: ; %entry
6554; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6555; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6556; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6557; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6558; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6559; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6560; GFX1164-NEXT:    s_cbranch_execz .LBB24_2
6561; GFX1164-NEXT:  ; %bb.1:
6562; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6563; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6564; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6565; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6566; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6567; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6568; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6569; GFX1164-NEXT:    buffer_gl0_inv
6570; GFX1164-NEXT:  .LBB24_2:
6571; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6572; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6573; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6574; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6575; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6576; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6577; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6578; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6579; GFX1164-NEXT:    s_mov_b32 s2, -1
6580; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6581; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6582; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6583; GFX1164-NEXT:    s_endpgm
6584;
6585; GFX1132-LABEL: umin_i64_constant:
6586; GFX1132:       ; %bb.0: ; %entry
6587; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6588; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6589; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6590; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6591; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6592; GFX1132-NEXT:    s_cbranch_execz .LBB24_2
6593; GFX1132-NEXT:  ; %bb.1:
6594; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6595; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6596; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6597; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6598; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6599; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6600; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6601; GFX1132-NEXT:    buffer_gl0_inv
6602; GFX1132-NEXT:  .LBB24_2:
6603; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6604; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6605; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6606; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6607; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6608; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6609; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6610; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6611; GFX1132-NEXT:    s_mov_b32 s2, -1
6612; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6613; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6614; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6615; GFX1132-NEXT:    s_endpgm
6616entry:
6617  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
6618  store i64 %old, i64 addrspace(1)* %out
6619  ret void
6620}
6621