1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12@local_var32 = addrspace(3) global i32 undef, align 4
13@local_var64 = addrspace(3) global i64 undef, align 8
14
15; Show what the atomic optimization pass will do for local pointers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
18;
19;
20; GFX7LESS-LABEL: add_i32_constant:
21; GFX7LESS:       ; %bb.0: ; %entry
22; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
23; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
25; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
26; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
27; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
28; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
29; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
30; GFX7LESS-NEXT:  ; %bb.1:
31; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
34; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
35; GFX7LESS-NEXT:    s_mov_b32 m0, -1
36; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:  .LBB0_2:
40; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
43; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
45; GFX7LESS-NEXT:    s_mov_b32 s2, -1
46; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; GFX7LESS-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
52; GFX8-NEXT:    s_mov_b64 s[2:3], exec
53; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
54; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
55; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
56; GFX8-NEXT:    ; implicit-def: $vgpr1
57; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
58; GFX8-NEXT:    s_cbranch_execz .LBB0_2
59; GFX8-NEXT:  ; %bb.1:
60; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
61; GFX8-NEXT:    s_mul_i32 s2, s2, 5
62; GFX8-NEXT:    v_mov_b32_e32 v1, 0
63; GFX8-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-NEXT:    s_mov_b32 m0, -1
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:  .LBB0_2:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
73; GFX8-NEXT:    s_mov_b32 s3, 0xf000
74; GFX8-NEXT:    s_mov_b32 s2, -1
75; GFX8-NEXT:    s_nop 1
76; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-LABEL: add_i32_constant:
80; GFX9:       ; %bb.0: ; %entry
81; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
82; GFX9-NEXT:    s_mov_b64 s[2:3], exec
83; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
84; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
85; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
86; GFX9-NEXT:    ; implicit-def: $vgpr1
87; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX9-NEXT:    s_cbranch_execz .LBB0_2
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
91; GFX9-NEXT:    s_mul_i32 s2, s2, 5
92; GFX9-NEXT:    v_mov_b32_e32 v1, 0
93; GFX9-NEXT:    v_mov_b32_e32 v2, s2
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
101; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
102; GFX9-NEXT:    s_mov_b32 s3, 0xf000
103; GFX9-NEXT:    s_mov_b32 s2, -1
104; GFX9-NEXT:    s_nop 1
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
111; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
112; GFX1064-NEXT:    ; implicit-def: $vgpr1
113; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
114; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
115; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
116; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
117; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
118; GFX1064-NEXT:  ; %bb.1:
119; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
120; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
121; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
122; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
123; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
124; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
126; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1064-NEXT:    buffer_gl0_inv
128; GFX1064-NEXT:  .LBB0_2:
129; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
130; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
131; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
132; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
133; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
134; GFX1064-NEXT:    s_mov_b32 s2, -1
135; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
137; GFX1064-NEXT:    s_endpgm
138;
139; GFX1032-LABEL: add_i32_constant:
140; GFX1032:       ; %bb.0: ; %entry
141; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
142; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
143; GFX1032-NEXT:    ; implicit-def: $vgpr1
144; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
145; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
146; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
147; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
148; GFX1032-NEXT:  ; %bb.1:
149; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
150; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
151; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
152; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
153; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
155; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
156; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1032-NEXT:    buffer_gl0_inv
158; GFX1032-NEXT:  .LBB0_2:
159; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
160; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
161; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
162; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
163; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
164; GFX1032-NEXT:    s_mov_b32 s2, -1
165; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
167; GFX1032-NEXT:    s_endpgm
168;
169; GFX1164-LABEL: add_i32_constant:
170; GFX1164:       ; %bb.0: ; %entry
171; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
172; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
173; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
174; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
175; GFX1164-NEXT:    ; implicit-def: $vgpr1
176; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
177; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
178; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
179; GFX1164-NEXT:  ; %bb.1:
180; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
181; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
182; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
183; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
184; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
185; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
186; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
187; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX1164-NEXT:    buffer_gl0_inv
189; GFX1164-NEXT:  .LBB0_2:
190; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
191; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
192; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
193; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
194; GFX1164-NEXT:    s_mov_b32 s2, -1
195; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
197; GFX1164-NEXT:    s_endpgm
198;
199; GFX1132-LABEL: add_i32_constant:
200; GFX1132:       ; %bb.0: ; %entry
201; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
202; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
203; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
204; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
205; GFX1132-NEXT:    ; implicit-def: $vgpr1
206; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
207; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
208; GFX1132-NEXT:  ; %bb.1:
209; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
210; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
211; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
212; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
213; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
214; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
215; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
216; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX1132-NEXT:    buffer_gl0_inv
218; GFX1132-NEXT:  .LBB0_2:
219; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
220; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
221; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
222; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
223; GFX1132-NEXT:    s_mov_b32 s2, -1
224; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
226; GFX1132-NEXT:    s_endpgm
227entry:
228  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
229  store i32 %old, i32 addrspace(1)* %out
230  ret void
231}
232
233define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
234;
235;
236; GFX7LESS-LABEL: add_i32_uniform:
237; GFX7LESS:       ; %bb.0: ; %entry
238; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
239; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
240; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
241; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
242; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
243; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
244; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
245; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
246; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
247; GFX7LESS-NEXT:  ; %bb.1:
248; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
249; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
251; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
252; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
253; GFX7LESS-NEXT:    s_mov_b32 m0, -1
254; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7LESS-NEXT:  .LBB1_2:
258; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
259; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
261; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
262; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
263; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
264; GFX7LESS-NEXT:    s_mov_b32 s6, -1
265; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
266; GFX7LESS-NEXT:    s_endpgm
267;
268; GFX8-LABEL: add_i32_uniform:
269; GFX8:       ; %bb.0: ; %entry
270; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
271; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
272; GFX8-NEXT:    s_mov_b64 s[2:3], exec
273; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
274; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
275; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
276; GFX8-NEXT:    ; implicit-def: $vgpr1
277; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
278; GFX8-NEXT:    s_cbranch_execz .LBB1_2
279; GFX8-NEXT:  ; %bb.1:
280; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
281; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX8-NEXT:    s_mul_i32 s2, s6, s2
283; GFX8-NEXT:    v_mov_b32_e32 v1, 0
284; GFX8-NEXT:    v_mov_b32_e32 v2, s2
285; GFX8-NEXT:    s_mov_b32 m0, -1
286; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
288; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX8-NEXT:  .LBB1_2:
290; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
291; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
293; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
294; GFX8-NEXT:    s_mov_b32 s7, 0xf000
295; GFX8-NEXT:    s_mov_b32 s6, -1
296; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
297; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
298; GFX8-NEXT:    s_endpgm
299;
300; GFX9-LABEL: add_i32_uniform:
301; GFX9:       ; %bb.0: ; %entry
302; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
303; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
304; GFX9-NEXT:    s_mov_b64 s[2:3], exec
305; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
306; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
307; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
308; GFX9-NEXT:    ; implicit-def: $vgpr1
309; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
310; GFX9-NEXT:    s_cbranch_execz .LBB1_2
311; GFX9-NEXT:  ; %bb.1:
312; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    s_mul_i32 s2, s6, s2
315; GFX9-NEXT:    v_mov_b32_e32 v1, 0
316; GFX9-NEXT:    v_mov_b32_e32 v2, s2
317; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
319; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX9-NEXT:  .LBB1_2:
321; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
324; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
325; GFX9-NEXT:    s_mov_b32 s7, 0xf000
326; GFX9-NEXT:    s_mov_b32 s6, -1
327; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
328; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
329; GFX9-NEXT:    s_endpgm
330;
331; GFX1064-LABEL: add_i32_uniform:
332; GFX1064:       ; %bb.0: ; %entry
333; GFX1064-NEXT:    s_clause 0x1
334; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
335; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
336; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
337; GFX1064-NEXT:    ; implicit-def: $vgpr1
338; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
339; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
340; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
341; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
342; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
343; GFX1064-NEXT:  ; %bb.1:
344; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
345; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
346; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
348; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
349; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
350; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
351; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
352; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX1064-NEXT:    buffer_gl0_inv
354; GFX1064-NEXT:  .LBB1_2:
355; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
356; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
357; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
358; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
359; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
361; GFX1064-NEXT:    s_mov_b32 s6, -1
362; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
363; GFX1064-NEXT:    s_endpgm
364;
365; GFX1032-LABEL: add_i32_uniform:
366; GFX1032:       ; %bb.0: ; %entry
367; GFX1032-NEXT:    s_clause 0x1
368; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
369; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
370; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
371; GFX1032-NEXT:    ; implicit-def: $vgpr1
372; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
373; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
374; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
375; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
376; GFX1032-NEXT:  ; %bb.1:
377; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
378; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
379; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
381; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
382; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
383; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
384; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
385; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX1032-NEXT:    buffer_gl0_inv
387; GFX1032-NEXT:  .LBB1_2:
388; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
389; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
390; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
391; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
392; GFX1032-NEXT:    s_mov_b32 s6, -1
393; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
395; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
396; GFX1032-NEXT:    s_endpgm
397;
398; GFX1164-LABEL: add_i32_uniform:
399; GFX1164:       ; %bb.0: ; %entry
400; GFX1164-NEXT:    s_clause 0x1
401; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
402; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
403; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
404; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
405; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
406; GFX1164-NEXT:    ; implicit-def: $vgpr1
407; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
408; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
409; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
410; GFX1164-NEXT:  ; %bb.1:
411; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
412; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
413; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
415; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
416; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
417; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
418; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
419; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX1164-NEXT:    buffer_gl0_inv
421; GFX1164-NEXT:  .LBB1_2:
422; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
423; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
424; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
425; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
427; GFX1164-NEXT:    s_mov_b32 s6, -1
428; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
429; GFX1164-NEXT:    s_endpgm
430;
431; GFX1132-LABEL: add_i32_uniform:
432; GFX1132:       ; %bb.0: ; %entry
433; GFX1132-NEXT:    s_clause 0x1
434; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
435; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
436; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
437; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
438; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
439; GFX1132-NEXT:    ; implicit-def: $vgpr1
440; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
441; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
442; GFX1132-NEXT:  ; %bb.1:
443; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
444; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
445; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
447; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
448; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
449; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
450; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
451; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX1132-NEXT:    buffer_gl0_inv
453; GFX1132-NEXT:  .LBB1_2:
454; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
455; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
456; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
457; GFX1132-NEXT:    s_mov_b32 s6, -1
458; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
460; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
461; GFX1132-NEXT:    s_endpgm
462entry:
463  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
464  store i32 %old, i32 addrspace(1)* %out
465  ret void
466}
467
468define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
469;
470;
471; GFX7LESS-LABEL: add_i32_varying:
472; GFX7LESS:       ; %bb.0: ; %entry
473; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
474; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
475; GFX7LESS-NEXT:    s_mov_b32 m0, -1
476; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
478; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
480; GFX7LESS-NEXT:    s_mov_b32 s2, -1
481; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
482; GFX7LESS-NEXT:    s_endpgm
483;
484; GFX8-LABEL: add_i32_varying:
485; GFX8:       ; %bb.0: ; %entry
486; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
487; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
488; GFX8-NEXT:    v_mov_b32_e32 v1, 0
489; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
490; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
491; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
492; GFX8-NEXT:    v_mov_b32_e32 v2, v0
493; GFX8-NEXT:    s_not_b64 exec, exec
494; GFX8-NEXT:    v_mov_b32_e32 v2, 0
495; GFX8-NEXT:    s_not_b64 exec, exec
496; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
497; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
498; GFX8-NEXT:    s_nop 1
499; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
500; GFX8-NEXT:    s_nop 1
501; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
502; GFX8-NEXT:    s_nop 1
503; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
504; GFX8-NEXT:    s_nop 1
505; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
506; GFX8-NEXT:    s_nop 1
507; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
508; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
509; GFX8-NEXT:    s_nop 0
510; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
511; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
512; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
513; GFX8-NEXT:    ; implicit-def: $vgpr0
514; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
515; GFX8-NEXT:    s_cbranch_execz .LBB2_2
516; GFX8-NEXT:  ; %bb.1:
517; GFX8-NEXT:    v_mov_b32_e32 v0, 0
518; GFX8-NEXT:    v_mov_b32_e32 v3, s4
519; GFX8-NEXT:    s_mov_b32 m0, -1
520; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
522; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX8-NEXT:  .LBB2_2:
524; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
525; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
527; GFX8-NEXT:    v_mov_b32_e32 v0, v1
528; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
529; GFX8-NEXT:    s_mov_b32 s3, 0xf000
530; GFX8-NEXT:    s_mov_b32 s2, -1
531; GFX8-NEXT:    s_nop 0
532; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
533; GFX8-NEXT:    s_endpgm
534;
535; GFX9-LABEL: add_i32_varying:
536; GFX9:       ; %bb.0: ; %entry
537; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
538; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
539; GFX9-NEXT:    v_mov_b32_e32 v1, 0
540; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
541; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
542; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
543; GFX9-NEXT:    v_mov_b32_e32 v2, v0
544; GFX9-NEXT:    s_not_b64 exec, exec
545; GFX9-NEXT:    v_mov_b32_e32 v2, 0
546; GFX9-NEXT:    s_not_b64 exec, exec
547; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
548; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
549; GFX9-NEXT:    s_nop 1
550; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
551; GFX9-NEXT:    s_nop 1
552; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
553; GFX9-NEXT:    s_nop 1
554; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
555; GFX9-NEXT:    s_nop 1
556; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
557; GFX9-NEXT:    s_nop 1
558; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
559; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
560; GFX9-NEXT:    s_nop 0
561; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
562; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
563; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
564; GFX9-NEXT:    ; implicit-def: $vgpr0
565; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
566; GFX9-NEXT:    s_cbranch_execz .LBB2_2
567; GFX9-NEXT:  ; %bb.1:
568; GFX9-NEXT:    v_mov_b32_e32 v0, 0
569; GFX9-NEXT:    v_mov_b32_e32 v3, s4
570; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
572; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX9-NEXT:  .LBB2_2:
574; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
575; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
577; GFX9-NEXT:    v_mov_b32_e32 v0, v1
578; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
579; GFX9-NEXT:    s_mov_b32 s3, 0xf000
580; GFX9-NEXT:    s_mov_b32 s2, -1
581; GFX9-NEXT:    s_nop 0
582; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
583; GFX9-NEXT:    s_endpgm
584;
585; GFX1064-LABEL: add_i32_varying:
586; GFX1064:       ; %bb.0: ; %entry
587; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
588; GFX1064-NEXT:    s_not_b64 exec, exec
589; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
590; GFX1064-NEXT:    s_not_b64 exec, exec
591; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
592; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
593; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
594; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
595; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
596; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
597; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
598; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
599; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
600; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
601; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
602; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
603; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
604; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
605; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
606; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
607; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
608; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
609; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
610; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
611; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
612; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
613; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
614; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
615; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
616; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
617; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
618; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
619; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
620; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
621; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
622; GFX1064-NEXT:    s_mov_b32 s2, -1
623; GFX1064-NEXT:    ; implicit-def: $vgpr0
624; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
625; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
626; GFX1064-NEXT:  ; %bb.1:
627; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
628; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
629; GFX1064-NEXT:    s_mov_b32 s3, s7
630; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
631; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
632; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v4
633; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX1064-NEXT:    buffer_gl0_inv
635; GFX1064-NEXT:  .LBB2_2:
636; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
637; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
638; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
639; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
640; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
641; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
642; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
644; GFX1064-NEXT:    s_endpgm
645;
646; GFX1032-LABEL: add_i32_varying:
647; GFX1032:       ; %bb.0: ; %entry
648; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
649; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
650; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
651; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
652; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
653; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
654; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
655; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
656; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
657; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
658; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
659; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
660; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
661; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
662; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
663; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
664; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
665; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
666; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
667; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
668; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
669; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
670; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
671; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
672; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
673; GFX1032-NEXT:    s_mov_b32 s2, -1
674; GFX1032-NEXT:    ; implicit-def: $vgpr0
675; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
676; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
677; GFX1032-NEXT:  ; %bb.1:
678; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
679; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
680; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
681; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
682; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v4
683; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX1032-NEXT:    buffer_gl0_inv
685; GFX1032-NEXT:  .LBB2_2:
686; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
687; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
688; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
689; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
690; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
691; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
692; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
694; GFX1032-NEXT:    s_endpgm
695;
696; GFX1164-LABEL: add_i32_varying:
697; GFX1164:       ; %bb.0: ; %entry
698; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
699; GFX1164-NEXT:    s_not_b64 exec, exec
700; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
701; GFX1164-NEXT:    s_not_b64 exec, exec
702; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
703; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
704; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
705; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
706; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
707; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
708; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
709; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
710; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
711; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
712; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
713; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
714; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
715; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
716; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
717; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
718; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
719; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
720; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
721; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
722; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
723; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
724; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
725; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
726; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
727; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
728; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
729; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
730; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
731; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
732; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
733; GFX1164-NEXT:    s_mov_b32 s2, -1
734; GFX1164-NEXT:    ; implicit-def: $vgpr0
735; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
736; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
737; GFX1164-NEXT:  ; %bb.1:
738; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
739; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
740; GFX1164-NEXT:    s_mov_b32 s3, s7
741; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
742; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
743; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v4
744; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX1164-NEXT:    buffer_gl0_inv
746; GFX1164-NEXT:  .LBB2_2:
747; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
748; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
749; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
750; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
751; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
752; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
754; GFX1164-NEXT:    s_endpgm
755;
756; GFX1132-LABEL: add_i32_varying:
757; GFX1132:       ; %bb.0: ; %entry
758; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
759; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
760; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
761; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
762; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
763; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
764; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
765; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
766; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
767; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
768; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
769; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
770; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
771; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
772; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
773; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
774; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
775; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
776; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
777; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
778; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
779; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
780; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
781; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
782; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
783; GFX1132-NEXT:    s_mov_b32 s2, -1
784; GFX1132-NEXT:    ; implicit-def: $vgpr0
785; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
786; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
787; GFX1132-NEXT:  ; %bb.1:
788; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
789; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
790; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
791; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
792; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v4
793; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX1132-NEXT:    buffer_gl0_inv
795; GFX1132-NEXT:  .LBB2_2:
796; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
797; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
798; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
799; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
800; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
801; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
803; GFX1132-NEXT:    s_endpgm
804entry:
805  %lane = call i32 @llvm.amdgcn.workitem.id.x()
806  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
807  store i32 %old, i32 addrspace(1)* %out
808  ret void
809}
810
811define amdgpu_kernel void @add_i32_varying_nouse() {
812; GFX7LESS-LABEL: add_i32_varying_nouse:
813; GFX7LESS:       ; %bb.0: ; %entry
814; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
815; GFX7LESS-NEXT:    s_mov_b32 m0, -1
816; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX7LESS-NEXT:    ds_add_u32 v1, v0
818; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX7LESS-NEXT:    s_endpgm
820;
821; GFX8-LABEL: add_i32_varying_nouse:
822; GFX8:       ; %bb.0: ; %entry
823; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
824; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
825; GFX8-NEXT:    v_mov_b32_e32 v1, v0
826; GFX8-NEXT:    s_not_b64 exec, exec
827; GFX8-NEXT:    v_mov_b32_e32 v1, 0
828; GFX8-NEXT:    s_not_b64 exec, exec
829; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
830; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
831; GFX8-NEXT:    s_nop 1
832; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
833; GFX8-NEXT:    s_nop 1
834; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
835; GFX8-NEXT:    s_nop 1
836; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
837; GFX8-NEXT:    s_nop 1
838; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
839; GFX8-NEXT:    s_nop 1
840; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
841; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
842; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
843; GFX8-NEXT:    s_mov_b32 s0, s2
844; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
845; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
846; GFX8-NEXT:    s_cbranch_execz .LBB3_2
847; GFX8-NEXT:  ; %bb.1:
848; GFX8-NEXT:    v_mov_b32_e32 v0, 0
849; GFX8-NEXT:    v_mov_b32_e32 v2, s0
850; GFX8-NEXT:    s_mov_b32 m0, -1
851; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX8-NEXT:    ds_add_u32 v0, v2
853; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX8-NEXT:  .LBB3_2:
855; GFX8-NEXT:    s_endpgm
856;
857; GFX9-LABEL: add_i32_varying_nouse:
858; GFX9:       ; %bb.0: ; %entry
859; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
860; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
861; GFX9-NEXT:    v_mov_b32_e32 v1, v0
862; GFX9-NEXT:    s_not_b64 exec, exec
863; GFX9-NEXT:    v_mov_b32_e32 v1, 0
864; GFX9-NEXT:    s_not_b64 exec, exec
865; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
866; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
867; GFX9-NEXT:    s_nop 1
868; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
869; GFX9-NEXT:    s_nop 1
870; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
871; GFX9-NEXT:    s_nop 1
872; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
873; GFX9-NEXT:    s_nop 1
874; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
875; GFX9-NEXT:    s_nop 1
876; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
877; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
878; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
879; GFX9-NEXT:    s_mov_b32 s0, s2
880; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
881; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
882; GFX9-NEXT:    s_cbranch_execz .LBB3_2
883; GFX9-NEXT:  ; %bb.1:
884; GFX9-NEXT:    v_mov_b32_e32 v0, 0
885; GFX9-NEXT:    v_mov_b32_e32 v2, s0
886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX9-NEXT:    ds_add_u32 v0, v2
888; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX9-NEXT:  .LBB3_2:
890; GFX9-NEXT:    s_endpgm
891;
892; GFX1064-LABEL: add_i32_varying_nouse:
893; GFX1064:       ; %bb.0: ; %entry
894; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
895; GFX1064-NEXT:    s_not_b64 exec, exec
896; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
897; GFX1064-NEXT:    s_not_b64 exec, exec
898; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
899; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
900; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
901; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
902; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
903; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
904; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
905; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
906; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
907; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
908; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
909; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
910; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
911; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
912; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
913; GFX1064-NEXT:    s_add_i32 s0, s2, s3
914; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
915; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
916; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
917; GFX1064-NEXT:  ; %bb.1:
918; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
919; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
920; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
921; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
922; GFX1064-NEXT:    ds_add_u32 v0, v3
923; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX1064-NEXT:    buffer_gl0_inv
925; GFX1064-NEXT:  .LBB3_2:
926; GFX1064-NEXT:    s_endpgm
927;
928; GFX1032-LABEL: add_i32_varying_nouse:
929; GFX1032:       ; %bb.0: ; %entry
930; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
931; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
932; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
933; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
934; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
935; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
936; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
937; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
938; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
939; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
940; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
941; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
942; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
943; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
944; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
945; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
946; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
947; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
948; GFX1032-NEXT:  ; %bb.1:
949; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
950; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
951; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
952; GFX1032-NEXT:    ds_add_u32 v3, v0
953; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX1032-NEXT:    buffer_gl0_inv
955; GFX1032-NEXT:  .LBB3_2:
956; GFX1032-NEXT:    s_endpgm
957;
958; GFX1164-LABEL: add_i32_varying_nouse:
959; GFX1164:       ; %bb.0: ; %entry
960; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
961; GFX1164-NEXT:    s_not_b64 exec, exec
962; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
963; GFX1164-NEXT:    s_not_b64 exec, exec
964; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
965; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
966; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
967; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
968; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
969; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
970; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
971; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
972; GFX1164-NEXT:    v_permlane64_b32 v2, v1
973; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
974; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
975; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
976; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
977; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
978; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
979; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
980; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
981; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
982; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
983; GFX1164-NEXT:  ; %bb.1:
984; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
985; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
986; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
987; GFX1164-NEXT:    ds_add_u32 v3, v0
988; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX1164-NEXT:    buffer_gl0_inv
990; GFX1164-NEXT:  .LBB3_2:
991; GFX1164-NEXT:    s_endpgm
992;
993; GFX1132-LABEL: add_i32_varying_nouse:
994; GFX1132:       ; %bb.0: ; %entry
995; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
996; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
997; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
998; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
999; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
1000; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1001; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1002; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1003; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1004; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
1005; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1006; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1007; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
1008; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1009; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
1010; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1011; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
1012; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1013; GFX1132-NEXT:  ; %bb.1:
1014; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1015; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1016; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1017; GFX1132-NEXT:    ds_add_u32 v3, v0
1018; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX1132-NEXT:    buffer_gl0_inv
1020; GFX1132-NEXT:  .LBB3_2:
1021; GFX1132-NEXT:    s_endpgm
1022entry:
1023  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1024  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1025  ret void
1026}
1027
1028define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1029;
1030;
1031; GFX7LESS-LABEL: add_i64_constant:
1032; GFX7LESS:       ; %bb.0: ; %entry
1033; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1034; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1035; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1036; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
1037; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1038; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1039; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1040; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1041; GFX7LESS-NEXT:  ; %bb.1:
1042; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1043; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
1044; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1045; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
1046; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1047; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1049; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1050; GFX7LESS-NEXT:  .LBB4_2:
1051; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1052; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1054; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1055; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1056; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1057; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1058; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1059; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1060; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1061; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1062; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1063; GFX7LESS-NEXT:    s_endpgm
1064;
1065; GFX8-LABEL: add_i64_constant:
1066; GFX8:       ; %bb.0: ; %entry
1067; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1068; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1069; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1070; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1071; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1072; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1073; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1074; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1075; GFX8-NEXT:  ; %bb.1:
1076; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1077; GFX8-NEXT:    s_mul_i32 s4, s4, 5
1078; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1079; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1080; GFX8-NEXT:    s_mov_b32 m0, -1
1081; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1083; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX8-NEXT:  .LBB4_2:
1085; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1086; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1087; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1088; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1089; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1090; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1091; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1092; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1093; GFX8-NEXT:    s_mov_b32 s2, -1
1094; GFX8-NEXT:    s_nop 2
1095; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1096; GFX8-NEXT:    s_endpgm
1097;
1098; GFX9-LABEL: add_i64_constant:
1099; GFX9:       ; %bb.0: ; %entry
1100; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1101; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1102; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1103; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1104; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1105; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1106; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1107; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1108; GFX9-NEXT:  ; %bb.1:
1109; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1110; GFX9-NEXT:    s_mul_i32 s4, s4, 5
1111; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1112; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1113; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1115; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX9-NEXT:  .LBB4_2:
1117; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1120; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
1121; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1122; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1123; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1124; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1125; GFX9-NEXT:    s_mov_b32 s2, -1
1126; GFX9-NEXT:    s_nop 2
1127; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1128; GFX9-NEXT:    s_endpgm
1129;
1130; GFX1064-LABEL: add_i64_constant:
1131; GFX1064:       ; %bb.0: ; %entry
1132; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1133; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1134; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1135; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1136; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1137; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1138; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1139; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1140; GFX1064-NEXT:  ; %bb.1:
1141; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1142; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1143; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
1144; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1145; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1146; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1147; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1148; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX1064-NEXT:    buffer_gl0_inv
1150; GFX1064-NEXT:  .LBB4_2:
1151; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1152; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1153; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1154; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1155; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1156; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1157; GFX1064-NEXT:    s_mov_b32 s2, -1
1158; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1159; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1160; GFX1064-NEXT:    s_endpgm
1161;
1162; GFX1032-LABEL: add_i64_constant:
1163; GFX1032:       ; %bb.0: ; %entry
1164; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1165; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1166; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1167; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1168; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1169; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1170; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1171; GFX1032-NEXT:  ; %bb.1:
1172; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1173; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1174; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1175; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
1176; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1177; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1178; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1179; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX1032-NEXT:    buffer_gl0_inv
1181; GFX1032-NEXT:  .LBB4_2:
1182; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1183; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1184; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1185; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1186; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1187; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1188; GFX1032-NEXT:    s_mov_b32 s2, -1
1189; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1191; GFX1032-NEXT:    s_endpgm
1192;
1193; GFX1164-LABEL: add_i64_constant:
1194; GFX1164:       ; %bb.0: ; %entry
1195; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1196; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1197; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1198; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1199; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
1200; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1201; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1202; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1203; GFX1164-NEXT:  ; %bb.1:
1204; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1205; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1206; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
1207; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
1208; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1209; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1210; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1211; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX1164-NEXT:    buffer_gl0_inv
1213; GFX1164-NEXT:  .LBB4_2:
1214; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1215; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1216; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1217; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1218; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1219; GFX1164-NEXT:    s_mov_b32 s2, -1
1220; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1221; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1222; GFX1164-NEXT:    s_endpgm
1223;
1224; GFX1132-LABEL: add_i64_constant:
1225; GFX1132:       ; %bb.0: ; %entry
1226; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1227; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1228; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1229; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1230; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1231; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1232; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1233; GFX1132-NEXT:  ; %bb.1:
1234; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1235; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1236; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1237; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
1238; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1239; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1240; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1241; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX1132-NEXT:    buffer_gl0_inv
1243; GFX1132-NEXT:  .LBB4_2:
1244; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1245; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1246; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1247; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1248; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1249; GFX1132-NEXT:    s_mov_b32 s2, -1
1250; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1251; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1252; GFX1132-NEXT:    s_endpgm
1253entry:
1254  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1255  store i64 %old, i64 addrspace(1)* %out
1256  ret void
1257}
1258
1259define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1260;
1261;
1262; GFX7LESS-LABEL: add_i64_uniform:
1263; GFX7LESS:       ; %bb.0: ; %entry
1264; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1265; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1266; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1267; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1268; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1269; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1270; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1271; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1272; GFX7LESS-NEXT:  ; %bb.1:
1273; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1274; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1275; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1276; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1277; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1278; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1279; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1280; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1281; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1282; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1283; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1284; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1285; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1286; GFX7LESS-NEXT:  .LBB5_2:
1287; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1288; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1289; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1290; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1292; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1293; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
1294; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1295; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1296; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1297; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1298; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1299; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
1300; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
1301; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1302; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1303; GFX7LESS-NEXT:    s_endpgm
1304;
1305; GFX8-LABEL: add_i64_uniform:
1306; GFX8:       ; %bb.0: ; %entry
1307; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1308; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1309; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1310; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1311; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1312; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1313; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1314; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1315; GFX8-NEXT:  ; %bb.1:
1316; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1317; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1318; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1320; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1321; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1322; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1323; GFX8-NEXT:    s_mov_b32 m0, -1
1324; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1325; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1326; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX8-NEXT:  .LBB5_2:
1328; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1329; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1330; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1331; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1332; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1333; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1334; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1335; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1336; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1337; GFX8-NEXT:    s_mov_b32 s6, -1
1338; GFX8-NEXT:    s_mov_b32 s4, s0
1339; GFX8-NEXT:    s_mov_b32 s5, s1
1340; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1341; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1342; GFX8-NEXT:    s_endpgm
1343;
1344; GFX9-LABEL: add_i64_uniform:
1345; GFX9:       ; %bb.0: ; %entry
1346; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1347; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1348; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1349; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1350; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1351; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1352; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1353; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1354; GFX9-NEXT:  ; %bb.1:
1355; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1356; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1357; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1358; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1359; GFX9-NEXT:    s_add_i32 s8, s8, s7
1360; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1361; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1362; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1363; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX9-NEXT:  .LBB5_2:
1368; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1369; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1371; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1372; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1373; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1374; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1375; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1376; GFX9-NEXT:    s_mov_b32 s6, -1
1377; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1378; GFX9-NEXT:    s_mov_b32 s4, s0
1379; GFX9-NEXT:    s_mov_b32 s5, s1
1380; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1381; GFX9-NEXT:    s_endpgm
1382;
1383; GFX1064-LABEL: add_i64_uniform:
1384; GFX1064:       ; %bb.0: ; %entry
1385; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1386; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1387; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1388; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1389; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1390; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1391; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1392; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1393; GFX1064-NEXT:  ; %bb.1:
1394; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1395; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1396; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1398; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1399; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1400; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1401; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1402; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1403; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1404; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1405; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1406; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1407; GFX1064-NEXT:    buffer_gl0_inv
1408; GFX1064-NEXT:  .LBB5_2:
1409; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1410; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1411; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1412; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1413; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1415; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1416; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1417; GFX1064-NEXT:    s_mov_b32 s2, -1
1418; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1419; GFX1064-NEXT:    s_endpgm
1420;
1421; GFX1032-LABEL: add_i64_uniform:
1422; GFX1032:       ; %bb.0: ; %entry
1423; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1424; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1425; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1426; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1427; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1428; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1429; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1430; GFX1032-NEXT:  ; %bb.1:
1431; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1432; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1433; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1434; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1435; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1436; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1437; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1438; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1439; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1440; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1441; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1442; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1443; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1444; GFX1032-NEXT:    buffer_gl0_inv
1445; GFX1032-NEXT:  .LBB5_2:
1446; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1447; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1448; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1449; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1450; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1452; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1453; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1454; GFX1032-NEXT:    s_mov_b32 s2, -1
1455; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1456; GFX1032-NEXT:    s_endpgm
1457;
1458; GFX1164-LABEL: add_i64_uniform:
1459; GFX1164:       ; %bb.0: ; %entry
1460; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1461; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1462; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1463; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1464; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1465; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1466; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1467; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1468; GFX1164-NEXT:  ; %bb.1:
1469; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1470; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1471; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1473; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1474; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1475; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1476; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1477; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1478; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1479; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1480; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1481; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX1164-NEXT:    buffer_gl0_inv
1483; GFX1164-NEXT:  .LBB5_2:
1484; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1485; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1486; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1487; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1488; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1489; GFX1164-NEXT:    s_mov_b32 s2, -1
1490; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1491; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1492; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1493; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1494; GFX1164-NEXT:    s_endpgm
1495;
1496; GFX1132-LABEL: add_i64_uniform:
1497; GFX1132:       ; %bb.0: ; %entry
1498; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1499; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1500; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1501; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1502; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1503; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1504; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1505; GFX1132-NEXT:  ; %bb.1:
1506; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1507; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1508; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1510; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1511; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1512; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1513; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
1514; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
1515; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1516; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1517; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1518; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1519; GFX1132-NEXT:    buffer_gl0_inv
1520; GFX1132-NEXT:  .LBB5_2:
1521; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1522; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1523; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1524; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1526; GFX1132-NEXT:    s_mov_b32 s2, -1
1527; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1528; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1529; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1530; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1531; GFX1132-NEXT:    s_endpgm
1532entry:
1533  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1534  store i64 %old, i64 addrspace(1)* %out
1535  ret void
1536}
1537
1538define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1539;
1540;
1541; GFX7LESS-LABEL: add_i64_varying:
1542; GFX7LESS:       ; %bb.0: ; %entry
1543; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1544; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1545; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1546; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1547; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1548; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1549; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1550; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1551; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1552; GFX7LESS-NEXT:    s_endpgm
1553;
1554; GFX8-LABEL: add_i64_varying:
1555; GFX8:       ; %bb.0: ; %entry
1556; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1557; GFX8-NEXT:    s_mov_b32 m0, -1
1558; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1559; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1561; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1562; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1563; GFX8-NEXT:    s_mov_b32 s2, -1
1564; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1565; GFX8-NEXT:    s_endpgm
1566;
1567; GFX9-LABEL: add_i64_varying:
1568; GFX9:       ; %bb.0: ; %entry
1569; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1570; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1571; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1575; GFX9-NEXT:    s_mov_b32 s2, -1
1576; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1577; GFX9-NEXT:    s_endpgm
1578;
1579; GFX10-LABEL: add_i64_varying:
1580; GFX10:       ; %bb.0: ; %entry
1581; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1582; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1583; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1584; GFX10-NEXT:    s_mov_b32 s2, -1
1585; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1586; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1587; GFX10-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1588; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1589; GFX10-NEXT:    buffer_gl0_inv
1590; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1591; GFX10-NEXT:    s_endpgm
1592;
1593; GFX11-LABEL: add_i64_varying:
1594; GFX11:       ; %bb.0: ; %entry
1595; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1596; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1597; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1598; GFX11-NEXT:    s_mov_b32 s2, -1
1599; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1600; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1601; GFX11-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1602; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX11-NEXT:    buffer_gl0_inv
1604; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1605; GFX11-NEXT:    s_endpgm
1606entry:
1607  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1608  %zext = zext i32 %lane to i64
1609  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1610  store i64 %old, i64 addrspace(1)* %out
1611  ret void
1612}
1613
1614define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1615;
1616;
1617; GFX7LESS-LABEL: sub_i32_constant:
1618; GFX7LESS:       ; %bb.0: ; %entry
1619; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1620; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1621; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1622; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1623; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1624; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1625; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1626; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1627; GFX7LESS-NEXT:  ; %bb.1:
1628; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1629; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1630; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1631; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1632; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1633; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1635; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX7LESS-NEXT:  .LBB7_2:
1637; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1638; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1639; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1640; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1641; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1642; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1643; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1644; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1645; GFX7LESS-NEXT:    s_endpgm
1646;
1647; GFX8-LABEL: sub_i32_constant:
1648; GFX8:       ; %bb.0: ; %entry
1649; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1650; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1651; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1652; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1653; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1654; GFX8-NEXT:    ; implicit-def: $vgpr1
1655; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1656; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1657; GFX8-NEXT:  ; %bb.1:
1658; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1659; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1660; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1661; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1662; GFX8-NEXT:    s_mov_b32 m0, -1
1663; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1664; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1665; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1666; GFX8-NEXT:  .LBB7_2:
1667; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1668; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1670; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1671; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1672; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1673; GFX8-NEXT:    s_mov_b32 s2, -1
1674; GFX8-NEXT:    s_nop 0
1675; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1676; GFX8-NEXT:    s_endpgm
1677;
1678; GFX9-LABEL: sub_i32_constant:
1679; GFX9:       ; %bb.0: ; %entry
1680; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1681; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1682; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1683; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1684; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1685; GFX9-NEXT:    ; implicit-def: $vgpr1
1686; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1687; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1688; GFX9-NEXT:  ; %bb.1:
1689; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1690; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1691; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1692; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1693; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1695; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1696; GFX9-NEXT:  .LBB7_2:
1697; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1700; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1701; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1702; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1703; GFX9-NEXT:    s_mov_b32 s2, -1
1704; GFX9-NEXT:    s_nop 0
1705; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1706; GFX9-NEXT:    s_endpgm
1707;
1708; GFX1064-LABEL: sub_i32_constant:
1709; GFX1064:       ; %bb.0: ; %entry
1710; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1711; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1712; GFX1064-NEXT:    ; implicit-def: $vgpr1
1713; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1714; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1715; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1716; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1717; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1718; GFX1064-NEXT:  ; %bb.1:
1719; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1720; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1721; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1722; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1723; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1724; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1725; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1726; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX1064-NEXT:    buffer_gl0_inv
1728; GFX1064-NEXT:  .LBB7_2:
1729; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1730; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1731; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1732; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1733; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1734; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1735; GFX1064-NEXT:    s_mov_b32 s2, -1
1736; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1737; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1738; GFX1064-NEXT:    s_endpgm
1739;
1740; GFX1032-LABEL: sub_i32_constant:
1741; GFX1032:       ; %bb.0: ; %entry
1742; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1743; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1744; GFX1032-NEXT:    ; implicit-def: $vgpr1
1745; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1746; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1747; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1748; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1749; GFX1032-NEXT:  ; %bb.1:
1750; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1751; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1752; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
1753; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
1754; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1755; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1756; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1757; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1758; GFX1032-NEXT:    buffer_gl0_inv
1759; GFX1032-NEXT:  .LBB7_2:
1760; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1761; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1762; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1763; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1764; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1765; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1766; GFX1032-NEXT:    s_mov_b32 s2, -1
1767; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1769; GFX1032-NEXT:    s_endpgm
1770;
1771; GFX1164-LABEL: sub_i32_constant:
1772; GFX1164:       ; %bb.0: ; %entry
1773; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1774; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1775; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1776; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1777; GFX1164-NEXT:    ; implicit-def: $vgpr1
1778; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1779; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1780; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1781; GFX1164-NEXT:  ; %bb.1:
1782; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1783; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1784; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1785; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
1786; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1787; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1788; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1789; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1790; GFX1164-NEXT:    buffer_gl0_inv
1791; GFX1164-NEXT:  .LBB7_2:
1792; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1793; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1794; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1795; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1796; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1797; GFX1164-NEXT:    s_mov_b32 s2, -1
1798; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1800; GFX1164-NEXT:    s_endpgm
1801;
1802; GFX1132-LABEL: sub_i32_constant:
1803; GFX1132:       ; %bb.0: ; %entry
1804; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1805; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1806; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1807; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1808; GFX1132-NEXT:    ; implicit-def: $vgpr1
1809; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1810; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
1811; GFX1132-NEXT:  ; %bb.1:
1812; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1813; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1814; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
1815; GFX1132-NEXT:    v_mov_b32_e32 v2, s3
1816; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1817; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1818; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1819; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX1132-NEXT:    buffer_gl0_inv
1821; GFX1132-NEXT:  .LBB7_2:
1822; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1823; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1824; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1825; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1826; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1827; GFX1132-NEXT:    s_mov_b32 s2, -1
1828; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1830; GFX1132-NEXT:    s_endpgm
1831entry:
1832  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1833  store i32 %old, i32 addrspace(1)* %out
1834  ret void
1835}
1836
1837define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1838;
1839;
1840; GFX7LESS-LABEL: sub_i32_uniform:
1841; GFX7LESS:       ; %bb.0: ; %entry
1842; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1843; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1844; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
1845; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1846; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1847; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1848; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1849; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1850; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
1851; GFX7LESS-NEXT:  ; %bb.1:
1852; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1853; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1854; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
1855; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1856; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
1857; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1858; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1859; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1860; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX7LESS-NEXT:  .LBB8_2:
1862; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1863; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1864; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1865; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
1866; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1867; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1868; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1869; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1870; GFX7LESS-NEXT:    s_endpgm
1871;
1872; GFX8-LABEL: sub_i32_uniform:
1873; GFX8:       ; %bb.0: ; %entry
1874; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1875; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
1876; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1877; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1878; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1879; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1880; GFX8-NEXT:    ; implicit-def: $vgpr1
1881; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1882; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1883; GFX8-NEXT:  ; %bb.1:
1884; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1885; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1886; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1887; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1888; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1889; GFX8-NEXT:    s_mov_b32 m0, -1
1890; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1891; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1892; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1893; GFX8-NEXT:  .LBB8_2:
1894; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1895; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1896; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1897; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1898; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1899; GFX8-NEXT:    s_mov_b32 s6, -1
1900; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1901; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1902; GFX8-NEXT:    s_endpgm
1903;
1904; GFX9-LABEL: sub_i32_uniform:
1905; GFX9:       ; %bb.0: ; %entry
1906; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1907; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
1908; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1909; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1910; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1911; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1912; GFX9-NEXT:    ; implicit-def: $vgpr1
1913; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1914; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1915; GFX9-NEXT:  ; %bb.1:
1916; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1917; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1919; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1920; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1921; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1922; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1923; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1924; GFX9-NEXT:  .LBB8_2:
1925; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1926; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1927; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1928; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1929; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1930; GFX9-NEXT:    s_mov_b32 s6, -1
1931; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1932; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1933; GFX9-NEXT:    s_endpgm
1934;
1935; GFX1064-LABEL: sub_i32_uniform:
1936; GFX1064:       ; %bb.0: ; %entry
1937; GFX1064-NEXT:    s_clause 0x1
1938; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1939; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
1940; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1941; GFX1064-NEXT:    ; implicit-def: $vgpr1
1942; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1943; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1944; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1945; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1946; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
1947; GFX1064-NEXT:  ; %bb.1:
1948; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1949; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1950; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1951; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
1952; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
1953; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1954; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1955; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1956; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX1064-NEXT:    buffer_gl0_inv
1958; GFX1064-NEXT:  .LBB8_2:
1959; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1960; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1961; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1962; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
1963; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1964; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1965; GFX1064-NEXT:    s_mov_b32 s6, -1
1966; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1967; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1968; GFX1064-NEXT:    s_endpgm
1969;
1970; GFX1032-LABEL: sub_i32_uniform:
1971; GFX1032:       ; %bb.0: ; %entry
1972; GFX1032-NEXT:    s_clause 0x1
1973; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1974; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1975; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1976; GFX1032-NEXT:    ; implicit-def: $vgpr1
1977; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1978; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1979; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1980; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
1981; GFX1032-NEXT:  ; %bb.1:
1982; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1983; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1984; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1986; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1987; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1988; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1989; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1990; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX1032-NEXT:    buffer_gl0_inv
1992; GFX1032-NEXT:  .LBB8_2:
1993; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1994; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1995; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1996; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1997; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1998; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1999; GFX1032-NEXT:    s_mov_b32 s6, -1
2000; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2001; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2002; GFX1032-NEXT:    s_endpgm
2003;
2004; GFX1164-LABEL: sub_i32_uniform:
2005; GFX1164:       ; %bb.0: ; %entry
2006; GFX1164-NEXT:    s_clause 0x1
2007; GFX1164-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2008; GFX1164-NEXT:    s_load_b32 s6, s[0:1], 0x2c
2009; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2010; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2011; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2012; GFX1164-NEXT:    ; implicit-def: $vgpr1
2013; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
2014; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
2015; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2016; GFX1164-NEXT:  ; %bb.1:
2017; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
2018; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2019; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
2021; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
2022; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2023; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2024; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2025; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2026; GFX1164-NEXT:    buffer_gl0_inv
2027; GFX1164-NEXT:  .LBB8_2:
2028; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2029; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2030; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
2031; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2032; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2033; GFX1164-NEXT:    s_mov_b32 s6, -1
2034; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2035; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2036; GFX1164-NEXT:    s_endpgm
2037;
2038; GFX1132-LABEL: sub_i32_uniform:
2039; GFX1132:       ; %bb.0: ; %entry
2040; GFX1132-NEXT:    s_clause 0x1
2041; GFX1132-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
2042; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x2c
2043; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2044; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2045; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2046; GFX1132-NEXT:    ; implicit-def: $vgpr1
2047; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2048; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2049; GFX1132-NEXT:  ; %bb.1:
2050; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2051; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2052; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2054; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
2055; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2056; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2057; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
2058; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2059; GFX1132-NEXT:    buffer_gl0_inv
2060; GFX1132-NEXT:  .LBB8_2:
2061; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2062; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2063; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2064; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2065; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2066; GFX1132-NEXT:    s_mov_b32 s6, -1
2067; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2068; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2069; GFX1132-NEXT:    s_endpgm
2070entry:
2071  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
2072  store i32 %old, i32 addrspace(1)* %out
2073  ret void
2074}
2075
2076define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
2077;
2078;
2079; GFX7LESS-LABEL: sub_i32_varying:
2080; GFX7LESS:       ; %bb.0: ; %entry
2081; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2082; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2083; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2084; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
2086; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2087; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2088; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2089; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2090; GFX7LESS-NEXT:    s_endpgm
2091;
2092; GFX8-LABEL: sub_i32_varying:
2093; GFX8:       ; %bb.0: ; %entry
2094; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2095; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2096; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2097; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2098; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2099; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2100; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2101; GFX8-NEXT:    s_not_b64 exec, exec
2102; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2103; GFX8-NEXT:    s_not_b64 exec, exec
2104; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2105; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2106; GFX8-NEXT:    s_nop 1
2107; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2108; GFX8-NEXT:    s_nop 1
2109; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2110; GFX8-NEXT:    s_nop 1
2111; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2112; GFX8-NEXT:    s_nop 1
2113; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2114; GFX8-NEXT:    s_nop 1
2115; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2116; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2117; GFX8-NEXT:    s_nop 0
2118; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2119; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2120; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2121; GFX8-NEXT:    ; implicit-def: $vgpr0
2122; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2123; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2124; GFX8-NEXT:  ; %bb.1:
2125; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2126; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2127; GFX8-NEXT:    s_mov_b32 m0, -1
2128; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2129; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2130; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2131; GFX8-NEXT:  .LBB9_2:
2132; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2133; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2134; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2135; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2136; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2137; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2138; GFX8-NEXT:    s_mov_b32 s2, -1
2139; GFX8-NEXT:    s_nop 0
2140; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2141; GFX8-NEXT:    s_endpgm
2142;
2143; GFX9-LABEL: sub_i32_varying:
2144; GFX9:       ; %bb.0: ; %entry
2145; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2146; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2147; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2148; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2149; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2150; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2151; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2152; GFX9-NEXT:    s_not_b64 exec, exec
2153; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2154; GFX9-NEXT:    s_not_b64 exec, exec
2155; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2156; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2157; GFX9-NEXT:    s_nop 1
2158; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2159; GFX9-NEXT:    s_nop 1
2160; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2161; GFX9-NEXT:    s_nop 1
2162; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2163; GFX9-NEXT:    s_nop 1
2164; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2165; GFX9-NEXT:    s_nop 1
2166; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2167; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2168; GFX9-NEXT:    s_nop 0
2169; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2170; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2171; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2172; GFX9-NEXT:    ; implicit-def: $vgpr0
2173; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2174; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2175; GFX9-NEXT:  ; %bb.1:
2176; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2177; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2178; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2179; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
2180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX9-NEXT:  .LBB9_2:
2182; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2183; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2184; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2185; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2186; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2187; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2188; GFX9-NEXT:    s_mov_b32 s2, -1
2189; GFX9-NEXT:    s_nop 0
2190; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2191; GFX9-NEXT:    s_endpgm
2192;
2193; GFX1064-LABEL: sub_i32_varying:
2194; GFX1064:       ; %bb.0: ; %entry
2195; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2196; GFX1064-NEXT:    s_not_b64 exec, exec
2197; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2198; GFX1064-NEXT:    s_not_b64 exec, exec
2199; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2200; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2201; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2202; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2203; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2204; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2205; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2206; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2207; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2208; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2209; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2210; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2211; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2212; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2213; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2214; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2215; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2216; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2217; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2218; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2219; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2220; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2221; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2222; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2223; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2224; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2225; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2226; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2227; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2228; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2229; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2230; GFX1064-NEXT:    s_mov_b32 s2, -1
2231; GFX1064-NEXT:    ; implicit-def: $vgpr0
2232; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2233; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2234; GFX1064-NEXT:  ; %bb.1:
2235; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2236; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2237; GFX1064-NEXT:    s_mov_b32 s3, s7
2238; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2239; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2240; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2241; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2242; GFX1064-NEXT:    buffer_gl0_inv
2243; GFX1064-NEXT:  .LBB9_2:
2244; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2245; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2246; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2247; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2248; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2249; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2250; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2252; GFX1064-NEXT:    s_endpgm
2253;
2254; GFX1032-LABEL: sub_i32_varying:
2255; GFX1032:       ; %bb.0: ; %entry
2256; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2257; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2258; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2259; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2260; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2261; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2262; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2263; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2264; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2265; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2266; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2267; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2268; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2269; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2270; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2271; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2272; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2273; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2274; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2275; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2276; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2277; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2278; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2279; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2280; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2281; GFX1032-NEXT:    s_mov_b32 s2, -1
2282; GFX1032-NEXT:    ; implicit-def: $vgpr0
2283; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2284; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2285; GFX1032-NEXT:  ; %bb.1:
2286; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2287; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2288; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2289; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2290; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2291; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX1032-NEXT:    buffer_gl0_inv
2293; GFX1032-NEXT:  .LBB9_2:
2294; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2295; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2296; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2297; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2298; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2299; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2300; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2301; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2302; GFX1032-NEXT:    s_endpgm
2303;
2304; GFX1164-LABEL: sub_i32_varying:
2305; GFX1164:       ; %bb.0: ; %entry
2306; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2307; GFX1164-NEXT:    s_not_b64 exec, exec
2308; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2309; GFX1164-NEXT:    s_not_b64 exec, exec
2310; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2311; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2312; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2313; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2314; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2315; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2316; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2317; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2318; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2319; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2320; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2321; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2322; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
2323; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2324; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2325; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2326; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2327; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
2328; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
2329; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2330; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2331; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2332; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
2333; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
2334; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
2335; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2336; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2337; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2338; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
2339; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2340; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2341; GFX1164-NEXT:    s_mov_b32 s2, -1
2342; GFX1164-NEXT:    ; implicit-def: $vgpr0
2343; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2344; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2345; GFX1164-NEXT:  ; %bb.1:
2346; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
2347; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
2348; GFX1164-NEXT:    s_mov_b32 s3, s7
2349; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2350; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2351; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2352; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2353; GFX1164-NEXT:    buffer_gl0_inv
2354; GFX1164-NEXT:  .LBB9_2:
2355; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2356; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
2357; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2358; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2359; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2360; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2362; GFX1164-NEXT:    s_endpgm
2363;
2364; GFX1132-LABEL: sub_i32_varying:
2365; GFX1132:       ; %bb.0: ; %entry
2366; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2367; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2368; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2369; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2370; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2371; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2372; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2373; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2374; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2375; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2376; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2377; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2378; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2379; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2380; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2381; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2382; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
2383; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
2384; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2385; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2386; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2387; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2388; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
2389; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2390; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2391; GFX1132-NEXT:    s_mov_b32 s2, -1
2392; GFX1132-NEXT:    ; implicit-def: $vgpr0
2393; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2394; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2395; GFX1132-NEXT:  ; %bb.1:
2396; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
2397; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
2398; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2399; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2400; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v4
2401; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2402; GFX1132-NEXT:    buffer_gl0_inv
2403; GFX1132-NEXT:  .LBB9_2:
2404; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2405; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
2406; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2407; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2408; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2409; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2410; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2411; GFX1132-NEXT:    s_endpgm
2412entry:
2413  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2414  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2415  store i32 %old, i32 addrspace(1)* %out
2416  ret void
2417}
2418
2419define amdgpu_kernel void @sub_i32_varying_nouse() {
2420; GFX7LESS-LABEL: sub_i32_varying_nouse:
2421; GFX7LESS:       ; %bb.0: ; %entry
2422; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2423; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2424; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2425; GFX7LESS-NEXT:    ds_sub_u32 v1, v0
2426; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2427; GFX7LESS-NEXT:    s_endpgm
2428;
2429; GFX8-LABEL: sub_i32_varying_nouse:
2430; GFX8:       ; %bb.0: ; %entry
2431; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2432; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2433; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2434; GFX8-NEXT:    s_not_b64 exec, exec
2435; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2436; GFX8-NEXT:    s_not_b64 exec, exec
2437; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
2438; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2439; GFX8-NEXT:    s_nop 1
2440; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2441; GFX8-NEXT:    s_nop 1
2442; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2443; GFX8-NEXT:    s_nop 1
2444; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2445; GFX8-NEXT:    s_nop 1
2446; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2447; GFX8-NEXT:    s_nop 1
2448; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2449; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
2450; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
2451; GFX8-NEXT:    s_mov_b32 s0, s2
2452; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2453; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2454; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2455; GFX8-NEXT:  ; %bb.1:
2456; GFX8-NEXT:    v_mov_b32_e32 v0, 0
2457; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2458; GFX8-NEXT:    s_mov_b32 m0, -1
2459; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2460; GFX8-NEXT:    ds_sub_u32 v0, v2
2461; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2462; GFX8-NEXT:  .LBB10_2:
2463; GFX8-NEXT:    s_endpgm
2464;
2465; GFX9-LABEL: sub_i32_varying_nouse:
2466; GFX9:       ; %bb.0: ; %entry
2467; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2468; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2469; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2470; GFX9-NEXT:    s_not_b64 exec, exec
2471; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2472; GFX9-NEXT:    s_not_b64 exec, exec
2473; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
2474; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2475; GFX9-NEXT:    s_nop 1
2476; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2477; GFX9-NEXT:    s_nop 1
2478; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2479; GFX9-NEXT:    s_nop 1
2480; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2481; GFX9-NEXT:    s_nop 1
2482; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2483; GFX9-NEXT:    s_nop 1
2484; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2485; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
2486; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
2487; GFX9-NEXT:    s_mov_b32 s0, s2
2488; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2489; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2490; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2491; GFX9-NEXT:  ; %bb.1:
2492; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2493; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2494; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2495; GFX9-NEXT:    ds_sub_u32 v0, v2
2496; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX9-NEXT:  .LBB10_2:
2498; GFX9-NEXT:    s_endpgm
2499;
2500; GFX1064-LABEL: sub_i32_varying_nouse:
2501; GFX1064:       ; %bb.0: ; %entry
2502; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2503; GFX1064-NEXT:    s_not_b64 exec, exec
2504; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2505; GFX1064-NEXT:    s_not_b64 exec, exec
2506; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2507; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2508; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2509; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2510; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2511; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2512; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2513; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2514; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2515; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2516; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
2517; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
2518; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
2519; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
2520; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2521; GFX1064-NEXT:    s_add_i32 s0, s2, s3
2522; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2523; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2524; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2525; GFX1064-NEXT:  ; %bb.1:
2526; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2527; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
2528; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2529; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2530; GFX1064-NEXT:    ds_sub_u32 v0, v3
2531; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2532; GFX1064-NEXT:    buffer_gl0_inv
2533; GFX1064-NEXT:  .LBB10_2:
2534; GFX1064-NEXT:    s_endpgm
2535;
2536; GFX1032-LABEL: sub_i32_varying_nouse:
2537; GFX1032:       ; %bb.0: ; %entry
2538; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2539; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2540; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2541; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2542; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
2543; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2544; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2545; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2546; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2547; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2548; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2549; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2550; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
2551; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2552; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
2553; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2554; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2555; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2556; GFX1032-NEXT:  ; %bb.1:
2557; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2558; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2559; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2560; GFX1032-NEXT:    ds_sub_u32 v3, v0
2561; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX1032-NEXT:    buffer_gl0_inv
2563; GFX1032-NEXT:  .LBB10_2:
2564; GFX1032-NEXT:    s_endpgm
2565;
2566; GFX1164-LABEL: sub_i32_varying_nouse:
2567; GFX1164:       ; %bb.0: ; %entry
2568; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2569; GFX1164-NEXT:    s_not_b64 exec, exec
2570; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2571; GFX1164-NEXT:    s_not_b64 exec, exec
2572; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2573; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2574; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2575; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2576; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2577; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2578; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2579; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2580; GFX1164-NEXT:    v_permlane64_b32 v2, v1
2581; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2582; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2583; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
2584; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2585; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
2586; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
2587; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
2588; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
2589; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
2590; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2591; GFX1164-NEXT:  ; %bb.1:
2592; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2593; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2594; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2595; GFX1164-NEXT:    ds_sub_u32 v3, v0
2596; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2597; GFX1164-NEXT:    buffer_gl0_inv
2598; GFX1164-NEXT:  .LBB10_2:
2599; GFX1164-NEXT:    s_endpgm
2600;
2601; GFX1132-LABEL: sub_i32_varying_nouse:
2602; GFX1132:       ; %bb.0: ; %entry
2603; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2604; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2605; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2606; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2607; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
2608; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2609; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2610; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2611; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2612; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2613; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2614; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2615; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
2616; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2617; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
2618; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
2619; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v3
2620; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
2621; GFX1132-NEXT:  ; %bb.1:
2622; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2623; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2624; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2625; GFX1132-NEXT:    ds_sub_u32 v3, v0
2626; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2627; GFX1132-NEXT:    buffer_gl0_inv
2628; GFX1132-NEXT:  .LBB10_2:
2629; GFX1132-NEXT:    s_endpgm
2630entry:
2631  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2632  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2633  ret void
2634}
2635
2636define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2637;
2638;
2639; GFX7LESS-LABEL: sub_i64_constant:
2640; GFX7LESS:       ; %bb.0: ; %entry
2641; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2642; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2643; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2644; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
2645; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2646; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2647; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2648; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_2
2649; GFX7LESS-NEXT:  ; %bb.1:
2650; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2651; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
2652; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2653; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
2654; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2655; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2656; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2657; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX7LESS-NEXT:  .LBB11_2:
2659; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2660; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2661; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2662; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
2663; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2664; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2665; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2666; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2667; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2668; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2669; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2670; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2671; GFX7LESS-NEXT:    s_endpgm
2672;
2673; GFX8-LABEL: sub_i64_constant:
2674; GFX8:       ; %bb.0: ; %entry
2675; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2676; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2677; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2678; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2679; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2680; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2681; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2682; GFX8-NEXT:    s_cbranch_execz .LBB11_2
2683; GFX8-NEXT:  ; %bb.1:
2684; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2685; GFX8-NEXT:    s_mul_i32 s4, s4, 5
2686; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2687; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2688; GFX8-NEXT:    s_mov_b32 m0, -1
2689; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2691; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2692; GFX8-NEXT:  .LBB11_2:
2693; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2694; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2695; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2696; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2697; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2698; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2699; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2700; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2701; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2702; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2703; GFX8-NEXT:    s_mov_b32 s2, -1
2704; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2705; GFX8-NEXT:    s_endpgm
2706;
2707; GFX9-LABEL: sub_i64_constant:
2708; GFX9:       ; %bb.0: ; %entry
2709; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2710; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2711; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2712; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2713; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2714; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2715; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2716; GFX9-NEXT:    s_cbranch_execz .LBB11_2
2717; GFX9-NEXT:  ; %bb.1:
2718; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2719; GFX9-NEXT:    s_mul_i32 s4, s4, 5
2720; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2721; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2722; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2723; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2724; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2725; GFX9-NEXT:  .LBB11_2:
2726; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2727; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2728; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2729; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2730; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2731; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2732; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2733; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2734; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2735; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2736; GFX9-NEXT:    s_mov_b32 s2, -1
2737; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2738; GFX9-NEXT:    s_endpgm
2739;
2740; GFX1064-LABEL: sub_i64_constant:
2741; GFX1064:       ; %bb.0: ; %entry
2742; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2743; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2744; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2745; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2746; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2747; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2748; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2749; GFX1064-NEXT:    s_cbranch_execz .LBB11_2
2750; GFX1064-NEXT:  ; %bb.1:
2751; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2752; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2753; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
2754; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2755; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2756; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2757; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2758; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2759; GFX1064-NEXT:    buffer_gl0_inv
2760; GFX1064-NEXT:  .LBB11_2:
2761; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2762; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2763; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2764; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2765; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2766; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2767; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2768; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2769; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2770; GFX1064-NEXT:    s_mov_b32 s2, -1
2771; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2772; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2773; GFX1064-NEXT:    s_endpgm
2774;
2775; GFX1032-LABEL: sub_i64_constant:
2776; GFX1032:       ; %bb.0: ; %entry
2777; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2778; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2779; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2780; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2781; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2782; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2783; GFX1032-NEXT:    s_cbranch_execz .LBB11_2
2784; GFX1032-NEXT:  ; %bb.1:
2785; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2786; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2787; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
2788; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
2789; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2790; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2791; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2792; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2793; GFX1032-NEXT:    buffer_gl0_inv
2794; GFX1032-NEXT:  .LBB11_2:
2795; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2796; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2797; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2798; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2799; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2800; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2801; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2802; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2803; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2804; GFX1032-NEXT:    s_mov_b32 s2, -1
2805; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2807; GFX1032-NEXT:    s_endpgm
2808;
2809; GFX1164-LABEL: sub_i64_constant:
2810; GFX1164:       ; %bb.0: ; %entry
2811; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2812; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2813; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2814; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2815; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
2816; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2817; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2818; GFX1164-NEXT:    s_cbranch_execz .LBB11_2
2819; GFX1164-NEXT:  ; %bb.1:
2820; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2821; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2822; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
2823; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2824; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2825; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2826; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2827; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2828; GFX1164-NEXT:    buffer_gl0_inv
2829; GFX1164-NEXT:  .LBB11_2:
2830; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
2831; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2832; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2833; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2834; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2835; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2836; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2837; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2838; GFX1164-NEXT:    s_mov_b32 s2, -1
2839; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2840; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2841; GFX1164-NEXT:    s_endpgm
2842;
2843; GFX1132-LABEL: sub_i64_constant:
2844; GFX1132:       ; %bb.0: ; %entry
2845; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2846; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
2847; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2848; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
2849; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2850; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2851; GFX1132-NEXT:    s_cbranch_execz .LBB11_2
2852; GFX1132-NEXT:  ; %bb.1:
2853; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
2854; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2855; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
2856; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
2857; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2858; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2859; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
2860; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2861; GFX1132-NEXT:    buffer_gl0_inv
2862; GFX1132-NEXT:  .LBB11_2:
2863; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2864; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2865; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2866; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2867; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2868; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2869; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2870; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2871; GFX1132-NEXT:    s_mov_b32 s2, -1
2872; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2873; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2874; GFX1132-NEXT:    s_endpgm
2875entry:
2876  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2877  store i64 %old, i64 addrspace(1)* %out
2878  ret void
2879}
2880
2881define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2882;
2883;
2884; GFX7LESS-LABEL: sub_i64_uniform:
2885; GFX7LESS:       ; %bb.0: ; %entry
2886; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2887; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2888; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2889; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
2890; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2891; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2892; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2893; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
2894; GFX7LESS-NEXT:  ; %bb.1:
2895; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2896; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
2897; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2898; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2899; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2900; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
2901; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2902; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
2903; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2904; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2905; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2906; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2907; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2908; GFX7LESS-NEXT:  .LBB12_2:
2909; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2910; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2911; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2912; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2913; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2914; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2915; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
2916; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
2917; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
2918; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
2919; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
2920; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
2921; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
2922; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
2923; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2924; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2925; GFX7LESS-NEXT:    s_endpgm
2926;
2927; GFX8-LABEL: sub_i64_uniform:
2928; GFX8:       ; %bb.0: ; %entry
2929; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2930; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2931; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2932; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2933; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2934; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2935; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2936; GFX8-NEXT:    s_cbranch_execz .LBB12_2
2937; GFX8-NEXT:  ; %bb.1:
2938; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
2939; GFX8-NEXT:    v_mov_b32_e32 v0, s8
2940; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2941; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
2942; GFX8-NEXT:    s_mul_i32 s6, s3, s8
2943; GFX8-NEXT:    v_mov_b32_e32 v3, 0
2944; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
2945; GFX8-NEXT:    s_mov_b32 m0, -1
2946; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2947; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2949; GFX8-NEXT:  .LBB12_2:
2950; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2951; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2952; GFX8-NEXT:    s_mov_b32 s4, s0
2953; GFX8-NEXT:    s_mov_b32 s5, s1
2954; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
2955; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2956; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2957; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2958; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
2959; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2960; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
2961; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2962; GFX8-NEXT:    s_mov_b32 s6, -1
2963; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2964; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2965; GFX8-NEXT:    s_endpgm
2966;
2967; GFX9-LABEL: sub_i64_uniform:
2968; GFX9:       ; %bb.0: ; %entry
2969; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2970; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2971; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2972; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2973; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2974; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2975; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2976; GFX9-NEXT:    s_cbranch_execz .LBB12_2
2977; GFX9-NEXT:  ; %bb.1:
2978; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2979; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2981; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2982; GFX9-NEXT:    s_add_i32 s8, s8, s7
2983; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2984; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2985; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2986; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2987; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2988; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
2989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2990; GFX9-NEXT:  .LBB12_2:
2991; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2993; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
2994; GFX9-NEXT:    s_mov_b32 s4, s0
2995; GFX9-NEXT:    s_mov_b32 s5, s1
2996; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
2997; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2998; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2999; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3000; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3001; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
3002; GFX9-NEXT:    s_mov_b32 s7, 0xf000
3003; GFX9-NEXT:    s_mov_b32 s6, -1
3004; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
3005; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3006; GFX9-NEXT:    s_endpgm
3007;
3008; GFX1064-LABEL: sub_i64_uniform:
3009; GFX1064:       ; %bb.0: ; %entry
3010; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3011; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3012; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3013; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3014; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3015; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
3016; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3017; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
3018; GFX1064-NEXT:  ; %bb.1:
3019; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3020; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3021; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3022; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
3023; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
3024; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
3025; GFX1064-NEXT:    s_add_i32 s8, s8, s7
3026; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
3027; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
3028; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3029; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3030; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3031; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3032; GFX1064-NEXT:    buffer_gl0_inv
3033; GFX1064-NEXT:  .LBB12_2:
3034; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3035; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3036; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3037; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
3038; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
3039; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
3040; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3041; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3042; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3043; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
3044; GFX1064-NEXT:    s_mov_b32 s2, -1
3045; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3046; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3047; GFX1064-NEXT:    s_endpgm
3048;
3049; GFX1032-LABEL: sub_i64_uniform:
3050; GFX1032:       ; %bb.0: ; %entry
3051; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3052; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
3053; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3054; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3055; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
3056; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3057; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
3058; GFX1032-NEXT:  ; %bb.1:
3059; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
3060; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3061; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3062; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
3063; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
3064; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
3065; GFX1032-NEXT:    s_add_i32 s7, s7, s6
3066; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
3067; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
3068; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3069; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3070; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3071; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3072; GFX1032-NEXT:    buffer_gl0_inv
3073; GFX1032-NEXT:  .LBB12_2:
3074; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3075; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3076; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3077; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
3078; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
3079; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
3080; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3081; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3082; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3083; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
3084; GFX1032-NEXT:    s_mov_b32 s2, -1
3085; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3086; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3087; GFX1032-NEXT:    s_endpgm
3088;
3089; GFX1164-LABEL: sub_i64_uniform:
3090; GFX1164:       ; %bb.0: ; %entry
3091; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3092; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3093; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3094; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3095; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
3096; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
3097; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
3098; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
3099; GFX1164-NEXT:  ; %bb.1:
3100; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3101; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3102; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3103; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
3104; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
3105; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
3106; GFX1164-NEXT:    s_add_i32 s8, s8, s7
3107; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
3108; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
3109; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3110; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3111; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3112; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3113; GFX1164-NEXT:    buffer_gl0_inv
3114; GFX1164-NEXT:  .LBB12_2:
3115; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3116; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3117; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3118; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
3119; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
3120; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
3121; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3122; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
3123; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3124; GFX1164-NEXT:    s_mov_b32 s2, -1
3125; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3126; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
3127; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3128; GFX1164-NEXT:    s_endpgm
3129;
3130; GFX1132-LABEL: sub_i64_uniform:
3131; GFX1132:       ; %bb.0: ; %entry
3132; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3133; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
3134; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3135; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
3136; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3137; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3138; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
3139; GFX1132-NEXT:  ; %bb.1:
3140; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
3141; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3142; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3143; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
3144; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
3145; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
3146; GFX1132-NEXT:    s_add_i32 s7, s7, s6
3147; GFX1132-NEXT:    v_mov_b32_e32 v0, s5
3148; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
3149; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3150; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3151; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
3152; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3153; GFX1132-NEXT:    buffer_gl0_inv
3154; GFX1132-NEXT:  .LBB12_2:
3155; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3156; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3157; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
3158; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
3159; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
3160; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
3161; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
3162; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3163; GFX1132-NEXT:    s_mov_b32 s2, -1
3164; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3165; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
3166; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3167; GFX1132-NEXT:    s_endpgm
3168entry:
3169  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
3170  store i64 %old, i64 addrspace(1)* %out
3171  ret void
3172}
3173
3174define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
3175;
3176;
3177; GFX7LESS-LABEL: sub_i64_varying:
3178; GFX7LESS:       ; %bb.0: ; %entry
3179; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3180; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3181; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3182; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3183; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3184; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3185; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3186; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3187; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3188; GFX7LESS-NEXT:    s_endpgm
3189;
3190; GFX8-LABEL: sub_i64_varying:
3191; GFX8:       ; %bb.0: ; %entry
3192; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3193; GFX8-NEXT:    s_mov_b32 m0, -1
3194; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3195; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3196; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3197; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3198; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3199; GFX8-NEXT:    s_mov_b32 s2, -1
3200; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3201; GFX8-NEXT:    s_endpgm
3202;
3203; GFX9-LABEL: sub_i64_varying:
3204; GFX9:       ; %bb.0: ; %entry
3205; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3206; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3208; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3209; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3210; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3211; GFX9-NEXT:    s_mov_b32 s2, -1
3212; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3213; GFX9-NEXT:    s_endpgm
3214;
3215; GFX10-LABEL: sub_i64_varying:
3216; GFX10:       ; %bb.0: ; %entry
3217; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3218; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3219; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
3220; GFX10-NEXT:    s_mov_b32 s2, -1
3221; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3222; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3223; GFX10-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3224; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3225; GFX10-NEXT:    buffer_gl0_inv
3226; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3227; GFX10-NEXT:    s_endpgm
3228;
3229; GFX11-LABEL: sub_i64_varying:
3230; GFX11:       ; %bb.0: ; %entry
3231; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3232; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3233; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
3234; GFX11-NEXT:    s_mov_b32 s2, -1
3235; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3236; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3237; GFX11-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
3238; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3239; GFX11-NEXT:    buffer_gl0_inv
3240; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
3241; GFX11-NEXT:    s_endpgm
3242entry:
3243  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3244  %zext = zext i32 %lane to i64
3245  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
3246  store i64 %old, i64 addrspace(1)* %out
3247  ret void
3248}
3249
3250define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
3251;
3252;
3253; GFX7LESS-LABEL: and_i32_varying:
3254; GFX7LESS:       ; %bb.0: ; %entry
3255; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3256; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3257; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3258; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
3260; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3261; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3262; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3263; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3264; GFX7LESS-NEXT:    s_endpgm
3265;
3266; GFX8-LABEL: and_i32_varying:
3267; GFX8:       ; %bb.0: ; %entry
3268; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3269; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3270; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3271; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3272; GFX8-NEXT:    v_mov_b32_e32 v1, -1
3273; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3274; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3275; GFX8-NEXT:    s_not_b64 exec, exec
3276; GFX8-NEXT:    v_mov_b32_e32 v2, -1
3277; GFX8-NEXT:    s_not_b64 exec, exec
3278; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3279; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3280; GFX8-NEXT:    s_nop 1
3281; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3282; GFX8-NEXT:    s_nop 1
3283; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3284; GFX8-NEXT:    s_nop 1
3285; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3286; GFX8-NEXT:    s_nop 1
3287; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3288; GFX8-NEXT:    s_nop 1
3289; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3290; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3291; GFX8-NEXT:    s_nop 0
3292; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3293; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3294; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3295; GFX8-NEXT:    ; implicit-def: $vgpr0
3296; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3297; GFX8-NEXT:    s_cbranch_execz .LBB14_2
3298; GFX8-NEXT:  ; %bb.1:
3299; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3300; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3301; GFX8-NEXT:    s_mov_b32 m0, -1
3302; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3303; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
3304; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3305; GFX8-NEXT:  .LBB14_2:
3306; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3307; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3308; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3309; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3310; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
3311; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3312; GFX8-NEXT:    s_mov_b32 s2, -1
3313; GFX8-NEXT:    s_nop 0
3314; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3315; GFX8-NEXT:    s_endpgm
3316;
3317; GFX9-LABEL: and_i32_varying:
3318; GFX9:       ; %bb.0: ; %entry
3319; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3320; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3321; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3322; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3323; GFX9-NEXT:    v_mov_b32_e32 v1, -1
3324; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3325; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3326; GFX9-NEXT:    s_not_b64 exec, exec
3327; GFX9-NEXT:    v_mov_b32_e32 v2, -1
3328; GFX9-NEXT:    s_not_b64 exec, exec
3329; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3330; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3331; GFX9-NEXT:    s_nop 1
3332; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3333; GFX9-NEXT:    s_nop 1
3334; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3335; GFX9-NEXT:    s_nop 1
3336; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3337; GFX9-NEXT:    s_nop 1
3338; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3339; GFX9-NEXT:    s_nop 1
3340; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3341; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3342; GFX9-NEXT:    s_nop 0
3343; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3344; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3345; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3346; GFX9-NEXT:    ; implicit-def: $vgpr0
3347; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3348; GFX9-NEXT:    s_cbranch_execz .LBB14_2
3349; GFX9-NEXT:  ; %bb.1:
3350; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3351; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3352; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3353; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
3354; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3355; GFX9-NEXT:  .LBB14_2:
3356; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3357; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3358; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3359; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3360; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
3361; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3362; GFX9-NEXT:    s_mov_b32 s2, -1
3363; GFX9-NEXT:    s_nop 0
3364; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3365; GFX9-NEXT:    s_endpgm
3366;
3367; GFX1064-LABEL: and_i32_varying:
3368; GFX1064:       ; %bb.0: ; %entry
3369; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3370; GFX1064-NEXT:    s_not_b64 exec, exec
3371; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
3372; GFX1064-NEXT:    s_not_b64 exec, exec
3373; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3374; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3375; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
3376; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3377; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3378; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3379; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3380; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3381; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3382; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3383; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3384; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3385; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3386; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3387; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3388; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3389; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3390; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3391; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3392; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3393; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3394; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3395; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3396; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3397; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3398; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3399; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3400; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3401; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3402; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3403; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3404; GFX1064-NEXT:    s_mov_b32 s2, -1
3405; GFX1064-NEXT:    ; implicit-def: $vgpr0
3406; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3407; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
3408; GFX1064-NEXT:  ; %bb.1:
3409; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3410; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3411; GFX1064-NEXT:    s_mov_b32 s3, s7
3412; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3413; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3414; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
3415; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3416; GFX1064-NEXT:    buffer_gl0_inv
3417; GFX1064-NEXT:  .LBB14_2:
3418; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3419; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3420; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3421; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3422; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
3423; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3424; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3425; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3426; GFX1064-NEXT:    s_endpgm
3427;
3428; GFX1032-LABEL: and_i32_varying:
3429; GFX1032:       ; %bb.0: ; %entry
3430; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3431; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3432; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
3433; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3434; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3435; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3436; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3437; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3438; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3439; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3440; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3441; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3442; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3443; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3444; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3445; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
3446; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3447; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3448; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3449; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3450; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3451; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3452; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3453; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3454; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3455; GFX1032-NEXT:    s_mov_b32 s2, -1
3456; GFX1032-NEXT:    ; implicit-def: $vgpr0
3457; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3458; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
3459; GFX1032-NEXT:  ; %bb.1:
3460; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3461; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3462; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3463; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3464; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
3465; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3466; GFX1032-NEXT:    buffer_gl0_inv
3467; GFX1032-NEXT:  .LBB14_2:
3468; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3469; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3470; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3471; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3472; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
3473; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3474; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3475; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3476; GFX1032-NEXT:    s_endpgm
3477;
3478; GFX1164-LABEL: and_i32_varying:
3479; GFX1164:       ; %bb.0: ; %entry
3480; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3481; GFX1164-NEXT:    s_not_b64 exec, exec
3482; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
3483; GFX1164-NEXT:    s_not_b64 exec, exec
3484; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3485; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3486; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
3487; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3488; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3489; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3490; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3491; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3492; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3493; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3494; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3495; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3496; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3497; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3498; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3499; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3500; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3501; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3502; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3503; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3504; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3505; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3506; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3507; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3508; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3509; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3510; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3511; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3512; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3513; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3514; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3515; GFX1164-NEXT:    s_mov_b32 s2, -1
3516; GFX1164-NEXT:    ; implicit-def: $vgpr0
3517; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3518; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
3519; GFX1164-NEXT:  ; %bb.1:
3520; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3521; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3522; GFX1164-NEXT:    s_mov_b32 s3, s7
3523; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3524; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3525; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
3526; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3527; GFX1164-NEXT:    buffer_gl0_inv
3528; GFX1164-NEXT:  .LBB14_2:
3529; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3530; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3531; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3532; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
3533; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3534; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3535; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3536; GFX1164-NEXT:    s_endpgm
3537;
3538; GFX1132-LABEL: and_i32_varying:
3539; GFX1132:       ; %bb.0: ; %entry
3540; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3541; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3542; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
3543; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3544; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3545; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3546; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3547; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3548; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3549; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3550; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3551; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3552; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3553; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3554; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3555; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
3556; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3557; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3558; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3559; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3560; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3561; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3562; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3563; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3564; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3565; GFX1132-NEXT:    s_mov_b32 s2, -1
3566; GFX1132-NEXT:    ; implicit-def: $vgpr0
3567; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3568; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
3569; GFX1132-NEXT:  ; %bb.1:
3570; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3571; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3572; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3573; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3574; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
3575; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3576; GFX1132-NEXT:    buffer_gl0_inv
3577; GFX1132-NEXT:  .LBB14_2:
3578; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3579; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3580; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3581; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
3582; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3583; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3584; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3585; GFX1132-NEXT:    s_endpgm
3586entry:
3587  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3588  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3589  store i32 %old, i32 addrspace(1)* %out
3590  ret void
3591}
3592
3593define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
3594;
3595;
3596; GFX7LESS-LABEL: or_i32_varying:
3597; GFX7LESS:       ; %bb.0: ; %entry
3598; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3599; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3600; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3601; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3602; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
3603; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3604; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3605; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3606; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3607; GFX7LESS-NEXT:    s_endpgm
3608;
3609; GFX8-LABEL: or_i32_varying:
3610; GFX8:       ; %bb.0: ; %entry
3611; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3612; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3613; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3614; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3615; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3616; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3617; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3618; GFX8-NEXT:    s_not_b64 exec, exec
3619; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3620; GFX8-NEXT:    s_not_b64 exec, exec
3621; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3622; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3623; GFX8-NEXT:    s_nop 1
3624; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3625; GFX8-NEXT:    s_nop 1
3626; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3627; GFX8-NEXT:    s_nop 1
3628; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3629; GFX8-NEXT:    s_nop 1
3630; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3631; GFX8-NEXT:    s_nop 1
3632; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3633; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3634; GFX8-NEXT:    s_nop 0
3635; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3636; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3637; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3638; GFX8-NEXT:    ; implicit-def: $vgpr0
3639; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3640; GFX8-NEXT:    s_cbranch_execz .LBB15_2
3641; GFX8-NEXT:  ; %bb.1:
3642; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3643; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3644; GFX8-NEXT:    s_mov_b32 m0, -1
3645; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3646; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
3647; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3648; GFX8-NEXT:  .LBB15_2:
3649; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3650; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3651; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3652; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3653; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
3654; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3655; GFX8-NEXT:    s_mov_b32 s2, -1
3656; GFX8-NEXT:    s_nop 0
3657; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3658; GFX8-NEXT:    s_endpgm
3659;
3660; GFX9-LABEL: or_i32_varying:
3661; GFX9:       ; %bb.0: ; %entry
3662; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3663; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3664; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3665; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3666; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3667; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3668; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3669; GFX9-NEXT:    s_not_b64 exec, exec
3670; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3671; GFX9-NEXT:    s_not_b64 exec, exec
3672; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3673; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3674; GFX9-NEXT:    s_nop 1
3675; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3676; GFX9-NEXT:    s_nop 1
3677; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3678; GFX9-NEXT:    s_nop 1
3679; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3680; GFX9-NEXT:    s_nop 1
3681; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3682; GFX9-NEXT:    s_nop 1
3683; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3684; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3685; GFX9-NEXT:    s_nop 0
3686; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3687; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3688; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3689; GFX9-NEXT:    ; implicit-def: $vgpr0
3690; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3691; GFX9-NEXT:    s_cbranch_execz .LBB15_2
3692; GFX9-NEXT:  ; %bb.1:
3693; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3694; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3695; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3696; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
3697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3698; GFX9-NEXT:  .LBB15_2:
3699; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3700; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3701; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3702; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3703; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
3704; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3705; GFX9-NEXT:    s_mov_b32 s2, -1
3706; GFX9-NEXT:    s_nop 0
3707; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3708; GFX9-NEXT:    s_endpgm
3709;
3710; GFX1064-LABEL: or_i32_varying:
3711; GFX1064:       ; %bb.0: ; %entry
3712; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3713; GFX1064-NEXT:    s_not_b64 exec, exec
3714; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3715; GFX1064-NEXT:    s_not_b64 exec, exec
3716; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3717; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3718; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3719; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3720; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3721; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3722; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3723; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3724; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3725; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3726; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3727; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3728; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3729; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3730; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3731; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3732; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3733; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3734; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3735; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3736; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3737; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3738; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3739; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3740; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3741; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3742; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3743; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3744; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3745; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3746; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3747; GFX1064-NEXT:    s_mov_b32 s2, -1
3748; GFX1064-NEXT:    ; implicit-def: $vgpr0
3749; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3750; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
3751; GFX1064-NEXT:  ; %bb.1:
3752; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
3753; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3754; GFX1064-NEXT:    s_mov_b32 s3, s7
3755; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3756; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3757; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v4
3758; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3759; GFX1064-NEXT:    buffer_gl0_inv
3760; GFX1064-NEXT:  .LBB15_2:
3761; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3762; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3763; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3764; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3765; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
3766; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3767; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3768; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3769; GFX1064-NEXT:    s_endpgm
3770;
3771; GFX1032-LABEL: or_i32_varying:
3772; GFX1032:       ; %bb.0: ; %entry
3773; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3774; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3775; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3776; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3777; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3778; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3779; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3780; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3781; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3782; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3783; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3784; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3785; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3786; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3787; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3788; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3789; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3790; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3791; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3792; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3793; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3794; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3795; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3796; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3797; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3798; GFX1032-NEXT:    s_mov_b32 s2, -1
3799; GFX1032-NEXT:    ; implicit-def: $vgpr0
3800; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3801; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
3802; GFX1032-NEXT:  ; %bb.1:
3803; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
3804; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3805; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3806; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3807; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v4
3808; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3809; GFX1032-NEXT:    buffer_gl0_inv
3810; GFX1032-NEXT:  .LBB15_2:
3811; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3812; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3813; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3814; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3815; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3816; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3817; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3818; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3819; GFX1032-NEXT:    s_endpgm
3820;
3821; GFX1164-LABEL: or_i32_varying:
3822; GFX1164:       ; %bb.0: ; %entry
3823; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
3824; GFX1164-NEXT:    s_not_b64 exec, exec
3825; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3826; GFX1164-NEXT:    s_not_b64 exec, exec
3827; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3828; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3829; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
3830; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3831; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3832; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3833; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
3834; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3835; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3836; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
3837; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
3838; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3839; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
3840; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3841; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3842; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3843; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3844; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
3845; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
3846; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3847; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3848; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
3849; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
3850; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
3851; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
3852; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
3853; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3854; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
3855; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
3856; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
3857; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3858; GFX1164-NEXT:    s_mov_b32 s2, -1
3859; GFX1164-NEXT:    ; implicit-def: $vgpr0
3860; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3861; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
3862; GFX1164-NEXT:  ; %bb.1:
3863; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
3864; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
3865; GFX1164-NEXT:    s_mov_b32 s3, s7
3866; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3867; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3868; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v4
3869; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3870; GFX1164-NEXT:    buffer_gl0_inv
3871; GFX1164-NEXT:  .LBB15_2:
3872; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3873; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
3874; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
3875; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
3876; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3877; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3878; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3879; GFX1164-NEXT:    s_endpgm
3880;
3881; GFX1132-LABEL: or_i32_varying:
3882; GFX1132:       ; %bb.0: ; %entry
3883; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
3884; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3885; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
3886; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
3887; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3888; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3889; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3890; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3891; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3892; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
3893; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3894; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3895; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
3896; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3897; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3898; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
3899; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
3900; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
3901; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3902; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3903; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3904; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
3905; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
3906; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
3907; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3908; GFX1132-NEXT:    s_mov_b32 s2, -1
3909; GFX1132-NEXT:    ; implicit-def: $vgpr0
3910; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3911; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
3912; GFX1132-NEXT:  ; %bb.1:
3913; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
3914; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
3915; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3916; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3917; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v4
3918; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3919; GFX1132-NEXT:    buffer_gl0_inv
3920; GFX1132-NEXT:  .LBB15_2:
3921; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3922; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
3923; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
3924; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
3925; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3926; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3927; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3928; GFX1132-NEXT:    s_endpgm
3929entry:
3930  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3931  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3932  store i32 %old, i32 addrspace(1)* %out
3933  ret void
3934}
3935
3936define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3937;
3938;
3939; GFX7LESS-LABEL: xor_i32_varying:
3940; GFX7LESS:       ; %bb.0: ; %entry
3941; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3942; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3943; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3944; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3945; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3946; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3947; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3948; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3949; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3950; GFX7LESS-NEXT:    s_endpgm
3951;
3952; GFX8-LABEL: xor_i32_varying:
3953; GFX8:       ; %bb.0: ; %entry
3954; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3955; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3956; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3957; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3958; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
3959; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
3960; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3961; GFX8-NEXT:    s_not_b64 exec, exec
3962; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3963; GFX8-NEXT:    s_not_b64 exec, exec
3964; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3965; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3966; GFX8-NEXT:    s_nop 1
3967; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3968; GFX8-NEXT:    s_nop 1
3969; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3970; GFX8-NEXT:    s_nop 1
3971; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3972; GFX8-NEXT:    s_nop 1
3973; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3974; GFX8-NEXT:    s_nop 1
3975; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3976; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3977; GFX8-NEXT:    s_nop 0
3978; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3979; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3980; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
3981; GFX8-NEXT:    ; implicit-def: $vgpr0
3982; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3983; GFX8-NEXT:    s_cbranch_execz .LBB16_2
3984; GFX8-NEXT:  ; %bb.1:
3985; GFX8-NEXT:    v_mov_b32_e32 v0, 0
3986; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3987; GFX8-NEXT:    s_mov_b32 m0, -1
3988; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3989; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3990; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3991; GFX8-NEXT:  .LBB16_2:
3992; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3993; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3994; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3995; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3996; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3997; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3998; GFX8-NEXT:    s_mov_b32 s2, -1
3999; GFX8-NEXT:    s_nop 0
4000; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4001; GFX8-NEXT:    s_endpgm
4002;
4003; GFX9-LABEL: xor_i32_varying:
4004; GFX9:       ; %bb.0: ; %entry
4005; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4006; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4007; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4008; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4009; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4010; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4011; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4012; GFX9-NEXT:    s_not_b64 exec, exec
4013; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4014; GFX9-NEXT:    s_not_b64 exec, exec
4015; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4016; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4017; GFX9-NEXT:    s_nop 1
4018; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4019; GFX9-NEXT:    s_nop 1
4020; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4021; GFX9-NEXT:    s_nop 1
4022; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4023; GFX9-NEXT:    s_nop 1
4024; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4025; GFX9-NEXT:    s_nop 1
4026; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4027; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4028; GFX9-NEXT:    s_nop 0
4029; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4030; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4031; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4032; GFX9-NEXT:    ; implicit-def: $vgpr0
4033; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4034; GFX9-NEXT:    s_cbranch_execz .LBB16_2
4035; GFX9-NEXT:  ; %bb.1:
4036; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4037; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4038; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4039; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
4040; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4041; GFX9-NEXT:  .LBB16_2:
4042; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4043; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4044; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4045; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4046; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
4047; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4048; GFX9-NEXT:    s_mov_b32 s2, -1
4049; GFX9-NEXT:    s_nop 0
4050; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4051; GFX9-NEXT:    s_endpgm
4052;
4053; GFX1064-LABEL: xor_i32_varying:
4054; GFX1064:       ; %bb.0: ; %entry
4055; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4056; GFX1064-NEXT:    s_not_b64 exec, exec
4057; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4058; GFX1064-NEXT:    s_not_b64 exec, exec
4059; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4060; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4061; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4062; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4063; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4064; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4065; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4066; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4067; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4068; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4069; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4070; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4071; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4072; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4073; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4074; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4075; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4076; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4077; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4078; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4079; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4080; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4081; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4082; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4083; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4084; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4085; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4086; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4087; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4088; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4089; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4090; GFX1064-NEXT:    s_mov_b32 s2, -1
4091; GFX1064-NEXT:    ; implicit-def: $vgpr0
4092; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4093; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
4094; GFX1064-NEXT:  ; %bb.1:
4095; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4096; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4097; GFX1064-NEXT:    s_mov_b32 s3, s7
4098; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4099; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4100; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4101; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4102; GFX1064-NEXT:    buffer_gl0_inv
4103; GFX1064-NEXT:  .LBB16_2:
4104; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4105; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4106; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4107; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4108; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
4109; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4110; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4111; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4112; GFX1064-NEXT:    s_endpgm
4113;
4114; GFX1032-LABEL: xor_i32_varying:
4115; GFX1032:       ; %bb.0: ; %entry
4116; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4117; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4118; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4119; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4120; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4121; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4122; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4123; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4124; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4125; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4126; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4127; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4128; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4129; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4130; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4131; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4132; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4133; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4134; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4135; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4136; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4137; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4138; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4139; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4140; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4141; GFX1032-NEXT:    s_mov_b32 s2, -1
4142; GFX1032-NEXT:    ; implicit-def: $vgpr0
4143; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4144; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
4145; GFX1032-NEXT:  ; %bb.1:
4146; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4147; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4148; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4149; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4150; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4151; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4152; GFX1032-NEXT:    buffer_gl0_inv
4153; GFX1032-NEXT:  .LBB16_2:
4154; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4155; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4156; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4157; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4158; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
4159; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4160; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4161; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4162; GFX1032-NEXT:    s_endpgm
4163;
4164; GFX1164-LABEL: xor_i32_varying:
4165; GFX1164:       ; %bb.0: ; %entry
4166; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4167; GFX1164-NEXT:    s_not_b64 exec, exec
4168; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4169; GFX1164-NEXT:    s_not_b64 exec, exec
4170; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4171; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4172; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
4173; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4174; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4175; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4176; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4177; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4178; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4179; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4180; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4181; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4182; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4183; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4184; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4185; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4186; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4187; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4188; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4189; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4190; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4191; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4192; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4193; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4194; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4195; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4196; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4197; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4198; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4199; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4200; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4201; GFX1164-NEXT:    s_mov_b32 s2, -1
4202; GFX1164-NEXT:    ; implicit-def: $vgpr0
4203; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4204; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
4205; GFX1164-NEXT:  ; %bb.1:
4206; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4207; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4208; GFX1164-NEXT:    s_mov_b32 s3, s7
4209; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4210; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4211; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4212; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4213; GFX1164-NEXT:    buffer_gl0_inv
4214; GFX1164-NEXT:  .LBB16_2:
4215; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4216; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4217; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4218; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
4219; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4220; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4221; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4222; GFX1164-NEXT:    s_endpgm
4223;
4224; GFX1132-LABEL: xor_i32_varying:
4225; GFX1132:       ; %bb.0: ; %entry
4226; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4227; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4228; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4229; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4230; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4231; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4232; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4233; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4234; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4235; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4236; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4237; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4238; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4239; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4240; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4241; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
4242; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4243; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4244; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4245; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4246; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4247; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4248; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4249; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4250; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4251; GFX1132-NEXT:    s_mov_b32 s2, -1
4252; GFX1132-NEXT:    ; implicit-def: $vgpr0
4253; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4254; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
4255; GFX1132-NEXT:  ; %bb.1:
4256; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4257; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4258; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4259; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4260; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v4
4261; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4262; GFX1132-NEXT:    buffer_gl0_inv
4263; GFX1132-NEXT:  .LBB16_2:
4264; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4265; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4266; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4267; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
4268; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4269; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4270; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4271; GFX1132-NEXT:    s_endpgm
4272entry:
4273  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4274  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4275  store i32 %old, i32 addrspace(1)* %out
4276  ret void
4277}
4278
4279define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
4280;
4281;
4282; GFX7LESS-LABEL: max_i32_varying:
4283; GFX7LESS:       ; %bb.0: ; %entry
4284; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4285; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4286; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4287; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4288; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
4289; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4290; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4291; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4292; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4293; GFX7LESS-NEXT:    s_endpgm
4294;
4295; GFX8-LABEL: max_i32_varying:
4296; GFX8:       ; %bb.0: ; %entry
4297; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4298; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4299; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4300; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4301; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4302; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4303; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4304; GFX8-NEXT:    s_not_b64 exec, exec
4305; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
4306; GFX8-NEXT:    s_not_b64 exec, exec
4307; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4308; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4309; GFX8-NEXT:    s_nop 1
4310; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4311; GFX8-NEXT:    s_nop 1
4312; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4313; GFX8-NEXT:    s_nop 1
4314; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4315; GFX8-NEXT:    s_nop 1
4316; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4317; GFX8-NEXT:    s_nop 1
4318; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4319; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4320; GFX8-NEXT:    s_nop 0
4321; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4322; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4323; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4324; GFX8-NEXT:    ; implicit-def: $vgpr0
4325; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4326; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4327; GFX8-NEXT:  ; %bb.1:
4328; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4329; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4330; GFX8-NEXT:    s_mov_b32 m0, -1
4331; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4332; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
4333; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4334; GFX8-NEXT:  .LBB17_2:
4335; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4336; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4337; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4338; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4339; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
4340; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4341; GFX8-NEXT:    s_mov_b32 s2, -1
4342; GFX8-NEXT:    s_nop 0
4343; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4344; GFX8-NEXT:    s_endpgm
4345;
4346; GFX9-LABEL: max_i32_varying:
4347; GFX9:       ; %bb.0: ; %entry
4348; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4349; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4350; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4351; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4352; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4353; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4354; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4355; GFX9-NEXT:    s_not_b64 exec, exec
4356; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
4357; GFX9-NEXT:    s_not_b64 exec, exec
4358; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4359; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4360; GFX9-NEXT:    s_nop 1
4361; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4362; GFX9-NEXT:    s_nop 1
4363; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4364; GFX9-NEXT:    s_nop 1
4365; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4366; GFX9-NEXT:    s_nop 1
4367; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4368; GFX9-NEXT:    s_nop 1
4369; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4370; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4371; GFX9-NEXT:    s_nop 0
4372; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4373; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4374; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4375; GFX9-NEXT:    ; implicit-def: $vgpr0
4376; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4377; GFX9-NEXT:    s_cbranch_execz .LBB17_2
4378; GFX9-NEXT:  ; %bb.1:
4379; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4380; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4381; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4382; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
4383; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4384; GFX9-NEXT:  .LBB17_2:
4385; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4386; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4387; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4388; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4389; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
4390; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4391; GFX9-NEXT:    s_mov_b32 s2, -1
4392; GFX9-NEXT:    s_nop 0
4393; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4394; GFX9-NEXT:    s_endpgm
4395;
4396; GFX1064-LABEL: max_i32_varying:
4397; GFX1064:       ; %bb.0: ; %entry
4398; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4399; GFX1064-NEXT:    s_not_b64 exec, exec
4400; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
4401; GFX1064-NEXT:    s_not_b64 exec, exec
4402; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4403; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4404; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
4405; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4406; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4407; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4408; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4409; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4410; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4411; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4412; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4413; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4414; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4415; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4416; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4417; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4418; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4419; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4420; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4421; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4422; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4423; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4424; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4425; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4426; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4427; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4428; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4429; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4430; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4431; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4432; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4433; GFX1064-NEXT:    s_mov_b32 s2, -1
4434; GFX1064-NEXT:    ; implicit-def: $vgpr0
4435; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4436; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
4437; GFX1064-NEXT:  ; %bb.1:
4438; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
4439; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4440; GFX1064-NEXT:    s_mov_b32 s3, s7
4441; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4442; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4443; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
4444; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4445; GFX1064-NEXT:    buffer_gl0_inv
4446; GFX1064-NEXT:  .LBB17_2:
4447; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4448; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4449; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4450; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4451; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
4452; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4453; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4454; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4455; GFX1064-NEXT:    s_endpgm
4456;
4457; GFX1032-LABEL: max_i32_varying:
4458; GFX1032:       ; %bb.0: ; %entry
4459; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4460; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4461; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
4462; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4463; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4464; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4465; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4466; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4467; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4468; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4469; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4470; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4471; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4472; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4473; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4474; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
4475; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4476; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4477; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4478; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4479; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4480; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4481; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4482; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4483; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4484; GFX1032-NEXT:    s_mov_b32 s2, -1
4485; GFX1032-NEXT:    ; implicit-def: $vgpr0
4486; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4487; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
4488; GFX1032-NEXT:  ; %bb.1:
4489; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
4490; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4491; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4492; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4493; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
4494; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4495; GFX1032-NEXT:    buffer_gl0_inv
4496; GFX1032-NEXT:  .LBB17_2:
4497; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4498; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4499; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4500; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4501; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
4502; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4503; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4504; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4505; GFX1032-NEXT:    s_endpgm
4506;
4507; GFX1164-LABEL: max_i32_varying:
4508; GFX1164:       ; %bb.0: ; %entry
4509; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
4510; GFX1164-NEXT:    s_not_b64 exec, exec
4511; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
4512; GFX1164-NEXT:    s_not_b64 exec, exec
4513; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4514; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4515; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
4516; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4517; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4518; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4519; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
4520; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4521; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4522; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
4523; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
4524; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4525; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
4526; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4527; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4528; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4529; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4530; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
4531; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
4532; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4533; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4534; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
4535; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
4536; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
4537; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
4538; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
4539; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4540; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
4541; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
4542; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
4543; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4544; GFX1164-NEXT:    s_mov_b32 s2, -1
4545; GFX1164-NEXT:    ; implicit-def: $vgpr0
4546; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4547; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
4548; GFX1164-NEXT:  ; %bb.1:
4549; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
4550; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
4551; GFX1164-NEXT:    s_mov_b32 s3, s7
4552; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4553; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4554; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
4555; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4556; GFX1164-NEXT:    buffer_gl0_inv
4557; GFX1164-NEXT:  .LBB17_2:
4558; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4559; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
4560; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
4561; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
4562; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4563; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4564; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4565; GFX1164-NEXT:    s_endpgm
4566;
4567; GFX1132-LABEL: max_i32_varying:
4568; GFX1132:       ; %bb.0: ; %entry
4569; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
4570; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4571; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
4572; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
4573; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4574; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4575; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4576; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4577; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4578; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
4579; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4580; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4581; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4582; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4583; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4584; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
4585; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
4586; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
4587; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4588; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4589; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4590; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
4591; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
4592; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
4593; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4594; GFX1132-NEXT:    s_mov_b32 s2, -1
4595; GFX1132-NEXT:    ; implicit-def: $vgpr0
4596; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4597; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
4598; GFX1132-NEXT:  ; %bb.1:
4599; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
4600; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
4601; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4602; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4603; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
4604; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4605; GFX1132-NEXT:    buffer_gl0_inv
4606; GFX1132-NEXT:  .LBB17_2:
4607; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4608; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
4609; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
4610; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
4611; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4612; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4613; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4614; GFX1132-NEXT:    s_endpgm
4615entry:
4616  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4617  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4618  store i32 %old, i32 addrspace(1)* %out
4619  ret void
4620}
4621
4622define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
4623;
4624;
4625; GFX7LESS-LABEL: max_i64_constant:
4626; GFX7LESS:       ; %bb.0: ; %entry
4627; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4628; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4629; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4630; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4631; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4632; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4633; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
4634; GFX7LESS-NEXT:  ; %bb.1:
4635; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
4636; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4637; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4638; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4639; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4640; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4641; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4642; GFX7LESS-NEXT:  .LBB18_2:
4643; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4644; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4645; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4646; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4647; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
4648; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4649; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4650; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4651; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4652; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
4653; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
4654; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4655; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4656; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4657; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4658; GFX7LESS-NEXT:    s_endpgm
4659;
4660; GFX8-LABEL: max_i64_constant:
4661; GFX8:       ; %bb.0: ; %entry
4662; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4663; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4664; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4665; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4666; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4667; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4668; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4669; GFX8-NEXT:  ; %bb.1:
4670; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4671; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4672; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4673; GFX8-NEXT:    s_mov_b32 m0, -1
4674; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4675; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4676; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4677; GFX8-NEXT:  .LBB18_2:
4678; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4679; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4680; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4681; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
4682; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4683; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4684; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4685; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4686; GFX8-NEXT:    v_mov_b32_e32 v2, s3
4687; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4688; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4689; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4690; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4691; GFX8-NEXT:    s_mov_b32 s2, -1
4692; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4693; GFX8-NEXT:    s_endpgm
4694;
4695; GFX9-LABEL: max_i64_constant:
4696; GFX9:       ; %bb.0: ; %entry
4697; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4698; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4699; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4700; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4701; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4702; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4703; GFX9-NEXT:    s_cbranch_execz .LBB18_2
4704; GFX9-NEXT:  ; %bb.1:
4705; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4706; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4707; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4708; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4709; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4710; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4711; GFX9-NEXT:  .LBB18_2:
4712; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4713; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4714; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4715; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
4716; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4717; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4718; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4719; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4720; GFX9-NEXT:    v_mov_b32_e32 v2, s3
4721; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4722; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4723; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4724; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4725; GFX9-NEXT:    s_mov_b32 s2, -1
4726; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4727; GFX9-NEXT:    s_endpgm
4728;
4729; GFX1064-LABEL: max_i64_constant:
4730; GFX1064:       ; %bb.0: ; %entry
4731; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4732; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4733; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4734; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4735; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4736; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4737; GFX1064-NEXT:    s_cbranch_execz .LBB18_2
4738; GFX1064-NEXT:  ; %bb.1:
4739; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4740; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4741; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
4742; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4743; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4744; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4745; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4746; GFX1064-NEXT:    buffer_gl0_inv
4747; GFX1064-NEXT:  .LBB18_2:
4748; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4749; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4750; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4751; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4752; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4753; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4754; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4755; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4756; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4757; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4758; GFX1064-NEXT:    s_mov_b32 s2, -1
4759; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4760; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4761; GFX1064-NEXT:    s_endpgm
4762;
4763; GFX1032-LABEL: max_i64_constant:
4764; GFX1032:       ; %bb.0: ; %entry
4765; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4766; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4767; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4768; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4769; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4770; GFX1032-NEXT:    s_cbranch_execz .LBB18_2
4771; GFX1032-NEXT:  ; %bb.1:
4772; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4773; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4774; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
4775; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4776; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4777; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4778; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4779; GFX1032-NEXT:    buffer_gl0_inv
4780; GFX1032-NEXT:  .LBB18_2:
4781; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4782; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4783; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4784; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4785; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4786; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4787; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4788; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4789; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4790; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4791; GFX1032-NEXT:    s_mov_b32 s2, -1
4792; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4793; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4794; GFX1032-NEXT:    s_endpgm
4795;
4796; GFX1164-LABEL: max_i64_constant:
4797; GFX1164:       ; %bb.0: ; %entry
4798; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4799; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4800; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4801; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4802; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
4803; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4804; GFX1164-NEXT:    s_cbranch_execz .LBB18_2
4805; GFX1164-NEXT:  ; %bb.1:
4806; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
4807; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
4808; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
4809; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4810; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
4811; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4812; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4813; GFX1164-NEXT:    buffer_gl0_inv
4814; GFX1164-NEXT:  .LBB18_2:
4815; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
4816; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
4817; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
4818; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
4819; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4820; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
4821; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4822; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4823; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4824; GFX1164-NEXT:    s_mov_b32 s2, -1
4825; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4826; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4827; GFX1164-NEXT:    s_endpgm
4828;
4829; GFX1132-LABEL: max_i64_constant:
4830; GFX1132:       ; %bb.0: ; %entry
4831; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
4832; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4833; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4834; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
4835; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4836; GFX1132-NEXT:    s_cbranch_execz .LBB18_2
4837; GFX1132-NEXT:  ; %bb.1:
4838; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
4839; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
4840; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
4841; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4842; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
4843; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
4844; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4845; GFX1132-NEXT:    buffer_gl0_inv
4846; GFX1132-NEXT:  .LBB18_2:
4847; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4848; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
4849; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
4850; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
4851; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4852; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
4853; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4854; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4855; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4856; GFX1132-NEXT:    s_mov_b32 s2, -1
4857; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4858; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
4859; GFX1132-NEXT:    s_endpgm
4860entry:
4861  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
4862  store i64 %old, i64 addrspace(1)* %out
4863  ret void
4864}
4865
4866define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
4867;
4868;
4869; GFX7LESS-LABEL: min_i32_varying:
4870; GFX7LESS:       ; %bb.0: ; %entry
4871; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4872; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4873; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4874; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4875; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
4876; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4877; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4878; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4879; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4880; GFX7LESS-NEXT:    s_endpgm
4881;
4882; GFX8-LABEL: min_i32_varying:
4883; GFX8:       ; %bb.0: ; %entry
4884; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4885; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4886; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4887; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4888; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
4889; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4890; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4891; GFX8-NEXT:    s_not_b64 exec, exec
4892; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
4893; GFX8-NEXT:    s_not_b64 exec, exec
4894; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4895; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4896; GFX8-NEXT:    s_nop 1
4897; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4898; GFX8-NEXT:    s_nop 1
4899; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4900; GFX8-NEXT:    s_nop 1
4901; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4902; GFX8-NEXT:    s_nop 1
4903; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4904; GFX8-NEXT:    s_nop 1
4905; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4906; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4907; GFX8-NEXT:    s_nop 0
4908; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4909; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4910; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4911; GFX8-NEXT:    ; implicit-def: $vgpr0
4912; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4913; GFX8-NEXT:    s_cbranch_execz .LBB19_2
4914; GFX8-NEXT:  ; %bb.1:
4915; GFX8-NEXT:    v_mov_b32_e32 v0, 0
4916; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4917; GFX8-NEXT:    s_mov_b32 m0, -1
4918; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4919; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
4920; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4921; GFX8-NEXT:  .LBB19_2:
4922; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4923; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4924; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4925; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4926; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
4927; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4928; GFX8-NEXT:    s_mov_b32 s2, -1
4929; GFX8-NEXT:    s_nop 0
4930; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4931; GFX8-NEXT:    s_endpgm
4932;
4933; GFX9-LABEL: min_i32_varying:
4934; GFX9:       ; %bb.0: ; %entry
4935; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4936; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4937; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4938; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4939; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
4940; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4941; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4942; GFX9-NEXT:    s_not_b64 exec, exec
4943; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
4944; GFX9-NEXT:    s_not_b64 exec, exec
4945; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4946; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4947; GFX9-NEXT:    s_nop 1
4948; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4949; GFX9-NEXT:    s_nop 1
4950; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4951; GFX9-NEXT:    s_nop 1
4952; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4953; GFX9-NEXT:    s_nop 1
4954; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4955; GFX9-NEXT:    s_nop 1
4956; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4957; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4958; GFX9-NEXT:    s_nop 0
4959; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4960; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4961; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4962; GFX9-NEXT:    ; implicit-def: $vgpr0
4963; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4964; GFX9-NEXT:    s_cbranch_execz .LBB19_2
4965; GFX9-NEXT:  ; %bb.1:
4966; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4967; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4968; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4969; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
4970; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4971; GFX9-NEXT:  .LBB19_2:
4972; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4973; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4974; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4975; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4976; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
4977; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4978; GFX9-NEXT:    s_mov_b32 s2, -1
4979; GFX9-NEXT:    s_nop 0
4980; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4981; GFX9-NEXT:    s_endpgm
4982;
4983; GFX1064-LABEL: min_i32_varying:
4984; GFX1064:       ; %bb.0: ; %entry
4985; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4986; GFX1064-NEXT:    s_not_b64 exec, exec
4987; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
4988; GFX1064-NEXT:    s_not_b64 exec, exec
4989; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4990; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4991; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
4992; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4993; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4994; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4995; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4996; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4997; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4998; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4999; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5000; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5001; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5002; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5003; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5004; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5005; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5006; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5007; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5008; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5009; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5010; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5011; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5012; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5013; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5014; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5015; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5016; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5017; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5018; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5019; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5020; GFX1064-NEXT:    s_mov_b32 s2, -1
5021; GFX1064-NEXT:    ; implicit-def: $vgpr0
5022; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5023; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
5024; GFX1064-NEXT:  ; %bb.1:
5025; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5026; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5027; GFX1064-NEXT:    s_mov_b32 s3, s7
5028; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5029; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5030; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
5031; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5032; GFX1064-NEXT:    buffer_gl0_inv
5033; GFX1064-NEXT:  .LBB19_2:
5034; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5035; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5036; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5037; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5038; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
5039; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5040; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5041; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5042; GFX1064-NEXT:    s_endpgm
5043;
5044; GFX1032-LABEL: min_i32_varying:
5045; GFX1032:       ; %bb.0: ; %entry
5046; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5047; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5048; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
5049; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5050; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5051; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5052; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5053; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5054; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5055; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5056; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5057; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5058; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5059; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5060; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5061; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
5062; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5063; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5064; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5065; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5066; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5067; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5068; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5069; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5070; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5071; GFX1032-NEXT:    s_mov_b32 s2, -1
5072; GFX1032-NEXT:    ; implicit-def: $vgpr0
5073; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5074; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
5075; GFX1032-NEXT:  ; %bb.1:
5076; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5077; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5078; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5079; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5080; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
5081; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5082; GFX1032-NEXT:    buffer_gl0_inv
5083; GFX1032-NEXT:  .LBB19_2:
5084; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5085; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5086; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5087; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5088; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
5089; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5090; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5091; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5092; GFX1032-NEXT:    s_endpgm
5093;
5094; GFX1164-LABEL: min_i32_varying:
5095; GFX1164:       ; %bb.0: ; %entry
5096; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5097; GFX1164-NEXT:    s_not_b64 exec, exec
5098; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
5099; GFX1164-NEXT:    s_not_b64 exec, exec
5100; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5101; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5102; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
5103; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5104; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5105; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5106; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5107; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5108; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5109; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5110; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5111; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5112; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5113; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5114; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5115; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5116; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5117; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5118; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5119; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5120; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5121; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5122; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5123; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5124; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5125; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5126; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5127; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5128; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5129; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5130; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5131; GFX1164-NEXT:    s_mov_b32 s2, -1
5132; GFX1164-NEXT:    ; implicit-def: $vgpr0
5133; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5134; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
5135; GFX1164-NEXT:  ; %bb.1:
5136; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5137; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5138; GFX1164-NEXT:    s_mov_b32 s3, s7
5139; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5140; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5141; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
5142; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5143; GFX1164-NEXT:    buffer_gl0_inv
5144; GFX1164-NEXT:  .LBB19_2:
5145; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5146; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5147; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5148; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
5149; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5150; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5151; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5152; GFX1164-NEXT:    s_endpgm
5153;
5154; GFX1132-LABEL: min_i32_varying:
5155; GFX1132:       ; %bb.0: ; %entry
5156; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5157; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5158; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
5159; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5160; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5161; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5162; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5163; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5164; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5165; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5166; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5167; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5168; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5169; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5170; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5171; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
5172; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5173; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5174; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5175; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5176; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5177; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5178; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5179; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5180; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5181; GFX1132-NEXT:    s_mov_b32 s2, -1
5182; GFX1132-NEXT:    ; implicit-def: $vgpr0
5183; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5184; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
5185; GFX1132-NEXT:  ; %bb.1:
5186; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5187; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5188; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5189; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5190; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
5191; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5192; GFX1132-NEXT:    buffer_gl0_inv
5193; GFX1132-NEXT:  .LBB19_2:
5194; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5195; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5196; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5197; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
5198; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5199; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5200; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5201; GFX1132-NEXT:    s_endpgm
5202entry:
5203  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5204  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5205  store i32 %old, i32 addrspace(1)* %out
5206  ret void
5207}
5208
5209define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
5210;
5211;
5212; GFX7LESS-LABEL: min_i64_constant:
5213; GFX7LESS:       ; %bb.0: ; %entry
5214; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5215; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5216; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5217; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5218; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5219; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5220; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
5221; GFX7LESS-NEXT:  ; %bb.1:
5222; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5223; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5224; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5225; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5226; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5227; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5228; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5229; GFX7LESS-NEXT:  .LBB20_2:
5230; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5231; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5232; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5233; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5234; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
5235; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5236; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5237; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5238; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
5239; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
5240; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5241; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5242; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5243; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5244; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5245; GFX7LESS-NEXT:    s_endpgm
5246;
5247; GFX8-LABEL: min_i64_constant:
5248; GFX8:       ; %bb.0: ; %entry
5249; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5250; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5251; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5252; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5253; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5254; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5255; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5256; GFX8-NEXT:  ; %bb.1:
5257; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5258; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5259; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5260; GFX8-NEXT:    s_mov_b32 m0, -1
5261; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5262; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5263; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5264; GFX8-NEXT:  .LBB20_2:
5265; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5266; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5267; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5268; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
5269; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5270; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5271; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5272; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5273; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5274; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5275; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5276; GFX8-NEXT:    s_mov_b32 s2, -1
5277; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5278; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5279; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5280; GFX8-NEXT:    s_endpgm
5281;
5282; GFX9-LABEL: min_i64_constant:
5283; GFX9:       ; %bb.0: ; %entry
5284; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5285; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5286; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5287; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5288; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5289; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5290; GFX9-NEXT:    s_cbranch_execz .LBB20_2
5291; GFX9-NEXT:  ; %bb.1:
5292; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5293; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5294; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5295; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5296; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5297; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5298; GFX9-NEXT:  .LBB20_2:
5299; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5300; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5301; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5302; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
5303; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5304; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
5305; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5306; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
5307; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5308; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5309; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5310; GFX9-NEXT:    s_mov_b32 s2, -1
5311; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5312; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5313; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5314; GFX9-NEXT:    s_endpgm
5315;
5316; GFX1064-LABEL: min_i64_constant:
5317; GFX1064:       ; %bb.0: ; %entry
5318; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5319; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5320; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5321; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5322; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5323; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5324; GFX1064-NEXT:    s_cbranch_execz .LBB20_2
5325; GFX1064-NEXT:  ; %bb.1:
5326; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5327; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5328; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5329; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5330; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5331; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5332; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5333; GFX1064-NEXT:    buffer_gl0_inv
5334; GFX1064-NEXT:  .LBB20_2:
5335; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5336; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5337; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5338; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5339; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5340; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5341; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5342; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5343; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5344; GFX1064-NEXT:    s_mov_b32 s2, -1
5345; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5346; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5347; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5348; GFX1064-NEXT:    s_endpgm
5349;
5350; GFX1032-LABEL: min_i64_constant:
5351; GFX1032:       ; %bb.0: ; %entry
5352; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5353; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5354; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5355; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5356; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5357; GFX1032-NEXT:    s_cbranch_execz .LBB20_2
5358; GFX1032-NEXT:  ; %bb.1:
5359; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5360; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5361; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5362; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5363; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5364; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5365; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5366; GFX1032-NEXT:    buffer_gl0_inv
5367; GFX1032-NEXT:  .LBB20_2:
5368; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5369; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5370; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5371; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5372; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5373; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5374; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5375; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5376; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5377; GFX1032-NEXT:    s_mov_b32 s2, -1
5378; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5379; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5380; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5381; GFX1032-NEXT:    s_endpgm
5382;
5383; GFX1164-LABEL: min_i64_constant:
5384; GFX1164:       ; %bb.0: ; %entry
5385; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5386; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5387; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5388; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5389; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5390; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5391; GFX1164-NEXT:    s_cbranch_execz .LBB20_2
5392; GFX1164-NEXT:  ; %bb.1:
5393; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5394; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5395; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5396; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5397; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5398; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5399; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5400; GFX1164-NEXT:    buffer_gl0_inv
5401; GFX1164-NEXT:  .LBB20_2:
5402; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5403; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5404; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5405; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
5406; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
5407; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5408; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
5409; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5410; GFX1164-NEXT:    s_mov_b32 s2, -1
5411; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5412; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5413; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5414; GFX1164-NEXT:    s_endpgm
5415;
5416; GFX1132-LABEL: min_i64_constant:
5417; GFX1132:       ; %bb.0: ; %entry
5418; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5419; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5420; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5421; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5422; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5423; GFX1132-NEXT:    s_cbranch_execz .LBB20_2
5424; GFX1132-NEXT:  ; %bb.1:
5425; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
5426; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5427; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
5428; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5429; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5430; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
5431; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5432; GFX1132-NEXT:    buffer_gl0_inv
5433; GFX1132-NEXT:  .LBB20_2:
5434; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5435; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5436; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5437; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
5438; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
5439; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
5440; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
5441; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5442; GFX1132-NEXT:    s_mov_b32 s2, -1
5443; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5444; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5445; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5446; GFX1132-NEXT:    s_endpgm
5447entry:
5448  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
5449  store i64 %old, i64 addrspace(1)* %out
5450  ret void
5451}
5452
5453define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
5454;
5455;
5456; GFX7LESS-LABEL: umax_i32_varying:
5457; GFX7LESS:       ; %bb.0: ; %entry
5458; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5459; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5460; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5461; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5462; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
5463; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5464; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5465; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5466; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5467; GFX7LESS-NEXT:    s_endpgm
5468;
5469; GFX8-LABEL: umax_i32_varying:
5470; GFX8:       ; %bb.0: ; %entry
5471; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5472; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5473; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5474; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5475; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5476; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5477; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5478; GFX8-NEXT:    s_not_b64 exec, exec
5479; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5480; GFX8-NEXT:    s_not_b64 exec, exec
5481; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
5482; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5483; GFX8-NEXT:    s_nop 1
5484; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5485; GFX8-NEXT:    s_nop 1
5486; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5487; GFX8-NEXT:    s_nop 1
5488; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5489; GFX8-NEXT:    s_nop 1
5490; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5491; GFX8-NEXT:    s_nop 1
5492; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5493; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
5494; GFX8-NEXT:    s_nop 0
5495; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5496; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
5497; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5498; GFX8-NEXT:    ; implicit-def: $vgpr0
5499; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5500; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5501; GFX8-NEXT:  ; %bb.1:
5502; GFX8-NEXT:    v_mov_b32_e32 v0, 0
5503; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5504; GFX8-NEXT:    s_mov_b32 m0, -1
5505; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5506; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
5507; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5508; GFX8-NEXT:  .LBB21_2:
5509; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5510; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5511; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5512; GFX8-NEXT:    v_mov_b32_e32 v0, v1
5513; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
5514; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5515; GFX8-NEXT:    s_mov_b32 s2, -1
5516; GFX8-NEXT:    s_nop 0
5517; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5518; GFX8-NEXT:    s_endpgm
5519;
5520; GFX9-LABEL: umax_i32_varying:
5521; GFX9:       ; %bb.0: ; %entry
5522; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5523; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5524; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5525; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5526; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
5527; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
5528; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5529; GFX9-NEXT:    s_not_b64 exec, exec
5530; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5531; GFX9-NEXT:    s_not_b64 exec, exec
5532; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
5533; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5534; GFX9-NEXT:    s_nop 1
5535; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5536; GFX9-NEXT:    s_nop 1
5537; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5538; GFX9-NEXT:    s_nop 1
5539; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5540; GFX9-NEXT:    s_nop 1
5541; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5542; GFX9-NEXT:    s_nop 1
5543; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5544; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
5545; GFX9-NEXT:    s_nop 0
5546; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5547; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
5548; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
5549; GFX9-NEXT:    ; implicit-def: $vgpr0
5550; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5551; GFX9-NEXT:    s_cbranch_execz .LBB21_2
5552; GFX9-NEXT:  ; %bb.1:
5553; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5554; GFX9-NEXT:    v_mov_b32_e32 v3, s4
5555; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5556; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
5557; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5558; GFX9-NEXT:  .LBB21_2:
5559; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5560; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5561; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5562; GFX9-NEXT:    v_mov_b32_e32 v0, v1
5563; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
5564; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5565; GFX9-NEXT:    s_mov_b32 s2, -1
5566; GFX9-NEXT:    s_nop 0
5567; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5568; GFX9-NEXT:    s_endpgm
5569;
5570; GFX1064-LABEL: umax_i32_varying:
5571; GFX1064:       ; %bb.0: ; %entry
5572; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
5573; GFX1064-NEXT:    s_not_b64 exec, exec
5574; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5575; GFX1064-NEXT:    s_not_b64 exec, exec
5576; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5577; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5578; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5579; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5580; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5581; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5582; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
5583; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5584; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5585; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
5586; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
5587; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5588; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
5589; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5590; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5591; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5592; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5593; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
5594; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
5595; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5596; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5597; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
5598; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
5599; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
5600; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
5601; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
5602; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5603; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
5604; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
5605; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
5606; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5607; GFX1064-NEXT:    s_mov_b32 s2, -1
5608; GFX1064-NEXT:    ; implicit-def: $vgpr0
5609; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5610; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
5611; GFX1064-NEXT:  ; %bb.1:
5612; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
5613; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
5614; GFX1064-NEXT:    s_mov_b32 s3, s7
5615; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5616; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5617; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v4
5618; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5619; GFX1064-NEXT:    buffer_gl0_inv
5620; GFX1064-NEXT:  .LBB21_2:
5621; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5622; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5623; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
5624; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
5625; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
5626; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5627; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5628; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5629; GFX1064-NEXT:    s_endpgm
5630;
5631; GFX1032-LABEL: umax_i32_varying:
5632; GFX1032:       ; %bb.0: ; %entry
5633; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
5634; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5635; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5636; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
5637; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5638; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5639; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5640; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5641; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5642; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
5643; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5644; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5645; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5646; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5647; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5648; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5649; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
5650; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
5651; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5652; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5653; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5654; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
5655; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
5656; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
5657; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5658; GFX1032-NEXT:    s_mov_b32 s2, -1
5659; GFX1032-NEXT:    ; implicit-def: $vgpr0
5660; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5661; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
5662; GFX1032-NEXT:  ; %bb.1:
5663; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
5664; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
5665; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5666; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5667; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v4
5668; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5669; GFX1032-NEXT:    buffer_gl0_inv
5670; GFX1032-NEXT:  .LBB21_2:
5671; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5672; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5673; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
5674; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
5675; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
5676; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5677; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5678; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5679; GFX1032-NEXT:    s_endpgm
5680;
5681; GFX1164-LABEL: umax_i32_varying:
5682; GFX1164:       ; %bb.0: ; %entry
5683; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
5684; GFX1164-NEXT:    s_not_b64 exec, exec
5685; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5686; GFX1164-NEXT:    s_not_b64 exec, exec
5687; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5688; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5689; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5690; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5691; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5692; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5693; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
5694; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5695; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5696; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
5697; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
5698; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5699; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
5700; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5701; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5702; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5703; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5704; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
5705; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
5706; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5707; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5708; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
5709; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
5710; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
5711; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
5712; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
5713; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5714; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
5715; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
5716; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
5717; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5718; GFX1164-NEXT:    s_mov_b32 s2, -1
5719; GFX1164-NEXT:    ; implicit-def: $vgpr0
5720; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5721; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
5722; GFX1164-NEXT:  ; %bb.1:
5723; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
5724; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
5725; GFX1164-NEXT:    s_mov_b32 s3, s7
5726; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5727; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5728; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v4
5729; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5730; GFX1164-NEXT:    buffer_gl0_inv
5731; GFX1164-NEXT:  .LBB21_2:
5732; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5733; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
5734; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
5735; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
5736; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5737; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5738; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5739; GFX1164-NEXT:    s_endpgm
5740;
5741; GFX1132-LABEL: umax_i32_varying:
5742; GFX1132:       ; %bb.0: ; %entry
5743; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
5744; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5745; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5746; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
5747; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5748; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5749; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5750; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5751; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5752; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
5753; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
5754; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5755; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5756; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5757; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5758; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
5759; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
5760; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
5761; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5762; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5763; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5764; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
5765; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
5766; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
5767; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5768; GFX1132-NEXT:    s_mov_b32 s2, -1
5769; GFX1132-NEXT:    ; implicit-def: $vgpr0
5770; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
5771; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
5772; GFX1132-NEXT:  ; %bb.1:
5773; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
5774; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
5775; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5776; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
5777; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v4
5778; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5779; GFX1132-NEXT:    buffer_gl0_inv
5780; GFX1132-NEXT:  .LBB21_2:
5781; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
5782; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
5783; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
5784; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
5785; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5786; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5787; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5788; GFX1132-NEXT:    s_endpgm
5789entry:
5790  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5791  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
5792  store i32 %old, i32 addrspace(1)* %out
5793  ret void
5794}
5795
5796define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
5797;
5798;
5799; GFX7LESS-LABEL: umax_i64_constant:
5800; GFX7LESS:       ; %bb.0: ; %entry
5801; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5802; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5803; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5804; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5805; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5806; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5807; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
5808; GFX7LESS-NEXT:  ; %bb.1:
5809; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
5810; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
5811; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5812; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5813; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5814; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5815; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5816; GFX7LESS-NEXT:  .LBB22_2:
5817; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
5818; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5819; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
5820; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
5821; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5822; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5823; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5824; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
5825; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
5826; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5827; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
5828; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5829; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5830; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5831; GFX7LESS-NEXT:    s_endpgm
5832;
5833; GFX8-LABEL: umax_i64_constant:
5834; GFX8:       ; %bb.0: ; %entry
5835; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5836; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5837; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5838; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5839; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5840; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5841; GFX8-NEXT:    s_cbranch_execz .LBB22_2
5842; GFX8-NEXT:  ; %bb.1:
5843; GFX8-NEXT:    v_mov_b32_e32 v0, 5
5844; GFX8-NEXT:    v_mov_b32_e32 v2, 0
5845; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5846; GFX8-NEXT:    s_mov_b32 m0, -1
5847; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5848; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5849; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5850; GFX8-NEXT:  .LBB22_2:
5851; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
5852; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5853; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
5854; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
5855; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5856; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5857; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5858; GFX8-NEXT:    v_mov_b32_e32 v2, s2
5859; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5860; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5861; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5862; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5863; GFX8-NEXT:    s_mov_b32 s2, -1
5864; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5865; GFX8-NEXT:    s_endpgm
5866;
5867; GFX9-LABEL: umax_i64_constant:
5868; GFX9:       ; %bb.0: ; %entry
5869; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5870; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5871; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5872; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5873; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5874; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5875; GFX9-NEXT:    s_cbranch_execz .LBB22_2
5876; GFX9-NEXT:  ; %bb.1:
5877; GFX9-NEXT:    v_mov_b32_e32 v0, 5
5878; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5879; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5880; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5881; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5882; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5883; GFX9-NEXT:  .LBB22_2:
5884; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5885; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5886; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
5887; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
5888; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5889; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5890; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5891; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5892; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5893; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5894; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5895; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5896; GFX9-NEXT:    s_mov_b32 s2, -1
5897; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5898; GFX9-NEXT:    s_endpgm
5899;
5900; GFX1064-LABEL: umax_i64_constant:
5901; GFX1064:       ; %bb.0: ; %entry
5902; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5903; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5904; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5905; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5906; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5907; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5908; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
5909; GFX1064-NEXT:  ; %bb.1:
5910; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
5911; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5912; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
5913; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5914; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
5915; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5916; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5917; GFX1064-NEXT:    buffer_gl0_inv
5918; GFX1064-NEXT:  .LBB22_2:
5919; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5920; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
5921; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5922; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5923; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5924; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5925; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5926; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5927; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
5928; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5929; GFX1064-NEXT:    s_mov_b32 s2, -1
5930; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5931; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5932; GFX1064-NEXT:    s_endpgm
5933;
5934; GFX1032-LABEL: umax_i64_constant:
5935; GFX1032:       ; %bb.0: ; %entry
5936; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5937; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5938; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5939; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5940; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5941; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
5942; GFX1032-NEXT:  ; %bb.1:
5943; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
5944; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5945; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
5946; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5947; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
5948; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5949; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5950; GFX1032-NEXT:    buffer_gl0_inv
5951; GFX1032-NEXT:  .LBB22_2:
5952; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5953; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5954; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5955; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5956; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5957; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
5958; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
5959; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
5960; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
5961; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5962; GFX1032-NEXT:    s_mov_b32 s2, -1
5963; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5964; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5965; GFX1032-NEXT:    s_endpgm
5966;
5967; GFX1164-LABEL: umax_i64_constant:
5968; GFX1164:       ; %bb.0: ; %entry
5969; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
5970; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5971; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5972; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5973; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5974; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5975; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
5976; GFX1164-NEXT:  ; %bb.1:
5977; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
5978; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5979; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
5980; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5981; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
5982; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
5983; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5984; GFX1164-NEXT:    buffer_gl0_inv
5985; GFX1164-NEXT:  .LBB22_2:
5986; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
5987; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5988; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5989; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5990; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
5991; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
5992; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
5993; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
5994; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5995; GFX1164-NEXT:    s_mov_b32 s2, -1
5996; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5997; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5998; GFX1164-NEXT:    s_endpgm
5999;
6000; GFX1132-LABEL: umax_i64_constant:
6001; GFX1132:       ; %bb.0: ; %entry
6002; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6003; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6004; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6005; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6006; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6007; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
6008; GFX1132-NEXT:  ; %bb.1:
6009; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6010; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6011; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6012; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6013; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6014; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
6015; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6016; GFX1132-NEXT:    buffer_gl0_inv
6017; GFX1132-NEXT:  .LBB22_2:
6018; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6019; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6020; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6021; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6022; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
6023; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
6024; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6025; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
6026; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6027; GFX1132-NEXT:    s_mov_b32 s2, -1
6028; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6029; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6030; GFX1132-NEXT:    s_endpgm
6031entry:
6032  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
6033  store i64 %old, i64 addrspace(1)* %out
6034  ret void
6035}
6036
6037define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
6038;
6039;
6040; GFX7LESS-LABEL: umin_i32_varying:
6041; GFX7LESS:       ; %bb.0: ; %entry
6042; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6043; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6044; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6045; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6046; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
6047; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6048; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6049; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6050; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6051; GFX7LESS-NEXT:    s_endpgm
6052;
6053; GFX8-LABEL: umin_i32_varying:
6054; GFX8:       ; %bb.0: ; %entry
6055; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6056; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6057; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6058; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6059; GFX8-NEXT:    v_mov_b32_e32 v1, -1
6060; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6061; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6062; GFX8-NEXT:    s_not_b64 exec, exec
6063; GFX8-NEXT:    v_mov_b32_e32 v2, -1
6064; GFX8-NEXT:    s_not_b64 exec, exec
6065; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
6066; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6067; GFX8-NEXT:    s_nop 1
6068; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6069; GFX8-NEXT:    s_nop 1
6070; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6071; GFX8-NEXT:    s_nop 1
6072; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6073; GFX8-NEXT:    s_nop 1
6074; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6075; GFX8-NEXT:    s_nop 1
6076; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6077; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
6078; GFX8-NEXT:    s_nop 0
6079; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6080; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
6081; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6082; GFX8-NEXT:    ; implicit-def: $vgpr0
6083; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6084; GFX8-NEXT:    s_cbranch_execz .LBB23_2
6085; GFX8-NEXT:  ; %bb.1:
6086; GFX8-NEXT:    v_mov_b32_e32 v0, 0
6087; GFX8-NEXT:    v_mov_b32_e32 v3, s4
6088; GFX8-NEXT:    s_mov_b32 m0, -1
6089; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6090; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
6091; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6092; GFX8-NEXT:  .LBB23_2:
6093; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6094; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6095; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
6096; GFX8-NEXT:    v_mov_b32_e32 v0, v1
6097; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
6098; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6099; GFX8-NEXT:    s_mov_b32 s2, -1
6100; GFX8-NEXT:    s_nop 0
6101; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6102; GFX8-NEXT:    s_endpgm
6103;
6104; GFX9-LABEL: umin_i32_varying:
6105; GFX9:       ; %bb.0: ; %entry
6106; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6107; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6108; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6109; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6110; GFX9-NEXT:    v_mov_b32_e32 v1, -1
6111; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6112; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6113; GFX9-NEXT:    s_not_b64 exec, exec
6114; GFX9-NEXT:    v_mov_b32_e32 v2, -1
6115; GFX9-NEXT:    s_not_b64 exec, exec
6116; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
6117; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6118; GFX9-NEXT:    s_nop 1
6119; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6120; GFX9-NEXT:    s_nop 1
6121; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6122; GFX9-NEXT:    s_nop 1
6123; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6124; GFX9-NEXT:    s_nop 1
6125; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6126; GFX9-NEXT:    s_nop 1
6127; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6128; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
6129; GFX9-NEXT:    s_nop 0
6130; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6131; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
6132; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6133; GFX9-NEXT:    ; implicit-def: $vgpr0
6134; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6135; GFX9-NEXT:    s_cbranch_execz .LBB23_2
6136; GFX9-NEXT:  ; %bb.1:
6137; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6138; GFX9-NEXT:    v_mov_b32_e32 v3, s4
6139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6140; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
6141; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6142; GFX9-NEXT:  .LBB23_2:
6143; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6144; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6145; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
6146; GFX9-NEXT:    v_mov_b32_e32 v0, v1
6147; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
6148; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6149; GFX9-NEXT:    s_mov_b32 s2, -1
6150; GFX9-NEXT:    s_nop 0
6151; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6152; GFX9-NEXT:    s_endpgm
6153;
6154; GFX1064-LABEL: umin_i32_varying:
6155; GFX1064:       ; %bb.0: ; %entry
6156; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
6157; GFX1064-NEXT:    s_not_b64 exec, exec
6158; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
6159; GFX1064-NEXT:    s_not_b64 exec, exec
6160; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6161; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6162; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
6163; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6164; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6165; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6166; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
6167; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6168; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6169; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
6170; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
6171; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6172; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
6173; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6174; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6175; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6176; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6177; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
6178; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
6179; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6180; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6181; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
6182; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
6183; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
6184; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
6185; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
6186; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6187; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
6188; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
6189; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
6190; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6191; GFX1064-NEXT:    s_mov_b32 s2, -1
6192; GFX1064-NEXT:    ; implicit-def: $vgpr0
6193; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6194; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
6195; GFX1064-NEXT:  ; %bb.1:
6196; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
6197; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
6198; GFX1064-NEXT:    s_mov_b32 s3, s7
6199; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6200; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6201; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
6202; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6203; GFX1064-NEXT:    buffer_gl0_inv
6204; GFX1064-NEXT:  .LBB23_2:
6205; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6206; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
6207; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
6208; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
6209; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
6210; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6211; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6212; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6213; GFX1064-NEXT:    s_endpgm
6214;
6215; GFX1032-LABEL: umin_i32_varying:
6216; GFX1032:       ; %bb.0: ; %entry
6217; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
6218; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6219; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
6220; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
6221; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6222; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6223; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6224; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6225; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6226; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
6227; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6228; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6229; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6230; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6231; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6232; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
6233; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
6234; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
6235; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6236; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6237; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6238; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
6239; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
6240; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
6241; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6242; GFX1032-NEXT:    s_mov_b32 s2, -1
6243; GFX1032-NEXT:    ; implicit-def: $vgpr0
6244; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6245; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
6246; GFX1032-NEXT:  ; %bb.1:
6247; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
6248; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
6249; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6250; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6251; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
6252; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6253; GFX1032-NEXT:    buffer_gl0_inv
6254; GFX1032-NEXT:  .LBB23_2:
6255; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6256; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6257; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
6258; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
6259; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
6260; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6261; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6262; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6263; GFX1032-NEXT:    s_endpgm
6264;
6265; GFX1164-LABEL: umin_i32_varying:
6266; GFX1164:       ; %bb.0: ; %entry
6267; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
6268; GFX1164-NEXT:    s_not_b64 exec, exec
6269; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
6270; GFX1164-NEXT:    s_not_b64 exec, exec
6271; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6272; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6273; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
6274; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6275; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6276; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6277; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
6278; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6279; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6280; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
6281; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
6282; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6283; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
6284; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6285; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6286; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6287; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6288; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
6289; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
6290; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6291; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6292; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
6293; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
6294; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
6295; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
6296; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
6297; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6298; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
6299; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
6300; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
6301; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6302; GFX1164-NEXT:    s_mov_b32 s2, -1
6303; GFX1164-NEXT:    ; implicit-def: $vgpr0
6304; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6305; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
6306; GFX1164-NEXT:  ; %bb.1:
6307; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
6308; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
6309; GFX1164-NEXT:    s_mov_b32 s3, s7
6310; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6311; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6312; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
6313; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6314; GFX1164-NEXT:    buffer_gl0_inv
6315; GFX1164-NEXT:  .LBB23_2:
6316; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
6317; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
6318; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
6319; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
6320; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6321; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6322; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6323; GFX1164-NEXT:    s_endpgm
6324;
6325; GFX1132-LABEL: umin_i32_varying:
6326; GFX1132:       ; %bb.0: ; %entry
6327; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
6328; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6329; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
6330; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
6331; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6332; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6333; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6334; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6335; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6336; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
6337; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
6338; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6339; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6340; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6341; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6342; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
6343; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
6344; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
6345; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6346; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6347; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6348; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
6349; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
6350; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
6351; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6352; GFX1132-NEXT:    s_mov_b32 s2, -1
6353; GFX1132-NEXT:    ; implicit-def: $vgpr0
6354; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6355; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
6356; GFX1132-NEXT:  ; %bb.1:
6357; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
6358; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
6359; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6360; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6361; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
6362; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6363; GFX1132-NEXT:    buffer_gl0_inv
6364; GFX1132-NEXT:  .LBB23_2:
6365; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6366; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
6367; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
6368; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
6369; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6370; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6371; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6372; GFX1132-NEXT:    s_endpgm
6373entry:
6374  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6375  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
6376  store i32 %old, i32 addrspace(1)* %out
6377  ret void
6378}
6379
6380define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
6381;
6382;
6383; GFX7LESS-LABEL: umin_i64_constant:
6384; GFX7LESS:       ; %bb.0: ; %entry
6385; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6386; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6387; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6388; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6389; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
6390; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6391; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
6392; GFX7LESS-NEXT:  ; %bb.1:
6393; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
6394; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
6395; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
6396; GFX7LESS-NEXT:    s_mov_b32 m0, -1
6397; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6398; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6399; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6400; GFX7LESS-NEXT:  .LBB24_2:
6401; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
6402; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
6403; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
6404; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
6405; GFX7LESS-NEXT:    s_mov_b32 s2, -1
6406; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6407; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6408; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
6409; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6410; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6411; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
6412; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6413; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
6414; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6415; GFX7LESS-NEXT:    s_endpgm
6416;
6417; GFX8-LABEL: umin_i64_constant:
6418; GFX8:       ; %bb.0: ; %entry
6419; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6420; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6421; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6422; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6423; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6424; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6425; GFX8-NEXT:    s_cbranch_execz .LBB24_2
6426; GFX8-NEXT:  ; %bb.1:
6427; GFX8-NEXT:    v_mov_b32_e32 v0, 5
6428; GFX8-NEXT:    v_mov_b32_e32 v2, 0
6429; GFX8-NEXT:    v_mov_b32_e32 v1, 0
6430; GFX8-NEXT:    s_mov_b32 m0, -1
6431; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6432; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6433; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6434; GFX8-NEXT:  .LBB24_2:
6435; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
6436; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6437; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
6438; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
6439; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6440; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6441; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6442; GFX8-NEXT:    v_mov_b32_e32 v2, s5
6443; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6444; GFX8-NEXT:    v_mov_b32_e32 v2, s4
6445; GFX8-NEXT:    s_mov_b32 s2, -1
6446; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6447; GFX8-NEXT:    s_mov_b32 s3, 0xf000
6448; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6449; GFX8-NEXT:    s_endpgm
6450;
6451; GFX9-LABEL: umin_i64_constant:
6452; GFX9:       ; %bb.0: ; %entry
6453; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6454; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6455; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6456; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6457; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
6458; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6459; GFX9-NEXT:    s_cbranch_execz .LBB24_2
6460; GFX9-NEXT:  ; %bb.1:
6461; GFX9-NEXT:    v_mov_b32_e32 v0, 5
6462; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6463; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6464; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6465; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6466; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6467; GFX9-NEXT:  .LBB24_2:
6468; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6469; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6470; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
6471; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
6472; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6473; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6474; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
6475; GFX9-NEXT:    v_mov_b32_e32 v2, s5
6476; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6477; GFX9-NEXT:    v_mov_b32_e32 v2, s4
6478; GFX9-NEXT:    s_mov_b32 s2, -1
6479; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6480; GFX9-NEXT:    s_mov_b32 s3, 0xf000
6481; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6482; GFX9-NEXT:    s_endpgm
6483;
6484; GFX1064-LABEL: umin_i64_constant:
6485; GFX1064:       ; %bb.0: ; %entry
6486; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6487; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6488; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6489; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6490; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
6491; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6492; GFX1064-NEXT:    s_cbranch_execz .LBB24_2
6493; GFX1064-NEXT:  ; %bb.1:
6494; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
6495; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
6496; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
6497; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6498; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
6499; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6500; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6501; GFX1064-NEXT:    buffer_gl0_inv
6502; GFX1064-NEXT:  .LBB24_2:
6503; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
6504; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
6505; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
6506; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
6507; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6508; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6509; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6510; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6511; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6512; GFX1064-NEXT:    s_mov_b32 s2, -1
6513; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
6514; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
6515; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6516; GFX1064-NEXT:    s_endpgm
6517;
6518; GFX1032-LABEL: umin_i64_constant:
6519; GFX1032:       ; %bb.0: ; %entry
6520; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6521; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6522; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6523; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
6524; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6525; GFX1032-NEXT:    s_cbranch_execz .LBB24_2
6526; GFX1032-NEXT:  ; %bb.1:
6527; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
6528; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
6529; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
6530; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6531; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
6532; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6533; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6534; GFX1032-NEXT:    buffer_gl0_inv
6535; GFX1032-NEXT:  .LBB24_2:
6536; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
6537; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6538; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
6539; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
6540; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6541; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6542; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6543; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6544; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6545; GFX1032-NEXT:    s_mov_b32 s2, -1
6546; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
6547; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
6548; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6549; GFX1032-NEXT:    s_endpgm
6550;
6551; GFX1164-LABEL: umin_i64_constant:
6552; GFX1164:       ; %bb.0: ; %entry
6553; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6554; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6555; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6556; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6557; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
6558; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6559; GFX1164-NEXT:    s_cbranch_execz .LBB24_2
6560; GFX1164-NEXT:  ; %bb.1:
6561; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
6562; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
6563; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
6564; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6565; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
6566; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6567; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6568; GFX1164-NEXT:    buffer_gl0_inv
6569; GFX1164-NEXT:  .LBB24_2:
6570; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
6571; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
6572; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
6573; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
6574; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
6575; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
6576; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
6577; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
6578; GFX1164-NEXT:    s_mov_b32 s2, -1
6579; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
6580; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
6581; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6582; GFX1164-NEXT:    s_endpgm
6583;
6584; GFX1132-LABEL: umin_i64_constant:
6585; GFX1132:       ; %bb.0: ; %entry
6586; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
6587; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6588; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6589; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
6590; GFX1132-NEXT:    s_and_saveexec_b32 s2, vcc_lo
6591; GFX1132-NEXT:    s_cbranch_execz .LBB24_2
6592; GFX1132-NEXT:  ; %bb.1:
6593; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
6594; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
6595; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
6596; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6597; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
6598; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
6599; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6600; GFX1132-NEXT:    buffer_gl0_inv
6601; GFX1132-NEXT:  .LBB24_2:
6602; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
6603; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
6604; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
6605; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
6606; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
6607; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
6608; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
6609; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
6610; GFX1132-NEXT:    s_mov_b32 s2, -1
6611; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
6612; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
6613; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6614; GFX1132-NEXT:    s_endpgm
6615entry:
6616  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
6617  store i64 %old, i64 addrspace(1)* %out
6618  ret void
6619}
6620