1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10@local_var32 = addrspace(3) global i32 undef, align 4
11@local_var64 = addrspace(3) global i64 undef, align 8
12
13; Show what the atomic optimization pass will do for local pointers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
16;
17;
18; GFX7LESS-LABEL: add_i32_constant:
19; GFX7LESS:       ; %bb.0: ; %entry
20; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
21; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
22; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
26; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
27; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
28; GFX7LESS-NEXT:  ; %bb.1:
29; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
30; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
31; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
32; GFX7LESS-NEXT:    s_mov_b32 m0, -1
33; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
35; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX7LESS-NEXT:  BB0_2:
37; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
40; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
41; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
42; GFX7LESS-NEXT:    s_mov_b32 s2, -1
43; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
44; GFX7LESS-NEXT:    s_endpgm
45;
46; GFX8-LABEL: add_i32_constant:
47; GFX8:       ; %bb.0: ; %entry
48; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
49; GFX8-NEXT:    s_mov_b64 s[2:3], exec
50; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
51; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
52; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
53; GFX8-NEXT:    ; implicit-def: $vgpr1
54; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
55; GFX8-NEXT:    s_cbranch_execz BB0_2
56; GFX8-NEXT:  ; %bb.1:
57; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
58; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
59; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
60; GFX8-NEXT:    s_mov_b32 m0, -1
61; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX8-NEXT:    ds_add_rtn_u32 v1, v2, v1
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:  BB0_2:
65; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
66; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
68; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
69; GFX8-NEXT:    s_mov_b32 s3, 0xf000
70; GFX8-NEXT:    s_mov_b32 s2, -1
71; GFX8-NEXT:    s_nop 1
72; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
73; GFX8-NEXT:    s_endpgm
74;
75; GFX9-LABEL: add_i32_constant:
76; GFX9:       ; %bb.0: ; %entry
77; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
78; GFX9-NEXT:    s_mov_b64 s[2:3], exec
79; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
80; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
81; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
82; GFX9-NEXT:    ; implicit-def: $vgpr1
83; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
84; GFX9-NEXT:    s_cbranch_execz BB0_2
85; GFX9-NEXT:  ; %bb.1:
86; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
87; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
88; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
89; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
91; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-NEXT:  BB0_2:
93; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
96; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
97; GFX9-NEXT:    s_mov_b32 s3, 0xf000
98; GFX9-NEXT:    s_mov_b32 s2, -1
99; GFX9-NEXT:    s_nop 1
100; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
101; GFX9-NEXT:    s_endpgm
102;
103; GFX1064-LABEL: add_i32_constant:
104; GFX1064:       ; %bb.0: ; %entry
105; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
106; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
107; GFX1064-NEXT:    ; implicit-def: $vgpr1
108; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
109; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
110; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
111; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
112; GFX1064-NEXT:    s_cbranch_execz BB0_2
113; GFX1064-NEXT:  ; %bb.1:
114; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
115; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
116; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
117; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
118; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
119; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
120; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX1064-NEXT:    buffer_gl0_inv
122; GFX1064-NEXT:  BB0_2:
123; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
124; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
125; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
126; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
127; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
128; GFX1064-NEXT:    s_mov_b32 s2, -1
129; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX1064-NEXT:    s_nop 0
131; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
132; GFX1064-NEXT:    s_endpgm
133;
134; GFX1032-LABEL: add_i32_constant:
135; GFX1032:       ; %bb.0: ; %entry
136; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
137; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
138; GFX1032-NEXT:    ; implicit-def: $vgpr1
139; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
140; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
141; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
142; GFX1032-NEXT:    s_cbranch_execz BB0_2
143; GFX1032-NEXT:  ; %bb.1:
144; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
145; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
146; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
147; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
148; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
149; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
150; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX1032-NEXT:    buffer_gl0_inv
152; GFX1032-NEXT:  BB0_2:
153; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
154; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
155; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
156; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
157; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
158; GFX1032-NEXT:    s_mov_b32 s2, -1
159; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX1032-NEXT:    s_nop 0
161; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
162; GFX1032-NEXT:    s_endpgm
163entry:
164  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
165  store i32 %old, i32 addrspace(1)* %out
166  ret void
167}
168
169define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
170;
171;
172; GFX7LESS-LABEL: add_i32_uniform:
173; GFX7LESS:       ; %bb.0: ; %entry
174; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
175; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
176; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
177; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
178; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
179; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
180; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
181; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
182; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
183; GFX7LESS-NEXT:  ; %bb.1:
184; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
185; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
187; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
188; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
189; GFX7LESS-NEXT:    s_mov_b32 m0, -1
190; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
192; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX7LESS-NEXT:  BB1_2:
194; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
195; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
197; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
198; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
199; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
200; GFX7LESS-NEXT:    s_mov_b32 s6, -1
201; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
202; GFX7LESS-NEXT:    s_endpgm
203;
204; GFX8-LABEL: add_i32_uniform:
205; GFX8:       ; %bb.0: ; %entry
206; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
207; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
208; GFX8-NEXT:    s_mov_b64 s[2:3], exec
209; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
210; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
211; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
212; GFX8-NEXT:    ; implicit-def: $vgpr1
213; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
214; GFX8-NEXT:    s_cbranch_execz BB1_2
215; GFX8-NEXT:  ; %bb.1:
216; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
217; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX8-NEXT:    s_mul_i32 s1, s0, s1
219; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
220; GFX8-NEXT:    v_mov_b32_e32 v2, s1
221; GFX8-NEXT:    s_mov_b32 m0, -1
222; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
224; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX8-NEXT:  BB1_2:
226; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
227; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
229; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
230; GFX8-NEXT:    s_mov_b32 s7, 0xf000
231; GFX8-NEXT:    s_mov_b32 s6, -1
232; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
233; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
234; GFX8-NEXT:    s_endpgm
235;
236; GFX9-LABEL: add_i32_uniform:
237; GFX9:       ; %bb.0: ; %entry
238; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
239; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
240; GFX9-NEXT:    s_mov_b64 s[6:7], exec
241; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
242; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
243; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
244; GFX9-NEXT:    ; implicit-def: $vgpr1
245; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
246; GFX9-NEXT:    s_cbranch_execz BB1_2
247; GFX9-NEXT:  ; %bb.1:
248; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
249; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX9-NEXT:    s_mul_i32 s3, s2, s3
251; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
252; GFX9-NEXT:    v_mov_b32_e32 v2, s3
253; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX9-NEXT:  BB1_2:
257; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
260; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
261; GFX9-NEXT:    s_mov_b32 s7, 0xf000
262; GFX9-NEXT:    s_mov_b32 s6, -1
263; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
264; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
265; GFX9-NEXT:    s_endpgm
266;
267; GFX1064-LABEL: add_i32_uniform:
268; GFX1064:       ; %bb.0: ; %entry
269; GFX1064-NEXT:    s_clause 0x1
270; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
271; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
272; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
273; GFX1064-NEXT:    ; implicit-def: $vgpr1
274; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
275; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
276; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
277; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
278; GFX1064-NEXT:    s_cbranch_execz BB1_2
279; GFX1064-NEXT:  ; %bb.1:
280; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
281; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
282; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
284; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
285; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
286; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
287; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
288; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX1064-NEXT:    buffer_gl0_inv
290; GFX1064-NEXT:  BB1_2:
291; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
292; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
293; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
295; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
296; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
297; GFX1064-NEXT:    s_mov_b32 s6, -1
298; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
299; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
300; GFX1064-NEXT:    s_endpgm
301;
302; GFX1032-LABEL: add_i32_uniform:
303; GFX1032:       ; %bb.0: ; %entry
304; GFX1032-NEXT:    s_clause 0x1
305; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
306; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
307; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
308; GFX1032-NEXT:    ; implicit-def: $vgpr1
309; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
310; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
311; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
312; GFX1032-NEXT:    s_cbranch_execz BB1_2
313; GFX1032-NEXT:  ; %bb.1:
314; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
315; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
316; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
317; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
318; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
319; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
320; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
321; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
322; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX1032-NEXT:    buffer_gl0_inv
324; GFX1032-NEXT:  BB1_2:
325; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
326; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
327; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
329; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
330; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
331; GFX1032-NEXT:    s_mov_b32 s6, -1
332; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
333; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
334; GFX1032-NEXT:    s_endpgm
335entry:
336  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
337  store i32 %old, i32 addrspace(1)* %out
338  ret void
339}
340
341define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
342;
343;
344; GFX7LESS-LABEL: add_i32_varying:
345; GFX7LESS:       ; %bb.0: ; %entry
346; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
347; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
348; GFX7LESS-NEXT:    s_mov_b32 m0, -1
349; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
351; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
353; GFX7LESS-NEXT:    s_mov_b32 s2, -1
354; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
355; GFX7LESS-NEXT:    s_endpgm
356;
357; GFX8-LABEL: add_i32_varying:
358; GFX8:       ; %bb.0: ; %entry
359; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
360; GFX8-NEXT:    v_mov_b32_e32 v2, v0
361; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
362; GFX8-NEXT:    v_mov_b32_e32 v1, 0
363; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
364; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
365; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
366; GFX8-NEXT:    s_not_b64 exec, exec
367; GFX8-NEXT:    v_mov_b32_e32 v2, 0
368; GFX8-NEXT:    s_not_b64 exec, exec
369; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
370; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
371; GFX8-NEXT:    s_nop 1
372; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
373; GFX8-NEXT:    s_nop 1
374; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
375; GFX8-NEXT:    s_nop 1
376; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
377; GFX8-NEXT:    s_nop 1
378; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
379; GFX8-NEXT:    s_nop 1
380; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
381; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
382; GFX8-NEXT:    s_nop 0
383; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
384; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
385; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
386; GFX8-NEXT:    ; implicit-def: $vgpr0
387; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
388; GFX8-NEXT:    s_cbranch_execz BB2_2
389; GFX8-NEXT:  ; %bb.1:
390; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
391; GFX8-NEXT:    v_mov_b32_e32 v3, s4
392; GFX8-NEXT:    s_mov_b32 m0, -1
393; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
395; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX8-NEXT:  BB2_2:
397; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
400; GFX8-NEXT:    v_mov_b32_e32 v0, v1
401; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
402; GFX8-NEXT:    s_mov_b32 s3, 0xf000
403; GFX8-NEXT:    s_mov_b32 s2, -1
404; GFX8-NEXT:    s_nop 0
405; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
406; GFX8-NEXT:    s_endpgm
407;
408; GFX9-LABEL: add_i32_varying:
409; GFX9:       ; %bb.0: ; %entry
410; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
411; GFX9-NEXT:    v_mov_b32_e32 v2, v0
412; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
413; GFX9-NEXT:    v_mov_b32_e32 v1, 0
414; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
415; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
416; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
417; GFX9-NEXT:    s_not_b64 exec, exec
418; GFX9-NEXT:    v_mov_b32_e32 v2, 0
419; GFX9-NEXT:    s_not_b64 exec, exec
420; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
421; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
422; GFX9-NEXT:    s_nop 1
423; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
424; GFX9-NEXT:    s_nop 1
425; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
426; GFX9-NEXT:    s_nop 1
427; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
428; GFX9-NEXT:    s_nop 1
429; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
430; GFX9-NEXT:    s_nop 1
431; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
432; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
433; GFX9-NEXT:    s_nop 0
434; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
435; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
436; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
437; GFX9-NEXT:    ; implicit-def: $vgpr0
438; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
439; GFX9-NEXT:    s_cbranch_execz BB2_2
440; GFX9-NEXT:  ; %bb.1:
441; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
442; GFX9-NEXT:    v_mov_b32_e32 v3, s4
443; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
445; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX9-NEXT:  BB2_2:
447; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
450; GFX9-NEXT:    v_mov_b32_e32 v0, v1
451; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
452; GFX9-NEXT:    s_mov_b32 s3, 0xf000
453; GFX9-NEXT:    s_mov_b32 s2, -1
454; GFX9-NEXT:    s_nop 0
455; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
456; GFX9-NEXT:    s_endpgm
457;
458; GFX1064-LABEL: add_i32_varying:
459; GFX1064:       ; %bb.0: ; %entry
460; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
461; GFX1064-NEXT:    s_not_b64 exec, exec
462; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
463; GFX1064-NEXT:    s_not_b64 exec, exec
464; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
465; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
466; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
467; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
468; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
469; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
470; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
471; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
472; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
473; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
474; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
475; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
476; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
477; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
478; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
479; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
480; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
481; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
482; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
483; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
484; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
485; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
486; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
487; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
488; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
489; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
490; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
491; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
492; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
493; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
494; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
495; GFX1064-NEXT:    s_mov_b32 s2, -1
496; GFX1064-NEXT:    ; implicit-def: $vgpr0
497; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
498; GFX1064-NEXT:    s_cbranch_execz BB2_2
499; GFX1064-NEXT:  ; %bb.1:
500; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
501; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
502; GFX1064-NEXT:    s_mov_b32 s3, s7
503; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
504; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
505; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
506; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX1064-NEXT:    buffer_gl0_inv
508; GFX1064-NEXT:  BB2_2:
509; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
510; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
511; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
512; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
513; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
514; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
515; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX1064-NEXT:    s_nop 0
517; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
518; GFX1064-NEXT:    s_endpgm
519;
520; GFX1032-LABEL: add_i32_varying:
521; GFX1032:       ; %bb.0: ; %entry
522; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
523; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
524; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
525; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
526; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
527; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
528; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
529; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
530; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
531; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
532; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
533; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
534; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
535; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
536; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
537; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
538; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
539; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
540; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
541; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
542; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
543; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
544; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
545; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
546; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
547; GFX1032-NEXT:    s_mov_b32 s2, -1
548; GFX1032-NEXT:    ; implicit-def: $vgpr0
549; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
550; GFX1032-NEXT:    s_cbranch_execz BB2_2
551; GFX1032-NEXT:  ; %bb.1:
552; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
553; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
554; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
555; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
556; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
557; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX1032-NEXT:    buffer_gl0_inv
559; GFX1032-NEXT:  BB2_2:
560; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
561; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
562; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
563; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
564; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
565; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
566; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX1032-NEXT:    s_nop 0
568; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
569; GFX1032-NEXT:    s_endpgm
570entry:
571  %lane = call i32 @llvm.amdgcn.workitem.id.x()
572  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
573  store i32 %old, i32 addrspace(1)* %out
574  ret void
575}
576
577define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
578;
579;
580; GFX7LESS-LABEL: add_i32_varying_gfx1032:
581; GFX7LESS:       ; %bb.0: ; %entry
582; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
583; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
584; GFX7LESS-NEXT:    s_mov_b32 m0, -1
585; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
587; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
589; GFX7LESS-NEXT:    s_mov_b32 s2, -1
590; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
591; GFX7LESS-NEXT:    s_endpgm
592;
593; GFX8-LABEL: add_i32_varying_gfx1032:
594; GFX8:       ; %bb.0: ; %entry
595; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
596; GFX8-NEXT:    v_mov_b32_e32 v2, v0
597; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
598; GFX8-NEXT:    v_mov_b32_e32 v1, 0
599; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
600; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
601; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
602; GFX8-NEXT:    s_not_b64 exec, exec
603; GFX8-NEXT:    v_mov_b32_e32 v2, 0
604; GFX8-NEXT:    s_not_b64 exec, exec
605; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
606; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
607; GFX8-NEXT:    s_nop 1
608; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
609; GFX8-NEXT:    s_nop 1
610; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
611; GFX8-NEXT:    s_nop 1
612; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
613; GFX8-NEXT:    s_nop 1
614; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
615; GFX8-NEXT:    s_nop 1
616; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
617; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
618; GFX8-NEXT:    s_nop 0
619; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
620; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
621; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
622; GFX8-NEXT:    ; implicit-def: $vgpr0
623; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
624; GFX8-NEXT:    s_cbranch_execz BB3_2
625; GFX8-NEXT:  ; %bb.1:
626; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
627; GFX8-NEXT:    v_mov_b32_e32 v3, s4
628; GFX8-NEXT:    s_mov_b32 m0, -1
629; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
631; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX8-NEXT:  BB3_2:
633; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
634; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
636; GFX8-NEXT:    v_mov_b32_e32 v0, v1
637; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
638; GFX8-NEXT:    s_mov_b32 s3, 0xf000
639; GFX8-NEXT:    s_mov_b32 s2, -1
640; GFX8-NEXT:    s_nop 0
641; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
642; GFX8-NEXT:    s_endpgm
643;
644; GFX9-LABEL: add_i32_varying_gfx1032:
645; GFX9:       ; %bb.0: ; %entry
646; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
647; GFX9-NEXT:    v_mov_b32_e32 v2, v0
648; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
649; GFX9-NEXT:    v_mov_b32_e32 v1, 0
650; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
651; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
652; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
653; GFX9-NEXT:    s_not_b64 exec, exec
654; GFX9-NEXT:    v_mov_b32_e32 v2, 0
655; GFX9-NEXT:    s_not_b64 exec, exec
656; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
657; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
658; GFX9-NEXT:    s_nop 1
659; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
660; GFX9-NEXT:    s_nop 1
661; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
662; GFX9-NEXT:    s_nop 1
663; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
664; GFX9-NEXT:    s_nop 1
665; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
666; GFX9-NEXT:    s_nop 1
667; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
668; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
669; GFX9-NEXT:    s_nop 0
670; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
671; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
672; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
673; GFX9-NEXT:    ; implicit-def: $vgpr0
674; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
675; GFX9-NEXT:    s_cbranch_execz BB3_2
676; GFX9-NEXT:  ; %bb.1:
677; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
678; GFX9-NEXT:    v_mov_b32_e32 v3, s4
679; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
681; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
682; GFX9-NEXT:  BB3_2:
683; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
685; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
686; GFX9-NEXT:    v_mov_b32_e32 v0, v1
687; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
688; GFX9-NEXT:    s_mov_b32 s3, 0xf000
689; GFX9-NEXT:    s_mov_b32 s2, -1
690; GFX9-NEXT:    s_nop 0
691; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
692; GFX9-NEXT:    s_endpgm
693;
694; GFX1064-LABEL: add_i32_varying_gfx1032:
695; GFX1064:       ; %bb.0: ; %entry
696; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
697; GFX1064-NEXT:    s_not_b64 exec, exec
698; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
699; GFX1064-NEXT:    s_not_b64 exec, exec
700; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
701; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
702; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
703; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
704; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
705; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
706; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
707; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
708; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
709; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
710; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
711; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
712; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
713; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
714; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
715; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
716; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
717; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
718; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
719; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
720; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
721; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
722; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
723; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
724; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
725; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
726; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
727; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
728; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
729; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
730; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
731; GFX1064-NEXT:    s_mov_b32 s2, -1
732; GFX1064-NEXT:    ; implicit-def: $vgpr0
733; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
734; GFX1064-NEXT:    s_cbranch_execz BB3_2
735; GFX1064-NEXT:  ; %bb.1:
736; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
737; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
738; GFX1064-NEXT:    s_mov_b32 s3, s7
739; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
740; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
741; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
742; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
743; GFX1064-NEXT:    buffer_gl0_inv
744; GFX1064-NEXT:  BB3_2:
745; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
746; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
747; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
748; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
749; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
750; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
751; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
752; GFX1064-NEXT:    s_nop 0
753; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
754; GFX1064-NEXT:    s_endpgm
755;
756; GFX1032-LABEL: add_i32_varying_gfx1032:
757; GFX1032:       ; %bb.0: ; %entry
758; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
759; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
760; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
761; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
762; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
763; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
764; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
765; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
766; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
767; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
768; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
769; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
770; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
771; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
772; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
773; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
774; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
775; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
776; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
777; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
778; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
779; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
780; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
781; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
782; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
783; GFX1032-NEXT:    s_mov_b32 s2, -1
784; GFX1032-NEXT:    ; implicit-def: $vgpr0
785; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
786; GFX1032-NEXT:    s_cbranch_execz BB3_2
787; GFX1032-NEXT:  ; %bb.1:
788; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
789; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
790; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
791; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
792; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
793; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX1032-NEXT:    buffer_gl0_inv
795; GFX1032-NEXT:  BB3_2:
796; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
797; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
798; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
799; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
800; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
801; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
802; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX1032-NEXT:    s_nop 0
804; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
805; GFX1032-NEXT:    s_endpgm
806entry:
807  %lane = call i32 @llvm.amdgcn.workitem.id.x()
808  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
809  store i32 %old, i32 addrspace(1)* %out
810  ret void
811}
812
813define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
814;
815;
816; GFX7LESS-LABEL: add_i32_varying_gfx1064:
817; GFX7LESS:       ; %bb.0: ; %entry
818; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
819; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
820; GFX7LESS-NEXT:    s_mov_b32 m0, -1
821; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX7LESS-NEXT:    ds_add_rtn_u32 v0, v1, v0
823; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
824; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
825; GFX7LESS-NEXT:    s_mov_b32 s2, -1
826; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
827; GFX7LESS-NEXT:    s_endpgm
828;
829; GFX8-LABEL: add_i32_varying_gfx1064:
830; GFX8:       ; %bb.0: ; %entry
831; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
832; GFX8-NEXT:    v_mov_b32_e32 v2, v0
833; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
834; GFX8-NEXT:    v_mov_b32_e32 v1, 0
835; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
836; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
837; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
838; GFX8-NEXT:    s_not_b64 exec, exec
839; GFX8-NEXT:    v_mov_b32_e32 v2, 0
840; GFX8-NEXT:    s_not_b64 exec, exec
841; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
842; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
843; GFX8-NEXT:    s_nop 1
844; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
845; GFX8-NEXT:    s_nop 1
846; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
847; GFX8-NEXT:    s_nop 1
848; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
849; GFX8-NEXT:    s_nop 1
850; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
851; GFX8-NEXT:    s_nop 1
852; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
853; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
854; GFX8-NEXT:    s_nop 0
855; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
856; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
857; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
858; GFX8-NEXT:    ; implicit-def: $vgpr0
859; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
860; GFX8-NEXT:    s_cbranch_execz BB4_2
861; GFX8-NEXT:  ; %bb.1:
862; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
863; GFX8-NEXT:    v_mov_b32_e32 v3, s4
864; GFX8-NEXT:    s_mov_b32 m0, -1
865; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
866; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v3
867; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
868; GFX8-NEXT:  BB4_2:
869; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
870; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
872; GFX8-NEXT:    v_mov_b32_e32 v0, v1
873; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
874; GFX8-NEXT:    s_mov_b32 s3, 0xf000
875; GFX8-NEXT:    s_mov_b32 s2, -1
876; GFX8-NEXT:    s_nop 0
877; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
878; GFX8-NEXT:    s_endpgm
879;
880; GFX9-LABEL: add_i32_varying_gfx1064:
881; GFX9:       ; %bb.0: ; %entry
882; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
883; GFX9-NEXT:    v_mov_b32_e32 v2, v0
884; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
885; GFX9-NEXT:    v_mov_b32_e32 v1, 0
886; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
887; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
888; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
889; GFX9-NEXT:    s_not_b64 exec, exec
890; GFX9-NEXT:    v_mov_b32_e32 v2, 0
891; GFX9-NEXT:    s_not_b64 exec, exec
892; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
893; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
894; GFX9-NEXT:    s_nop 1
895; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
896; GFX9-NEXT:    s_nop 1
897; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
898; GFX9-NEXT:    s_nop 1
899; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
900; GFX9-NEXT:    s_nop 1
901; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
902; GFX9-NEXT:    s_nop 1
903; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
904; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
905; GFX9-NEXT:    s_nop 0
906; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
907; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
908; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
909; GFX9-NEXT:    ; implicit-def: $vgpr0
910; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
911; GFX9-NEXT:    s_cbranch_execz BB4_2
912; GFX9-NEXT:  ; %bb.1:
913; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
914; GFX9-NEXT:    v_mov_b32_e32 v3, s4
915; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v3
917; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX9-NEXT:  BB4_2:
919; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
920; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
922; GFX9-NEXT:    v_mov_b32_e32 v0, v1
923; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
924; GFX9-NEXT:    s_mov_b32 s3, 0xf000
925; GFX9-NEXT:    s_mov_b32 s2, -1
926; GFX9-NEXT:    s_nop 0
927; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
928; GFX9-NEXT:    s_endpgm
929;
930; GFX1064-LABEL: add_i32_varying_gfx1064:
931; GFX1064:       ; %bb.0: ; %entry
932; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
933; GFX1064-NEXT:    s_not_b64 exec, exec
934; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
935; GFX1064-NEXT:    s_not_b64 exec, exec
936; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
937; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
938; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
939; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
940; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
941; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
942; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
943; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
944; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
945; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
946; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
947; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
948; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
949; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
950; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
951; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
952; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
953; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
954; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
955; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
956; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
957; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
958; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
959; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
960; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
961; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
962; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
963; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
964; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
965; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
966; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
967; GFX1064-NEXT:    s_mov_b32 s2, -1
968; GFX1064-NEXT:    ; implicit-def: $vgpr0
969; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
970; GFX1064-NEXT:    s_cbranch_execz BB4_2
971; GFX1064-NEXT:  ; %bb.1:
972; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
973; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
974; GFX1064-NEXT:    s_mov_b32 s3, s7
975; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
976; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
977; GFX1064-NEXT:    ds_add_rtn_u32 v0, v7, v4
978; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX1064-NEXT:    buffer_gl0_inv
980; GFX1064-NEXT:  BB4_2:
981; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
982; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
983; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
984; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
985; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
986; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
987; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX1064-NEXT:    s_nop 0
989; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
990; GFX1064-NEXT:    s_endpgm
991;
992; GFX1032-LABEL: add_i32_varying_gfx1064:
993; GFX1032:       ; %bb.0: ; %entry
994; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
995; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
996; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
997; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
998; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
999; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1000; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1001; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1002; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1003; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1004; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1005; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1006; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1007; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1008; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1009; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1010; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
1011; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
1012; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1013; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1014; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1015; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1016; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
1017; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1018; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1019; GFX1032-NEXT:    s_mov_b32 s2, -1
1020; GFX1032-NEXT:    ; implicit-def: $vgpr0
1021; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1022; GFX1032-NEXT:    s_cbranch_execz BB4_2
1023; GFX1032-NEXT:  ; %bb.1:
1024; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
1025; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
1026; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1027; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1028; GFX1032-NEXT:    ds_add_rtn_u32 v0, v7, v4
1029; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX1032-NEXT:    buffer_gl0_inv
1031; GFX1032-NEXT:  BB4_2:
1032; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1033; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1034; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
1035; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1036; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1037; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1038; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1039; GFX1032-NEXT:    s_nop 0
1040; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1041; GFX1032-NEXT:    s_endpgm
1042entry:
1043  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1044  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
1045  store i32 %old, i32 addrspace(1)* %out
1046  ret void
1047}
1048
1049define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
1050;
1051;
1052; GFX7LESS-LABEL: add_i64_constant:
1053; GFX7LESS:       ; %bb.0: ; %entry
1054; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
1055; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1056; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1057; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
1058; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1059; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1060; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1061; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
1062; GFX7LESS-NEXT:  ; %bb.1:
1063; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1064; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1065; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1066; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1067; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1068; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1069; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1070; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX7LESS-NEXT:  BB5_2:
1072; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1073; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1074; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1075; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
1076; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1077; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1078; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1079; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1080; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
1081; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1082; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1083; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1084; GFX7LESS-NEXT:    s_endpgm
1085;
1086; GFX8-LABEL: add_i64_constant:
1087; GFX8:       ; %bb.0: ; %entry
1088; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1089; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1090; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1091; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1092; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1093; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1094; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1095; GFX8-NEXT:    s_cbranch_execz BB5_2
1096; GFX8-NEXT:  ; %bb.1:
1097; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1098; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1099; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1100; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1101; GFX8-NEXT:    s_mov_b32 m0, -1
1102; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1103; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1104; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1105; GFX8-NEXT:  BB5_2:
1106; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1107; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1109; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
1110; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1111; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1112; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1113; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1114; GFX8-NEXT:    s_mov_b32 s2, -1
1115; GFX8-NEXT:    s_nop 2
1116; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1117; GFX8-NEXT:    s_endpgm
1118;
1119; GFX9-LABEL: add_i64_constant:
1120; GFX9:       ; %bb.0: ; %entry
1121; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1122; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1123; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1124; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1125; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1126; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1127; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1128; GFX9-NEXT:    s_cbranch_execz BB5_2
1129; GFX9-NEXT:  ; %bb.1:
1130; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1131; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1132; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1133; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1134; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1135; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1136; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX9-NEXT:  BB5_2:
1138; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1141; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
1142; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1143; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1144; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
1145; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1146; GFX9-NEXT:    s_mov_b32 s2, -1
1147; GFX9-NEXT:    s_nop 2
1148; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1149; GFX9-NEXT:    s_endpgm
1150;
1151; GFX1064-LABEL: add_i64_constant:
1152; GFX1064:       ; %bb.0: ; %entry
1153; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1154; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
1155; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1156; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
1157; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
1158; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1159; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1160; GFX1064-NEXT:    s_cbranch_execz BB5_2
1161; GFX1064-NEXT:  ; %bb.1:
1162; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
1163; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1164; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
1165; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
1166; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1167; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1168; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1169; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX1064-NEXT:    buffer_gl0_inv
1171; GFX1064-NEXT:  BB5_2:
1172; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1173; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1174; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1175; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
1176; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
1177; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1178; GFX1064-NEXT:    s_mov_b32 s2, -1
1179; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX1064-NEXT:    s_nop 1
1181; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1182; GFX1064-NEXT:    s_endpgm
1183;
1184; GFX1032-LABEL: add_i64_constant:
1185; GFX1032:       ; %bb.0: ; %entry
1186; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1187; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1188; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1189; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1190; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1191; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1192; GFX1032-NEXT:    s_cbranch_execz BB5_2
1193; GFX1032-NEXT:  ; %bb.1:
1194; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
1195; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1196; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
1197; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
1198; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1199; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1200; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1201; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1202; GFX1032-NEXT:    buffer_gl0_inv
1203; GFX1032-NEXT:  BB5_2:
1204; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1205; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1206; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1207; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
1208; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
1209; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1210; GFX1032-NEXT:    s_mov_b32 s2, -1
1211; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX1032-NEXT:    s_nop 1
1213; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1214; GFX1032-NEXT:    s_endpgm
1215entry:
1216  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
1217  store i64 %old, i64 addrspace(1)* %out
1218  ret void
1219}
1220
1221define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
1222;
1223;
1224; GFX7LESS-LABEL: add_i64_uniform:
1225; GFX7LESS:       ; %bb.0: ; %entry
1226; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1227; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1228; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1229; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1230; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1231; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1232; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1233; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1234; GFX7LESS-NEXT:  ; %bb.1:
1235; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1236; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1237; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1239; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1240; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
1241; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1242; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1243; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1244; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1245; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1247; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1248; GFX7LESS-NEXT:  BB6_2:
1249; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1250; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1251; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1252; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1253; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1254; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1255; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1256; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
1257; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
1258; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
1259; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
1260; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1261; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1262; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1263; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1264; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1265; GFX7LESS-NEXT:    s_endpgm
1266;
1267; GFX8-LABEL: add_i64_uniform:
1268; GFX8:       ; %bb.0: ; %entry
1269; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1270; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1271; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1272; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1273; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1274; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1275; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1276; GFX8-NEXT:    s_cbranch_execz BB6_2
1277; GFX8-NEXT:  ; %bb.1:
1278; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1279; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1280; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1281; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
1282; GFX8-NEXT:    s_mul_i32 s7, s3, s6
1283; GFX8-NEXT:    s_mul_i32 s6, s2, s6
1284; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1285; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1286; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1287; GFX8-NEXT:    s_mov_b32 m0, -1
1288; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1290; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX8-NEXT:  BB6_2:
1292; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1293; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX8-NEXT:    s_mov_b32 s4, s0
1295; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1296; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
1297; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
1298; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
1299; GFX8-NEXT:    s_mov_b32 s5, s1
1300; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1301; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1302; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1303; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1304; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1305; GFX8-NEXT:    s_mov_b32 s6, -1
1306; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1307; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1308; GFX8-NEXT:    s_endpgm
1309;
1310; GFX9-LABEL: add_i64_uniform:
1311; GFX9:       ; %bb.0: ; %entry
1312; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1313; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1314; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1315; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1316; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1317; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1318; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1319; GFX9-NEXT:    s_cbranch_execz BB6_2
1320; GFX9-NEXT:  ; %bb.1:
1321; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1323; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1324; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1325; GFX9-NEXT:    s_add_i32 s8, s8, s7
1326; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1327; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1328; GFX9-NEXT:    v_mov_b32_e32 v2, s8
1329; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1330; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX9-NEXT:  BB6_2:
1334; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1335; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
1337; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
1338; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1339; GFX9-NEXT:    s_mov_b32 s4, s0
1340; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1341; GFX9-NEXT:    s_mov_b32 s5, s1
1342; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1343; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
1344; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1345; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
1346; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1347; GFX9-NEXT:    s_mov_b32 s6, -1
1348; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1349; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1350; GFX9-NEXT:    s_endpgm
1351;
1352; GFX1064-LABEL: add_i64_uniform:
1353; GFX1064:       ; %bb.0: ; %entry
1354; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1355; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1356; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
1357; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1358; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1359; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1360; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1361; GFX1064-NEXT:    s_cbranch_execz BB6_2
1362; GFX1064-NEXT:  ; %bb.1:
1363; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1364; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1365; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1366; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1367; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1368; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1369; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1370; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1371; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
1372; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1373; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1374; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1375; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1376; GFX1064-NEXT:    buffer_gl0_inv
1377; GFX1064-NEXT:  BB6_2:
1378; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1379; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1380; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1381; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
1382; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
1383; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1384; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1385; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
1386; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1387; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1388; GFX1064-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
1389; GFX1064-NEXT:    s_mov_b32 s2, -1
1390; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
1391; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1392; GFX1064-NEXT:    s_endpgm
1393;
1394; GFX1032-LABEL: add_i64_uniform:
1395; GFX1032:       ; %bb.0: ; %entry
1396; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1397; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1398; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
1399; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1400; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1401; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1402; GFX1032-NEXT:    s_cbranch_execz BB6_2
1403; GFX1032-NEXT:  ; %bb.1:
1404; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1405; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
1406; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1407; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1408; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1409; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1410; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1411; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1412; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
1413; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1414; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1415; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
1416; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX1032-NEXT:    buffer_gl0_inv
1418; GFX1032-NEXT:  BB6_2:
1419; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1420; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1421; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
1423; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
1424; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1425; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1426; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
1427; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1428; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
1429; GFX1032-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s2, v0
1430; GFX1032-NEXT:    s_mov_b32 s2, -1
1431; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
1432; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1433; GFX1032-NEXT:    s_endpgm
1434entry:
1435  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
1436  store i64 %old, i64 addrspace(1)* %out
1437  ret void
1438}
1439
1440; GCN-NOT: v_mbcnt_lo_u32_b32
1441; GCN-NOT: v_mbcnt_hi_u32_b32
1442; GCN-NOT: s_bcnt1_i32_b64
1443define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
1444;
1445;
1446; GFX7LESS-LABEL: add_i64_varying:
1447; GFX7LESS:       ; %bb.0: ; %entry
1448; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1449; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1450; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1451; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1452; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1454; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1455; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1456; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1457; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1458; GFX7LESS-NEXT:    s_endpgm
1459;
1460; GFX8-LABEL: add_i64_varying:
1461; GFX8:       ; %bb.0: ; %entry
1462; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1463; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1464; GFX8-NEXT:    s_mov_b32 m0, -1
1465; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1466; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1467; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1468; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1469; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1470; GFX8-NEXT:    s_mov_b32 s2, -1
1471; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1472; GFX8-NEXT:    s_endpgm
1473;
1474; GFX9-LABEL: add_i64_varying:
1475; GFX9:       ; %bb.0: ; %entry
1476; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1477; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1478; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1479; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1481; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1483; GFX9-NEXT:    s_mov_b32 s2, -1
1484; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1485; GFX9-NEXT:    s_endpgm
1486;
1487; GFX1064-LABEL: add_i64_varying:
1488; GFX1064:       ; %bb.0: ; %entry
1489; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1490; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1491; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1492; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1493; GFX1064-NEXT:    s_mov_b32 s2, -1
1494; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1495; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1496; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1497; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX1064-NEXT:    buffer_gl0_inv
1499; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1500; GFX1064-NEXT:    s_endpgm
1501;
1502; GFX1032-LABEL: add_i64_varying:
1503; GFX1032:       ; %bb.0: ; %entry
1504; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1505; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
1506; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1507; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1508; GFX1032-NEXT:    s_mov_b32 s2, -1
1509; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1510; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1511; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v2, v[0:1]
1512; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1513; GFX1032-NEXT:    buffer_gl0_inv
1514; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1515; GFX1032-NEXT:    s_endpgm
1516entry:
1517  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1518  %zext = zext i32 %lane to i64
1519  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
1520  store i64 %old, i64 addrspace(1)* %out
1521  ret void
1522}
1523
1524define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
1525;
1526;
1527; GFX7LESS-LABEL: sub_i32_constant:
1528; GFX7LESS:       ; %bb.0: ; %entry
1529; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1530; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1531; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1532; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1533; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1534; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1535; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1536; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
1537; GFX7LESS-NEXT:  ; %bb.1:
1538; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1539; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1540; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v2, s2, 5
1541; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1542; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1544; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1545; GFX7LESS-NEXT:  BB8_2:
1546; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1547; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1549; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1550; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1551; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1552; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1553; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1554; GFX7LESS-NEXT:    s_endpgm
1555;
1556; GFX8-LABEL: sub_i32_constant:
1557; GFX8:       ; %bb.0: ; %entry
1558; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1559; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1560; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1561; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1562; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1563; GFX8-NEXT:    ; implicit-def: $vgpr1
1564; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1565; GFX8-NEXT:    s_cbranch_execz BB8_2
1566; GFX8-NEXT:  ; %bb.1:
1567; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1568; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1569; GFX8-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1570; GFX8-NEXT:    s_mov_b32 m0, -1
1571; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX8-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1573; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX8-NEXT:  BB8_2:
1575; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1576; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1577; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1578; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1579; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1580; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1581; GFX8-NEXT:    s_mov_b32 s2, -1
1582; GFX8-NEXT:    s_nop 0
1583; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1584; GFX8-NEXT:    s_endpgm
1585;
1586; GFX9-LABEL: sub_i32_constant:
1587; GFX9:       ; %bb.0: ; %entry
1588; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1589; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1590; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1591; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1592; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1593; GFX9-NEXT:    ; implicit-def: $vgpr1
1594; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1595; GFX9-NEXT:    s_cbranch_execz BB8_2
1596; GFX9-NEXT:  ; %bb.1:
1597; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1598; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1599; GFX9-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1600; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1602; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX9-NEXT:  BB8_2:
1604; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1605; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1606; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1607; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1608; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1609; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1610; GFX9-NEXT:    s_mov_b32 s2, -1
1611; GFX9-NEXT:    s_nop 0
1612; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1613; GFX9-NEXT:    s_endpgm
1614;
1615; GFX1064-LABEL: sub_i32_constant:
1616; GFX1064:       ; %bb.0: ; %entry
1617; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1618; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1619; GFX1064-NEXT:    ; implicit-def: $vgpr1
1620; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1621; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
1622; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1623; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1624; GFX1064-NEXT:    s_cbranch_execz BB8_2
1625; GFX1064-NEXT:  ; %bb.1:
1626; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1627; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1628; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1629; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1630; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1631; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1632; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1633; GFX1064-NEXT:    buffer_gl0_inv
1634; GFX1064-NEXT:  BB8_2:
1635; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1636; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1637; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1638; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1639; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1640; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1641; GFX1064-NEXT:    s_mov_b32 s2, -1
1642; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1643; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1644; GFX1064-NEXT:    s_endpgm
1645;
1646; GFX1032-LABEL: sub_i32_constant:
1647; GFX1032:       ; %bb.0: ; %entry
1648; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1649; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
1650; GFX1032-NEXT:    ; implicit-def: $vgpr1
1651; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1652; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1653; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
1654; GFX1032-NEXT:    s_cbranch_execz BB8_2
1655; GFX1032-NEXT:  ; %bb.1:
1656; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
1657; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var32@abs32@lo
1658; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
1659; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1660; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1661; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
1662; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1663; GFX1032-NEXT:    buffer_gl0_inv
1664; GFX1032-NEXT:  BB8_2:
1665; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1666; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1667; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1668; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1669; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1670; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1671; GFX1032-NEXT:    s_mov_b32 s2, -1
1672; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1673; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1674; GFX1032-NEXT:    s_endpgm
1675entry:
1676  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
1677  store i32 %old, i32 addrspace(1)* %out
1678  ret void
1679}
1680
1681define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
1682;
1683;
1684; GFX7LESS-LABEL: sub_i32_uniform:
1685; GFX7LESS:       ; %bb.0: ; %entry
1686; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1687; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1688; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
1689; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1690; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1691; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1692; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1693; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1694; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1695; GFX7LESS-NEXT:  ; %bb.1:
1696; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1697; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1698; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
1699; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1700; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
1701; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1702; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1703; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1704; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1705; GFX7LESS-NEXT:  BB9_2:
1706; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
1707; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1708; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1709; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1710; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1711; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
1712; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1713; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1714; GFX7LESS-NEXT:    s_endpgm
1715;
1716; GFX8-LABEL: sub_i32_uniform:
1717; GFX8:       ; %bb.0: ; %entry
1718; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1719; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
1720; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1721; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1722; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1723; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1724; GFX8-NEXT:    ; implicit-def: $vgpr1
1725; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1726; GFX8-NEXT:    s_cbranch_execz BB9_2
1727; GFX8-NEXT:  ; %bb.1:
1728; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1729; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1730; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1731; GFX8-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1732; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1733; GFX8-NEXT:    s_mov_b32 m0, -1
1734; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1735; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1736; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1737; GFX8-NEXT:  BB9_2:
1738; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1739; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1741; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1742; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1743; GFX8-NEXT:    s_mov_b32 s6, -1
1744; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1745; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1746; GFX8-NEXT:    s_endpgm
1747;
1748; GFX9-LABEL: sub_i32_uniform:
1749; GFX9:       ; %bb.0: ; %entry
1750; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1751; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1752; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1753; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1754; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1755; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1756; GFX9-NEXT:    ; implicit-def: $vgpr1
1757; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1758; GFX9-NEXT:    s_cbranch_execz BB9_2
1759; GFX9-NEXT:  ; %bb.1:
1760; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1762; GFX9-NEXT:    s_mul_i32 s3, s2, s3
1763; GFX9-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1764; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1765; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1767; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX9-NEXT:  BB9_2:
1769; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1770; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1771; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1772; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1773; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1774; GFX9-NEXT:    s_mov_b32 s6, -1
1775; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1776; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1777; GFX9-NEXT:    s_endpgm
1778;
1779; GFX1064-LABEL: sub_i32_uniform:
1780; GFX1064:       ; %bb.0: ; %entry
1781; GFX1064-NEXT:    s_clause 0x1
1782; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1783; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
1784; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1785; GFX1064-NEXT:    ; implicit-def: $vgpr1
1786; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1787; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1788; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1789; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1790; GFX1064-NEXT:    s_cbranch_execz BB9_2
1791; GFX1064-NEXT:  ; %bb.1:
1792; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
1793; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1794; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
1796; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
1797; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1798; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1799; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1800; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1801; GFX1064-NEXT:    buffer_gl0_inv
1802; GFX1064-NEXT:  BB9_2:
1803; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1804; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1805; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1806; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
1807; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1808; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1809; GFX1064-NEXT:    s_mov_b32 s6, -1
1810; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1811; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1812; GFX1064-NEXT:    s_endpgm
1813;
1814; GFX1032-LABEL: sub_i32_uniform:
1815; GFX1032:       ; %bb.0: ; %entry
1816; GFX1032-NEXT:    s_clause 0x1
1817; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1818; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x2c
1819; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1820; GFX1032-NEXT:    ; implicit-def: $vgpr1
1821; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1822; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1823; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1824; GFX1032-NEXT:    s_cbranch_execz BB9_2
1825; GFX1032-NEXT:  ; %bb.1:
1826; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1827; GFX1032-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1828; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1830; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
1831; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1832; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1833; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
1834; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1835; GFX1032-NEXT:    buffer_gl0_inv
1836; GFX1032-NEXT:  BB9_2:
1837; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1838; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1839; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1840; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1841; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1842; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1843; GFX1032-NEXT:    s_mov_b32 s6, -1
1844; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1845; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1846; GFX1032-NEXT:    s_endpgm
1847entry:
1848  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
1849  store i32 %old, i32 addrspace(1)* %out
1850  ret void
1851}
1852
1853define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
1854;
1855;
1856; GFX7LESS-LABEL: sub_i32_varying:
1857; GFX7LESS:       ; %bb.0: ; %entry
1858; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1859; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
1860; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1861; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX7LESS-NEXT:    ds_sub_rtn_u32 v0, v1, v0
1863; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1864; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1865; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1866; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1867; GFX7LESS-NEXT:    s_endpgm
1868;
1869; GFX8-LABEL: sub_i32_varying:
1870; GFX8:       ; %bb.0: ; %entry
1871; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1872; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1873; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1874; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1875; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1876; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1877; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1878; GFX8-NEXT:    s_not_b64 exec, exec
1879; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1880; GFX8-NEXT:    s_not_b64 exec, exec
1881; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
1882; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1883; GFX8-NEXT:    s_nop 1
1884; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1885; GFX8-NEXT:    s_nop 1
1886; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1887; GFX8-NEXT:    s_nop 1
1888; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1889; GFX8-NEXT:    s_nop 1
1890; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1891; GFX8-NEXT:    s_nop 1
1892; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1893; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
1894; GFX8-NEXT:    s_nop 0
1895; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1896; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
1897; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1898; GFX8-NEXT:    ; implicit-def: $vgpr0
1899; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1900; GFX8-NEXT:    s_cbranch_execz BB10_2
1901; GFX8-NEXT:  ; %bb.1:
1902; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1903; GFX8-NEXT:    v_mov_b32_e32 v3, s4
1904; GFX8-NEXT:    s_mov_b32 m0, -1
1905; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1906; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1907; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1908; GFX8-NEXT:  BB10_2:
1909; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1910; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1911; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1912; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1913; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1914; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1915; GFX8-NEXT:    s_mov_b32 s2, -1
1916; GFX8-NEXT:    s_nop 0
1917; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1918; GFX8-NEXT:    s_endpgm
1919;
1920; GFX9-LABEL: sub_i32_varying:
1921; GFX9:       ; %bb.0: ; %entry
1922; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1923; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1924; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1925; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1926; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1927; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1928; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1929; GFX9-NEXT:    s_not_b64 exec, exec
1930; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1931; GFX9-NEXT:    s_not_b64 exec, exec
1932; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
1933; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1934; GFX9-NEXT:    s_nop 1
1935; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1936; GFX9-NEXT:    s_nop 1
1937; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1938; GFX9-NEXT:    s_nop 1
1939; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1940; GFX9-NEXT:    s_nop 1
1941; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1942; GFX9-NEXT:    s_nop 1
1943; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1944; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
1945; GFX9-NEXT:    s_nop 0
1946; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1947; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
1948; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1949; GFX9-NEXT:    ; implicit-def: $vgpr0
1950; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1951; GFX9-NEXT:    s_cbranch_execz BB10_2
1952; GFX9-NEXT:  ; %bb.1:
1953; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
1954; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1955; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1956; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v3
1957; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1958; GFX9-NEXT:  BB10_2:
1959; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1960; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1961; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1962; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1963; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1964; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1965; GFX9-NEXT:    s_mov_b32 s2, -1
1966; GFX9-NEXT:    s_nop 0
1967; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1968; GFX9-NEXT:    s_endpgm
1969;
1970; GFX1064-LABEL: sub_i32_varying:
1971; GFX1064:       ; %bb.0: ; %entry
1972; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1973; GFX1064-NEXT:    s_not_b64 exec, exec
1974; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1975; GFX1064-NEXT:    s_not_b64 exec, exec
1976; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1977; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
1978; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1979; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
1980; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
1981; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
1982; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1983; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1984; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1985; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
1986; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
1987; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1988; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
1989; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1990; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1991; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1992; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1993; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
1994; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
1995; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1996; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1997; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1998; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
1999; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2000; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2001; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2002; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2003; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2004; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2005; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2006; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2007; GFX1064-NEXT:    s_mov_b32 s2, -1
2008; GFX1064-NEXT:    ; implicit-def: $vgpr0
2009; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2010; GFX1064-NEXT:    s_cbranch_execz BB10_2
2011; GFX1064-NEXT:  ; %bb.1:
2012; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2013; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2014; GFX1064-NEXT:    s_mov_b32 s3, s7
2015; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2016; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2017; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2018; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2019; GFX1064-NEXT:    buffer_gl0_inv
2020; GFX1064-NEXT:  BB10_2:
2021; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2022; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2023; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2024; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2025; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2026; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2027; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX1064-NEXT:    s_nop 0
2029; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2030; GFX1064-NEXT:    s_endpgm
2031;
2032; GFX1032-LABEL: sub_i32_varying:
2033; GFX1032:       ; %bb.0: ; %entry
2034; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2035; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2036; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2037; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2038; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2039; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2040; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2041; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2042; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2043; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2044; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2045; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2046; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2047; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2048; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2049; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2050; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2051; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2052; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2053; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2054; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2055; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2056; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2057; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2058; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2059; GFX1032-NEXT:    s_mov_b32 s2, -1
2060; GFX1032-NEXT:    ; implicit-def: $vgpr0
2061; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2062; GFX1032-NEXT:    s_cbranch_execz BB10_2
2063; GFX1032-NEXT:  ; %bb.1:
2064; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2065; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2066; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2067; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2068; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v7, v4
2069; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2070; GFX1032-NEXT:    buffer_gl0_inv
2071; GFX1032-NEXT:  BB10_2:
2072; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2073; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2074; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2075; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2076; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
2077; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2078; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX1032-NEXT:    s_nop 0
2080; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2081; GFX1032-NEXT:    s_endpgm
2082entry:
2083  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2084  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2085  store i32 %old, i32 addrspace(1)* %out
2086  ret void
2087}
2088
2089define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
2090;
2091;
2092; GFX7LESS-LABEL: sub_i64_constant:
2093; GFX7LESS:       ; %bb.0: ; %entry
2094; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
2095; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2096; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2097; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
2098; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2099; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2100; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2101; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
2102; GFX7LESS-NEXT:  ; %bb.1:
2103; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2104; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2105; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2106; GFX7LESS-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2107; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2108; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2109; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2110; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2111; GFX7LESS-NEXT:  BB11_2:
2112; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2113; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2114; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
2115; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
2116; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2117; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2118; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2119; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
2120; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2121; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2122; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2123; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2124; GFX7LESS-NEXT:    s_endpgm
2125;
2126; GFX8-LABEL: sub_i64_constant:
2127; GFX8:       ; %bb.0: ; %entry
2128; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2129; GFX8-NEXT:    s_mov_b64 s[4:5], exec
2130; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2131; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2132; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2133; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2134; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2135; GFX8-NEXT:    s_cbranch_execz BB11_2
2136; GFX8-NEXT:  ; %bb.1:
2137; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2138; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2139; GFX8-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2140; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2141; GFX8-NEXT:    s_mov_b32 m0, -1
2142; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2143; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2144; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2145; GFX8-NEXT:  BB11_2:
2146; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2147; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
2149; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
2150; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2151; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2152; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2153; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
2154; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2155; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2156; GFX8-NEXT:    s_mov_b32 s2, -1
2157; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2158; GFX8-NEXT:    s_endpgm
2159;
2160; GFX9-LABEL: sub_i64_constant:
2161; GFX9:       ; %bb.0: ; %entry
2162; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2163; GFX9-NEXT:    s_mov_b64 s[4:5], exec
2164; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
2165; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
2166; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2167; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2168; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2169; GFX9-NEXT:    s_cbranch_execz BB11_2
2170; GFX9-NEXT:  ; %bb.1:
2171; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2172; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2173; GFX9-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2174; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2175; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2176; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2177; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2178; GFX9-NEXT:  BB11_2:
2179; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
2182; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2183; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
2184; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
2185; GFX9-NEXT:    v_mov_b32_e32 v2, s3
2186; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
2187; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2188; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2189; GFX9-NEXT:    s_mov_b32 s2, -1
2190; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2191; GFX9-NEXT:    s_endpgm
2192;
2193; GFX1064-LABEL: sub_i64_constant:
2194; GFX1064:       ; %bb.0: ; %entry
2195; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2196; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
2197; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2198; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
2199; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
2200; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2201; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2202; GFX1064-NEXT:    s_cbranch_execz BB11_2
2203; GFX1064-NEXT:  ; %bb.1:
2204; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
2205; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2206; GFX1064-NEXT:    v_mul_hi_u32_u24_e64 v2, s4, 5
2207; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
2208; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2209; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2210; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2211; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2212; GFX1064-NEXT:    buffer_gl0_inv
2213; GFX1064-NEXT:  BB11_2:
2214; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2215; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2216; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2217; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2218; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
2219; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2220; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
2221; GFX1064-NEXT:    s_mov_b32 s2, -1
2222; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2223; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2224; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2225; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2226; GFX1064-NEXT:    s_endpgm
2227;
2228; GFX1032-LABEL: sub_i64_constant:
2229; GFX1032:       ; %bb.0: ; %entry
2230; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2231; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
2232; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2233; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
2234; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2235; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2236; GFX1032-NEXT:    s_cbranch_execz BB11_2
2237; GFX1032-NEXT:  ; %bb.1:
2238; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
2239; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2240; GFX1032-NEXT:    v_mul_hi_u32_u24_e64 v2, s3, 5
2241; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s3, 5
2242; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2243; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2244; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2245; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2246; GFX1032-NEXT:    buffer_gl0_inv
2247; GFX1032-NEXT:  BB11_2:
2248; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2249; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2250; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2251; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
2252; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
2253; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
2254; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
2255; GFX1032-NEXT:    s_mov_b32 s2, -1
2256; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2257; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2258; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2259; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2260; GFX1032-NEXT:    s_endpgm
2261entry:
2262  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
2263  store i64 %old, i64 addrspace(1)* %out
2264  ret void
2265}
2266
2267define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
2268;
2269;
2270; GFX7LESS-LABEL: sub_i64_uniform:
2271; GFX7LESS:       ; %bb.0: ; %entry
2272; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2273; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2274; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2275; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
2276; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2277; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
2278; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2279; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
2280; GFX7LESS-NEXT:  ; %bb.1:
2281; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2282; GFX7LESS-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2283; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2284; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
2285; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2286; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
2287; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
2288; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
2289; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
2290; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2291; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2293; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2294; GFX7LESS-NEXT:  BB12_2:
2295; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2296; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2297; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2298; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2300; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2301; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
2302; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
2303; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
2304; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
2305; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
2306; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2307; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
2308; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2309; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2310; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2311; GFX7LESS-NEXT:    s_endpgm
2312;
2313; GFX8-LABEL: sub_i64_uniform:
2314; GFX8:       ; %bb.0: ; %entry
2315; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2316; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2317; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2318; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2319; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2320; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
2321; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2322; GFX8-NEXT:    s_cbranch_execz BB12_2
2323; GFX8-NEXT:  ; %bb.1:
2324; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2325; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2326; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2327; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
2328; GFX8-NEXT:    s_mul_i32 s7, s3, s6
2329; GFX8-NEXT:    s_mul_i32 s6, s2, s6
2330; GFX8-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2331; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
2332; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2333; GFX8-NEXT:    s_mov_b32 m0, -1
2334; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2335; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2336; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2337; GFX8-NEXT:  BB12_2:
2338; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2339; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2340; GFX8-NEXT:    s_mov_b32 s4, s0
2341; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
2342; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
2343; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
2344; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
2345; GFX8-NEXT:    s_mov_b32 s5, s1
2346; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
2347; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2348; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2349; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
2350; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2351; GFX8-NEXT:    s_mov_b32 s6, -1
2352; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2353; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2354; GFX8-NEXT:    s_endpgm
2355;
2356; GFX9-LABEL: sub_i64_uniform:
2357; GFX9:       ; %bb.0: ; %entry
2358; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2359; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2360; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2361; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
2362; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2363; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
2364; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2365; GFX9-NEXT:    s_cbranch_execz BB12_2
2366; GFX9-NEXT:  ; %bb.1:
2367; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2369; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2370; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2371; GFX9-NEXT:    s_add_i32 s8, s8, s7
2372; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2373; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2374; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2375; GFX9-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2376; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2377; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2379; GFX9-NEXT:  BB12_2:
2380; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2381; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2382; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2383; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2384; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2385; GFX9-NEXT:    s_mov_b32 s4, s0
2386; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2387; GFX9-NEXT:    s_mov_b32 s5, s1
2388; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2389; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2390; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2391; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2392; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2393; GFX9-NEXT:    s_mov_b32 s6, -1
2394; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2395; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2396; GFX9-NEXT:    s_endpgm
2397;
2398; GFX1064-LABEL: sub_i64_uniform:
2399; GFX1064:       ; %bb.0: ; %entry
2400; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2401; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2402; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
2403; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2404; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
2405; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2406; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2407; GFX1064-NEXT:    s_cbranch_execz BB12_2
2408; GFX1064-NEXT:  ; %bb.1:
2409; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2410; GFX1064-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2411; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2412; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
2413; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
2414; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
2415; GFX1064-NEXT:    s_add_i32 s8, s8, s7
2416; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
2417; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
2418; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2419; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2420; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2421; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2422; GFX1064-NEXT:    buffer_gl0_inv
2423; GFX1064-NEXT:  BB12_2:
2424; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2425; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2426; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2427; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
2428; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
2429; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
2430; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
2431; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
2432; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2433; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2434; GFX1064-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v0
2435; GFX1064-NEXT:    s_mov_b32 s2, -1
2436; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
2437; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2438; GFX1064-NEXT:    s_endpgm
2439;
2440; GFX1032-LABEL: sub_i64_uniform:
2441; GFX1032:       ; %bb.0: ; %entry
2442; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2443; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2444; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
2445; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
2446; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2447; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2448; GFX1032-NEXT:    s_cbranch_execz BB12_2
2449; GFX1032-NEXT:  ; %bb.1:
2450; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2451; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var64@abs32@lo
2452; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2453; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
2454; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
2455; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
2456; GFX1032-NEXT:    s_add_i32 s7, s7, s6
2457; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
2458; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
2459; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2460; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2461; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
2462; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX1032-NEXT:    buffer_gl0_inv
2464; GFX1032-NEXT:  BB12_2:
2465; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2466; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2467; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2468; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
2469; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
2470; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
2471; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
2472; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
2473; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2474; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2475; GFX1032-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v0
2476; GFX1032-NEXT:    s_mov_b32 s2, -1
2477; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
2478; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2479; GFX1032-NEXT:    s_endpgm
2480entry:
2481  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
2482  store i64 %old, i64 addrspace(1)* %out
2483  ret void
2484}
2485
2486; GCN-NOT: v_mbcnt_lo_u32_b32
2487; GCN-NOT: v_mbcnt_hi_u32_b32
2488; GCN-NOT: s_bcnt1_i32_b64
2489define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
2490;
2491;
2492; GFX7LESS-LABEL: sub_i64_varying:
2493; GFX7LESS:       ; %bb.0: ; %entry
2494; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2495; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2496; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2497; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2498; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2499; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2500; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2501; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2502; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2503; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2504; GFX7LESS-NEXT:    s_endpgm
2505;
2506; GFX8-LABEL: sub_i64_varying:
2507; GFX8:       ; %bb.0: ; %entry
2508; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2509; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2510; GFX8-NEXT:    s_mov_b32 m0, -1
2511; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2512; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2513; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2514; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2515; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2516; GFX8-NEXT:    s_mov_b32 s2, -1
2517; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2518; GFX8-NEXT:    s_endpgm
2519;
2520; GFX9-LABEL: sub_i64_varying:
2521; GFX9:       ; %bb.0: ; %entry
2522; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2523; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2524; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2525; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2526; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2527; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2528; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2529; GFX9-NEXT:    s_mov_b32 s2, -1
2530; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2531; GFX9-NEXT:    s_endpgm
2532;
2533; GFX1064-LABEL: sub_i64_varying:
2534; GFX1064:       ; %bb.0: ; %entry
2535; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2536; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2537; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2538; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2539; GFX1064-NEXT:    s_mov_b32 s2, -1
2540; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2541; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2542; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2543; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2544; GFX1064-NEXT:    buffer_gl0_inv
2545; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2546; GFX1064-NEXT:    s_endpgm
2547;
2548; GFX1032-LABEL: sub_i64_varying:
2549; GFX1032:       ; %bb.0: ; %entry
2550; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2551; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
2552; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2553; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2554; GFX1032-NEXT:    s_mov_b32 s2, -1
2555; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2556; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2557; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v2, v[0:1]
2558; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2559; GFX1032-NEXT:    buffer_gl0_inv
2560; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2561; GFX1032-NEXT:    s_endpgm
2562entry:
2563  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2564  %zext = zext i32 %lane to i64
2565  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
2566  store i64 %old, i64 addrspace(1)* %out
2567  ret void
2568}
2569
2570define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
2571;
2572;
2573; GFX7LESS-LABEL: and_i32_varying:
2574; GFX7LESS:       ; %bb.0: ; %entry
2575; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2576; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2577; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2578; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2579; GFX7LESS-NEXT:    ds_and_rtn_b32 v0, v1, v0
2580; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2581; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2582; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2583; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2584; GFX7LESS-NEXT:    s_endpgm
2585;
2586; GFX8-LABEL: and_i32_varying:
2587; GFX8:       ; %bb.0: ; %entry
2588; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2589; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2590; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2591; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2592; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2593; GFX8-NEXT:    v_mov_b32_e32 v1, -1
2594; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2595; GFX8-NEXT:    s_not_b64 exec, exec
2596; GFX8-NEXT:    v_mov_b32_e32 v2, -1
2597; GFX8-NEXT:    s_not_b64 exec, exec
2598; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2599; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2600; GFX8-NEXT:    s_nop 1
2601; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2602; GFX8-NEXT:    s_nop 1
2603; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2604; GFX8-NEXT:    s_nop 1
2605; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2606; GFX8-NEXT:    s_nop 1
2607; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2608; GFX8-NEXT:    s_nop 1
2609; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2610; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2611; GFX8-NEXT:    s_nop 0
2612; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2613; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2614; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2615; GFX8-NEXT:    ; implicit-def: $vgpr0
2616; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2617; GFX8-NEXT:    s_cbranch_execz BB14_2
2618; GFX8-NEXT:  ; %bb.1:
2619; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2620; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2621; GFX8-NEXT:    s_mov_b32 m0, -1
2622; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2623; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
2624; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2625; GFX8-NEXT:  BB14_2:
2626; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2627; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2628; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2629; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2630; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
2631; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2632; GFX8-NEXT:    s_mov_b32 s2, -1
2633; GFX8-NEXT:    s_nop 0
2634; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2635; GFX8-NEXT:    s_endpgm
2636;
2637; GFX9-LABEL: and_i32_varying:
2638; GFX9:       ; %bb.0: ; %entry
2639; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2640; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2641; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2642; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2643; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2644; GFX9-NEXT:    v_mov_b32_e32 v1, -1
2645; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2646; GFX9-NEXT:    s_not_b64 exec, exec
2647; GFX9-NEXT:    v_mov_b32_e32 v2, -1
2648; GFX9-NEXT:    s_not_b64 exec, exec
2649; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2650; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2651; GFX9-NEXT:    s_nop 1
2652; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2653; GFX9-NEXT:    s_nop 1
2654; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2655; GFX9-NEXT:    s_nop 1
2656; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2657; GFX9-NEXT:    s_nop 1
2658; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2659; GFX9-NEXT:    s_nop 1
2660; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2661; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2662; GFX9-NEXT:    s_nop 0
2663; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2664; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2665; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2666; GFX9-NEXT:    ; implicit-def: $vgpr0
2667; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2668; GFX9-NEXT:    s_cbranch_execz BB14_2
2669; GFX9-NEXT:  ; %bb.1:
2670; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2671; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2672; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2673; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
2674; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2675; GFX9-NEXT:  BB14_2:
2676; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2677; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2678; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2679; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2680; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
2681; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2682; GFX9-NEXT:    s_mov_b32 s2, -1
2683; GFX9-NEXT:    s_nop 0
2684; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2685; GFX9-NEXT:    s_endpgm
2686;
2687; GFX1064-LABEL: and_i32_varying:
2688; GFX1064:       ; %bb.0: ; %entry
2689; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2690; GFX1064-NEXT:    s_not_b64 exec, exec
2691; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
2692; GFX1064-NEXT:    s_not_b64 exec, exec
2693; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2694; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2695; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
2696; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2697; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2698; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2699; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2700; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2701; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2702; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2703; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2704; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2705; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2706; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2707; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2708; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2709; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2710; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2711; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2712; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2713; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2714; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2715; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2716; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2717; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2718; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2719; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2720; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2721; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2722; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2723; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2724; GFX1064-NEXT:    s_mov_b32 s2, -1
2725; GFX1064-NEXT:    ; implicit-def: $vgpr0
2726; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2727; GFX1064-NEXT:    s_cbranch_execz BB14_2
2728; GFX1064-NEXT:  ; %bb.1:
2729; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2730; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2731; GFX1064-NEXT:    s_mov_b32 s3, s7
2732; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2733; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2734; GFX1064-NEXT:    ds_and_rtn_b32 v0, v7, v4
2735; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2736; GFX1064-NEXT:    buffer_gl0_inv
2737; GFX1064-NEXT:  BB14_2:
2738; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2739; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2740; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2741; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2742; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
2743; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2744; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2745; GFX1064-NEXT:    s_nop 0
2746; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2747; GFX1064-NEXT:    s_endpgm
2748;
2749; GFX1032-LABEL: and_i32_varying:
2750; GFX1032:       ; %bb.0: ; %entry
2751; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2752; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2753; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
2754; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2755; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2756; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2757; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2758; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2759; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2760; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2761; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2762; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2763; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2764; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2765; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2766; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
2767; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
2768; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
2769; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2770; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2771; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2772; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2773; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
2774; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2775; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2776; GFX1032-NEXT:    s_mov_b32 s2, -1
2777; GFX1032-NEXT:    ; implicit-def: $vgpr0
2778; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2779; GFX1032-NEXT:    s_cbranch_execz BB14_2
2780; GFX1032-NEXT:  ; %bb.1:
2781; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2782; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
2783; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2784; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2785; GFX1032-NEXT:    ds_and_rtn_b32 v0, v7, v4
2786; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2787; GFX1032-NEXT:    buffer_gl0_inv
2788; GFX1032-NEXT:  BB14_2:
2789; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2790; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2791; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
2792; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2793; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
2794; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2795; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2796; GFX1032-NEXT:    s_nop 0
2797; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2798; GFX1032-NEXT:    s_endpgm
2799entry:
2800  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2801  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
2802  store i32 %old, i32 addrspace(1)* %out
2803  ret void
2804}
2805
2806define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
2807;
2808;
2809; GFX7LESS-LABEL: or_i32_varying:
2810; GFX7LESS:       ; %bb.0: ; %entry
2811; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2812; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
2813; GFX7LESS-NEXT:    s_mov_b32 m0, -1
2814; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2815; GFX7LESS-NEXT:    ds_or_rtn_b32 v0, v1, v0
2816; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2817; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2818; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2819; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2820; GFX7LESS-NEXT:    s_endpgm
2821;
2822; GFX8-LABEL: or_i32_varying:
2823; GFX8:       ; %bb.0: ; %entry
2824; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2825; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2826; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2827; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2828; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2829; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2830; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2831; GFX8-NEXT:    s_not_b64 exec, exec
2832; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2833; GFX8-NEXT:    s_not_b64 exec, exec
2834; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
2835; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2836; GFX8-NEXT:    s_nop 1
2837; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2838; GFX8-NEXT:    s_nop 1
2839; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2840; GFX8-NEXT:    s_nop 1
2841; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2842; GFX8-NEXT:    s_nop 1
2843; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2844; GFX8-NEXT:    s_nop 1
2845; GFX8-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2846; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
2847; GFX8-NEXT:    s_nop 0
2848; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2849; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
2850; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2851; GFX8-NEXT:    ; implicit-def: $vgpr0
2852; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2853; GFX8-NEXT:    s_cbranch_execz BB15_2
2854; GFX8-NEXT:  ; %bb.1:
2855; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2856; GFX8-NEXT:    v_mov_b32_e32 v3, s4
2857; GFX8-NEXT:    s_mov_b32 m0, -1
2858; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2859; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v3
2860; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2861; GFX8-NEXT:  BB15_2:
2862; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2863; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2864; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2865; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2866; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
2867; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2868; GFX8-NEXT:    s_mov_b32 s2, -1
2869; GFX8-NEXT:    s_nop 0
2870; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2871; GFX8-NEXT:    s_endpgm
2872;
2873; GFX9-LABEL: or_i32_varying:
2874; GFX9:       ; %bb.0: ; %entry
2875; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2876; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2877; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2878; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2879; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2880; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2881; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2882; GFX9-NEXT:    s_not_b64 exec, exec
2883; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2884; GFX9-NEXT:    s_not_b64 exec, exec
2885; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
2886; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2887; GFX9-NEXT:    s_nop 1
2888; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2889; GFX9-NEXT:    s_nop 1
2890; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2891; GFX9-NEXT:    s_nop 1
2892; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2893; GFX9-NEXT:    s_nop 1
2894; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2895; GFX9-NEXT:    s_nop 1
2896; GFX9-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2897; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
2898; GFX9-NEXT:    s_nop 0
2899; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2900; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
2901; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2902; GFX9-NEXT:    ; implicit-def: $vgpr0
2903; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2904; GFX9-NEXT:    s_cbranch_execz BB15_2
2905; GFX9-NEXT:  ; %bb.1:
2906; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
2907; GFX9-NEXT:    v_mov_b32_e32 v3, s4
2908; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2909; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v3
2910; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2911; GFX9-NEXT:  BB15_2:
2912; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
2913; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2914; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2915; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2916; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
2917; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2918; GFX9-NEXT:    s_mov_b32 s2, -1
2919; GFX9-NEXT:    s_nop 0
2920; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2921; GFX9-NEXT:    s_endpgm
2922;
2923; GFX1064-LABEL: or_i32_varying:
2924; GFX1064:       ; %bb.0: ; %entry
2925; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2926; GFX1064-NEXT:    s_not_b64 exec, exec
2927; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2928; GFX1064-NEXT:    s_not_b64 exec, exec
2929; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2930; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2931; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2932; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2933; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2934; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2935; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2936; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2937; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2938; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2939; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2940; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2941; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
2942; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2943; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2944; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2945; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2946; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
2947; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
2948; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2949; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2950; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2951; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
2952; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
2953; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
2954; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2955; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
2956; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2957; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
2958; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2959; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2960; GFX1064-NEXT:    s_mov_b32 s2, -1
2961; GFX1064-NEXT:    ; implicit-def: $vgpr0
2962; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2963; GFX1064-NEXT:    s_cbranch_execz BB15_2
2964; GFX1064-NEXT:  ; %bb.1:
2965; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
2966; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
2967; GFX1064-NEXT:    s_mov_b32 s3, s7
2968; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2969; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2970; GFX1064-NEXT:    ds_or_rtn_b32 v0, v7, v4
2971; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2972; GFX1064-NEXT:    buffer_gl0_inv
2973; GFX1064-NEXT:  BB15_2:
2974; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2975; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2976; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
2977; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2978; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
2979; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2980; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2981; GFX1064-NEXT:    s_nop 0
2982; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2983; GFX1064-NEXT:    s_endpgm
2984;
2985; GFX1032-LABEL: or_i32_varying:
2986; GFX1032:       ; %bb.0: ; %entry
2987; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2988; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2989; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2990; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2991; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2992; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
2993; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
2994; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
2995; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
2996; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2997; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2998; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2999; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3000; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3001; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3002; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3003; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3004; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3005; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3006; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3007; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3008; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3009; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3010; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3011; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3012; GFX1032-NEXT:    s_mov_b32 s2, -1
3013; GFX1032-NEXT:    ; implicit-def: $vgpr0
3014; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3015; GFX1032-NEXT:    s_cbranch_execz BB15_2
3016; GFX1032-NEXT:  ; %bb.1:
3017; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3018; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3019; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3020; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3021; GFX1032-NEXT:    ds_or_rtn_b32 v0, v7, v4
3022; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3023; GFX1032-NEXT:    buffer_gl0_inv
3024; GFX1032-NEXT:  BB15_2:
3025; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3026; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3027; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3028; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3029; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
3030; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3031; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3032; GFX1032-NEXT:    s_nop 0
3033; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3034; GFX1032-NEXT:    s_endpgm
3035entry:
3036  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3037  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3038  store i32 %old, i32 addrspace(1)* %out
3039  ret void
3040}
3041
3042define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
3043;
3044;
3045; GFX7LESS-LABEL: xor_i32_varying:
3046; GFX7LESS:       ; %bb.0: ; %entry
3047; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3048; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3049; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3050; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3051; GFX7LESS-NEXT:    ds_xor_rtn_b32 v0, v1, v0
3052; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3053; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3054; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3055; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3056; GFX7LESS-NEXT:    s_endpgm
3057;
3058; GFX8-LABEL: xor_i32_varying:
3059; GFX8:       ; %bb.0: ; %entry
3060; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3061; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3062; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3063; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3064; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3065; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3066; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3067; GFX8-NEXT:    s_not_b64 exec, exec
3068; GFX8-NEXT:    v_mov_b32_e32 v2, 0
3069; GFX8-NEXT:    s_not_b64 exec, exec
3070; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3071; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3072; GFX8-NEXT:    s_nop 1
3073; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3074; GFX8-NEXT:    s_nop 1
3075; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3076; GFX8-NEXT:    s_nop 1
3077; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3078; GFX8-NEXT:    s_nop 1
3079; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3080; GFX8-NEXT:    s_nop 1
3081; GFX8-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3082; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3083; GFX8-NEXT:    s_nop 0
3084; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3085; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3086; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3087; GFX8-NEXT:    ; implicit-def: $vgpr0
3088; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3089; GFX8-NEXT:    s_cbranch_execz BB16_2
3090; GFX8-NEXT:  ; %bb.1:
3091; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3092; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3093; GFX8-NEXT:    s_mov_b32 m0, -1
3094; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3095; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3096; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3097; GFX8-NEXT:  BB16_2:
3098; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3099; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3100; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3101; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3102; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3103; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3104; GFX8-NEXT:    s_mov_b32 s2, -1
3105; GFX8-NEXT:    s_nop 0
3106; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3107; GFX8-NEXT:    s_endpgm
3108;
3109; GFX9-LABEL: xor_i32_varying:
3110; GFX9:       ; %bb.0: ; %entry
3111; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3112; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3113; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3114; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3115; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3116; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3117; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3118; GFX9-NEXT:    s_not_b64 exec, exec
3119; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3120; GFX9-NEXT:    s_not_b64 exec, exec
3121; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3122; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3123; GFX9-NEXT:    s_nop 1
3124; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3125; GFX9-NEXT:    s_nop 1
3126; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3127; GFX9-NEXT:    s_nop 1
3128; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3129; GFX9-NEXT:    s_nop 1
3130; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3131; GFX9-NEXT:    s_nop 1
3132; GFX9-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3133; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3134; GFX9-NEXT:    s_nop 0
3135; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3136; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3137; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3138; GFX9-NEXT:    ; implicit-def: $vgpr0
3139; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3140; GFX9-NEXT:    s_cbranch_execz BB16_2
3141; GFX9-NEXT:  ; %bb.1:
3142; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3143; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3144; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3145; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v3
3146; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3147; GFX9-NEXT:  BB16_2:
3148; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3149; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3150; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3151; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3152; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
3153; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3154; GFX9-NEXT:    s_mov_b32 s2, -1
3155; GFX9-NEXT:    s_nop 0
3156; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3157; GFX9-NEXT:    s_endpgm
3158;
3159; GFX1064-LABEL: xor_i32_varying:
3160; GFX1064:       ; %bb.0: ; %entry
3161; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
3162; GFX1064-NEXT:    s_not_b64 exec, exec
3163; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3164; GFX1064-NEXT:    s_not_b64 exec, exec
3165; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3166; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3167; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
3168; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3169; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3170; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3171; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3172; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3173; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3174; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
3175; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
3176; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3177; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
3178; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3179; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3180; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3181; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3182; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
3183; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
3184; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3185; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3186; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3187; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
3188; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
3189; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
3190; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3191; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3192; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3193; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
3194; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3195; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3196; GFX1064-NEXT:    s_mov_b32 s2, -1
3197; GFX1064-NEXT:    ; implicit-def: $vgpr0
3198; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3199; GFX1064-NEXT:    s_cbranch_execz BB16_2
3200; GFX1064-NEXT:  ; %bb.1:
3201; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3202; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3203; GFX1064-NEXT:    s_mov_b32 s3, s7
3204; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3205; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3206; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3207; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3208; GFX1064-NEXT:    buffer_gl0_inv
3209; GFX1064-NEXT:  BB16_2:
3210; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3211; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3212; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3213; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
3214; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
3215; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3216; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3217; GFX1064-NEXT:    s_nop 0
3218; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3219; GFX1064-NEXT:    s_endpgm
3220;
3221; GFX1032-LABEL: xor_i32_varying:
3222; GFX1032:       ; %bb.0: ; %entry
3223; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
3224; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3225; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3226; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3227; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3228; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
3229; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
3230; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
3231; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
3232; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3233; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
3234; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3235; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3236; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3237; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3238; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
3239; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
3240; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
3241; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3242; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3243; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3244; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3245; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
3246; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3247; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3248; GFX1032-NEXT:    s_mov_b32 s2, -1
3249; GFX1032-NEXT:    ; implicit-def: $vgpr0
3250; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3251; GFX1032-NEXT:    s_cbranch_execz BB16_2
3252; GFX1032-NEXT:  ; %bb.1:
3253; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3254; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3255; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3256; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3257; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v7, v4
3258; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX1032-NEXT:    buffer_gl0_inv
3260; GFX1032-NEXT:  BB16_2:
3261; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3262; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3263; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3264; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
3265; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
3266; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3267; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3268; GFX1032-NEXT:    s_nop 0
3269; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3270; GFX1032-NEXT:    s_endpgm
3271entry:
3272  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3273  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3274  store i32 %old, i32 addrspace(1)* %out
3275  ret void
3276}
3277
3278define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
3279;
3280;
3281; GFX7LESS-LABEL: max_i32_varying:
3282; GFX7LESS:       ; %bb.0: ; %entry
3283; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3284; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3285; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3286; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3287; GFX7LESS-NEXT:    ds_max_rtn_i32 v0, v1, v0
3288; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3289; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3290; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3291; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3292; GFX7LESS-NEXT:    s_endpgm
3293;
3294; GFX8-LABEL: max_i32_varying:
3295; GFX8:       ; %bb.0: ; %entry
3296; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3297; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3298; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3299; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3300; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3301; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
3302; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3303; GFX8-NEXT:    s_not_b64 exec, exec
3304; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3305; GFX8-NEXT:    s_not_b64 exec, exec
3306; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3307; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3308; GFX8-NEXT:    s_nop 1
3309; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3310; GFX8-NEXT:    s_nop 1
3311; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3312; GFX8-NEXT:    s_nop 1
3313; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3314; GFX8-NEXT:    s_nop 1
3315; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3316; GFX8-NEXT:    s_nop 1
3317; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3318; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3319; GFX8-NEXT:    s_nop 0
3320; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3321; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3322; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3323; GFX8-NEXT:    ; implicit-def: $vgpr0
3324; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3325; GFX8-NEXT:    s_cbranch_execz BB17_2
3326; GFX8-NEXT:  ; %bb.1:
3327; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3328; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3329; GFX8-NEXT:    s_mov_b32 m0, -1
3330; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3331; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
3332; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3333; GFX8-NEXT:  BB17_2:
3334; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3335; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3336; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3337; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3338; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
3339; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3340; GFX8-NEXT:    s_mov_b32 s2, -1
3341; GFX8-NEXT:    s_nop 0
3342; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3343; GFX8-NEXT:    s_endpgm
3344;
3345; GFX9-LABEL: max_i32_varying:
3346; GFX9:       ; %bb.0: ; %entry
3347; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3348; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3349; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3350; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3351; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3352; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
3353; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3354; GFX9-NEXT:    s_not_b64 exec, exec
3355; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3356; GFX9-NEXT:    s_not_b64 exec, exec
3357; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3358; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3359; GFX9-NEXT:    s_nop 1
3360; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3361; GFX9-NEXT:    s_nop 1
3362; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3363; GFX9-NEXT:    s_nop 1
3364; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3365; GFX9-NEXT:    s_nop 1
3366; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3367; GFX9-NEXT:    s_nop 1
3368; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3369; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3370; GFX9-NEXT:    s_nop 0
3371; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3372; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3373; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3374; GFX9-NEXT:    ; implicit-def: $vgpr0
3375; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3376; GFX9-NEXT:    s_cbranch_execz BB17_2
3377; GFX9-NEXT:  ; %bb.1:
3378; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3379; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3381; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
3382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3383; GFX9-NEXT:  BB17_2:
3384; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3385; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3386; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3387; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3388; GFX9-NEXT:    v_max_i32_e32 v0, s2, v0
3389; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3390; GFX9-NEXT:    s_mov_b32 s2, -1
3391; GFX9-NEXT:    s_nop 0
3392; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3393; GFX9-NEXT:    s_endpgm
3394;
3395; GFX1064-LABEL: max_i32_varying:
3396; GFX1064:       ; %bb.0: ; %entry
3397; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3398; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3399; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
3400; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3401; GFX1064-NEXT:    s_not_b64 exec, exec
3402; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3403; GFX1064-NEXT:    s_not_b64 exec, exec
3404; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3405; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3406; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3407; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3408; GFX1064-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3409; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3410; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3411; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3412; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3413; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3414; GFX1064-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3415; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3416; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3417; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3418; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3419; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3420; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3421; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3422; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3423; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3424; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3425; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3426; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3427; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3428; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3429; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3430; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3431; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3432; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3433; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3434; GFX1064-NEXT:    s_mov_b32 s2, -1
3435; GFX1064-NEXT:    ; implicit-def: $vgpr0
3436; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3437; GFX1064-NEXT:    s_cbranch_execz BB17_2
3438; GFX1064-NEXT:  ; %bb.1:
3439; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3440; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3441; GFX1064-NEXT:    s_mov_b32 s3, s7
3442; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3443; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3444; GFX1064-NEXT:    ds_max_rtn_i32 v0, v7, v4
3445; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3446; GFX1064-NEXT:    buffer_gl0_inv
3447; GFX1064-NEXT:  BB17_2:
3448; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3449; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3450; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3451; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3452; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
3453; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3454; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3455; GFX1064-NEXT:    s_nop 0
3456; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3457; GFX1064-NEXT:    s_endpgm
3458;
3459; GFX1032-LABEL: max_i32_varying:
3460; GFX1032:       ; %bb.0: ; %entry
3461; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3462; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3463; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
3464; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3465; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3466; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3467; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3468; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3469; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3470; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3471; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3472; GFX1032-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3473; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3474; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3475; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3476; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3477; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3478; GFX1032-NEXT:    v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3479; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3480; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3481; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3482; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3483; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3484; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3485; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3486; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3487; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3488; GFX1032-NEXT:    s_mov_b32 s2, -1
3489; GFX1032-NEXT:    ; implicit-def: $vgpr0
3490; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3491; GFX1032-NEXT:    s_cbranch_execz BB17_2
3492; GFX1032-NEXT:  ; %bb.1:
3493; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3494; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3495; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3496; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3497; GFX1032-NEXT:    ds_max_rtn_i32 v0, v7, v4
3498; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3499; GFX1032-NEXT:    buffer_gl0_inv
3500; GFX1032-NEXT:  BB17_2:
3501; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3502; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3503; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3504; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3505; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
3506; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3507; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3508; GFX1032-NEXT:    s_nop 0
3509; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3510; GFX1032-NEXT:    s_endpgm
3511entry:
3512  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3513  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3514  store i32 %old, i32 addrspace(1)* %out
3515  ret void
3516}
3517
3518define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
3519;
3520;
3521; GFX7LESS-LABEL: max_i64_constant:
3522; GFX7LESS:       ; %bb.0: ; %entry
3523; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3524; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3525; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3526; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3527; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3528; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3529; GFX7LESS-NEXT:    s_cbranch_execz BB18_2
3530; GFX7LESS-NEXT:  ; %bb.1:
3531; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3532; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3533; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3534; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3535; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3536; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3537; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3538; GFX7LESS-NEXT:  BB18_2:
3539; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3540; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3541; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3542; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3543; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
3544; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3545; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3546; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3547; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3548; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3549; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
3550; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3551; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3552; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3553; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3554; GFX7LESS-NEXT:    s_endpgm
3555;
3556; GFX8-LABEL: max_i64_constant:
3557; GFX8:       ; %bb.0: ; %entry
3558; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3559; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3560; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3561; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3562; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3563; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3564; GFX8-NEXT:    s_cbranch_execz BB18_2
3565; GFX8-NEXT:  ; %bb.1:
3566; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3567; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3568; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3569; GFX8-NEXT:    s_mov_b32 m0, -1
3570; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3571; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3572; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3573; GFX8-NEXT:  BB18_2:
3574; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3575; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3576; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3577; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
3578; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
3579; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3580; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3581; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3582; GFX8-NEXT:    v_mov_b32_e32 v2, s3
3583; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3584; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3585; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3586; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3587; GFX8-NEXT:    s_mov_b32 s2, -1
3588; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3589; GFX8-NEXT:    s_endpgm
3590;
3591; GFX9-LABEL: max_i64_constant:
3592; GFX9:       ; %bb.0: ; %entry
3593; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3594; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3595; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3596; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3597; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
3598; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3599; GFX9-NEXT:    s_cbranch_execz BB18_2
3600; GFX9-NEXT:  ; %bb.1:
3601; GFX9-NEXT:    v_mov_b32_e32 v0, 5
3602; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3603; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3604; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3605; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3606; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3607; GFX9-NEXT:  BB18_2:
3608; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3609; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3610; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3611; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
3612; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
3613; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3614; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3615; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3616; GFX9-NEXT:    v_mov_b32_e32 v2, s3
3617; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3618; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3619; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3620; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3621; GFX9-NEXT:    s_mov_b32 s2, -1
3622; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3623; GFX9-NEXT:    s_endpgm
3624;
3625; GFX1064-LABEL: max_i64_constant:
3626; GFX1064:       ; %bb.0: ; %entry
3627; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3628; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3629; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3630; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3631; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
3632; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3633; GFX1064-NEXT:    s_cbranch_execz BB18_2
3634; GFX1064-NEXT:  ; %bb.1:
3635; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
3636; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3637; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3638; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3639; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3640; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3641; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3642; GFX1064-NEXT:    buffer_gl0_inv
3643; GFX1064-NEXT:  BB18_2:
3644; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3645; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
3646; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
3647; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
3648; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
3649; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
3650; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
3651; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
3652; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
3653; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3654; GFX1064-NEXT:    s_mov_b32 s2, -1
3655; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3656; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3657; GFX1064-NEXT:    s_endpgm
3658;
3659; GFX1032-LABEL: max_i64_constant:
3660; GFX1032:       ; %bb.0: ; %entry
3661; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3662; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3663; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3664; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
3665; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3666; GFX1032-NEXT:    s_cbranch_execz BB18_2
3667; GFX1032-NEXT:  ; %bb.1:
3668; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
3669; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3670; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3671; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3672; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3673; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
3674; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3675; GFX1032-NEXT:    buffer_gl0_inv
3676; GFX1032-NEXT:  BB18_2:
3677; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3678; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3679; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
3680; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
3681; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
3682; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
3683; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
3684; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
3685; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
3686; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3687; GFX1032-NEXT:    s_mov_b32 s2, -1
3688; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3689; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3690; GFX1032-NEXT:    s_endpgm
3691entry:
3692  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
3693  store i64 %old, i64 addrspace(1)* %out
3694  ret void
3695}
3696
3697define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
3698;
3699;
3700; GFX7LESS-LABEL: min_i32_varying:
3701; GFX7LESS:       ; %bb.0: ; %entry
3702; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3703; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
3704; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3705; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3706; GFX7LESS-NEXT:    ds_min_rtn_i32 v0, v1, v0
3707; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3708; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3709; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3710; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3711; GFX7LESS-NEXT:    s_endpgm
3712;
3713; GFX8-LABEL: min_i32_varying:
3714; GFX8:       ; %bb.0: ; %entry
3715; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3716; GFX8-NEXT:    v_mov_b32_e32 v2, v0
3717; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3718; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3719; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3720; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
3721; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3722; GFX8-NEXT:    s_not_b64 exec, exec
3723; GFX8-NEXT:    v_mov_b32_e32 v2, v1
3724; GFX8-NEXT:    s_not_b64 exec, exec
3725; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
3726; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3727; GFX8-NEXT:    s_nop 1
3728; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3729; GFX8-NEXT:    s_nop 1
3730; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3731; GFX8-NEXT:    s_nop 1
3732; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3733; GFX8-NEXT:    s_nop 1
3734; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3735; GFX8-NEXT:    s_nop 1
3736; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3737; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
3738; GFX8-NEXT:    s_nop 0
3739; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3740; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
3741; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3742; GFX8-NEXT:    ; implicit-def: $vgpr0
3743; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3744; GFX8-NEXT:    s_cbranch_execz BB19_2
3745; GFX8-NEXT:  ; %bb.1:
3746; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3747; GFX8-NEXT:    v_mov_b32_e32 v3, s4
3748; GFX8-NEXT:    s_mov_b32 m0, -1
3749; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3750; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
3751; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3752; GFX8-NEXT:  BB19_2:
3753; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3754; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3755; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
3756; GFX8-NEXT:    v_mov_b32_e32 v0, v1
3757; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
3758; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3759; GFX8-NEXT:    s_mov_b32 s2, -1
3760; GFX8-NEXT:    s_nop 0
3761; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3762; GFX8-NEXT:    s_endpgm
3763;
3764; GFX9-LABEL: min_i32_varying:
3765; GFX9:       ; %bb.0: ; %entry
3766; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3767; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3768; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3769; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3770; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3771; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
3772; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3773; GFX9-NEXT:    s_not_b64 exec, exec
3774; GFX9-NEXT:    v_mov_b32_e32 v2, v1
3775; GFX9-NEXT:    s_not_b64 exec, exec
3776; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
3777; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3778; GFX9-NEXT:    s_nop 1
3779; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3780; GFX9-NEXT:    s_nop 1
3781; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3782; GFX9-NEXT:    s_nop 1
3783; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3784; GFX9-NEXT:    s_nop 1
3785; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3786; GFX9-NEXT:    s_nop 1
3787; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3788; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
3789; GFX9-NEXT:    s_nop 0
3790; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
3791; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
3792; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3793; GFX9-NEXT:    ; implicit-def: $vgpr0
3794; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3795; GFX9-NEXT:    s_cbranch_execz BB19_2
3796; GFX9-NEXT:  ; %bb.1:
3797; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
3798; GFX9-NEXT:    v_mov_b32_e32 v3, s4
3799; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3800; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
3801; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3802; GFX9-NEXT:  BB19_2:
3803; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
3804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3805; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
3806; GFX9-NEXT:    v_mov_b32_e32 v0, v1
3807; GFX9-NEXT:    v_min_i32_e32 v0, s2, v0
3808; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3809; GFX9-NEXT:    s_mov_b32 s2, -1
3810; GFX9-NEXT:    s_nop 0
3811; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3812; GFX9-NEXT:    s_endpgm
3813;
3814; GFX1064-LABEL: min_i32_varying:
3815; GFX1064:       ; %bb.0: ; %entry
3816; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
3817; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3818; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
3819; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3820; GFX1064-NEXT:    s_not_b64 exec, exec
3821; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
3822; GFX1064-NEXT:    s_not_b64 exec, exec
3823; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3824; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3825; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3826; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3827; GFX1064-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3828; GFX1064-NEXT:    v_mov_b32_e32 v3, v2
3829; GFX1064-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3830; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3831; GFX1064-NEXT:    v_readlane_b32 s4, v2, 31
3832; GFX1064-NEXT:    v_mov_b32_e32 v3, s4
3833; GFX1064-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3834; GFX1064-NEXT:    v_readlane_b32 s4, v2, 15
3835; GFX1064-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3836; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3837; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3838; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3839; GFX1064-NEXT:    v_readlane_b32 s5, v2, 31
3840; GFX1064-NEXT:    v_writelane_b32 v1, s4, 16
3841; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3842; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3843; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
3844; GFX1064-NEXT:    v_readlane_b32 s7, v2, 63
3845; GFX1064-NEXT:    v_readlane_b32 s6, v2, 47
3846; GFX1064-NEXT:    v_writelane_b32 v1, s5, 32
3847; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
3848; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
3849; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
3850; GFX1064-NEXT:    v_writelane_b32 v1, s6, 48
3851; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
3852; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3853; GFX1064-NEXT:    s_mov_b32 s2, -1
3854; GFX1064-NEXT:    ; implicit-def: $vgpr0
3855; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3856; GFX1064-NEXT:    s_cbranch_execz BB19_2
3857; GFX1064-NEXT:  ; %bb.1:
3858; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3859; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
3860; GFX1064-NEXT:    s_mov_b32 s3, s7
3861; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3862; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
3863; GFX1064-NEXT:    ds_min_rtn_i32 v0, v7, v4
3864; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3865; GFX1064-NEXT:    buffer_gl0_inv
3866; GFX1064-NEXT:  BB19_2:
3867; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3868; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3869; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
3870; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
3871; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
3872; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3873; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3874; GFX1064-NEXT:    s_nop 0
3875; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3876; GFX1064-NEXT:    s_endpgm
3877;
3878; GFX1032-LABEL: min_i32_varying:
3879; GFX1032:       ; %bb.0: ; %entry
3880; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
3881; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3882; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
3883; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3884; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3885; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
3886; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
3887; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3888; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3889; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3890; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3891; GFX1032-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3892; GFX1032-NEXT:    v_mov_b32_e32 v3, v2
3893; GFX1032-NEXT:    v_permlanex16_b32 v3, v3, -1, -1
3894; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3895; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3896; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3897; GFX1032-NEXT:    v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3898; GFX1032-NEXT:    v_readlane_b32 s3, v2, 15
3899; GFX1032-NEXT:    v_readlane_b32 s4, v2, 31
3900; GFX1032-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3901; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3902; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3903; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
3904; GFX1032-NEXT:    v_writelane_b32 v1, s3, 16
3905; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
3906; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3907; GFX1032-NEXT:    s_mov_b32 s2, -1
3908; GFX1032-NEXT:    ; implicit-def: $vgpr0
3909; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
3910; GFX1032-NEXT:    s_cbranch_execz BB19_2
3911; GFX1032-NEXT:  ; %bb.1:
3912; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
3913; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
3914; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3915; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
3916; GFX1032-NEXT:    ds_min_rtn_i32 v0, v7, v4
3917; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3918; GFX1032-NEXT:    buffer_gl0_inv
3919; GFX1032-NEXT:  BB19_2:
3920; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3921; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
3922; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
3923; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
3924; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
3925; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3926; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3927; GFX1032-NEXT:    s_nop 0
3928; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3929; GFX1032-NEXT:    s_endpgm
3930entry:
3931  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3932  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
3933  store i32 %old, i32 addrspace(1)* %out
3934  ret void
3935}
3936
3937define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
3938;
3939;
3940; GFX7LESS-LABEL: min_i64_constant:
3941; GFX7LESS:       ; %bb.0: ; %entry
3942; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3943; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3944; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3945; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3946; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
3947; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3948; GFX7LESS-NEXT:    s_cbranch_execz BB20_2
3949; GFX7LESS-NEXT:  ; %bb.1:
3950; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3951; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
3952; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3953; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3954; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3955; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3956; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3957; GFX7LESS-NEXT:  BB20_2:
3958; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
3959; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3960; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
3961; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
3962; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
3963; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3964; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
3965; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
3966; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
3967; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s4
3968; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
3969; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3970; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3971; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3972; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3973; GFX7LESS-NEXT:    s_endpgm
3974;
3975; GFX8-LABEL: min_i64_constant:
3976; GFX8:       ; %bb.0: ; %entry
3977; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3978; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3979; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3980; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3981; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3982; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3983; GFX8-NEXT:    s_cbranch_execz BB20_2
3984; GFX8-NEXT:  ; %bb.1:
3985; GFX8-NEXT:    v_mov_b32_e32 v0, 5
3986; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
3987; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3988; GFX8-NEXT:    s_mov_b32 m0, -1
3989; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3990; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
3991; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3992; GFX8-NEXT:  BB20_2:
3993; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
3994; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3995; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
3996; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
3997; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
3998; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
3999; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4000; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4001; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4002; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4003; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4004; GFX8-NEXT:    s_mov_b32 s2, -1
4005; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4006; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4007; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4008; GFX8-NEXT:    s_endpgm
4009;
4010; GFX9-LABEL: min_i64_constant:
4011; GFX9:       ; %bb.0: ; %entry
4012; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4013; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4014; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4015; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4016; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4017; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4018; GFX9-NEXT:    s_cbranch_execz BB20_2
4019; GFX9-NEXT:  ; %bb.1:
4020; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4021; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4022; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4023; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4024; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4025; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4026; GFX9-NEXT:  BB20_2:
4027; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4028; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4029; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4030; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
4031; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4032; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
4033; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4034; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4035; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4036; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4037; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4038; GFX9-NEXT:    s_mov_b32 s2, -1
4039; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4040; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4041; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4042; GFX9-NEXT:    s_endpgm
4043;
4044; GFX1064-LABEL: min_i64_constant:
4045; GFX1064:       ; %bb.0: ; %entry
4046; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4047; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4048; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4049; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4050; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4051; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4052; GFX1064-NEXT:    s_cbranch_execz BB20_2
4053; GFX1064-NEXT:  ; %bb.1:
4054; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4055; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4056; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4057; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4058; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4059; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4060; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4061; GFX1064-NEXT:    buffer_gl0_inv
4062; GFX1064-NEXT:  BB20_2:
4063; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4064; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4065; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4066; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4067; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
4068; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4069; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
4070; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4071; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4072; GFX1064-NEXT:    s_mov_b32 s2, -1
4073; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4074; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4075; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4076; GFX1064-NEXT:    s_endpgm
4077;
4078; GFX1032-LABEL: min_i64_constant:
4079; GFX1032:       ; %bb.0: ; %entry
4080; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4081; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4082; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4083; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4084; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4085; GFX1032-NEXT:    s_cbranch_execz BB20_2
4086; GFX1032-NEXT:  ; %bb.1:
4087; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4088; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4089; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4090; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4091; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4092; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
4093; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4094; GFX1032-NEXT:    buffer_gl0_inv
4095; GFX1032-NEXT:  BB20_2:
4096; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4097; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4098; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4099; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4100; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
4101; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4102; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
4103; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4104; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4105; GFX1032-NEXT:    s_mov_b32 s2, -1
4106; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4107; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4108; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4109; GFX1032-NEXT:    s_endpgm
4110entry:
4111  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
4112  store i64 %old, i64 addrspace(1)* %out
4113  ret void
4114}
4115
4116define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
4117;
4118;
4119; GFX7LESS-LABEL: umax_i32_varying:
4120; GFX7LESS:       ; %bb.0: ; %entry
4121; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4122; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4123; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4124; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4125; GFX7LESS-NEXT:    ds_max_rtn_u32 v0, v1, v0
4126; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4127; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4128; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4129; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4130; GFX7LESS-NEXT:    s_endpgm
4131;
4132; GFX8-LABEL: umax_i32_varying:
4133; GFX8:       ; %bb.0: ; %entry
4134; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4135; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4136; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4137; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4138; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4139; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4140; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4141; GFX8-NEXT:    s_not_b64 exec, exec
4142; GFX8-NEXT:    v_mov_b32_e32 v2, 0
4143; GFX8-NEXT:    s_not_b64 exec, exec
4144; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4145; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4146; GFX8-NEXT:    s_nop 1
4147; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4148; GFX8-NEXT:    s_nop 1
4149; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4150; GFX8-NEXT:    s_nop 1
4151; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4152; GFX8-NEXT:    s_nop 1
4153; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4154; GFX8-NEXT:    s_nop 1
4155; GFX8-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4156; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4157; GFX8-NEXT:    s_nop 0
4158; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4159; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4160; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4161; GFX8-NEXT:    ; implicit-def: $vgpr0
4162; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4163; GFX8-NEXT:    s_cbranch_execz BB21_2
4164; GFX8-NEXT:  ; %bb.1:
4165; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4166; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4167; GFX8-NEXT:    s_mov_b32 m0, -1
4168; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4169; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v3
4170; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4171; GFX8-NEXT:  BB21_2:
4172; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4173; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4174; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4175; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4176; GFX8-NEXT:    v_max_u32_e32 v0, s2, v0
4177; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4178; GFX8-NEXT:    s_mov_b32 s2, -1
4179; GFX8-NEXT:    s_nop 0
4180; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4181; GFX8-NEXT:    s_endpgm
4182;
4183; GFX9-LABEL: umax_i32_varying:
4184; GFX9:       ; %bb.0: ; %entry
4185; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4186; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4187; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4188; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4189; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4190; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4191; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4192; GFX9-NEXT:    s_not_b64 exec, exec
4193; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4194; GFX9-NEXT:    s_not_b64 exec, exec
4195; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4196; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4197; GFX9-NEXT:    s_nop 1
4198; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4199; GFX9-NEXT:    s_nop 1
4200; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4201; GFX9-NEXT:    s_nop 1
4202; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4203; GFX9-NEXT:    s_nop 1
4204; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4205; GFX9-NEXT:    s_nop 1
4206; GFX9-NEXT:    v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4207; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4208; GFX9-NEXT:    s_nop 0
4209; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4210; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4211; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4212; GFX9-NEXT:    ; implicit-def: $vgpr0
4213; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4214; GFX9-NEXT:    s_cbranch_execz BB21_2
4215; GFX9-NEXT:  ; %bb.1:
4216; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4217; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4218; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4219; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v3
4220; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4221; GFX9-NEXT:  BB21_2:
4222; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4223; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4224; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4225; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4226; GFX9-NEXT:    v_max_u32_e32 v0, s2, v0
4227; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4228; GFX9-NEXT:    s_mov_b32 s2, -1
4229; GFX9-NEXT:    s_nop 0
4230; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4231; GFX9-NEXT:    s_endpgm
4232;
4233; GFX1064-LABEL: umax_i32_varying:
4234; GFX1064:       ; %bb.0: ; %entry
4235; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4236; GFX1064-NEXT:    s_not_b64 exec, exec
4237; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4238; GFX1064-NEXT:    s_not_b64 exec, exec
4239; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4240; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4241; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
4242; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4243; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4244; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4245; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4246; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4247; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4248; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4249; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4250; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4251; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4252; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4253; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4254; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4255; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4256; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4257; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4258; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4259; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4260; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4261; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4262; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4263; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4264; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4265; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4266; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4267; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4268; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4269; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4270; GFX1064-NEXT:    s_mov_b32 s2, -1
4271; GFX1064-NEXT:    ; implicit-def: $vgpr0
4272; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4273; GFX1064-NEXT:    s_cbranch_execz BB21_2
4274; GFX1064-NEXT:  ; %bb.1:
4275; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4276; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4277; GFX1064-NEXT:    s_mov_b32 s3, s7
4278; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4279; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4280; GFX1064-NEXT:    ds_max_rtn_u32 v0, v7, v4
4281; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4282; GFX1064-NEXT:    buffer_gl0_inv
4283; GFX1064-NEXT:  BB21_2:
4284; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4285; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4286; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4287; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4288; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
4289; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4290; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4291; GFX1064-NEXT:    s_nop 0
4292; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4293; GFX1064-NEXT:    s_endpgm
4294;
4295; GFX1032-LABEL: umax_i32_varying:
4296; GFX1032:       ; %bb.0: ; %entry
4297; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4298; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4299; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4300; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4301; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4302; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
4303; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
4304; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0
4305; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0
4306; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4307; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4308; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4309; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4310; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4311; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4312; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
4313; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4314; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4315; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4316; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4317; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4318; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4319; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4320; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4321; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4322; GFX1032-NEXT:    s_mov_b32 s2, -1
4323; GFX1032-NEXT:    ; implicit-def: $vgpr0
4324; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4325; GFX1032-NEXT:    s_cbranch_execz BB21_2
4326; GFX1032-NEXT:  ; %bb.1:
4327; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4328; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4329; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4330; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4331; GFX1032-NEXT:    ds_max_rtn_u32 v0, v7, v4
4332; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX1032-NEXT:    buffer_gl0_inv
4334; GFX1032-NEXT:  BB21_2:
4335; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4336; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4337; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4338; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4339; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
4340; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4341; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4342; GFX1032-NEXT:    s_nop 0
4343; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4344; GFX1032-NEXT:    s_endpgm
4345entry:
4346  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4347  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4348  store i32 %old, i32 addrspace(1)* %out
4349  ret void
4350}
4351
4352define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
4353;
4354;
4355; GFX7LESS-LABEL: umax_i64_constant:
4356; GFX7LESS:       ; %bb.0: ; %entry
4357; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4358; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4359; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4360; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4361; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4362; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4363; GFX7LESS-NEXT:    s_cbranch_execz BB22_2
4364; GFX7LESS-NEXT:  ; %bb.1:
4365; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4366; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4367; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4368; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4369; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4370; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4371; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4372; GFX7LESS-NEXT:  BB22_2:
4373; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4374; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4375; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4376; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4377; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4378; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4379; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4380; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4381; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
4382; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4383; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
4384; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4385; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4386; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4387; GFX7LESS-NEXT:    s_endpgm
4388;
4389; GFX8-LABEL: umax_i64_constant:
4390; GFX8:       ; %bb.0: ; %entry
4391; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4392; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4393; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4394; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4395; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4396; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4397; GFX8-NEXT:    s_cbranch_execz BB22_2
4398; GFX8-NEXT:  ; %bb.1:
4399; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4400; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4401; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4402; GFX8-NEXT:    s_mov_b32 m0, -1
4403; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4404; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4405; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4406; GFX8-NEXT:  BB22_2:
4407; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4408; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4409; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4410; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4411; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4412; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4413; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4414; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4415; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4416; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4417; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4418; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4419; GFX8-NEXT:    s_mov_b32 s2, -1
4420; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4421; GFX8-NEXT:    s_endpgm
4422;
4423; GFX9-LABEL: umax_i64_constant:
4424; GFX9:       ; %bb.0: ; %entry
4425; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4426; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4427; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4428; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4429; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4430; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4431; GFX9-NEXT:    s_cbranch_execz BB22_2
4432; GFX9-NEXT:  ; %bb.1:
4433; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4434; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4435; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4436; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4437; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4438; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4439; GFX9-NEXT:  BB22_2:
4440; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4441; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4442; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4443; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4444; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4445; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4446; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4447; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4448; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4449; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4450; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
4451; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4452; GFX9-NEXT:    s_mov_b32 s2, -1
4453; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4454; GFX9-NEXT:    s_endpgm
4455;
4456; GFX1064-LABEL: umax_i64_constant:
4457; GFX1064:       ; %bb.0: ; %entry
4458; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4459; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4460; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4461; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4462; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4463; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4464; GFX1064-NEXT:    s_cbranch_execz BB22_2
4465; GFX1064-NEXT:  ; %bb.1:
4466; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4467; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4468; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4469; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4470; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4471; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4472; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4473; GFX1064-NEXT:    buffer_gl0_inv
4474; GFX1064-NEXT:  BB22_2:
4475; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4476; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4477; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4478; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4479; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4480; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
4481; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
4482; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4483; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
4484; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4485; GFX1064-NEXT:    s_mov_b32 s2, -1
4486; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4487; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4488; GFX1064-NEXT:    s_endpgm
4489;
4490; GFX1032-LABEL: umax_i64_constant:
4491; GFX1032:       ; %bb.0: ; %entry
4492; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4493; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4494; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4495; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4496; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4497; GFX1032-NEXT:    s_cbranch_execz BB22_2
4498; GFX1032-NEXT:  ; %bb.1:
4499; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4500; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4501; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4502; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4503; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4504; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
4505; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4506; GFX1032-NEXT:    buffer_gl0_inv
4507; GFX1032-NEXT:  BB22_2:
4508; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4509; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4510; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4511; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4512; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4513; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
4514; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
4515; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4516; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
4517; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4518; GFX1032-NEXT:    s_mov_b32 s2, -1
4519; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4520; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4521; GFX1032-NEXT:    s_endpgm
4522entry:
4523  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
4524  store i64 %old, i64 addrspace(1)* %out
4525  ret void
4526}
4527
4528define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
4529;
4530;
4531; GFX7LESS-LABEL: umin_i32_varying:
4532; GFX7LESS:       ; %bb.0: ; %entry
4533; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4534; GFX7LESS-NEXT:    v_mov_b32_e32 v1, local_var32@abs32@lo
4535; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4536; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4537; GFX7LESS-NEXT:    ds_min_rtn_u32 v0, v1, v0
4538; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4539; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4540; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4541; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4542; GFX7LESS-NEXT:    s_endpgm
4543;
4544; GFX8-LABEL: umin_i32_varying:
4545; GFX8:       ; %bb.0: ; %entry
4546; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4547; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4548; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4549; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4550; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4551; GFX8-NEXT:    v_mov_b32_e32 v1, -1
4552; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4553; GFX8-NEXT:    s_not_b64 exec, exec
4554; GFX8-NEXT:    v_mov_b32_e32 v2, -1
4555; GFX8-NEXT:    s_not_b64 exec, exec
4556; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
4557; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4558; GFX8-NEXT:    s_nop 1
4559; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4560; GFX8-NEXT:    s_nop 1
4561; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4562; GFX8-NEXT:    s_nop 1
4563; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4564; GFX8-NEXT:    s_nop 1
4565; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4566; GFX8-NEXT:    s_nop 1
4567; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4568; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
4569; GFX8-NEXT:    s_nop 0
4570; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4571; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
4572; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4573; GFX8-NEXT:    ; implicit-def: $vgpr0
4574; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4575; GFX8-NEXT:    s_cbranch_execz BB23_2
4576; GFX8-NEXT:  ; %bb.1:
4577; GFX8-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4578; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4579; GFX8-NEXT:    s_mov_b32 m0, -1
4580; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4581; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
4582; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4583; GFX8-NEXT:  BB23_2:
4584; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4585; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4586; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4587; GFX8-NEXT:    v_mov_b32_e32 v0, v1
4588; GFX8-NEXT:    v_min_u32_e32 v0, s2, v0
4589; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4590; GFX8-NEXT:    s_mov_b32 s2, -1
4591; GFX8-NEXT:    s_nop 0
4592; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4593; GFX8-NEXT:    s_endpgm
4594;
4595; GFX9-LABEL: umin_i32_varying:
4596; GFX9:       ; %bb.0: ; %entry
4597; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4598; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4599; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4600; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4601; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4602; GFX9-NEXT:    v_mov_b32_e32 v1, -1
4603; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4604; GFX9-NEXT:    s_not_b64 exec, exec
4605; GFX9-NEXT:    v_mov_b32_e32 v2, -1
4606; GFX9-NEXT:    s_not_b64 exec, exec
4607; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
4608; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
4609; GFX9-NEXT:    s_nop 1
4610; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
4611; GFX9-NEXT:    s_nop 1
4612; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
4613; GFX9-NEXT:    s_nop 1
4614; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
4615; GFX9-NEXT:    s_nop 1
4616; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4617; GFX9-NEXT:    s_nop 1
4618; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4619; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
4620; GFX9-NEXT:    s_nop 0
4621; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4622; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
4623; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4624; GFX9-NEXT:    ; implicit-def: $vgpr0
4625; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4626; GFX9-NEXT:    s_cbranch_execz BB23_2
4627; GFX9-NEXT:  ; %bb.1:
4628; GFX9-NEXT:    v_mov_b32_e32 v0, local_var32@abs32@lo
4629; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4630; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4631; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
4632; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4633; GFX9-NEXT:  BB23_2:
4634; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4636; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4637; GFX9-NEXT:    v_mov_b32_e32 v0, v1
4638; GFX9-NEXT:    v_min_u32_e32 v0, s2, v0
4639; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4640; GFX9-NEXT:    s_mov_b32 s2, -1
4641; GFX9-NEXT:    s_nop 0
4642; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4643; GFX9-NEXT:    s_endpgm
4644;
4645; GFX1064-LABEL: umin_i32_varying:
4646; GFX1064:       ; %bb.0: ; %entry
4647; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
4648; GFX1064-NEXT:    s_not_b64 exec, exec
4649; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
4650; GFX1064-NEXT:    s_not_b64 exec, exec
4651; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4652; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4653; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
4654; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4655; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4656; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4657; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
4658; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4659; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4660; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
4661; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
4662; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4663; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
4664; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4665; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4666; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4667; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4668; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
4669; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
4670; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4671; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4672; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
4673; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
4674; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
4675; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
4676; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
4677; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4678; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
4679; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
4680; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
4681; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4682; GFX1064-NEXT:    s_mov_b32 s2, -1
4683; GFX1064-NEXT:    ; implicit-def: $vgpr0
4684; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4685; GFX1064-NEXT:    s_cbranch_execz BB23_2
4686; GFX1064-NEXT:  ; %bb.1:
4687; GFX1064-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4688; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
4689; GFX1064-NEXT:    s_mov_b32 s3, s7
4690; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4691; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4692; GFX1064-NEXT:    ds_min_rtn_u32 v0, v7, v4
4693; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4694; GFX1064-NEXT:    buffer_gl0_inv
4695; GFX1064-NEXT:  BB23_2:
4696; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4697; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4698; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
4699; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
4700; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
4701; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4702; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4703; GFX1064-NEXT:    s_nop 0
4704; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4705; GFX1064-NEXT:    s_endpgm
4706;
4707; GFX1032-LABEL: umin_i32_varying:
4708; GFX1032:       ; %bb.0: ; %entry
4709; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
4710; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4711; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
4712; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
4713; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4714; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4715; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
4716; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
4717; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
4718; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
4719; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
4720; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4721; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4722; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4723; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4724; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
4725; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
4726; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
4727; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4728; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4729; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4730; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
4731; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
4732; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
4733; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4734; GFX1032-NEXT:    s_mov_b32 s2, -1
4735; GFX1032-NEXT:    ; implicit-def: $vgpr0
4736; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
4737; GFX1032-NEXT:    s_cbranch_execz BB23_2
4738; GFX1032-NEXT:  ; %bb.1:
4739; GFX1032-NEXT:    v_mov_b32_e32 v7, local_var32@abs32@lo
4740; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
4741; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4742; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4743; GFX1032-NEXT:    ds_min_rtn_u32 v0, v7, v4
4744; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4745; GFX1032-NEXT:    buffer_gl0_inv
4746; GFX1032-NEXT:  BB23_2:
4747; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4748; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
4749; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
4750; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
4751; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
4752; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4753; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4754; GFX1032-NEXT:    s_nop 0
4755; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4756; GFX1032-NEXT:    s_endpgm
4757entry:
4758  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4759  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
4760  store i32 %old, i32 addrspace(1)* %out
4761  ret void
4762}
4763
4764define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
4765;
4766;
4767; GFX7LESS-LABEL: umin_i64_constant:
4768; GFX7LESS:       ; %bb.0: ; %entry
4769; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4770; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4771; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4772; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4773; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4774; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4775; GFX7LESS-NEXT:    s_cbranch_execz BB24_2
4776; GFX7LESS-NEXT:  ; %bb.1:
4777; GFX7LESS-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4778; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
4779; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4780; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4781; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4782; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4783; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4784; GFX7LESS-NEXT:  BB24_2:
4785; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
4786; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4787; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
4788; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
4789; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4790; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4791; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4792; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
4793; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4794; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4795; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4796; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4797; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4798; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4799; GFX7LESS-NEXT:    s_endpgm
4800;
4801; GFX8-LABEL: umin_i64_constant:
4802; GFX8:       ; %bb.0: ; %entry
4803; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4804; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4805; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4806; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4807; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4808; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4809; GFX8-NEXT:    s_cbranch_execz BB24_2
4810; GFX8-NEXT:  ; %bb.1:
4811; GFX8-NEXT:    v_mov_b32_e32 v0, 5
4812; GFX8-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4813; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4814; GFX8-NEXT:    s_mov_b32 m0, -1
4815; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4816; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4817; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4818; GFX8-NEXT:  BB24_2:
4819; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
4820; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4821; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
4822; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
4823; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4824; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4825; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4826; GFX8-NEXT:    v_mov_b32_e32 v2, s5
4827; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4828; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4829; GFX8-NEXT:    s_mov_b32 s2, -1
4830; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4831; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4832; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4833; GFX8-NEXT:    s_endpgm
4834;
4835; GFX9-LABEL: umin_i64_constant:
4836; GFX9:       ; %bb.0: ; %entry
4837; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4838; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4839; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4840; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4841; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4842; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4843; GFX9-NEXT:    s_cbranch_execz BB24_2
4844; GFX9-NEXT:  ; %bb.1:
4845; GFX9-NEXT:    v_mov_b32_e32 v0, 5
4846; GFX9-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4847; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4848; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4849; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4850; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4851; GFX9-NEXT:  BB24_2:
4852; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
4853; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4854; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
4855; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
4856; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4857; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4858; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
4859; GFX9-NEXT:    v_mov_b32_e32 v2, s5
4860; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4861; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4862; GFX9-NEXT:    s_mov_b32 s2, -1
4863; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4864; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4865; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4866; GFX9-NEXT:    s_endpgm
4867;
4868; GFX1064-LABEL: umin_i64_constant:
4869; GFX1064:       ; %bb.0: ; %entry
4870; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4871; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4872; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
4873; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4874; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4875; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4876; GFX1064-NEXT:    s_cbranch_execz BB24_2
4877; GFX1064-NEXT:  ; %bb.1:
4878; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
4879; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4880; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
4881; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4882; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
4883; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4884; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4885; GFX1064-NEXT:    buffer_gl0_inv
4886; GFX1064-NEXT:  BB24_2:
4887; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4888; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
4889; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
4890; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
4891; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
4892; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
4893; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
4894; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
4895; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
4896; GFX1064-NEXT:    s_mov_b32 s2, -1
4897; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4898; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4899; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4900; GFX1064-NEXT:    s_endpgm
4901;
4902; GFX1032-LABEL: umin_i64_constant:
4903; GFX1032:       ; %bb.0: ; %entry
4904; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4905; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4906; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4907; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
4908; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
4909; GFX1032-NEXT:    s_cbranch_execz BB24_2
4910; GFX1032-NEXT:  ; %bb.1:
4911; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
4912; GFX1032-NEXT:    v_mov_b32_e32 v2, local_var64@abs32@lo
4913; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
4914; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4915; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
4916; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
4917; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4918; GFX1032-NEXT:    buffer_gl0_inv
4919; GFX1032-NEXT:  BB24_2:
4920; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4921; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
4922; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
4923; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
4924; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4925; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
4926; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
4927; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
4928; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
4929; GFX1032-NEXT:    s_mov_b32 s2, -1
4930; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4931; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4932; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4933; GFX1032-NEXT:    s_endpgm
4934entry:
4935  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
4936  store i64 %old, i64 addrspace(1)* %out
4937  ret void
4938}
4939