1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11
8
9declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
10declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
11declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
12
13declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
14declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
15declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
16
17declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
18declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
19declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
20
21declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
22
23define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
24; SI-LABEL: s_ctlz_i32:
25; SI:       ; %bb.0:
26; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
27; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
28; SI-NEXT:    s_mov_b32 s3, 0xf000
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    s_flbit_i32_b32 s2, s2
31; SI-NEXT:    s_min_u32 s4, s2, 32
32; SI-NEXT:    s_mov_b32 s2, -1
33; SI-NEXT:    v_mov_b32_e32 v0, s4
34; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
35; SI-NEXT:    s_endpgm
36;
37; VI-LABEL: s_ctlz_i32:
38; VI:       ; %bb.0:
39; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
40; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; VI-NEXT:    s_mov_b32 s3, 0xf000
42; VI-NEXT:    s_mov_b32 s2, -1
43; VI-NEXT:    s_waitcnt lgkmcnt(0)
44; VI-NEXT:    s_flbit_i32_b32 s4, s4
45; VI-NEXT:    s_min_u32 s4, s4, 32
46; VI-NEXT:    v_mov_b32_e32 v0, s4
47; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
48; VI-NEXT:    s_endpgm
49;
50; EG-LABEL: s_ctlz_i32:
51; EG:       ; %bb.0:
52; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
53; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
54; EG-NEXT:    CF_END
55; EG-NEXT:    PAD
56; EG-NEXT:    ALU clause starting at 4:
57; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
58; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
59; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
60; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
61;
62; GFX10-LABEL: s_ctlz_i32:
63; GFX10:       ; %bb.0:
64; GFX10-NEXT:    s_clause 0x1
65; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
66; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
67; GFX10-NEXT:    v_mov_b32_e32 v0, 0
68; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX10-NEXT:    s_flbit_i32_b32 s0, s4
70; GFX10-NEXT:    s_min_u32 s0, s0, 32
71; GFX10-NEXT:    v_mov_b32_e32 v1, s0
72; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
73; GFX10-NEXT:    s_endpgm
74;
75; GFX10-GISEL-LABEL: s_ctlz_i32:
76; GFX10-GISEL:       ; %bb.0:
77; GFX10-GISEL-NEXT:    s_clause 0x1
78; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
79; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
80; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-GISEL-NEXT:    s_flbit_i32_b32 s0, s4
83; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 32
84; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
85; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
86; GFX10-GISEL-NEXT:    s_endpgm
87;
88; GFX11-LABEL: s_ctlz_i32:
89; GFX11:       ; %bb.0:
90; GFX11-NEXT:    s_clause 0x1
91; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
92; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
93; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX11-NEXT:    s_clz_i32_u32 s2, s2
95; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
96; GFX11-NEXT:    s_min_u32 s2, s2, 32
97; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
98; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
99; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
100; GFX11-NEXT:    s_endpgm
101  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
102  store i32 %ctlz, i32 addrspace(1)* %out, align 4
103  ret void
104}
105
106define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
107; SI-LABEL: v_ctlz_i32:
108; SI:       ; %bb.0:
109; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
110; SI-NEXT:    s_mov_b32 s3, 0xf000
111; SI-NEXT:    s_mov_b32 s6, 0
112; SI-NEXT:    s_mov_b32 s7, s3
113; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
114; SI-NEXT:    v_mov_b32_e32 v1, 0
115; SI-NEXT:    s_waitcnt lgkmcnt(0)
116; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
117; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
118; SI-NEXT:    s_mov_b32 s2, -1
119; SI-NEXT:    s_waitcnt vmcnt(0)
120; SI-NEXT:    v_ffbh_u32_e32 v0, v0
121; SI-NEXT:    v_min_u32_e32 v0, 32, v0
122; SI-NEXT:    s_waitcnt lgkmcnt(0)
123; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
124; SI-NEXT:    s_endpgm
125;
126; VI-LABEL: v_ctlz_i32:
127; VI:       ; %bb.0:
128; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
129; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
130; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
131; VI-NEXT:    s_waitcnt lgkmcnt(0)
132; VI-NEXT:    v_mov_b32_e32 v1, s3
133; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
134; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
135; VI-NEXT:    flat_load_dword v0, v[0:1]
136; VI-NEXT:    s_mov_b32 s3, 0xf000
137; VI-NEXT:    s_mov_b32 s2, -1
138; VI-NEXT:    s_waitcnt vmcnt(0)
139; VI-NEXT:    v_ffbh_u32_e32 v0, v0
140; VI-NEXT:    v_min_u32_e32 v0, 32, v0
141; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
142; VI-NEXT:    s_endpgm
143;
144; EG-LABEL: v_ctlz_i32:
145; EG:       ; %bb.0:
146; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
147; EG-NEXT:    TEX 0 @6
148; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
149; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
150; EG-NEXT:    CF_END
151; EG-NEXT:    PAD
152; EG-NEXT:    Fetch clause starting at 6:
153; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
154; EG-NEXT:    ALU clause starting at 8:
155; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
156; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
157; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
158; EG-NEXT:    ALU clause starting at 11:
159; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
160; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
161; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
162; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
163;
164; GFX10-LABEL: v_ctlz_i32:
165; GFX10:       ; %bb.0:
166; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
167; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
168; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
169; GFX10-NEXT:    v_mov_b32_e32 v1, 0
170; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
172; GFX10-NEXT:    s_waitcnt vmcnt(0)
173; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
174; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
175; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
176; GFX10-NEXT:    s_endpgm
177;
178; GFX10-GISEL-LABEL: v_ctlz_i32:
179; GFX10-GISEL:       ; %bb.0:
180; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
181; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
182; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
183; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
184; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
186; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
187; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
188; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
189; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
190; GFX10-GISEL-NEXT:    s_endpgm
191;
192; GFX11-LABEL: v_ctlz_i32:
193; GFX11:       ; %bb.0:
194; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
195; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
196; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
197; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
199; GFX11-NEXT:    s_waitcnt vmcnt(0)
200; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
201; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
202; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
203; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
204; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
205; GFX11-NEXT:    s_endpgm
206  %tid = call i32 @llvm.amdgcn.workitem.id.x()
207  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
208  %val = load i32, i32 addrspace(1)* %in.gep, align 4
209  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
210  store i32 %ctlz, i32 addrspace(1)* %out, align 4
211  ret void
212}
213
214define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
215; SI-LABEL: v_ctlz_v2i32:
216; SI:       ; %bb.0:
217; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
218; SI-NEXT:    s_mov_b32 s3, 0xf000
219; SI-NEXT:    s_mov_b32 s6, 0
220; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
221; SI-NEXT:    v_mov_b32_e32 v1, 0
222; SI-NEXT:    s_mov_b32 s7, s3
223; SI-NEXT:    s_waitcnt lgkmcnt(0)
224; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
225; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
226; SI-NEXT:    s_mov_b32 s2, -1
227; SI-NEXT:    s_waitcnt vmcnt(0)
228; SI-NEXT:    v_ffbh_u32_e32 v1, v1
229; SI-NEXT:    v_ffbh_u32_e32 v0, v0
230; SI-NEXT:    v_min_u32_e32 v1, 32, v1
231; SI-NEXT:    v_min_u32_e32 v0, 32, v0
232; SI-NEXT:    s_waitcnt lgkmcnt(0)
233; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
234; SI-NEXT:    s_endpgm
235;
236; VI-LABEL: v_ctlz_v2i32:
237; VI:       ; %bb.0:
238; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
239; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
240; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    v_mov_b32_e32 v1, s3
243; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
244; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
245; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
246; VI-NEXT:    s_mov_b32 s3, 0xf000
247; VI-NEXT:    s_mov_b32 s2, -1
248; VI-NEXT:    s_waitcnt vmcnt(0)
249; VI-NEXT:    v_ffbh_u32_e32 v1, v1
250; VI-NEXT:    v_ffbh_u32_e32 v0, v0
251; VI-NEXT:    v_min_u32_e32 v1, 32, v1
252; VI-NEXT:    v_min_u32_e32 v0, 32, v0
253; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
254; VI-NEXT:    s_endpgm
255;
256; EG-LABEL: v_ctlz_v2i32:
257; EG:       ; %bb.0:
258; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
259; EG-NEXT:    TEX 0 @6
260; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
261; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
262; EG-NEXT:    CF_END
263; EG-NEXT:    PAD
264; EG-NEXT:    Fetch clause starting at 6:
265; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
266; EG-NEXT:    ALU clause starting at 8:
267; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
268; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
269; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
270; EG-NEXT:    ALU clause starting at 11:
271; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
272; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
273; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
274; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
275; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
276; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
277; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
278;
279; GFX10-LABEL: v_ctlz_v2i32:
280; GFX10:       ; %bb.0:
281; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
282; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
283; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
284; GFX10-NEXT:    v_mov_b32_e32 v2, 0
285; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
287; GFX10-NEXT:    s_waitcnt vmcnt(0)
288; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
289; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
290; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
291; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
292; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
293; GFX10-NEXT:    s_endpgm
294;
295; GFX10-GISEL-LABEL: v_ctlz_v2i32:
296; GFX10-GISEL:       ; %bb.0:
297; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
298; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
299; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
300; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
301; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
303; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
304; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
305; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
306; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
307; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
308; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
309; GFX10-GISEL-NEXT:    s_endpgm
310;
311; GFX11-LABEL: v_ctlz_v2i32:
312; GFX11:       ; %bb.0:
313; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
314; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
315; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
316; GFX11-NEXT:    v_mov_b32_e32 v2, 0
317; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
319; GFX11-NEXT:    s_waitcnt vmcnt(0)
320; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
321; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
322; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
323; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
324; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
325; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
326; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
327; GFX11-NEXT:    s_endpgm
328  %tid = call i32 @llvm.amdgcn.workitem.id.x()
329  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
330  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
331  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
332  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
333  ret void
334}
335
336define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
337; SI-LABEL: v_ctlz_v4i32:
338; SI:       ; %bb.0:
339; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
340; SI-NEXT:    s_mov_b32 s3, 0xf000
341; SI-NEXT:    s_mov_b32 s6, 0
342; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
343; SI-NEXT:    v_mov_b32_e32 v1, 0
344; SI-NEXT:    s_mov_b32 s7, s3
345; SI-NEXT:    s_waitcnt lgkmcnt(0)
346; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
347; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
348; SI-NEXT:    s_mov_b32 s2, -1
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    v_ffbh_u32_e32 v3, v3
351; SI-NEXT:    v_ffbh_u32_e32 v2, v2
352; SI-NEXT:    v_ffbh_u32_e32 v1, v1
353; SI-NEXT:    v_ffbh_u32_e32 v0, v0
354; SI-NEXT:    v_min_u32_e32 v3, 32, v3
355; SI-NEXT:    v_min_u32_e32 v2, 32, v2
356; SI-NEXT:    v_min_u32_e32 v1, 32, v1
357; SI-NEXT:    v_min_u32_e32 v0, 32, v0
358; SI-NEXT:    s_waitcnt lgkmcnt(0)
359; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
360; SI-NEXT:    s_endpgm
361;
362; VI-LABEL: v_ctlz_v4i32:
363; VI:       ; %bb.0:
364; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
365; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
366; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
367; VI-NEXT:    s_waitcnt lgkmcnt(0)
368; VI-NEXT:    v_mov_b32_e32 v1, s3
369; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
370; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
371; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
372; VI-NEXT:    s_mov_b32 s3, 0xf000
373; VI-NEXT:    s_mov_b32 s2, -1
374; VI-NEXT:    s_waitcnt vmcnt(0)
375; VI-NEXT:    v_ffbh_u32_e32 v3, v3
376; VI-NEXT:    v_ffbh_u32_e32 v2, v2
377; VI-NEXT:    v_ffbh_u32_e32 v1, v1
378; VI-NEXT:    v_ffbh_u32_e32 v0, v0
379; VI-NEXT:    v_min_u32_e32 v3, 32, v3
380; VI-NEXT:    v_min_u32_e32 v2, 32, v2
381; VI-NEXT:    v_min_u32_e32 v1, 32, v1
382; VI-NEXT:    v_min_u32_e32 v0, 32, v0
383; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
384; VI-NEXT:    s_endpgm
385;
386; EG-LABEL: v_ctlz_v4i32:
387; EG:       ; %bb.0:
388; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
389; EG-NEXT:    TEX 0 @6
390; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
391; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
392; EG-NEXT:    CF_END
393; EG-NEXT:    PAD
394; EG-NEXT:    Fetch clause starting at 6:
395; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
396; EG-NEXT:    ALU clause starting at 8:
397; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
398; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
399; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
400; EG-NEXT:    ALU clause starting at 11:
401; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
402; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
403; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
404; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
405; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
406; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
407; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
408; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
409; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
410; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
411; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
412; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
413; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
414;
415; GFX10-LABEL: v_ctlz_v4i32:
416; GFX10:       ; %bb.0:
417; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
418; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
419; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
420; GFX10-NEXT:    v_mov_b32_e32 v4, 0
421; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
423; GFX10-NEXT:    s_waitcnt vmcnt(0)
424; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
425; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
426; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
427; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
428; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
429; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
430; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
431; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
432; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
433; GFX10-NEXT:    s_endpgm
434;
435; GFX10-GISEL-LABEL: v_ctlz_v4i32:
436; GFX10-GISEL:       ; %bb.0:
437; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
438; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
439; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
440; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
441; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
443; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
444; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
445; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
446; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
447; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
448; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
449; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
450; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
451; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
452; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
453; GFX10-GISEL-NEXT:    s_endpgm
454;
455; GFX11-LABEL: v_ctlz_v4i32:
456; GFX11:       ; %bb.0:
457; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
458; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
459; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
460; GFX11-NEXT:    v_mov_b32_e32 v4, 0
461; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
463; GFX11-NEXT:    s_waitcnt vmcnt(0)
464; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v3
465; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
466; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
467; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
468; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
469; GFX11-NEXT:    v_min_u32_e32 v3, 32, v3
470; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
471; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
472; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
473; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
474; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
475; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
476; GFX11-NEXT:    s_endpgm
477  %tid = call i32 @llvm.amdgcn.workitem.id.x()
478  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
479  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
480  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
481  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
482  ret void
483}
484
485define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
486; SI-LABEL: v_ctlz_i8:
487; SI:       ; %bb.0:
488; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
489; SI-NEXT:    s_mov_b32 s3, 0xf000
490; SI-NEXT:    s_mov_b32 s2, -1
491; SI-NEXT:    s_mov_b32 s6, s2
492; SI-NEXT:    s_mov_b32 s7, s3
493; SI-NEXT:    s_waitcnt lgkmcnt(0)
494; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
495; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
496; SI-NEXT:    s_waitcnt vmcnt(0)
497; SI-NEXT:    v_ffbh_u32_e32 v0, v0
498; SI-NEXT:    v_min_u32_e32 v0, 32, v0
499; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
500; SI-NEXT:    s_waitcnt lgkmcnt(0)
501; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
502; SI-NEXT:    s_endpgm
503;
504; VI-LABEL: v_ctlz_i8:
505; VI:       ; %bb.0:
506; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
507; VI-NEXT:    s_mov_b32 s3, 0xf000
508; VI-NEXT:    s_mov_b32 s2, -1
509; VI-NEXT:    s_mov_b32 s6, s2
510; VI-NEXT:    s_mov_b32 s7, s3
511; VI-NEXT:    s_waitcnt lgkmcnt(0)
512; VI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
513; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
514; VI-NEXT:    s_waitcnt vmcnt(0)
515; VI-NEXT:    v_ffbh_u32_e32 v0, v0
516; VI-NEXT:    v_min_u32_e32 v0, 32, v0
517; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
518; VI-NEXT:    v_add_u16_e32 v0, -8, v0
519; VI-NEXT:    s_waitcnt lgkmcnt(0)
520; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
521; VI-NEXT:    s_endpgm
522;
523; EG-LABEL: v_ctlz_i8:
524; EG:       ; %bb.0:
525; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
526; EG-NEXT:    TEX 0 @6
527; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
528; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
529; EG-NEXT:    CF_END
530; EG-NEXT:    PAD
531; EG-NEXT:    Fetch clause starting at 6:
532; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
533; EG-NEXT:    ALU clause starting at 8:
534; EG-NEXT:     MOV * T0.X, KC0[2].Z,
535; EG-NEXT:    ALU clause starting at 9:
536; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
537; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
538; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
539; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
540; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
541; EG-NEXT:    -24(nan), 0(0.000000e+00)
542; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
543; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
544; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
545; EG-NEXT:     LSHL T0.X, PV.W, PS,
546; EG-NEXT:     LSHL * T0.W, literal.x, PS,
547; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
548; EG-NEXT:     MOV T0.Y, 0.0,
549; EG-NEXT:     MOV * T0.Z, 0.0,
550; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
551; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
552;
553; GFX10-LABEL: v_ctlz_i8:
554; GFX10:       ; %bb.0:
555; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
556; GFX10-NEXT:    v_mov_b32_e32 v0, 0
557; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
558; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
560; GFX10-NEXT:    s_waitcnt vmcnt(0)
561; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
562; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
563; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
564; GFX10-NEXT:    v_add_nc_u16 v1, v1, -8
565; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
566; GFX10-NEXT:    s_endpgm
567;
568; GFX10-GISEL-LABEL: v_ctlz_i8:
569; GFX10-GISEL:       ; %bb.0:
570; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
571; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
572; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
573; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
575; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
576; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
577; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
578; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
579; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
580; GFX10-GISEL-NEXT:    s_endpgm
581;
582; GFX11-LABEL: v_ctlz_i8:
583; GFX11:       ; %bb.0:
584; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
585; GFX11-NEXT:    v_mov_b32_e32 v0, 0
586; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
587; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
589; GFX11-NEXT:    s_waitcnt vmcnt(0)
590; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
591; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
592; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
593; GFX11-NEXT:    v_add_nc_u32_e32 v1, -16, v1
594; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
595; GFX11-NEXT:    v_add_nc_u16 v1, v1, -8
596; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
597; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
598; GFX11-NEXT:    s_endpgm
599  %val = load i8, i8 addrspace(1)* %valptr
600  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
601  store i8 %ctlz, i8 addrspace(1)* %out
602  ret void
603}
604
605define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
606; SI-LABEL: s_ctlz_i64:
607; SI:       ; %bb.0:
608; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
609; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
610; SI-NEXT:    s_mov_b32 s3, 0xf000
611; SI-NEXT:    s_mov_b32 s2, -1
612; SI-NEXT:    s_waitcnt lgkmcnt(0)
613; SI-NEXT:    s_flbit_i32_b32 s4, s4
614; SI-NEXT:    s_flbit_i32_b32 s5, s5
615; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
616; SI-NEXT:    v_mov_b32_e32 v0, s5
617; SI-NEXT:    s_add_i32 s4, s4, 32
618; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
619; SI-NEXT:    v_mov_b32_e32 v1, 0
620; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
621; SI-NEXT:    s_endpgm
622;
623; VI-LABEL: s_ctlz_i64:
624; VI:       ; %bb.0:
625; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
626; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
627; VI-NEXT:    s_mov_b32 s3, 0xf000
628; VI-NEXT:    s_mov_b32 s2, -1
629; VI-NEXT:    v_mov_b32_e32 v1, 0
630; VI-NEXT:    s_waitcnt lgkmcnt(0)
631; VI-NEXT:    s_flbit_i32_b32 s4, s4
632; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s4, 32 clamp
633; VI-NEXT:    s_flbit_i32_b32 s4, s5
634; VI-NEXT:    v_min3_u32 v0, v0, s4, 64
635; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
636; VI-NEXT:    s_endpgm
637;
638; EG-LABEL: s_ctlz_i64:
639; EG:       ; %bb.0:
640; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
641; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
642; EG-NEXT:    CF_END
643; EG-NEXT:    PAD
644; EG-NEXT:    ALU clause starting at 4:
645; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
646; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
647; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
648; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
649; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
650; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
651; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
652; EG-NEXT:     MOV T0.Y, 0.0,
653; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
654; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
655;
656; GFX10-LABEL: s_ctlz_i64:
657; GFX10:       ; %bb.0:
658; GFX10-NEXT:    s_clause 0x1
659; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
660; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
661; GFX10-NEXT:    v_mov_b32_e32 v1, 0
662; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
663; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
664; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
665; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
666; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
667; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
668; GFX10-NEXT:    s_endpgm
669;
670; GFX10-GISEL-LABEL: s_ctlz_i64:
671; GFX10-GISEL:       ; %bb.0:
672; GFX10-GISEL-NEXT:    s_clause 0x1
673; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
674; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
675; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
676; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
678; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
679; GFX10-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
680; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
681; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
682; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
683; GFX10-GISEL-NEXT:    s_endpgm
684;
685; GFX11-LABEL: s_ctlz_i64:
686; GFX11:       ; %bb.0:
687; GFX11-NEXT:    s_clause 0x1
688; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x4c
689; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
690; GFX11-NEXT:    v_mov_b32_e32 v1, 0
691; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
692; GFX11-NEXT:    s_clz_i32_u32 s2, s2
693; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
694; GFX11-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
695; GFX11-NEXT:    s_clz_i32_u32 s2, s3
696; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
697; GFX11-NEXT:    v_min3_u32 v0, v0, s2, 64
698; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
699; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
700; GFX11-NEXT:    s_endpgm
701  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
702  store i64 %ctlz, i64 addrspace(1)* %out
703  ret void
704}
705
706define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
707; SI-LABEL: s_ctlz_i64_trunc:
708; SI:       ; %bb.0:
709; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
710; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
711; SI-NEXT:    s_mov_b32 s3, 0xf000
712; SI-NEXT:    s_mov_b32 s2, -1
713; SI-NEXT:    s_waitcnt lgkmcnt(0)
714; SI-NEXT:    s_flbit_i32_b32 s4, s4
715; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
716; SI-NEXT:    s_flbit_i32_b32 s5, s5
717; SI-NEXT:    s_add_i32 s4, s4, 32
718; SI-NEXT:    v_mov_b32_e32 v0, s5
719; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
720; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
721; SI-NEXT:    s_endpgm
722;
723; VI-LABEL: s_ctlz_i64_trunc:
724; VI:       ; %bb.0:
725; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
726; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
727; VI-NEXT:    s_mov_b32 s3, 0xf000
728; VI-NEXT:    s_mov_b32 s2, -1
729; VI-NEXT:    s_waitcnt lgkmcnt(0)
730; VI-NEXT:    s_flbit_i32_b32 s4, s4
731; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s4, 32 clamp
732; VI-NEXT:    s_flbit_i32_b32 s4, s5
733; VI-NEXT:    v_min3_u32 v0, v0, s4, 64
734; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
735; VI-NEXT:    s_endpgm
736;
737; EG-LABEL: s_ctlz_i64_trunc:
738; EG:       ; %bb.0:
739; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
740; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
741; EG-NEXT:    CF_END
742; EG-NEXT:    PAD
743; EG-NEXT:    ALU clause starting at 4:
744; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
745; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
746; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
747; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
748; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
749; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
750; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
751; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
752; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
753;
754; GFX10-LABEL: s_ctlz_i64_trunc:
755; GFX10:       ; %bb.0:
756; GFX10-NEXT:    s_clause 0x1
757; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
758; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
759; GFX10-NEXT:    v_mov_b32_e32 v1, 0
760; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
761; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
762; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
763; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
764; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
765; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
766; GFX10-NEXT:    s_endpgm
767;
768; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
769; GFX10-GISEL:       ; %bb.0:
770; GFX10-GISEL-NEXT:    s_clause 0x1
771; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
772; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
773; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
774; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
775; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
776; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
777; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
778; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
779; GFX10-GISEL-NEXT:    s_endpgm
780;
781; GFX11-LABEL: s_ctlz_i64_trunc:
782; GFX11:       ; %bb.0:
783; GFX11-NEXT:    s_clause 0x1
784; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
785; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
786; GFX11-NEXT:    v_mov_b32_e32 v1, 0
787; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX11-NEXT:    s_clz_i32_u32 s2, s2
789; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
790; GFX11-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
791; GFX11-NEXT:    s_clz_i32_u32 s2, s3
792; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
793; GFX11-NEXT:    v_min3_u32 v0, v0, s2, 64
794; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
795; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
796; GFX11-NEXT:    s_endpgm
797  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
798  %trunc = trunc i64 %ctlz to i32
799  store i32 %trunc, i32 addrspace(1)* %out
800  ret void
801}
802
803define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
804; SI-LABEL: v_ctlz_i64:
805; SI:       ; %bb.0:
806; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
807; SI-NEXT:    s_mov_b32 s7, 0xf000
808; SI-NEXT:    s_mov_b32 s6, 0
809; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
810; SI-NEXT:    v_mov_b32_e32 v1, 0
811; SI-NEXT:    s_waitcnt lgkmcnt(0)
812; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
813; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
814; SI-NEXT:    s_waitcnt vmcnt(0)
815; SI-NEXT:    v_ffbh_u32_e32 v2, v2
816; SI-NEXT:    v_min_u32_e32 v2, 0xffffffdf, v2
817; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
818; SI-NEXT:    v_ffbh_u32_e32 v3, v3
819; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
820; SI-NEXT:    v_mov_b32_e32 v3, v1
821; SI-NEXT:    s_waitcnt lgkmcnt(0)
822; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
823; SI-NEXT:    s_endpgm
824;
825; VI-LABEL: v_ctlz_i64:
826; VI:       ; %bb.0:
827; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
828; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
829; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
830; VI-NEXT:    v_mov_b32_e32 v2, 0
831; VI-NEXT:    s_waitcnt lgkmcnt(0)
832; VI-NEXT:    v_mov_b32_e32 v1, s3
833; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
834; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
835; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
836; VI-NEXT:    v_mov_b32_e32 v4, s1
837; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
838; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
839; VI-NEXT:    s_waitcnt vmcnt(0)
840; VI-NEXT:    v_ffbh_u32_e32 v0, v0
841; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
842; VI-NEXT:    v_ffbh_u32_e32 v1, v1
843; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
844; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
845; VI-NEXT:    s_endpgm
846;
847; EG-LABEL: v_ctlz_i64:
848; EG:       ; %bb.0:
849; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
850; EG-NEXT:    TEX 0 @6
851; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
852; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
853; EG-NEXT:    CF_END
854; EG-NEXT:    PAD
855; EG-NEXT:    Fetch clause starting at 6:
856; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
857; EG-NEXT:    ALU clause starting at 8:
858; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
859; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
860; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
861; EG-NEXT:    ALU clause starting at 11:
862; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
863; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
864; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
865; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
866; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
867; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
868; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
869; EG-NEXT:     MOV T0.Y, 0.0,
870; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
871; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
872; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
873;
874; GFX10-LABEL: v_ctlz_i64:
875; GFX10:       ; %bb.0:
876; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
877; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
878; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
879; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
881; GFX10-NEXT:    s_waitcnt vmcnt(0)
882; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
883; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
884; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
885; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
886; GFX10-NEXT:    v_mov_b32_e32 v1, 0
887; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
888; GFX10-NEXT:    s_endpgm
889;
890; GFX10-GISEL-LABEL: v_ctlz_i64:
891; GFX10-GISEL:       ; %bb.0:
892; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
893; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
894; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
895; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
897; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
898; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
899; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
900; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
901; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
902; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
903; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
904; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
905; GFX10-GISEL-NEXT:    s_endpgm
906;
907; GFX11-LABEL: v_ctlz_i64:
908; GFX11:       ; %bb.0:
909; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
910; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
911; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
912; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
913; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
914; GFX11-NEXT:    s_waitcnt vmcnt(0)
915; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
916; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
917; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
918; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
919; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
920; GFX11-NEXT:    v_mov_b32_e32 v1, 0
921; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
922; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
923; GFX11-NEXT:    s_endpgm
924  %tid = call i32 @llvm.amdgcn.workitem.id.x()
925  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
926  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
927  %val = load i64, i64 addrspace(1)* %in.gep
928  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
929  store i64 %ctlz, i64 addrspace(1)* %out.gep
930  ret void
931}
932
933define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
934; SI-LABEL: v_ctlz_i64_trunc:
935; SI:       ; %bb.0:
936; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
937; SI-NEXT:    s_mov_b32 s7, 0xf000
938; SI-NEXT:    s_mov_b32 s6, 0
939; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
940; SI-NEXT:    v_mov_b32_e32 v2, 0
941; SI-NEXT:    s_waitcnt lgkmcnt(0)
942; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
943; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
944; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
945; SI-NEXT:    s_waitcnt vmcnt(0)
946; SI-NEXT:    v_ffbh_u32_e32 v0, v3
947; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
948; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
949; SI-NEXT:    v_ffbh_u32_e32 v3, v4
950; SI-NEXT:    v_min3_u32 v0, v0, v3, 64
951; SI-NEXT:    s_waitcnt lgkmcnt(0)
952; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
953; SI-NEXT:    s_endpgm
954;
955; VI-LABEL: v_ctlz_i64_trunc:
956; VI:       ; %bb.0:
957; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
958; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
959; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
960; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
961; VI-NEXT:    s_waitcnt lgkmcnt(0)
962; VI-NEXT:    v_mov_b32_e32 v2, s3
963; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
964; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
965; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
966; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
967; VI-NEXT:    v_mov_b32_e32 v4, s1
968; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
969; VI-NEXT:    s_waitcnt vmcnt(0)
970; VI-NEXT:    v_ffbh_u32_e32 v0, v1
971; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
972; VI-NEXT:    v_ffbh_u32_e32 v1, v2
973; VI-NEXT:    v_min3_u32 v0, v0, v1, 64
974; VI-NEXT:    flat_store_dword v[3:4], v0
975; VI-NEXT:    s_endpgm
976;
977; EG-LABEL: v_ctlz_i64_trunc:
978; EG:       ; %bb.0:
979; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
980; EG-NEXT:    TEX 0 @6
981; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
982; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
983; EG-NEXT:    CF_END
984; EG-NEXT:    PAD
985; EG-NEXT:    Fetch clause starting at 6:
986; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
987; EG-NEXT:    ALU clause starting at 8:
988; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
989; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
990; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
991; EG-NEXT:    ALU clause starting at 11:
992; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
993; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
994; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
995; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
996; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
997; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
998; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
999; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
1000; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
1001; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
1002; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1003;
1004; GFX10-LABEL: v_ctlz_i64_trunc:
1005; GFX10:       ; %bb.0:
1006; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1007; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1008; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1009; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1010; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
1012; GFX10-NEXT:    s_waitcnt vmcnt(0)
1013; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
1014; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
1015; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
1016; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
1017; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1018; GFX10-NEXT:    s_endpgm
1019;
1020; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
1021; GFX10-GISEL:       ; %bb.0:
1022; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1023; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1024; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1025; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1026; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1027; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
1028; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1029; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
1030; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
1031; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
1032; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
1033; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
1034; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1035; GFX10-GISEL-NEXT:    s_endpgm
1036;
1037; GFX11-LABEL: v_ctlz_i64_trunc:
1038; GFX11:       ; %bb.0:
1039; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1040; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1041; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1042; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1043; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX11-NEXT:    global_load_b64 v[1:2], v1, s[2:3]
1045; GFX11-NEXT:    s_waitcnt vmcnt(0)
1046; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
1047; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
1048; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1049; GFX11-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
1050; GFX11-NEXT:    v_min3_u32 v1, v1, v2, 64
1051; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1052; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1053; GFX11-NEXT:    s_endpgm
1054  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1055  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
1056  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
1057  %val = load i64, i64 addrspace(1)* %in.gep
1058  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
1059  %trunc = trunc i64 %ctlz to i32
1060  store i32 %trunc, i32 addrspace(1)* %out.gep
1061  ret void
1062}
1063
1064define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1065; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
1066; SI:       ; %bb.0:
1067; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1068; SI-NEXT:    s_mov_b32 s3, 0xf000
1069; SI-NEXT:    s_mov_b32 s6, 0
1070; SI-NEXT:    s_mov_b32 s7, s3
1071; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1072; SI-NEXT:    v_mov_b32_e32 v1, 0
1073; SI-NEXT:    s_waitcnt lgkmcnt(0)
1074; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1075; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1076; SI-NEXT:    s_mov_b32 s2, -1
1077; SI-NEXT:    s_waitcnt vmcnt(0)
1078; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1079; SI-NEXT:    s_waitcnt lgkmcnt(0)
1080; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1081; SI-NEXT:    s_endpgm
1082;
1083; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
1084; VI:       ; %bb.0:
1085; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1086; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1087; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1088; VI-NEXT:    s_waitcnt lgkmcnt(0)
1089; VI-NEXT:    v_mov_b32_e32 v1, s3
1090; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1091; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1092; VI-NEXT:    flat_load_dword v0, v[0:1]
1093; VI-NEXT:    s_mov_b32 s3, 0xf000
1094; VI-NEXT:    s_mov_b32 s2, -1
1095; VI-NEXT:    s_waitcnt vmcnt(0)
1096; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1097; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1098; VI-NEXT:    s_endpgm
1099;
1100; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
1101; EG:       ; %bb.0:
1102; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1103; EG-NEXT:    TEX 0 @6
1104; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1105; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1106; EG-NEXT:    CF_END
1107; EG-NEXT:    PAD
1108; EG-NEXT:    Fetch clause starting at 6:
1109; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1110; EG-NEXT:    ALU clause starting at 8:
1111; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1112; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1113; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1114; EG-NEXT:    ALU clause starting at 11:
1115; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1116; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1117; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1118; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1119; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1120; EG-NEXT:    -1(nan), 2(2.802597e-45)
1121;
1122; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
1123; GFX10:       ; %bb.0:
1124; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1125; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1126; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1127; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1128; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1130; GFX10-NEXT:    s_waitcnt vmcnt(0)
1131; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1132; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1133; GFX10-NEXT:    s_endpgm
1134;
1135; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
1136; GFX10-GISEL:       ; %bb.0:
1137; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1138; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1139; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1140; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1142; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1143; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1144; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1145; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1146; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1147; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1148; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1149; GFX10-GISEL-NEXT:    s_endpgm
1150;
1151; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
1152; GFX11:       ; %bb.0:
1153; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1154; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1155; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1156; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1157; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1158; GFX11-NEXT:    s_waitcnt vmcnt(0)
1159; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1160; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1161; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1162; GFX11-NEXT:    s_endpgm
1163  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1164  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1165  %val = load i32, i32 addrspace(1)* %in.gep
1166  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1167  %cmp = icmp eq i32 %val, 0
1168  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1169  store i32 %sel, i32 addrspace(1)* %out
1170  ret void
1171}
1172
1173define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1174; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
1175; SI:       ; %bb.0:
1176; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1177; SI-NEXT:    s_mov_b32 s3, 0xf000
1178; SI-NEXT:    s_mov_b32 s6, 0
1179; SI-NEXT:    s_mov_b32 s7, s3
1180; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1181; SI-NEXT:    v_mov_b32_e32 v1, 0
1182; SI-NEXT:    s_waitcnt lgkmcnt(0)
1183; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1184; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1185; SI-NEXT:    s_mov_b32 s2, -1
1186; SI-NEXT:    s_waitcnt vmcnt(0)
1187; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1188; SI-NEXT:    s_waitcnt lgkmcnt(0)
1189; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1190; SI-NEXT:    s_endpgm
1191;
1192; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
1193; VI:       ; %bb.0:
1194; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1195; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1196; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1197; VI-NEXT:    s_waitcnt lgkmcnt(0)
1198; VI-NEXT:    v_mov_b32_e32 v1, s3
1199; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1200; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1201; VI-NEXT:    flat_load_dword v0, v[0:1]
1202; VI-NEXT:    s_mov_b32 s3, 0xf000
1203; VI-NEXT:    s_mov_b32 s2, -1
1204; VI-NEXT:    s_waitcnt vmcnt(0)
1205; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1206; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1207; VI-NEXT:    s_endpgm
1208;
1209; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
1210; EG:       ; %bb.0:
1211; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1212; EG-NEXT:    TEX 0 @6
1213; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1214; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1215; EG-NEXT:    CF_END
1216; EG-NEXT:    PAD
1217; EG-NEXT:    Fetch clause starting at 6:
1218; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1219; EG-NEXT:    ALU clause starting at 8:
1220; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1221; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1222; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1223; EG-NEXT:    ALU clause starting at 11:
1224; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1225; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1226; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1227; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1228; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1229; EG-NEXT:    -1(nan), 2(2.802597e-45)
1230;
1231; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
1232; GFX10:       ; %bb.0:
1233; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1234; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1235; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1236; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1237; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1239; GFX10-NEXT:    s_waitcnt vmcnt(0)
1240; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1241; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1242; GFX10-NEXT:    s_endpgm
1243;
1244; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
1245; GFX10-GISEL:       ; %bb.0:
1246; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1247; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1248; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1249; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1250; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1251; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1252; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1253; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1254; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1255; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1256; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1257; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1258; GFX10-GISEL-NEXT:    s_endpgm
1259;
1260; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
1261; GFX11:       ; %bb.0:
1262; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1263; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1264; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1265; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1267; GFX11-NEXT:    s_waitcnt vmcnt(0)
1268; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1269; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1270; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1271; GFX11-NEXT:    s_endpgm
1272  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1273  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1274  %val = load i32, i32 addrspace(1)* %in.gep
1275  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1276  %cmp = icmp ne i32 %val, 0
1277  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1278  store i32 %sel, i32 addrspace(1)* %out
1279  ret void
1280}
1281
1282; TODO: Should be able to eliminate select here as well.
1283define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1284; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1285; SI:       ; %bb.0:
1286; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1287; SI-NEXT:    s_mov_b32 s3, 0xf000
1288; SI-NEXT:    s_mov_b32 s6, 0
1289; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1290; SI-NEXT:    v_mov_b32_e32 v1, 0
1291; SI-NEXT:    s_mov_b32 s7, s3
1292; SI-NEXT:    s_waitcnt lgkmcnt(0)
1293; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1294; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1295; SI-NEXT:    s_mov_b32 s2, -1
1296; SI-NEXT:    s_waitcnt vmcnt(0)
1297; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1298; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1299; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1300; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1301; SI-NEXT:    s_waitcnt lgkmcnt(0)
1302; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1303; SI-NEXT:    s_endpgm
1304;
1305; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1306; VI:       ; %bb.0:
1307; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1308; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1309; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1310; VI-NEXT:    s_waitcnt lgkmcnt(0)
1311; VI-NEXT:    v_mov_b32_e32 v1, s3
1312; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1313; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1314; VI-NEXT:    flat_load_dword v0, v[0:1]
1315; VI-NEXT:    s_mov_b32 s3, 0xf000
1316; VI-NEXT:    s_mov_b32 s2, -1
1317; VI-NEXT:    s_waitcnt vmcnt(0)
1318; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1319; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1320; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1321; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1322; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1323; VI-NEXT:    s_endpgm
1324;
1325; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1326; EG:       ; %bb.0:
1327; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1328; EG-NEXT:    TEX 0 @6
1329; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1330; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1331; EG-NEXT:    CF_END
1332; EG-NEXT:    PAD
1333; EG-NEXT:    Fetch clause starting at 6:
1334; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1335; EG-NEXT:    ALU clause starting at 8:
1336; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1337; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1338; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1339; EG-NEXT:    ALU clause starting at 11:
1340; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1341; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1342; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1343; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1344; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1345; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1346; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1347; EG-NEXT:    -1(nan), 2(2.802597e-45)
1348;
1349; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1350; GFX10:       ; %bb.0:
1351; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1352; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1353; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1354; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1355; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1356; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1357; GFX10-NEXT:    s_waitcnt vmcnt(0)
1358; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1359; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1360; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1361; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1362; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1363; GFX10-NEXT:    s_endpgm
1364;
1365; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1366; GFX10-GISEL:       ; %bb.0:
1367; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1368; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1369; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1370; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1371; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1373; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1374; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1375; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1376; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1377; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1378; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1379; GFX10-GISEL-NEXT:    s_endpgm
1380;
1381; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1382; GFX11:       ; %bb.0:
1383; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1384; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1385; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1386; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1387; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1388; GFX11-NEXT:    s_waitcnt vmcnt(0)
1389; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1390; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1391; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
1392; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1393; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1394; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1395; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1396; GFX11-NEXT:    s_endpgm
1397  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1398  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1399  %val = load i32, i32 addrspace(1)* %in.gep
1400  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1401  %cmp = icmp eq i32 %ctlz, 32
1402  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1403  store i32 %sel, i32 addrspace(1)* %out
1404  ret void
1405}
1406
1407define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1408; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1409; SI:       ; %bb.0:
1410; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1411; SI-NEXT:    s_mov_b32 s3, 0xf000
1412; SI-NEXT:    s_mov_b32 s6, 0
1413; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1414; SI-NEXT:    v_mov_b32_e32 v1, 0
1415; SI-NEXT:    s_mov_b32 s7, s3
1416; SI-NEXT:    s_waitcnt lgkmcnt(0)
1417; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1418; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1419; SI-NEXT:    s_mov_b32 s2, -1
1420; SI-NEXT:    s_waitcnt vmcnt(0)
1421; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1422; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1423; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1424; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1425; SI-NEXT:    s_waitcnt lgkmcnt(0)
1426; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1427; SI-NEXT:    s_endpgm
1428;
1429; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1430; VI:       ; %bb.0:
1431; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1432; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1433; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1434; VI-NEXT:    s_waitcnt lgkmcnt(0)
1435; VI-NEXT:    v_mov_b32_e32 v1, s3
1436; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1437; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1438; VI-NEXT:    flat_load_dword v0, v[0:1]
1439; VI-NEXT:    s_mov_b32 s3, 0xf000
1440; VI-NEXT:    s_mov_b32 s2, -1
1441; VI-NEXT:    s_waitcnt vmcnt(0)
1442; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1443; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1444; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1445; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1446; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1447; VI-NEXT:    s_endpgm
1448;
1449; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1450; EG:       ; %bb.0:
1451; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1452; EG-NEXT:    TEX 0 @6
1453; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1454; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1455; EG-NEXT:    CF_END
1456; EG-NEXT:    PAD
1457; EG-NEXT:    Fetch clause starting at 6:
1458; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1459; EG-NEXT:    ALU clause starting at 8:
1460; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1461; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1462; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1463; EG-NEXT:    ALU clause starting at 11:
1464; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1465; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1466; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1467; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1468; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1469; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1470; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1471; EG-NEXT:    -1(nan), 2(2.802597e-45)
1472;
1473; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1474; GFX10:       ; %bb.0:
1475; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1476; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1477; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1478; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1479; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1481; GFX10-NEXT:    s_waitcnt vmcnt(0)
1482; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1483; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1484; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1485; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1486; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1487; GFX10-NEXT:    s_endpgm
1488;
1489; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1490; GFX10-GISEL:       ; %bb.0:
1491; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1492; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1493; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1494; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1495; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1496; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1497; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1498; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1499; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1500; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1501; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1502; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1503; GFX10-GISEL-NEXT:    s_endpgm
1504;
1505; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1506; GFX11:       ; %bb.0:
1507; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1508; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1509; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1510; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1511; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1512; GFX11-NEXT:    s_waitcnt vmcnt(0)
1513; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1514; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1515; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
1516; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1517; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1518; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1519; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1520; GFX11-NEXT:    s_endpgm
1521  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1522  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1523  %val = load i32, i32 addrspace(1)* %in.gep
1524  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1525  %cmp = icmp ne i32 %ctlz, 32
1526  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1527  store i32 %sel, i32 addrspace(1)* %out
1528  ret void
1529}
1530
1531 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1532; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
1533; SI:       ; %bb.0:
1534; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1535; SI-NEXT:    s_mov_b32 s3, 0xf000
1536; SI-NEXT:    v_mov_b32_e32 v1, 0
1537; SI-NEXT:    s_mov_b32 s6, 0
1538; SI-NEXT:    s_mov_b32 s7, s3
1539; SI-NEXT:    s_waitcnt lgkmcnt(0)
1540; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1541; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1542; SI-NEXT:    s_mov_b32 s2, -1
1543; SI-NEXT:    s_waitcnt vmcnt(0)
1544; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1545; SI-NEXT:    s_waitcnt lgkmcnt(0)
1546; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1547; SI-NEXT:    s_endpgm
1548;
1549; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
1550; VI:       ; %bb.0:
1551; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1552; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1553; VI-NEXT:    s_waitcnt lgkmcnt(0)
1554; VI-NEXT:    v_mov_b32_e32 v1, s3
1555; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1556; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1557; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1558; VI-NEXT:    s_mov_b32 s3, 0xf000
1559; VI-NEXT:    s_mov_b32 s2, -1
1560; VI-NEXT:    s_waitcnt vmcnt(0)
1561; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1562; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1563; VI-NEXT:    s_endpgm
1564;
1565; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1566; EG:       ; %bb.0:
1567; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1568; EG-NEXT:    TEX 0 @6
1569; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1570; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1571; EG-NEXT:    CF_END
1572; EG-NEXT:    PAD
1573; EG-NEXT:    Fetch clause starting at 6:
1574; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1575; EG-NEXT:    ALU clause starting at 8:
1576; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1577; EG-NEXT:    ALU clause starting at 9:
1578; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1579; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1580; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1581; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1582; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1583; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1584; EG-NEXT:     LSHL T0.X, PV.W, PS,
1585; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1586; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1587; EG-NEXT:     MOV T0.Y, 0.0,
1588; EG-NEXT:     MOV * T0.Z, 0.0,
1589; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1590; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1591;
1592; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
1593; GFX10:       ; %bb.0:
1594; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1595; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1596; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1597; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1599; GFX10-NEXT:    s_waitcnt vmcnt(0)
1600; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1601; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1602; GFX10-NEXT:    s_endpgm
1603;
1604; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
1605; GFX10-GISEL:       ; %bb.0:
1606; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1607; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1608; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1609; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1610; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1611; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1612; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1613; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1614; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1615; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1616; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1617; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1618; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1619; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
1620; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1621; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1622; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1623; GFX10-GISEL-NEXT:    s_endpgm
1624;
1625; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
1626; GFX11:       ; %bb.0:
1627; GFX11-NEXT:    s_clause 0x1
1628; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1629; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1630; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1631; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1632; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1633; GFX11-NEXT:    s_waitcnt vmcnt(0)
1634; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1635; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
1636; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1637; GFX11-NEXT:    s_endpgm
1638  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1639  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1640  %val = load i8, i8 addrspace(1)* %valptr.gep
1641  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1642  %cmp = icmp eq i8 %val, 0
1643  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1644  store i8 %sel, i8 addrspace(1)* %out
1645  ret void
1646}
1647
1648 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1649; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1650; SI:       ; %bb.0:
1651; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1652; SI-NEXT:    s_mov_b32 s3, 0xf000
1653; SI-NEXT:    s_mov_b32 s2, -1
1654; SI-NEXT:    s_mov_b32 s6, s2
1655; SI-NEXT:    s_mov_b32 s7, s3
1656; SI-NEXT:    s_waitcnt lgkmcnt(0)
1657; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1658; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1659; SI-NEXT:    s_waitcnt vmcnt(0)
1660; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1661; SI-NEXT:    s_waitcnt lgkmcnt(0)
1662; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1663; SI-NEXT:    s_endpgm
1664;
1665; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1666; VI:       ; %bb.0:
1667; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
1668; VI-NEXT:    s_mov_b32 s3, 0xf000
1669; VI-NEXT:    s_mov_b32 s2, -1
1670; VI-NEXT:    s_mov_b32 s6, s2
1671; VI-NEXT:    s_mov_b32 s7, s3
1672; VI-NEXT:    s_waitcnt lgkmcnt(0)
1673; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1674; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1675; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1676; VI-NEXT:    s_waitcnt vmcnt(0)
1677; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1678; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1679; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
1680; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1681; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1682; VI-NEXT:    s_waitcnt lgkmcnt(0)
1683; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1684; VI-NEXT:    s_endpgm
1685;
1686; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1687; EG:       ; %bb.0:
1688; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1689; EG-NEXT:    TEX 0 @6
1690; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1691; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1692; EG-NEXT:    CF_END
1693; EG-NEXT:    PAD
1694; EG-NEXT:    Fetch clause starting at 6:
1695; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1696; EG-NEXT:    ALU clause starting at 8:
1697; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1698; EG-NEXT:    ALU clause starting at 9:
1699; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1700; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1701; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1702; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1703; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1704; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1705; EG-NEXT:     LSHL T0.X, PV.W, PS,
1706; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1707; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1708; EG-NEXT:     MOV T0.Y, 0.0,
1709; EG-NEXT:     MOV * T0.Z, 0.0,
1710; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1711; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1712;
1713; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
1714; GFX10:       ; %bb.0:
1715; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1716; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1717; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1718; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1720; GFX10-NEXT:    s_waitcnt vmcnt(0)
1721; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
1722; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1723; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1724; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1725; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1726; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1727; GFX10-NEXT:    s_endpgm
1728;
1729; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
1730; GFX10-GISEL:       ; %bb.0:
1731; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1732; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1733; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1734; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1735; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1736; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1737; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
1738; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1739; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1740; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
1741; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1742; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
1743; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1744; GFX10-GISEL-NEXT:    s_endpgm
1745;
1746; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
1747; GFX11:       ; %bb.0:
1748; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1749; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1750; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1751; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
1753; GFX11-NEXT:    s_waitcnt vmcnt(0)
1754; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
1755; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1756; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1757; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
1758; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1759; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1760; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1761; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1762; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1763; GFX11-NEXT:    s_endpgm
1764  %val = load i16, i16 addrspace(1)* %valptr
1765  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1766  %cmp = icmp eq i16 %val, 0
1767  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1768  store i16 %sel, i16 addrspace(1)* %out
1769  ret void
1770}
1771
1772; FIXME: Need to handle non-uniform case for function below (load without gep).
1773define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1774; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1775; SI:       ; %bb.0:
1776; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1777; SI-NEXT:    s_mov_b32 s3, 0xf000
1778; SI-NEXT:    v_mov_b32_e32 v1, 0
1779; SI-NEXT:    s_mov_b32 s6, 0
1780; SI-NEXT:    s_mov_b32 s7, s3
1781; SI-NEXT:    s_waitcnt lgkmcnt(0)
1782; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1783; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1784; SI-NEXT:    s_mov_b32 s2, -1
1785; SI-NEXT:    s_waitcnt vmcnt(0)
1786; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1787; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1788; SI-NEXT:    s_waitcnt lgkmcnt(0)
1789; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1790; SI-NEXT:    s_endpgm
1791;
1792; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1793; VI:       ; %bb.0:
1794; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1795; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1796; VI-NEXT:    s_waitcnt lgkmcnt(0)
1797; VI-NEXT:    v_mov_b32_e32 v1, s3
1798; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1799; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1800; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1801; VI-NEXT:    s_mov_b32 s3, 0xf000
1802; VI-NEXT:    s_mov_b32 s2, -1
1803; VI-NEXT:    s_waitcnt vmcnt(0)
1804; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1805; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1806; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1807; VI-NEXT:    s_endpgm
1808;
1809; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1810; EG:       ; %bb.0:
1811; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1812; EG-NEXT:    TEX 0 @6
1813; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1814; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1815; EG-NEXT:    CF_END
1816; EG-NEXT:    PAD
1817; EG-NEXT:    Fetch clause starting at 6:
1818; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1819; EG-NEXT:    ALU clause starting at 8:
1820; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1821; EG-NEXT:    ALU clause starting at 9:
1822; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1823; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1824; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1825; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1826; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1827; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1828; EG-NEXT:     LSHL T0.X, PV.W, PS,
1829; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1830; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1831; EG-NEXT:     MOV T0.Y, 0.0,
1832; EG-NEXT:     MOV * T0.Z, 0.0,
1833; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1834; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1835;
1836; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
1837; GFX10:       ; %bb.0:
1838; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1839; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1840; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1841; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1842; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1843; GFX10-NEXT:    s_waitcnt vmcnt(0)
1844; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1845; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1846; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1847; GFX10-NEXT:    s_endpgm
1848;
1849; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
1850; GFX10-GISEL:       ; %bb.0:
1851; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1852; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1853; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1854; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1856; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1857; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1858; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1859; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1860; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1861; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1862; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1863; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1864; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1865; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 25, v1
1866; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1867; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1868; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1869; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1870; GFX10-GISEL-NEXT:    s_endpgm
1871;
1872; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
1873; GFX11:       ; %bb.0:
1874; GFX11-NEXT:    s_clause 0x1
1875; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
1876; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1877; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1878; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1879; GFX11-NEXT:    s_waitcnt vmcnt(0)
1880; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1881; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1882; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0
1883; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
1884; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1885; GFX11-NEXT:    s_endpgm
1886  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1887  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1888  %val = load i7, i7 addrspace(1)* %valptr.gep
1889  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1890  %cmp = icmp eq i7 %val, 0
1891  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1892  store i7 %sel, i7 addrspace(1)* %out
1893  ret void
1894}
1895