1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7
8declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
9declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
10declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
11
12declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
13declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
14declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
15
16declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
17declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
18declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
19
20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
21
22define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
23; SI-LABEL: s_ctlz_i32:
24; SI:       ; %bb.0:
25; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    s_flbit_i32_b32 s2, s2
30; SI-NEXT:    s_min_u32 s4, s2, 32
31; SI-NEXT:    s_mov_b32 s2, -1
32; SI-NEXT:    v_mov_b32_e32 v0, s4
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: s_ctlz_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
39; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
40; VI-NEXT:    s_mov_b32 s3, 0xf000
41; VI-NEXT:    s_mov_b32 s2, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_flbit_i32_b32 s4, s4
44; VI-NEXT:    s_min_u32 s4, s4, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s4
46; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_ctlz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60;
61; GFX10-LABEL: s_ctlz_i32:
62; GFX10:       ; %bb.0:
63; GFX10-NEXT:    s_clause 0x1
64; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
65; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
66; GFX10-NEXT:    v_mov_b32_e32 v0, 0
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    s_flbit_i32_b32 s0, s4
69; GFX10-NEXT:    s_min_u32 s0, s0, 32
70; GFX10-NEXT:    v_mov_b32_e32 v1, s0
71; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
72; GFX10-NEXT:    s_endpgm
73;
74; GFX10-GISEL-LABEL: s_ctlz_i32:
75; GFX10-GISEL:       ; %bb.0:
76; GFX10-GISEL-NEXT:    s_clause 0x1
77; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
78; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
79; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
80; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX10-GISEL-NEXT:    s_flbit_i32_b32 s0, s4
82; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 32
83; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
84; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
85; GFX10-GISEL-NEXT:    s_endpgm
86  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
87  store i32 %ctlz, i32 addrspace(1)* %out, align 4
88  ret void
89}
90
91define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
92; SI-LABEL: v_ctlz_i32:
93; SI:       ; %bb.0:
94; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
95; SI-NEXT:    s_mov_b32 s3, 0xf000
96; SI-NEXT:    s_mov_b32 s6, 0
97; SI-NEXT:    s_mov_b32 s7, s3
98; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
99; SI-NEXT:    v_mov_b32_e32 v1, 0
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
102; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt vmcnt(0)
105; SI-NEXT:    v_ffbh_u32_e32 v0, v0
106; SI-NEXT:    v_min_u32_e32 v0, 32, v0
107; SI-NEXT:    s_waitcnt lgkmcnt(0)
108; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
109; SI-NEXT:    s_endpgm
110;
111; VI-LABEL: v_ctlz_i32:
112; VI:       ; %bb.0:
113; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
114; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
115; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v1, s3
118; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
119; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
120; VI-NEXT:    flat_load_dword v0, v[0:1]
121; VI-NEXT:    s_mov_b32 s3, 0xf000
122; VI-NEXT:    s_mov_b32 s2, -1
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_ffbh_u32_e32 v0, v0
125; VI-NEXT:    v_min_u32_e32 v0, 32, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
127; VI-NEXT:    s_endpgm
128;
129; EG-LABEL: v_ctlz_i32:
130; EG:       ; %bb.0:
131; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    TEX 0 @6
133; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
134; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
135; EG-NEXT:    CF_END
136; EG-NEXT:    PAD
137; EG-NEXT:    Fetch clause starting at 6:
138; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
139; EG-NEXT:    ALU clause starting at 8:
140; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
141; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
143; EG-NEXT:    ALU clause starting at 11:
144; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
145; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
147; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
148;
149; GFX10-LABEL: v_ctlz_i32:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
152; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
153; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
154; GFX10-NEXT:    v_mov_b32_e32 v1, 0
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
157; GFX10-NEXT:    s_waitcnt vmcnt(0)
158; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
159; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
160; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
161; GFX10-NEXT:    s_endpgm
162;
163; GFX10-GISEL-LABEL: v_ctlz_i32:
164; GFX10-GISEL:       ; %bb.0:
165; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
166; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
168; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
169; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
171; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
172; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
173; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
174; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
175; GFX10-GISEL-NEXT:    s_endpgm
176  %tid = call i32 @llvm.amdgcn.workitem.id.x()
177  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
178  %val = load i32, i32 addrspace(1)* %in.gep, align 4
179  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
180  store i32 %ctlz, i32 addrspace(1)* %out, align 4
181  ret void
182}
183
184define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
185; SI-LABEL: v_ctlz_v2i32:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
188; SI-NEXT:    s_mov_b32 s3, 0xf000
189; SI-NEXT:    s_mov_b32 s6, 0
190; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
191; SI-NEXT:    v_mov_b32_e32 v1, 0
192; SI-NEXT:    s_mov_b32 s7, s3
193; SI-NEXT:    s_waitcnt lgkmcnt(0)
194; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
195; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    v_ffbh_u32_e32 v1, v1
199; SI-NEXT:    v_ffbh_u32_e32 v0, v0
200; SI-NEXT:    v_min_u32_e32 v1, 32, v1
201; SI-NEXT:    v_min_u32_e32 v0, 32, v0
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
204; SI-NEXT:    s_endpgm
205;
206; VI-LABEL: v_ctlz_v2i32:
207; VI:       ; %bb.0:
208; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
209; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
210; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
211; VI-NEXT:    s_waitcnt lgkmcnt(0)
212; VI-NEXT:    v_mov_b32_e32 v1, s3
213; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
214; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
215; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
216; VI-NEXT:    s_mov_b32 s3, 0xf000
217; VI-NEXT:    s_mov_b32 s2, -1
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    v_ffbh_u32_e32 v1, v1
220; VI-NEXT:    v_ffbh_u32_e32 v0, v0
221; VI-NEXT:    v_min_u32_e32 v1, 32, v1
222; VI-NEXT:    v_min_u32_e32 v0, 32, v0
223; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
224; VI-NEXT:    s_endpgm
225;
226; EG-LABEL: v_ctlz_v2i32:
227; EG:       ; %bb.0:
228; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
229; EG-NEXT:    TEX 0 @6
230; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
231; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
232; EG-NEXT:    CF_END
233; EG-NEXT:    PAD
234; EG-NEXT:    Fetch clause starting at 6:
235; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
236; EG-NEXT:    ALU clause starting at 8:
237; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
238; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
239; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
240; EG-NEXT:    ALU clause starting at 11:
241; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
242; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
243; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
244; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
245; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
246; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
247; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
248;
249; GFX10-LABEL: v_ctlz_v2i32:
250; GFX10:       ; %bb.0:
251; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
252; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
253; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
254; GFX10-NEXT:    v_mov_b32_e32 v2, 0
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
257; GFX10-NEXT:    s_waitcnt vmcnt(0)
258; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
259; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
260; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
261; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
262; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
263; GFX10-NEXT:    s_endpgm
264;
265; GFX10-GISEL-LABEL: v_ctlz_v2i32:
266; GFX10-GISEL:       ; %bb.0:
267; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
268; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
269; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
270; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
271; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
273; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
274; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
275; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
276; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
277; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
278; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
279; GFX10-GISEL-NEXT:    s_endpgm
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
282  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
283  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
284  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
285  ret void
286}
287
288define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
289; SI-LABEL: v_ctlz_v4i32:
290; SI:       ; %bb.0:
291; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
292; SI-NEXT:    s_mov_b32 s3, 0xf000
293; SI-NEXT:    s_mov_b32 s6, 0
294; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
295; SI-NEXT:    v_mov_b32_e32 v1, 0
296; SI-NEXT:    s_mov_b32 s7, s3
297; SI-NEXT:    s_waitcnt lgkmcnt(0)
298; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
300; SI-NEXT:    s_mov_b32 s2, -1
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    v_ffbh_u32_e32 v3, v3
303; SI-NEXT:    v_ffbh_u32_e32 v2, v2
304; SI-NEXT:    v_ffbh_u32_e32 v1, v1
305; SI-NEXT:    v_ffbh_u32_e32 v0, v0
306; SI-NEXT:    v_min_u32_e32 v3, 32, v3
307; SI-NEXT:    v_min_u32_e32 v2, 32, v2
308; SI-NEXT:    v_min_u32_e32 v1, 32, v1
309; SI-NEXT:    v_min_u32_e32 v0, 32, v0
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
312; SI-NEXT:    s_endpgm
313;
314; VI-LABEL: v_ctlz_v4i32:
315; VI:       ; %bb.0:
316; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
317; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
318; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
319; VI-NEXT:    s_waitcnt lgkmcnt(0)
320; VI-NEXT:    v_mov_b32_e32 v1, s3
321; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
322; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
323; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
324; VI-NEXT:    s_mov_b32 s3, 0xf000
325; VI-NEXT:    s_mov_b32 s2, -1
326; VI-NEXT:    s_waitcnt vmcnt(0)
327; VI-NEXT:    v_ffbh_u32_e32 v3, v3
328; VI-NEXT:    v_ffbh_u32_e32 v2, v2
329; VI-NEXT:    v_ffbh_u32_e32 v1, v1
330; VI-NEXT:    v_ffbh_u32_e32 v0, v0
331; VI-NEXT:    v_min_u32_e32 v3, 32, v3
332; VI-NEXT:    v_min_u32_e32 v2, 32, v2
333; VI-NEXT:    v_min_u32_e32 v1, 32, v1
334; VI-NEXT:    v_min_u32_e32 v0, 32, v0
335; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
336; VI-NEXT:    s_endpgm
337;
338; EG-LABEL: v_ctlz_v4i32:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
350; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
351; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
352; EG-NEXT:    ALU clause starting at 11:
353; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
354; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
355; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
356; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
357; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
358; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
359; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
360; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
361; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
362; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
363; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
365; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
366;
367; GFX10-LABEL: v_ctlz_v4i32:
368; GFX10:       ; %bb.0:
369; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
370; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
371; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
372; GFX10-NEXT:    v_mov_b32_e32 v4, 0
373; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
375; GFX10-NEXT:    s_waitcnt vmcnt(0)
376; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
377; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
378; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
379; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
380; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
381; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
382; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
383; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
384; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
385; GFX10-NEXT:    s_endpgm
386;
387; GFX10-GISEL-LABEL: v_ctlz_v4i32:
388; GFX10-GISEL:       ; %bb.0:
389; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
390; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
391; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
392; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
393; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
395; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
396; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
397; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
398; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
399; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
400; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
401; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
402; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
403; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
404; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
405; GFX10-GISEL-NEXT:    s_endpgm
406  %tid = call i32 @llvm.amdgcn.workitem.id.x()
407  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
408  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
409  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
410  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
411  ret void
412}
413
414define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
415; SI-LABEL: v_ctlz_i8:
416; SI:       ; %bb.0:
417; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
418; SI-NEXT:    s_mov_b32 s3, 0xf000
419; SI-NEXT:    s_mov_b32 s2, -1
420; SI-NEXT:    s_mov_b32 s6, s2
421; SI-NEXT:    s_mov_b32 s7, s3
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
424; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
425; SI-NEXT:    s_waitcnt vmcnt(0)
426; SI-NEXT:    v_ffbh_u32_e32 v0, v0
427; SI-NEXT:    v_min_u32_e32 v0, 32, v0
428; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
429; SI-NEXT:    s_waitcnt lgkmcnt(0)
430; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
431; SI-NEXT:    s_endpgm
432;
433; VI-LABEL: v_ctlz_i8:
434; VI:       ; %bb.0:
435; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
436; VI-NEXT:    s_mov_b32 s3, 0xf000
437; VI-NEXT:    s_mov_b32 s2, -1
438; VI-NEXT:    s_mov_b32 s6, s2
439; VI-NEXT:    s_mov_b32 s7, s3
440; VI-NEXT:    s_waitcnt lgkmcnt(0)
441; VI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
442; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
443; VI-NEXT:    s_waitcnt vmcnt(0)
444; VI-NEXT:    v_ffbh_u32_e32 v0, v0
445; VI-NEXT:    v_min_u32_e32 v0, 32, v0
446; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
447; VI-NEXT:    v_add_u16_e32 v0, -8, v0
448; VI-NEXT:    s_waitcnt lgkmcnt(0)
449; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
450; VI-NEXT:    s_endpgm
451;
452; EG-LABEL: v_ctlz_i8:
453; EG:       ; %bb.0:
454; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
455; EG-NEXT:    TEX 0 @6
456; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
457; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
458; EG-NEXT:    CF_END
459; EG-NEXT:    PAD
460; EG-NEXT:    Fetch clause starting at 6:
461; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
462; EG-NEXT:    ALU clause starting at 8:
463; EG-NEXT:     MOV * T0.X, KC0[2].Z,
464; EG-NEXT:    ALU clause starting at 9:
465; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
466; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
467; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
468; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
469; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
470; EG-NEXT:    -24(nan), 0(0.000000e+00)
471; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
472; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
473; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
474; EG-NEXT:     LSHL T0.X, PV.W, PS,
475; EG-NEXT:     LSHL * T0.W, literal.x, PS,
476; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
477; EG-NEXT:     MOV T0.Y, 0.0,
478; EG-NEXT:     MOV * T0.Z, 0.0,
479; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
480; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
481;
482; GFX10-LABEL: v_ctlz_i8:
483; GFX10:       ; %bb.0:
484; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
485; GFX10-NEXT:    v_mov_b32_e32 v0, 0
486; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
487; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
489; GFX10-NEXT:    s_waitcnt vmcnt(0)
490; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
491; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
492; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
493; GFX10-NEXT:    v_add_nc_u16 v1, v1, -8
494; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
495; GFX10-NEXT:    s_endpgm
496;
497; GFX10-GISEL-LABEL: v_ctlz_i8:
498; GFX10-GISEL:       ; %bb.0:
499; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
500; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
501; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
502; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
504; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
505; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
506; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
507; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
508; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
509; GFX10-GISEL-NEXT:    s_endpgm
510  %val = load i8, i8 addrspace(1)* %valptr
511  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
512  store i8 %ctlz, i8 addrspace(1)* %out
513  ret void
514}
515
516define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
517; SI-LABEL: s_ctlz_i64:
518; SI:       ; %bb.0:
519; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
520; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
521; SI-NEXT:    s_mov_b32 s3, 0xf000
522; SI-NEXT:    s_mov_b32 s2, -1
523; SI-NEXT:    s_waitcnt lgkmcnt(0)
524; SI-NEXT:    s_flbit_i32_b32 s4, s4
525; SI-NEXT:    s_flbit_i32_b32 s5, s5
526; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
527; SI-NEXT:    v_mov_b32_e32 v0, s5
528; SI-NEXT:    s_add_i32 s4, s4, 32
529; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
530; SI-NEXT:    v_mov_b32_e32 v1, 0
531; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
532; SI-NEXT:    s_endpgm
533;
534; VI-LABEL: s_ctlz_i64:
535; VI:       ; %bb.0:
536; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
537; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
538; VI-NEXT:    s_mov_b32 s3, 0xf000
539; VI-NEXT:    s_mov_b32 s2, -1
540; VI-NEXT:    v_mov_b32_e32 v1, 0
541; VI-NEXT:    s_waitcnt lgkmcnt(0)
542; VI-NEXT:    s_flbit_i32_b32 s4, s4
543; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s4, 32 clamp
544; VI-NEXT:    s_flbit_i32_b32 s4, s5
545; VI-NEXT:    v_min3_u32 v0, v0, s4, 64
546; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
547; VI-NEXT:    s_endpgm
548;
549; EG-LABEL: s_ctlz_i64:
550; EG:       ; %bb.0:
551; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
552; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
553; EG-NEXT:    CF_END
554; EG-NEXT:    PAD
555; EG-NEXT:    ALU clause starting at 4:
556; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
557; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
558; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
559; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
560; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
561; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
562; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
563; EG-NEXT:     MOV T0.Y, 0.0,
564; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
565; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
566;
567; GFX10-LABEL: s_ctlz_i64:
568; GFX10:       ; %bb.0:
569; GFX10-NEXT:    s_clause 0x1
570; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
571; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
572; GFX10-NEXT:    v_mov_b32_e32 v1, 0
573; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
575; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
576; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
577; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
578; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
579; GFX10-NEXT:    s_endpgm
580;
581; GFX10-GISEL-LABEL: s_ctlz_i64:
582; GFX10-GISEL:       ; %bb.0:
583; GFX10-GISEL-NEXT:    s_clause 0x1
584; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
585; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
586; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
587; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
589; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
590; GFX10-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
591; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
592; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
593; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
594; GFX10-GISEL-NEXT:    s_endpgm
595  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
596  store i64 %ctlz, i64 addrspace(1)* %out
597  ret void
598}
599
600define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
601; SI-LABEL: s_ctlz_i64_trunc:
602; SI:       ; %bb.0:
603; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
604; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
605; SI-NEXT:    s_mov_b32 s3, 0xf000
606; SI-NEXT:    s_mov_b32 s2, -1
607; SI-NEXT:    s_waitcnt lgkmcnt(0)
608; SI-NEXT:    s_flbit_i32_b32 s4, s4
609; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
610; SI-NEXT:    s_flbit_i32_b32 s5, s5
611; SI-NEXT:    s_add_i32 s4, s4, 32
612; SI-NEXT:    v_mov_b32_e32 v0, s5
613; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
614; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
615; SI-NEXT:    s_endpgm
616;
617; VI-LABEL: s_ctlz_i64_trunc:
618; VI:       ; %bb.0:
619; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
620; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
621; VI-NEXT:    s_mov_b32 s3, 0xf000
622; VI-NEXT:    s_mov_b32 s2, -1
623; VI-NEXT:    s_waitcnt lgkmcnt(0)
624; VI-NEXT:    s_flbit_i32_b32 s4, s4
625; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s4, 32 clamp
626; VI-NEXT:    s_flbit_i32_b32 s4, s5
627; VI-NEXT:    v_min3_u32 v0, v0, s4, 64
628; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
629; VI-NEXT:    s_endpgm
630;
631; EG-LABEL: s_ctlz_i64_trunc:
632; EG:       ; %bb.0:
633; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
634; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
635; EG-NEXT:    CF_END
636; EG-NEXT:    PAD
637; EG-NEXT:    ALU clause starting at 4:
638; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
639; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
640; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
641; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
642; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
643; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
644; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
645; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
646; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
647;
648; GFX10-LABEL: s_ctlz_i64_trunc:
649; GFX10:       ; %bb.0:
650; GFX10-NEXT:    s_clause 0x1
651; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
652; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
653; GFX10-NEXT:    v_mov_b32_e32 v1, 0
654; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
656; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
657; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
658; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
659; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
660; GFX10-NEXT:    s_endpgm
661;
662; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
663; GFX10-GISEL:       ; %bb.0:
664; GFX10-GISEL-NEXT:    s_clause 0x1
665; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
666; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
667; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
668; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
669; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
670; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
671; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
672; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
673; GFX10-GISEL-NEXT:    s_endpgm
674  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
675  %trunc = trunc i64 %ctlz to i32
676  store i32 %trunc, i32 addrspace(1)* %out
677  ret void
678}
679
680define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
681; SI-LABEL: v_ctlz_i64:
682; SI:       ; %bb.0:
683; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
684; SI-NEXT:    s_mov_b32 s7, 0xf000
685; SI-NEXT:    s_mov_b32 s6, 0
686; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
687; SI-NEXT:    v_mov_b32_e32 v1, 0
688; SI-NEXT:    s_waitcnt lgkmcnt(0)
689; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
690; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
691; SI-NEXT:    s_waitcnt vmcnt(0)
692; SI-NEXT:    v_ffbh_u32_e32 v2, v2
693; SI-NEXT:    v_min_u32_e32 v2, 0xffffffdf, v2
694; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
695; SI-NEXT:    v_ffbh_u32_e32 v3, v3
696; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
697; SI-NEXT:    v_mov_b32_e32 v3, v1
698; SI-NEXT:    s_waitcnt lgkmcnt(0)
699; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
700; SI-NEXT:    s_endpgm
701;
702; VI-LABEL: v_ctlz_i64:
703; VI:       ; %bb.0:
704; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
705; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
706; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
707; VI-NEXT:    v_mov_b32_e32 v2, 0
708; VI-NEXT:    s_waitcnt lgkmcnt(0)
709; VI-NEXT:    v_mov_b32_e32 v1, s3
710; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
711; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
712; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
713; VI-NEXT:    v_mov_b32_e32 v4, s1
714; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
715; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
716; VI-NEXT:    s_waitcnt vmcnt(0)
717; VI-NEXT:    v_ffbh_u32_e32 v0, v0
718; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
719; VI-NEXT:    v_ffbh_u32_e32 v1, v1
720; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
721; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
722; VI-NEXT:    s_endpgm
723;
724; EG-LABEL: v_ctlz_i64:
725; EG:       ; %bb.0:
726; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
727; EG-NEXT:    TEX 0 @6
728; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
729; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
730; EG-NEXT:    CF_END
731; EG-NEXT:    PAD
732; EG-NEXT:    Fetch clause starting at 6:
733; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
734; EG-NEXT:    ALU clause starting at 8:
735; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
736; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
737; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
738; EG-NEXT:    ALU clause starting at 11:
739; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
740; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
741; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
742; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
743; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
744; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
745; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
746; EG-NEXT:     MOV T0.Y, 0.0,
747; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
748; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
749; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
750;
751; GFX10-LABEL: v_ctlz_i64:
752; GFX10:       ; %bb.0:
753; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
754; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
755; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
756; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
758; GFX10-NEXT:    s_waitcnt vmcnt(0)
759; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
760; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
761; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
762; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
763; GFX10-NEXT:    v_mov_b32_e32 v1, 0
764; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
765; GFX10-NEXT:    s_endpgm
766;
767; GFX10-GISEL-LABEL: v_ctlz_i64:
768; GFX10-GISEL:       ; %bb.0:
769; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
770; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
771; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
772; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
774; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
775; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
776; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
777; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
778; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
779; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
780; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
781; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
782; GFX10-GISEL-NEXT:    s_endpgm
783  %tid = call i32 @llvm.amdgcn.workitem.id.x()
784  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
785  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
786  %val = load i64, i64 addrspace(1)* %in.gep
787  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
788  store i64 %ctlz, i64 addrspace(1)* %out.gep
789  ret void
790}
791
792define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
793; SI-LABEL: v_ctlz_i64_trunc:
794; SI:       ; %bb.0:
795; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
796; SI-NEXT:    s_mov_b32 s7, 0xf000
797; SI-NEXT:    s_mov_b32 s6, 0
798; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
799; SI-NEXT:    v_mov_b32_e32 v2, 0
800; SI-NEXT:    s_waitcnt lgkmcnt(0)
801; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
802; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
803; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
804; SI-NEXT:    s_waitcnt vmcnt(0)
805; SI-NEXT:    v_ffbh_u32_e32 v0, v3
806; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
807; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
808; SI-NEXT:    v_ffbh_u32_e32 v3, v4
809; SI-NEXT:    v_min3_u32 v0, v0, v3, 64
810; SI-NEXT:    s_waitcnt lgkmcnt(0)
811; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
812; SI-NEXT:    s_endpgm
813;
814; VI-LABEL: v_ctlz_i64_trunc:
815; VI:       ; %bb.0:
816; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
817; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
818; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
819; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
820; VI-NEXT:    s_waitcnt lgkmcnt(0)
821; VI-NEXT:    v_mov_b32_e32 v2, s3
822; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
823; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
824; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
825; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
826; VI-NEXT:    v_mov_b32_e32 v4, s1
827; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
828; VI-NEXT:    s_waitcnt vmcnt(0)
829; VI-NEXT:    v_ffbh_u32_e32 v0, v1
830; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
831; VI-NEXT:    v_ffbh_u32_e32 v1, v2
832; VI-NEXT:    v_min3_u32 v0, v0, v1, 64
833; VI-NEXT:    flat_store_dword v[3:4], v0
834; VI-NEXT:    s_endpgm
835;
836; EG-LABEL: v_ctlz_i64_trunc:
837; EG:       ; %bb.0:
838; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
839; EG-NEXT:    TEX 0 @6
840; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
841; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
842; EG-NEXT:    CF_END
843; EG-NEXT:    PAD
844; EG-NEXT:    Fetch clause starting at 6:
845; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
846; EG-NEXT:    ALU clause starting at 8:
847; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
848; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
849; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
850; EG-NEXT:    ALU clause starting at 11:
851; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
852; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
853; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
854; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
855; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
856; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
857; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
858; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
859; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
860; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
861; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
862;
863; GFX10-LABEL: v_ctlz_i64_trunc:
864; GFX10:       ; %bb.0:
865; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
866; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
867; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
868; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
869; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
870; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
871; GFX10-NEXT:    s_waitcnt vmcnt(0)
872; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
873; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
874; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
875; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
876; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
877; GFX10-NEXT:    s_endpgm
878;
879; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
880; GFX10-GISEL:       ; %bb.0:
881; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
882; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
883; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
884; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
885; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
887; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
888; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
889; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
890; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
891; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
892; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
893; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
894; GFX10-GISEL-NEXT:    s_endpgm
895  %tid = call i32 @llvm.amdgcn.workitem.id.x()
896  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
897  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
898  %val = load i64, i64 addrspace(1)* %in.gep
899  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
900  %trunc = trunc i64 %ctlz to i32
901  store i32 %trunc, i32 addrspace(1)* %out.gep
902  ret void
903}
904
905define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
906; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
907; SI:       ; %bb.0:
908; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
909; SI-NEXT:    s_mov_b32 s3, 0xf000
910; SI-NEXT:    s_mov_b32 s6, 0
911; SI-NEXT:    s_mov_b32 s7, s3
912; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
913; SI-NEXT:    v_mov_b32_e32 v1, 0
914; SI-NEXT:    s_waitcnt lgkmcnt(0)
915; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
916; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
917; SI-NEXT:    s_mov_b32 s2, -1
918; SI-NEXT:    s_waitcnt vmcnt(0)
919; SI-NEXT:    v_ffbh_u32_e32 v0, v0
920; SI-NEXT:    s_waitcnt lgkmcnt(0)
921; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
922; SI-NEXT:    s_endpgm
923;
924; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
925; VI:       ; %bb.0:
926; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
927; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
928; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
929; VI-NEXT:    s_waitcnt lgkmcnt(0)
930; VI-NEXT:    v_mov_b32_e32 v1, s3
931; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
932; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
933; VI-NEXT:    flat_load_dword v0, v[0:1]
934; VI-NEXT:    s_mov_b32 s3, 0xf000
935; VI-NEXT:    s_mov_b32 s2, -1
936; VI-NEXT:    s_waitcnt vmcnt(0)
937; VI-NEXT:    v_ffbh_u32_e32 v0, v0
938; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
939; VI-NEXT:    s_endpgm
940;
941; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
942; EG:       ; %bb.0:
943; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
944; EG-NEXT:    TEX 0 @6
945; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
946; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
947; EG-NEXT:    CF_END
948; EG-NEXT:    PAD
949; EG-NEXT:    Fetch clause starting at 6:
950; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
951; EG-NEXT:    ALU clause starting at 8:
952; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
953; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
954; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
955; EG-NEXT:    ALU clause starting at 11:
956; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
957; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
958; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
959; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
960; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
961; EG-NEXT:    -1(nan), 2(2.802597e-45)
962;
963; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
964; GFX10:       ; %bb.0:
965; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
966; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
967; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
968; GFX10-NEXT:    v_mov_b32_e32 v1, 0
969; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
970; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
971; GFX10-NEXT:    s_waitcnt vmcnt(0)
972; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
973; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
974; GFX10-NEXT:    s_endpgm
975;
976; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
977; GFX10-GISEL:       ; %bb.0:
978; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
979; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
980; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
981; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
983; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
984; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
985; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
986; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
987; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
988; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
989; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
990; GFX10-GISEL-NEXT:    s_endpgm
991  %tid = call i32 @llvm.amdgcn.workitem.id.x()
992  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
993  %val = load i32, i32 addrspace(1)* %in.gep
994  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
995  %cmp = icmp eq i32 %val, 0
996  %sel = select i1 %cmp, i32 -1, i32 %ctlz
997  store i32 %sel, i32 addrspace(1)* %out
998  ret void
999}
1000
1001define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1002; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
1003; SI:       ; %bb.0:
1004; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1005; SI-NEXT:    s_mov_b32 s3, 0xf000
1006; SI-NEXT:    s_mov_b32 s6, 0
1007; SI-NEXT:    s_mov_b32 s7, s3
1008; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1009; SI-NEXT:    v_mov_b32_e32 v1, 0
1010; SI-NEXT:    s_waitcnt lgkmcnt(0)
1011; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1012; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1013; SI-NEXT:    s_mov_b32 s2, -1
1014; SI-NEXT:    s_waitcnt vmcnt(0)
1015; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1016; SI-NEXT:    s_waitcnt lgkmcnt(0)
1017; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1018; SI-NEXT:    s_endpgm
1019;
1020; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
1021; VI:       ; %bb.0:
1022; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1023; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1024; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1025; VI-NEXT:    s_waitcnt lgkmcnt(0)
1026; VI-NEXT:    v_mov_b32_e32 v1, s3
1027; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1028; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1029; VI-NEXT:    flat_load_dword v0, v[0:1]
1030; VI-NEXT:    s_mov_b32 s3, 0xf000
1031; VI-NEXT:    s_mov_b32 s2, -1
1032; VI-NEXT:    s_waitcnt vmcnt(0)
1033; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1034; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1035; VI-NEXT:    s_endpgm
1036;
1037; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
1038; EG:       ; %bb.0:
1039; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1040; EG-NEXT:    TEX 0 @6
1041; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1042; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1043; EG-NEXT:    CF_END
1044; EG-NEXT:    PAD
1045; EG-NEXT:    Fetch clause starting at 6:
1046; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1047; EG-NEXT:    ALU clause starting at 8:
1048; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1049; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1050; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1051; EG-NEXT:    ALU clause starting at 11:
1052; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1053; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1054; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1055; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1056; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1057; EG-NEXT:    -1(nan), 2(2.802597e-45)
1058;
1059; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
1060; GFX10:       ; %bb.0:
1061; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1062; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1063; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1064; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1065; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1066; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1067; GFX10-NEXT:    s_waitcnt vmcnt(0)
1068; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1069; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1070; GFX10-NEXT:    s_endpgm
1071;
1072; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
1073; GFX10-GISEL:       ; %bb.0:
1074; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1075; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1076; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1077; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1078; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1079; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1080; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1081; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1082; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1083; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1084; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1085; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1086; GFX10-GISEL-NEXT:    s_endpgm
1087  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1088  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1089  %val = load i32, i32 addrspace(1)* %in.gep
1090  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1091  %cmp = icmp ne i32 %val, 0
1092  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1093  store i32 %sel, i32 addrspace(1)* %out
1094  ret void
1095}
1096
1097; TODO: Should be able to eliminate select here as well.
1098define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1099; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1102; SI-NEXT:    s_mov_b32 s3, 0xf000
1103; SI-NEXT:    s_mov_b32 s6, 0
1104; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1105; SI-NEXT:    v_mov_b32_e32 v1, 0
1106; SI-NEXT:    s_mov_b32 s7, s3
1107; SI-NEXT:    s_waitcnt lgkmcnt(0)
1108; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1109; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1110; SI-NEXT:    s_mov_b32 s2, -1
1111; SI-NEXT:    s_waitcnt vmcnt(0)
1112; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1113; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1114; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1115; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1116; SI-NEXT:    s_waitcnt lgkmcnt(0)
1117; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1118; SI-NEXT:    s_endpgm
1119;
1120; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1121; VI:       ; %bb.0:
1122; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1123; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1124; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1125; VI-NEXT:    s_waitcnt lgkmcnt(0)
1126; VI-NEXT:    v_mov_b32_e32 v1, s3
1127; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1128; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1129; VI-NEXT:    flat_load_dword v0, v[0:1]
1130; VI-NEXT:    s_mov_b32 s3, 0xf000
1131; VI-NEXT:    s_mov_b32 s2, -1
1132; VI-NEXT:    s_waitcnt vmcnt(0)
1133; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1134; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1135; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1136; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1137; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1138; VI-NEXT:    s_endpgm
1139;
1140; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1141; EG:       ; %bb.0:
1142; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1143; EG-NEXT:    TEX 0 @6
1144; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1145; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1146; EG-NEXT:    CF_END
1147; EG-NEXT:    PAD
1148; EG-NEXT:    Fetch clause starting at 6:
1149; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1150; EG-NEXT:    ALU clause starting at 8:
1151; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1152; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1153; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1154; EG-NEXT:    ALU clause starting at 11:
1155; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1156; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1157; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1158; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1159; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1160; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1161; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1162; EG-NEXT:    -1(nan), 2(2.802597e-45)
1163;
1164; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1165; GFX10:       ; %bb.0:
1166; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1167; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1168; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1169; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1170; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1171; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1172; GFX10-NEXT:    s_waitcnt vmcnt(0)
1173; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1174; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1175; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1176; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1177; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1178; GFX10-NEXT:    s_endpgm
1179;
1180; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1181; GFX10-GISEL:       ; %bb.0:
1182; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1183; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1184; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1185; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1186; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1188; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1189; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1190; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1191; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1192; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1193; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1194; GFX10-GISEL-NEXT:    s_endpgm
1195  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1196  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1197  %val = load i32, i32 addrspace(1)* %in.gep
1198  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1199  %cmp = icmp eq i32 %ctlz, 32
1200  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1201  store i32 %sel, i32 addrspace(1)* %out
1202  ret void
1203}
1204
1205define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1206; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1207; SI:       ; %bb.0:
1208; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1209; SI-NEXT:    s_mov_b32 s3, 0xf000
1210; SI-NEXT:    s_mov_b32 s6, 0
1211; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1212; SI-NEXT:    v_mov_b32_e32 v1, 0
1213; SI-NEXT:    s_mov_b32 s7, s3
1214; SI-NEXT:    s_waitcnt lgkmcnt(0)
1215; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1216; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1217; SI-NEXT:    s_mov_b32 s2, -1
1218; SI-NEXT:    s_waitcnt vmcnt(0)
1219; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1220; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1221; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1222; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1223; SI-NEXT:    s_waitcnt lgkmcnt(0)
1224; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1225; SI-NEXT:    s_endpgm
1226;
1227; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1228; VI:       ; %bb.0:
1229; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1230; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1231; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1232; VI-NEXT:    s_waitcnt lgkmcnt(0)
1233; VI-NEXT:    v_mov_b32_e32 v1, s3
1234; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1235; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1236; VI-NEXT:    flat_load_dword v0, v[0:1]
1237; VI-NEXT:    s_mov_b32 s3, 0xf000
1238; VI-NEXT:    s_mov_b32 s2, -1
1239; VI-NEXT:    s_waitcnt vmcnt(0)
1240; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1241; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1242; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1243; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1244; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1245; VI-NEXT:    s_endpgm
1246;
1247; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1248; EG:       ; %bb.0:
1249; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1250; EG-NEXT:    TEX 0 @6
1251; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1252; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1253; EG-NEXT:    CF_END
1254; EG-NEXT:    PAD
1255; EG-NEXT:    Fetch clause starting at 6:
1256; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1257; EG-NEXT:    ALU clause starting at 8:
1258; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1259; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1260; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1261; EG-NEXT:    ALU clause starting at 11:
1262; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1263; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1264; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1265; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1266; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1267; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1268; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1269; EG-NEXT:    -1(nan), 2(2.802597e-45)
1270;
1271; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1272; GFX10:       ; %bb.0:
1273; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1274; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1275; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1276; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1277; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1278; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1279; GFX10-NEXT:    s_waitcnt vmcnt(0)
1280; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1281; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1282; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1283; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1284; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1285; GFX10-NEXT:    s_endpgm
1286;
1287; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1288; GFX10-GISEL:       ; %bb.0:
1289; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1290; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1291; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1292; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1293; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1295; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1296; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1297; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1298; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1299; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1300; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1301; GFX10-GISEL-NEXT:    s_endpgm
1302  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1303  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1304  %val = load i32, i32 addrspace(1)* %in.gep
1305  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1306  %cmp = icmp ne i32 %ctlz, 32
1307  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1308  store i32 %sel, i32 addrspace(1)* %out
1309  ret void
1310}
1311
1312 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1313; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
1314; SI:       ; %bb.0:
1315; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1316; SI-NEXT:    s_mov_b32 s3, 0xf000
1317; SI-NEXT:    v_mov_b32_e32 v1, 0
1318; SI-NEXT:    s_mov_b32 s6, 0
1319; SI-NEXT:    s_mov_b32 s7, s3
1320; SI-NEXT:    s_waitcnt lgkmcnt(0)
1321; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1322; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1323; SI-NEXT:    s_mov_b32 s2, -1
1324; SI-NEXT:    s_waitcnt vmcnt(0)
1325; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1326; SI-NEXT:    s_waitcnt lgkmcnt(0)
1327; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1328; SI-NEXT:    s_endpgm
1329;
1330; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
1331; VI:       ; %bb.0:
1332; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1333; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1334; VI-NEXT:    s_waitcnt lgkmcnt(0)
1335; VI-NEXT:    v_mov_b32_e32 v1, s3
1336; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1337; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1338; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1339; VI-NEXT:    s_mov_b32 s3, 0xf000
1340; VI-NEXT:    s_mov_b32 s2, -1
1341; VI-NEXT:    s_waitcnt vmcnt(0)
1342; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1343; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1344; VI-NEXT:    s_endpgm
1345;
1346; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1347; EG:       ; %bb.0:
1348; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1349; EG-NEXT:    TEX 0 @6
1350; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1351; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1352; EG-NEXT:    CF_END
1353; EG-NEXT:    PAD
1354; EG-NEXT:    Fetch clause starting at 6:
1355; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1356; EG-NEXT:    ALU clause starting at 8:
1357; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1358; EG-NEXT:    ALU clause starting at 9:
1359; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1360; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1361; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1362; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1363; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1364; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1365; EG-NEXT:     LSHL T0.X, PV.W, PS,
1366; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1367; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1368; EG-NEXT:     MOV T0.Y, 0.0,
1369; EG-NEXT:     MOV * T0.Z, 0.0,
1370; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1371; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1372;
1373; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
1374; GFX10:       ; %bb.0:
1375; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1376; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1377; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1378; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1380; GFX10-NEXT:    s_waitcnt vmcnt(0)
1381; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1382; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1383; GFX10-NEXT:    s_endpgm
1384;
1385; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
1386; GFX10-GISEL:       ; %bb.0:
1387; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1388; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1389; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1390; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1391; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1392; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1393; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1394; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1395; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1396; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1397; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1398; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1399; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1400; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
1401; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1402; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1403; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1404; GFX10-GISEL-NEXT:    s_endpgm
1405  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1406  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1407  %val = load i8, i8 addrspace(1)* %valptr.gep
1408  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1409  %cmp = icmp eq i8 %val, 0
1410  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1411  store i8 %sel, i8 addrspace(1)* %out
1412  ret void
1413}
1414
1415 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1416; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1417; SI:       ; %bb.0:
1418; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1419; SI-NEXT:    s_mov_b32 s3, 0xf000
1420; SI-NEXT:    s_mov_b32 s2, -1
1421; SI-NEXT:    s_mov_b32 s6, s2
1422; SI-NEXT:    s_mov_b32 s7, s3
1423; SI-NEXT:    s_waitcnt lgkmcnt(0)
1424; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1425; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1426; SI-NEXT:    s_waitcnt vmcnt(0)
1427; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1428; SI-NEXT:    s_waitcnt lgkmcnt(0)
1429; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1430; SI-NEXT:    s_endpgm
1431;
1432; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1433; VI:       ; %bb.0:
1434; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
1435; VI-NEXT:    s_mov_b32 s3, 0xf000
1436; VI-NEXT:    s_mov_b32 s2, -1
1437; VI-NEXT:    s_mov_b32 s6, s2
1438; VI-NEXT:    s_mov_b32 s7, s3
1439; VI-NEXT:    s_waitcnt lgkmcnt(0)
1440; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1441; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1442; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1443; VI-NEXT:    s_waitcnt vmcnt(0)
1444; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1445; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1446; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
1447; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1448; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1449; VI-NEXT:    s_waitcnt lgkmcnt(0)
1450; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1451; VI-NEXT:    s_endpgm
1452;
1453; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1454; EG:       ; %bb.0:
1455; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1456; EG-NEXT:    TEX 0 @6
1457; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1458; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1459; EG-NEXT:    CF_END
1460; EG-NEXT:    PAD
1461; EG-NEXT:    Fetch clause starting at 6:
1462; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1463; EG-NEXT:    ALU clause starting at 8:
1464; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1465; EG-NEXT:    ALU clause starting at 9:
1466; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1467; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1468; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1469; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1470; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1471; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1472; EG-NEXT:     LSHL T0.X, PV.W, PS,
1473; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1474; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1475; EG-NEXT:     MOV T0.Y, 0.0,
1476; EG-NEXT:     MOV * T0.Z, 0.0,
1477; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1478; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1479;
1480; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
1481; GFX10:       ; %bb.0:
1482; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1483; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1484; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1485; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1487; GFX10-NEXT:    s_waitcnt vmcnt(0)
1488; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
1489; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1490; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1491; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1492; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1493; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1494; GFX10-NEXT:    s_endpgm
1495;
1496; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
1497; GFX10-GISEL:       ; %bb.0:
1498; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1499; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1500; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1501; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1503; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1504; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
1505; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1506; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1507; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
1508; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1509; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
1510; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1511; GFX10-GISEL-NEXT:    s_endpgm
1512  %val = load i16, i16 addrspace(1)* %valptr
1513  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1514  %cmp = icmp eq i16 %val, 0
1515  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1516  store i16 %sel, i16 addrspace(1)* %out
1517  ret void
1518}
1519
1520; FIXME: Need to handle non-uniform case for function below (load without gep).
1521define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1522; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1523; SI:       ; %bb.0:
1524; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1525; SI-NEXT:    s_mov_b32 s3, 0xf000
1526; SI-NEXT:    v_mov_b32_e32 v1, 0
1527; SI-NEXT:    s_mov_b32 s6, 0
1528; SI-NEXT:    s_mov_b32 s7, s3
1529; SI-NEXT:    s_waitcnt lgkmcnt(0)
1530; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1531; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1532; SI-NEXT:    s_mov_b32 s2, -1
1533; SI-NEXT:    s_waitcnt vmcnt(0)
1534; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1535; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1536; SI-NEXT:    s_waitcnt lgkmcnt(0)
1537; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1538; SI-NEXT:    s_endpgm
1539;
1540; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1541; VI:       ; %bb.0:
1542; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1543; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1544; VI-NEXT:    s_waitcnt lgkmcnt(0)
1545; VI-NEXT:    v_mov_b32_e32 v1, s3
1546; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1547; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1548; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1549; VI-NEXT:    s_mov_b32 s3, 0xf000
1550; VI-NEXT:    s_mov_b32 s2, -1
1551; VI-NEXT:    s_waitcnt vmcnt(0)
1552; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1553; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1554; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1555; VI-NEXT:    s_endpgm
1556;
1557; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1558; EG:       ; %bb.0:
1559; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1560; EG-NEXT:    TEX 0 @6
1561; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1562; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1563; EG-NEXT:    CF_END
1564; EG-NEXT:    PAD
1565; EG-NEXT:    Fetch clause starting at 6:
1566; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1567; EG-NEXT:    ALU clause starting at 8:
1568; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1569; EG-NEXT:    ALU clause starting at 9:
1570; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1571; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1572; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1573; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1574; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1575; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1576; EG-NEXT:     LSHL T0.X, PV.W, PS,
1577; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1578; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1579; EG-NEXT:     MOV T0.Y, 0.0,
1580; EG-NEXT:     MOV * T0.Z, 0.0,
1581; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1582; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1583;
1584; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
1585; GFX10:       ; %bb.0:
1586; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1587; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1588; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1589; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1590; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1591; GFX10-NEXT:    s_waitcnt vmcnt(0)
1592; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1593; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1594; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1595; GFX10-NEXT:    s_endpgm
1596;
1597; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
1598; GFX10-GISEL:       ; %bb.0:
1599; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1600; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1601; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1602; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1604; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1605; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1606; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1607; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1608; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1609; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1610; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1611; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1612; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1613; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 25, v1
1614; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1615; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1616; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1617; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1618; GFX10-GISEL-NEXT:    s_endpgm
1619  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1620  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1621  %val = load i7, i7 addrspace(1)* %valptr.gep
1622  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1623  %cmp = icmp eq i7 %val, 0
1624  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1625  store i7 %sel, i7 addrspace(1)* %out
1626  ret void
1627}
1628