1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7
8declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
9declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
10declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
11
12declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
13declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
14declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
15
16declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
17declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
18declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
19
20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
21
22define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
23; SI-LABEL: s_ctlz_i32:
24; SI:       ; %bb.0:
25; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    s_flbit_i32_b32 s2, s2
30; SI-NEXT:    s_min_u32 s4, s2, 32
31; SI-NEXT:    s_mov_b32 s2, -1
32; SI-NEXT:    v_mov_b32_e32 v0, s4
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: s_ctlz_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
39; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
40; VI-NEXT:    s_mov_b32 s7, 0xf000
41; VI-NEXT:    s_mov_b32 s6, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_flbit_i32_b32 s0, s0
44; VI-NEXT:    s_min_u32 s0, s0, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_ctlz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60;
61; GFX10-LABEL: s_ctlz_i32:
62; GFX10:       ; %bb.0:
63; GFX10-NEXT:    s_clause 0x1
64; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
65; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
66; GFX10-NEXT:    v_mov_b32_e32 v0, 0
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    s_flbit_i32_b32 s0, s4
69; GFX10-NEXT:    s_min_u32 s0, s0, 32
70; GFX10-NEXT:    v_mov_b32_e32 v1, s0
71; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
72; GFX10-NEXT:    s_endpgm
73;
74; GFX10-GISEL-LABEL: s_ctlz_i32:
75; GFX10-GISEL:       ; %bb.0:
76; GFX10-GISEL-NEXT:    s_clause 0x1
77; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
78; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
79; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
80; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX10-GISEL-NEXT:    s_flbit_i32_b32 s0, s4
82; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 32
83; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
84; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
85; GFX10-GISEL-NEXT:    s_endpgm
86  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
87  store i32 %ctlz, i32 addrspace(1)* %out, align 4
88  ret void
89}
90
91define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
92; SI-LABEL: v_ctlz_i32:
93; SI:       ; %bb.0:
94; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
95; SI-NEXT:    s_mov_b32 s3, 0xf000
96; SI-NEXT:    s_mov_b32 s6, 0
97; SI-NEXT:    s_mov_b32 s7, s3
98; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
99; SI-NEXT:    v_mov_b32_e32 v1, 0
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
102; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt vmcnt(0)
105; SI-NEXT:    v_ffbh_u32_e32 v0, v0
106; SI-NEXT:    v_min_u32_e32 v0, 32, v0
107; SI-NEXT:    s_waitcnt lgkmcnt(0)
108; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
109; SI-NEXT:    s_endpgm
110;
111; VI-LABEL: v_ctlz_i32:
112; VI:       ; %bb.0:
113; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
114; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
115; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
116; VI-NEXT:    s_mov_b32 s7, 0xf000
117; VI-NEXT:    s_mov_b32 s6, -1
118; VI-NEXT:    s_waitcnt lgkmcnt(0)
119; VI-NEXT:    v_mov_b32_e32 v1, s1
120; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
121; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
122; VI-NEXT:    flat_load_dword v0, v[0:1]
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_ffbh_u32_e32 v0, v0
125; VI-NEXT:    v_min_u32_e32 v0, 32, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
127; VI-NEXT:    s_endpgm
128;
129; EG-LABEL: v_ctlz_i32:
130; EG:       ; %bb.0:
131; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    TEX 0 @6
133; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
134; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
135; EG-NEXT:    CF_END
136; EG-NEXT:    PAD
137; EG-NEXT:    Fetch clause starting at 6:
138; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
139; EG-NEXT:    ALU clause starting at 8:
140; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
141; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
143; EG-NEXT:    ALU clause starting at 11:
144; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
145; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
147; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
148;
149; GFX10-LABEL: v_ctlz_i32:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
152; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
153; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
154; GFX10-NEXT:    v_mov_b32_e32 v1, 0
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
157; GFX10-NEXT:    s_waitcnt vmcnt(0)
158; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
159; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
160; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
161; GFX10-NEXT:    s_endpgm
162;
163; GFX10-GISEL-LABEL: v_ctlz_i32:
164; GFX10-GISEL:       ; %bb.0:
165; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
166; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
168; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
169; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
171; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
172; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
173; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
174; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
175; GFX10-GISEL-NEXT:    s_endpgm
176  %tid = call i32 @llvm.amdgcn.workitem.id.x()
177  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
178  %val = load i32, i32 addrspace(1)* %in.gep, align 4
179  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
180  store i32 %ctlz, i32 addrspace(1)* %out, align 4
181  ret void
182}
183
184define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
185; SI-LABEL: v_ctlz_v2i32:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
188; SI-NEXT:    s_mov_b32 s3, 0xf000
189; SI-NEXT:    s_mov_b32 s6, 0
190; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
191; SI-NEXT:    v_mov_b32_e32 v1, 0
192; SI-NEXT:    s_mov_b32 s7, s3
193; SI-NEXT:    s_waitcnt lgkmcnt(0)
194; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
195; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    v_ffbh_u32_e32 v1, v1
199; SI-NEXT:    v_ffbh_u32_e32 v0, v0
200; SI-NEXT:    v_min_u32_e32 v1, 32, v1
201; SI-NEXT:    v_min_u32_e32 v0, 32, v0
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
204; SI-NEXT:    s_endpgm
205;
206; VI-LABEL: v_ctlz_v2i32:
207; VI:       ; %bb.0:
208; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
209; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
210; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
211; VI-NEXT:    s_mov_b32 s7, 0xf000
212; VI-NEXT:    s_mov_b32 s6, -1
213; VI-NEXT:    s_waitcnt lgkmcnt(0)
214; VI-NEXT:    v_mov_b32_e32 v1, s1
215; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
216; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
217; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    v_ffbh_u32_e32 v1, v1
220; VI-NEXT:    v_ffbh_u32_e32 v0, v0
221; VI-NEXT:    v_min_u32_e32 v1, 32, v1
222; VI-NEXT:    v_min_u32_e32 v0, 32, v0
223; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
224; VI-NEXT:    s_endpgm
225;
226; EG-LABEL: v_ctlz_v2i32:
227; EG:       ; %bb.0:
228; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
229; EG-NEXT:    TEX 0 @6
230; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
231; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
232; EG-NEXT:    CF_END
233; EG-NEXT:    PAD
234; EG-NEXT:    Fetch clause starting at 6:
235; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
236; EG-NEXT:    ALU clause starting at 8:
237; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
238; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
239; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
240; EG-NEXT:    ALU clause starting at 11:
241; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
242; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
243; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
244; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
245; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
246; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
247; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
248;
249; GFX10-LABEL: v_ctlz_v2i32:
250; GFX10:       ; %bb.0:
251; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
252; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
253; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
254; GFX10-NEXT:    v_mov_b32_e32 v2, 0
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
257; GFX10-NEXT:    s_waitcnt vmcnt(0)
258; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
259; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
260; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
261; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
262; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
263; GFX10-NEXT:    s_endpgm
264;
265; GFX10-GISEL-LABEL: v_ctlz_v2i32:
266; GFX10-GISEL:       ; %bb.0:
267; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
268; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
269; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
270; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
271; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
273; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
274; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
275; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
276; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
277; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
278; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
279; GFX10-GISEL-NEXT:    s_endpgm
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
282  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
283  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
284  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
285  ret void
286}
287
288define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
289; SI-LABEL: v_ctlz_v4i32:
290; SI:       ; %bb.0:
291; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
292; SI-NEXT:    s_mov_b32 s3, 0xf000
293; SI-NEXT:    s_mov_b32 s6, 0
294; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
295; SI-NEXT:    v_mov_b32_e32 v1, 0
296; SI-NEXT:    s_mov_b32 s7, s3
297; SI-NEXT:    s_waitcnt lgkmcnt(0)
298; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
300; SI-NEXT:    s_mov_b32 s2, -1
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    v_ffbh_u32_e32 v3, v3
303; SI-NEXT:    v_ffbh_u32_e32 v2, v2
304; SI-NEXT:    v_ffbh_u32_e32 v1, v1
305; SI-NEXT:    v_ffbh_u32_e32 v0, v0
306; SI-NEXT:    v_min_u32_e32 v3, 32, v3
307; SI-NEXT:    v_min_u32_e32 v2, 32, v2
308; SI-NEXT:    v_min_u32_e32 v1, 32, v1
309; SI-NEXT:    v_min_u32_e32 v0, 32, v0
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
312; SI-NEXT:    s_endpgm
313;
314; VI-LABEL: v_ctlz_v4i32:
315; VI:       ; %bb.0:
316; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
317; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
318; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
319; VI-NEXT:    s_mov_b32 s7, 0xf000
320; VI-NEXT:    s_mov_b32 s6, -1
321; VI-NEXT:    s_waitcnt lgkmcnt(0)
322; VI-NEXT:    v_mov_b32_e32 v1, s1
323; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
324; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
325; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
326; VI-NEXT:    s_waitcnt vmcnt(0)
327; VI-NEXT:    v_ffbh_u32_e32 v3, v3
328; VI-NEXT:    v_ffbh_u32_e32 v2, v2
329; VI-NEXT:    v_ffbh_u32_e32 v1, v1
330; VI-NEXT:    v_ffbh_u32_e32 v0, v0
331; VI-NEXT:    v_min_u32_e32 v3, 32, v3
332; VI-NEXT:    v_min_u32_e32 v2, 32, v2
333; VI-NEXT:    v_min_u32_e32 v1, 32, v1
334; VI-NEXT:    v_min_u32_e32 v0, 32, v0
335; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
336; VI-NEXT:    s_endpgm
337;
338; EG-LABEL: v_ctlz_v4i32:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
350; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
351; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
352; EG-NEXT:    ALU clause starting at 11:
353; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
354; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
355; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
356; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
357; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
358; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
359; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
360; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
361; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
362; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
363; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
365; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
366;
367; GFX10-LABEL: v_ctlz_v4i32:
368; GFX10:       ; %bb.0:
369; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
370; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
371; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
372; GFX10-NEXT:    v_mov_b32_e32 v4, 0
373; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
375; GFX10-NEXT:    s_waitcnt vmcnt(0)
376; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
377; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
378; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
379; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
380; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
381; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
382; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
383; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
384; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
385; GFX10-NEXT:    s_endpgm
386;
387; GFX10-GISEL-LABEL: v_ctlz_v4i32:
388; GFX10-GISEL:       ; %bb.0:
389; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
390; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
391; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
392; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
393; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
395; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
396; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
397; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
398; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
399; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
400; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
401; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
402; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
403; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
404; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
405; GFX10-GISEL-NEXT:    s_endpgm
406  %tid = call i32 @llvm.amdgcn.workitem.id.x()
407  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
408  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
409  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
410  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
411  ret void
412}
413
414define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
415; SI-LABEL: v_ctlz_i8:
416; SI:       ; %bb.0:
417; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
418; SI-NEXT:    s_mov_b32 s3, 0xf000
419; SI-NEXT:    s_mov_b32 s2, -1
420; SI-NEXT:    s_mov_b32 s6, s2
421; SI-NEXT:    s_mov_b32 s7, s3
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
424; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
425; SI-NEXT:    s_waitcnt vmcnt(0)
426; SI-NEXT:    v_ffbh_u32_e32 v0, v0
427; SI-NEXT:    v_min_u32_e32 v0, 32, v0
428; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
429; SI-NEXT:    s_waitcnt lgkmcnt(0)
430; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
431; SI-NEXT:    s_endpgm
432;
433; VI-LABEL: v_ctlz_i8:
434; VI:       ; %bb.0:
435; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
436; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
437; VI-NEXT:    s_mov_b32 s7, 0xf000
438; VI-NEXT:    s_mov_b32 s6, -1
439; VI-NEXT:    s_mov_b32 s2, s6
440; VI-NEXT:    s_mov_b32 s3, s7
441; VI-NEXT:    s_waitcnt lgkmcnt(0)
442; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
443; VI-NEXT:    s_waitcnt vmcnt(0)
444; VI-NEXT:    v_ffbh_u32_e32 v0, v0
445; VI-NEXT:    v_min_u32_e32 v0, 32, v0
446; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
447; VI-NEXT:    v_add_u16_e32 v0, -8, v0
448; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
449; VI-NEXT:    s_endpgm
450;
451; EG-LABEL: v_ctlz_i8:
452; EG:       ; %bb.0:
453; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
454; EG-NEXT:    TEX 0 @6
455; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
456; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
457; EG-NEXT:    CF_END
458; EG-NEXT:    PAD
459; EG-NEXT:    Fetch clause starting at 6:
460; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
461; EG-NEXT:    ALU clause starting at 8:
462; EG-NEXT:     MOV * T0.X, KC0[2].Z,
463; EG-NEXT:    ALU clause starting at 9:
464; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
465; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
466; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
467; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
468; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
469; EG-NEXT:    -24(nan), 0(0.000000e+00)
470; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
471; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
472; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
473; EG-NEXT:     LSHL T0.X, PV.W, PS,
474; EG-NEXT:     LSHL * T0.W, literal.x, PS,
475; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
476; EG-NEXT:     MOV T0.Y, 0.0,
477; EG-NEXT:     MOV * T0.Z, 0.0,
478; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
479; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
480;
481; GFX10-LABEL: v_ctlz_i8:
482; GFX10:       ; %bb.0:
483; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
484; GFX10-NEXT:    v_mov_b32_e32 v0, 0
485; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
486; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
488; GFX10-NEXT:    s_waitcnt vmcnt(0)
489; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
490; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
491; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
492; GFX10-NEXT:    v_add_nc_u16 v1, v1, -8
493; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
494; GFX10-NEXT:    s_endpgm
495;
496; GFX10-GISEL-LABEL: v_ctlz_i8:
497; GFX10-GISEL:       ; %bb.0:
498; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
499; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
500; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
501; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
503; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
504; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
505; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
506; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
507; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
508; GFX10-GISEL-NEXT:    s_endpgm
509  %val = load i8, i8 addrspace(1)* %valptr
510  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
511  store i8 %ctlz, i8 addrspace(1)* %out
512  ret void
513}
514
515define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
516; SI-LABEL: s_ctlz_i64:
517; SI:       ; %bb.0:
518; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
519; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
520; SI-NEXT:    s_mov_b32 s3, 0xf000
521; SI-NEXT:    s_mov_b32 s2, -1
522; SI-NEXT:    s_waitcnt lgkmcnt(0)
523; SI-NEXT:    s_flbit_i32_b32 s4, s4
524; SI-NEXT:    s_flbit_i32_b32 s5, s5
525; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
526; SI-NEXT:    v_mov_b32_e32 v0, s5
527; SI-NEXT:    s_add_i32 s4, s4, 32
528; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
529; SI-NEXT:    v_mov_b32_e32 v1, 0
530; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
531; SI-NEXT:    s_endpgm
532;
533; VI-LABEL: s_ctlz_i64:
534; VI:       ; %bb.0:
535; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
536; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4c
537; VI-NEXT:    s_mov_b32 s7, 0xf000
538; VI-NEXT:    s_mov_b32 s6, -1
539; VI-NEXT:    v_mov_b32_e32 v1, 0
540; VI-NEXT:    s_waitcnt lgkmcnt(0)
541; VI-NEXT:    s_flbit_i32_b32 s0, s0
542; VI-NEXT:    v_add_u32_e64 v0, s[2:3], s0, 32 clamp
543; VI-NEXT:    s_flbit_i32_b32 s0, s1
544; VI-NEXT:    v_min3_u32 v0, v0, s0, 64
545; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
546; VI-NEXT:    s_endpgm
547;
548; EG-LABEL: s_ctlz_i64:
549; EG:       ; %bb.0:
550; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
551; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
552; EG-NEXT:    CF_END
553; EG-NEXT:    PAD
554; EG-NEXT:    ALU clause starting at 4:
555; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
556; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
557; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
558; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
559; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
560; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
561; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
562; EG-NEXT:     MOV T0.Y, 0.0,
563; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
564; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
565;
566; GFX10-LABEL: s_ctlz_i64:
567; GFX10:       ; %bb.0:
568; GFX10-NEXT:    s_clause 0x1
569; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
570; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
571; GFX10-NEXT:    v_mov_b32_e32 v1, 0
572; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
574; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
575; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
576; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
577; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
578; GFX10-NEXT:    s_endpgm
579;
580; GFX10-GISEL-LABEL: s_ctlz_i64:
581; GFX10-GISEL:       ; %bb.0:
582; GFX10-GISEL-NEXT:    s_clause 0x1
583; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
584; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
585; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
586; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
588; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
589; GFX10-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
590; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
591; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
592; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
593; GFX10-GISEL-NEXT:    s_endpgm
594  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
595  store i64 %ctlz, i64 addrspace(1)* %out
596  ret void
597}
598
599define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
600; SI-LABEL: s_ctlz_i64_trunc:
601; SI:       ; %bb.0:
602; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
603; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
604; SI-NEXT:    s_mov_b32 s3, 0xf000
605; SI-NEXT:    s_mov_b32 s2, -1
606; SI-NEXT:    s_waitcnt lgkmcnt(0)
607; SI-NEXT:    s_flbit_i32_b32 s4, s4
608; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
609; SI-NEXT:    s_flbit_i32_b32 s5, s5
610; SI-NEXT:    s_add_i32 s4, s4, 32
611; SI-NEXT:    v_mov_b32_e32 v0, s5
612; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
613; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
614; SI-NEXT:    s_endpgm
615;
616; VI-LABEL: s_ctlz_i64_trunc:
617; VI:       ; %bb.0:
618; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
619; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
620; VI-NEXT:    s_mov_b32 s7, 0xf000
621; VI-NEXT:    s_mov_b32 s6, -1
622; VI-NEXT:    s_waitcnt lgkmcnt(0)
623; VI-NEXT:    s_flbit_i32_b32 s0, s0
624; VI-NEXT:    v_add_u32_e64 v0, s[2:3], s0, 32 clamp
625; VI-NEXT:    s_flbit_i32_b32 s0, s1
626; VI-NEXT:    v_min3_u32 v0, v0, s0, 64
627; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
628; VI-NEXT:    s_endpgm
629;
630; EG-LABEL: s_ctlz_i64_trunc:
631; EG:       ; %bb.0:
632; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
633; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
634; EG-NEXT:    CF_END
635; EG-NEXT:    PAD
636; EG-NEXT:    ALU clause starting at 4:
637; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
638; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
639; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
640; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
641; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
642; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
643; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
644; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
645; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
646;
647; GFX10-LABEL: s_ctlz_i64_trunc:
648; GFX10:       ; %bb.0:
649; GFX10-NEXT:    s_clause 0x1
650; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
651; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
652; GFX10-NEXT:    v_mov_b32_e32 v1, 0
653; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
655; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
656; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
657; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
658; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
659; GFX10-NEXT:    s_endpgm
660;
661; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
662; GFX10-GISEL:       ; %bb.0:
663; GFX10-GISEL-NEXT:    s_clause 0x1
664; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
665; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
666; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
667; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
668; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
669; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
670; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
671; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
672; GFX10-GISEL-NEXT:    s_endpgm
673  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
674  %trunc = trunc i64 %ctlz to i32
675  store i32 %trunc, i32 addrspace(1)* %out
676  ret void
677}
678
679define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
680; SI-LABEL: v_ctlz_i64:
681; SI:       ; %bb.0:
682; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
683; SI-NEXT:    s_mov_b32 s7, 0xf000
684; SI-NEXT:    s_mov_b32 s6, 0
685; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
686; SI-NEXT:    v_mov_b32_e32 v1, 0
687; SI-NEXT:    s_waitcnt lgkmcnt(0)
688; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
689; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
690; SI-NEXT:    s_waitcnt vmcnt(0)
691; SI-NEXT:    v_ffbh_u32_e32 v2, v2
692; SI-NEXT:    v_min_u32_e32 v2, 0xffffffdf, v2
693; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
694; SI-NEXT:    v_ffbh_u32_e32 v3, v3
695; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
696; SI-NEXT:    v_mov_b32_e32 v3, v1
697; SI-NEXT:    s_waitcnt lgkmcnt(0)
698; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
699; SI-NEXT:    s_endpgm
700;
701; VI-LABEL: v_ctlz_i64:
702; VI:       ; %bb.0:
703; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
704; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
705; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
706; VI-NEXT:    v_mov_b32_e32 v2, 0
707; VI-NEXT:    s_waitcnt lgkmcnt(0)
708; VI-NEXT:    v_mov_b32_e32 v4, s3
709; VI-NEXT:    v_mov_b32_e32 v1, s1
710; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
711; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
712; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
713; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v3
714; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
715; VI-NEXT:    s_waitcnt vmcnt(0)
716; VI-NEXT:    v_ffbh_u32_e32 v0, v0
717; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
718; VI-NEXT:    v_ffbh_u32_e32 v1, v1
719; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
720; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
721; VI-NEXT:    s_endpgm
722;
723; EG-LABEL: v_ctlz_i64:
724; EG:       ; %bb.0:
725; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
726; EG-NEXT:    TEX 0 @6
727; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
728; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
729; EG-NEXT:    CF_END
730; EG-NEXT:    PAD
731; EG-NEXT:    Fetch clause starting at 6:
732; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
733; EG-NEXT:    ALU clause starting at 8:
734; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
735; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
736; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
737; EG-NEXT:    ALU clause starting at 11:
738; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
739; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
740; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
741; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
742; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
743; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
744; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
745; EG-NEXT:     MOV T0.Y, 0.0,
746; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
747; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
748; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
749;
750; GFX10-LABEL: v_ctlz_i64:
751; GFX10:       ; %bb.0:
752; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
753; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
754; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
755; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
756; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
757; GFX10-NEXT:    s_waitcnt vmcnt(0)
758; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
759; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
760; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
761; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
762; GFX10-NEXT:    v_mov_b32_e32 v1, 0
763; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
764; GFX10-NEXT:    s_endpgm
765;
766; GFX10-GISEL-LABEL: v_ctlz_i64:
767; GFX10-GISEL:       ; %bb.0:
768; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
769; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
770; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
771; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
773; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
774; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
775; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
776; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
777; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
778; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
779; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
780; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
781; GFX10-GISEL-NEXT:    s_endpgm
782  %tid = call i32 @llvm.amdgcn.workitem.id.x()
783  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
784  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
785  %val = load i64, i64 addrspace(1)* %in.gep
786  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
787  store i64 %ctlz, i64 addrspace(1)* %out.gep
788  ret void
789}
790
791define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
792; SI-LABEL: v_ctlz_i64_trunc:
793; SI:       ; %bb.0:
794; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
795; SI-NEXT:    s_mov_b32 s7, 0xf000
796; SI-NEXT:    s_mov_b32 s6, 0
797; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
798; SI-NEXT:    v_mov_b32_e32 v2, 0
799; SI-NEXT:    s_waitcnt lgkmcnt(0)
800; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
801; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
802; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
803; SI-NEXT:    s_waitcnt vmcnt(0)
804; SI-NEXT:    v_ffbh_u32_e32 v0, v3
805; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
806; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
807; SI-NEXT:    v_ffbh_u32_e32 v3, v4
808; SI-NEXT:    v_min3_u32 v0, v0, v3, 64
809; SI-NEXT:    s_waitcnt lgkmcnt(0)
810; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
811; SI-NEXT:    s_endpgm
812;
813; VI-LABEL: v_ctlz_i64_trunc:
814; VI:       ; %bb.0:
815; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
816; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
817; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
818; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
819; VI-NEXT:    s_waitcnt lgkmcnt(0)
820; VI-NEXT:    v_mov_b32_e32 v4, s3
821; VI-NEXT:    v_mov_b32_e32 v2, s1
822; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
823; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
824; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
825; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
826; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
827; VI-NEXT:    s_waitcnt vmcnt(0)
828; VI-NEXT:    v_ffbh_u32_e32 v0, v1
829; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
830; VI-NEXT:    v_ffbh_u32_e32 v1, v2
831; VI-NEXT:    v_min3_u32 v0, v0, v1, 64
832; VI-NEXT:    flat_store_dword v[3:4], v0
833; VI-NEXT:    s_endpgm
834;
835; EG-LABEL: v_ctlz_i64_trunc:
836; EG:       ; %bb.0:
837; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
838; EG-NEXT:    TEX 0 @6
839; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
840; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
841; EG-NEXT:    CF_END
842; EG-NEXT:    PAD
843; EG-NEXT:    Fetch clause starting at 6:
844; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
845; EG-NEXT:    ALU clause starting at 8:
846; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
847; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
848; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
849; EG-NEXT:    ALU clause starting at 11:
850; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
851; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
852; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
853; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
854; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
855; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
856; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
857; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
858; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
859; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
860; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
861;
862; GFX10-LABEL: v_ctlz_i64_trunc:
863; GFX10:       ; %bb.0:
864; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
865; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
866; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
867; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
868; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
869; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
870; GFX10-NEXT:    s_waitcnt vmcnt(0)
871; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
872; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
873; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
874; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
875; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
876; GFX10-NEXT:    s_endpgm
877;
878; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
879; GFX10-GISEL:       ; %bb.0:
880; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
881; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
882; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
883; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
884; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
885; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
886; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
887; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
888; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
889; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
890; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
891; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
892; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
893; GFX10-GISEL-NEXT:    s_endpgm
894  %tid = call i32 @llvm.amdgcn.workitem.id.x()
895  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
896  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
897  %val = load i64, i64 addrspace(1)* %in.gep
898  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
899  %trunc = trunc i64 %ctlz to i32
900  store i32 %trunc, i32 addrspace(1)* %out.gep
901  ret void
902}
903
904define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
905; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
906; SI:       ; %bb.0:
907; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
908; SI-NEXT:    s_mov_b32 s3, 0xf000
909; SI-NEXT:    s_mov_b32 s6, 0
910; SI-NEXT:    s_mov_b32 s7, s3
911; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
912; SI-NEXT:    v_mov_b32_e32 v1, 0
913; SI-NEXT:    s_waitcnt lgkmcnt(0)
914; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
915; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
916; SI-NEXT:    s_mov_b32 s2, -1
917; SI-NEXT:    s_waitcnt vmcnt(0)
918; SI-NEXT:    v_ffbh_u32_e32 v0, v0
919; SI-NEXT:    s_waitcnt lgkmcnt(0)
920; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
921; SI-NEXT:    s_endpgm
922;
923; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
924; VI:       ; %bb.0:
925; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
926; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
927; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
928; VI-NEXT:    s_mov_b32 s7, 0xf000
929; VI-NEXT:    s_mov_b32 s6, -1
930; VI-NEXT:    s_waitcnt lgkmcnt(0)
931; VI-NEXT:    v_mov_b32_e32 v1, s1
932; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
933; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
934; VI-NEXT:    flat_load_dword v0, v[0:1]
935; VI-NEXT:    s_waitcnt vmcnt(0)
936; VI-NEXT:    v_ffbh_u32_e32 v0, v0
937; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
938; VI-NEXT:    s_endpgm
939;
940; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
941; EG:       ; %bb.0:
942; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
943; EG-NEXT:    TEX 0 @6
944; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
945; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
946; EG-NEXT:    CF_END
947; EG-NEXT:    PAD
948; EG-NEXT:    Fetch clause starting at 6:
949; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
950; EG-NEXT:    ALU clause starting at 8:
951; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
952; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
953; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
954; EG-NEXT:    ALU clause starting at 11:
955; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
956; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
957; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
958; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
959; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
960; EG-NEXT:    -1(nan), 2(2.802597e-45)
961;
962; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
963; GFX10:       ; %bb.0:
964; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
965; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
966; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
967; GFX10-NEXT:    v_mov_b32_e32 v1, 0
968; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
969; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
970; GFX10-NEXT:    s_waitcnt vmcnt(0)
971; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
972; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
973; GFX10-NEXT:    s_endpgm
974;
975; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
976; GFX10-GISEL:       ; %bb.0:
977; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
978; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
979; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
980; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
981; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
982; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
983; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
984; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
985; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
986; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
987; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
988; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
989; GFX10-GISEL-NEXT:    s_endpgm
990  %tid = call i32 @llvm.amdgcn.workitem.id.x()
991  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
992  %val = load i32, i32 addrspace(1)* %in.gep
993  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
994  %cmp = icmp eq i32 %val, 0
995  %sel = select i1 %cmp, i32 -1, i32 %ctlz
996  store i32 %sel, i32 addrspace(1)* %out
997  ret void
998}
999
1000define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1001; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
1002; SI:       ; %bb.0:
1003; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1004; SI-NEXT:    s_mov_b32 s3, 0xf000
1005; SI-NEXT:    s_mov_b32 s6, 0
1006; SI-NEXT:    s_mov_b32 s7, s3
1007; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1008; SI-NEXT:    v_mov_b32_e32 v1, 0
1009; SI-NEXT:    s_waitcnt lgkmcnt(0)
1010; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1011; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1012; SI-NEXT:    s_mov_b32 s2, -1
1013; SI-NEXT:    s_waitcnt vmcnt(0)
1014; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1015; SI-NEXT:    s_waitcnt lgkmcnt(0)
1016; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1017; SI-NEXT:    s_endpgm
1018;
1019; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
1020; VI:       ; %bb.0:
1021; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1022; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1023; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1024; VI-NEXT:    s_mov_b32 s7, 0xf000
1025; VI-NEXT:    s_mov_b32 s6, -1
1026; VI-NEXT:    s_waitcnt lgkmcnt(0)
1027; VI-NEXT:    v_mov_b32_e32 v1, s1
1028; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1029; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1030; VI-NEXT:    flat_load_dword v0, v[0:1]
1031; VI-NEXT:    s_waitcnt vmcnt(0)
1032; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1033; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1034; VI-NEXT:    s_endpgm
1035;
1036; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
1037; EG:       ; %bb.0:
1038; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1039; EG-NEXT:    TEX 0 @6
1040; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1041; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1042; EG-NEXT:    CF_END
1043; EG-NEXT:    PAD
1044; EG-NEXT:    Fetch clause starting at 6:
1045; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1046; EG-NEXT:    ALU clause starting at 8:
1047; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1048; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1049; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1050; EG-NEXT:    ALU clause starting at 11:
1051; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1052; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1053; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1054; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1055; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1056; EG-NEXT:    -1(nan), 2(2.802597e-45)
1057;
1058; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
1059; GFX10:       ; %bb.0:
1060; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1061; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1062; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1063; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1064; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1066; GFX10-NEXT:    s_waitcnt vmcnt(0)
1067; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1068; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1069; GFX10-NEXT:    s_endpgm
1070;
1071; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
1072; GFX10-GISEL:       ; %bb.0:
1073; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1074; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1075; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1076; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1077; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1078; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1079; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1080; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1081; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1082; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1083; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1084; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1085; GFX10-GISEL-NEXT:    s_endpgm
1086  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1087  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1088  %val = load i32, i32 addrspace(1)* %in.gep
1089  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1090  %cmp = icmp ne i32 %val, 0
1091  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1092  store i32 %sel, i32 addrspace(1)* %out
1093  ret void
1094}
1095
1096; TODO: Should be able to eliminate select here as well.
1097define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1098; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1099; SI:       ; %bb.0:
1100; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1101; SI-NEXT:    s_mov_b32 s3, 0xf000
1102; SI-NEXT:    s_mov_b32 s6, 0
1103; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1104; SI-NEXT:    v_mov_b32_e32 v1, 0
1105; SI-NEXT:    s_mov_b32 s7, s3
1106; SI-NEXT:    s_waitcnt lgkmcnt(0)
1107; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1108; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1109; SI-NEXT:    s_mov_b32 s2, -1
1110; SI-NEXT:    s_waitcnt vmcnt(0)
1111; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1112; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1113; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1114; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1115; SI-NEXT:    s_waitcnt lgkmcnt(0)
1116; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1117; SI-NEXT:    s_endpgm
1118;
1119; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1120; VI:       ; %bb.0:
1121; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1122; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1123; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1124; VI-NEXT:    s_mov_b32 s7, 0xf000
1125; VI-NEXT:    s_mov_b32 s6, -1
1126; VI-NEXT:    s_waitcnt lgkmcnt(0)
1127; VI-NEXT:    v_mov_b32_e32 v1, s1
1128; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1129; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1130; VI-NEXT:    flat_load_dword v0, v[0:1]
1131; VI-NEXT:    s_waitcnt vmcnt(0)
1132; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1133; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1134; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1135; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1136; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1137; VI-NEXT:    s_endpgm
1138;
1139; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1140; EG:       ; %bb.0:
1141; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1142; EG-NEXT:    TEX 0 @6
1143; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1144; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1145; EG-NEXT:    CF_END
1146; EG-NEXT:    PAD
1147; EG-NEXT:    Fetch clause starting at 6:
1148; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1149; EG-NEXT:    ALU clause starting at 8:
1150; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1151; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1152; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1153; EG-NEXT:    ALU clause starting at 11:
1154; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1155; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1156; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1157; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1158; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1159; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1160; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1161; EG-NEXT:    -1(nan), 2(2.802597e-45)
1162;
1163; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1164; GFX10:       ; %bb.0:
1165; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1166; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1167; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1168; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1169; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1171; GFX10-NEXT:    s_waitcnt vmcnt(0)
1172; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1173; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1174; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1175; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1176; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1177; GFX10-NEXT:    s_endpgm
1178;
1179; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1180; GFX10-GISEL:       ; %bb.0:
1181; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1182; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1183; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1184; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1185; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1186; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1187; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1188; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1189; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1190; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1191; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1192; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1193; GFX10-GISEL-NEXT:    s_endpgm
1194  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1195  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1196  %val = load i32, i32 addrspace(1)* %in.gep
1197  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1198  %cmp = icmp eq i32 %ctlz, 32
1199  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1200  store i32 %sel, i32 addrspace(1)* %out
1201  ret void
1202}
1203
1204define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1205; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1206; SI:       ; %bb.0:
1207; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1208; SI-NEXT:    s_mov_b32 s3, 0xf000
1209; SI-NEXT:    s_mov_b32 s6, 0
1210; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1211; SI-NEXT:    v_mov_b32_e32 v1, 0
1212; SI-NEXT:    s_mov_b32 s7, s3
1213; SI-NEXT:    s_waitcnt lgkmcnt(0)
1214; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1215; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1216; SI-NEXT:    s_mov_b32 s2, -1
1217; SI-NEXT:    s_waitcnt vmcnt(0)
1218; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1219; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1220; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1221; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1222; SI-NEXT:    s_waitcnt lgkmcnt(0)
1223; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1224; SI-NEXT:    s_endpgm
1225;
1226; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1227; VI:       ; %bb.0:
1228; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1229; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1230; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1231; VI-NEXT:    s_mov_b32 s7, 0xf000
1232; VI-NEXT:    s_mov_b32 s6, -1
1233; VI-NEXT:    s_waitcnt lgkmcnt(0)
1234; VI-NEXT:    v_mov_b32_e32 v1, s1
1235; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1236; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1237; VI-NEXT:    flat_load_dword v0, v[0:1]
1238; VI-NEXT:    s_waitcnt vmcnt(0)
1239; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1240; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1241; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1242; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1243; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1244; VI-NEXT:    s_endpgm
1245;
1246; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1247; EG:       ; %bb.0:
1248; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1249; EG-NEXT:    TEX 0 @6
1250; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1251; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1252; EG-NEXT:    CF_END
1253; EG-NEXT:    PAD
1254; EG-NEXT:    Fetch clause starting at 6:
1255; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1256; EG-NEXT:    ALU clause starting at 8:
1257; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1258; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1259; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1260; EG-NEXT:    ALU clause starting at 11:
1261; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1262; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1263; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1264; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1265; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1266; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1267; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1268; EG-NEXT:    -1(nan), 2(2.802597e-45)
1269;
1270; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1271; GFX10:       ; %bb.0:
1272; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1273; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1274; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1275; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1276; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1278; GFX10-NEXT:    s_waitcnt vmcnt(0)
1279; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1280; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1281; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1282; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1283; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1284; GFX10-NEXT:    s_endpgm
1285;
1286; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1287; GFX10-GISEL:       ; %bb.0:
1288; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1289; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1290; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1291; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1292; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1294; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1295; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1296; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1297; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1298; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1299; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1300; GFX10-GISEL-NEXT:    s_endpgm
1301  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1302  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1303  %val = load i32, i32 addrspace(1)* %in.gep
1304  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1305  %cmp = icmp ne i32 %ctlz, 32
1306  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1307  store i32 %sel, i32 addrspace(1)* %out
1308  ret void
1309}
1310
1311 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1312; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
1313; SI:       ; %bb.0:
1314; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1315; SI-NEXT:    s_mov_b32 s3, 0xf000
1316; SI-NEXT:    v_mov_b32_e32 v1, 0
1317; SI-NEXT:    s_mov_b32 s6, 0
1318; SI-NEXT:    s_mov_b32 s7, s3
1319; SI-NEXT:    s_waitcnt lgkmcnt(0)
1320; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1321; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1322; SI-NEXT:    s_mov_b32 s2, -1
1323; SI-NEXT:    s_waitcnt vmcnt(0)
1324; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1325; SI-NEXT:    s_waitcnt lgkmcnt(0)
1326; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1327; SI-NEXT:    s_endpgm
1328;
1329; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
1330; VI:       ; %bb.0:
1331; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1332; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1333; VI-NEXT:    s_mov_b32 s7, 0xf000
1334; VI-NEXT:    s_mov_b32 s6, -1
1335; VI-NEXT:    s_waitcnt lgkmcnt(0)
1336; VI-NEXT:    v_mov_b32_e32 v1, s1
1337; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1338; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1339; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1340; VI-NEXT:    s_waitcnt vmcnt(0)
1341; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1342; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1343; VI-NEXT:    s_endpgm
1344;
1345; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1346; EG:       ; %bb.0:
1347; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1348; EG-NEXT:    TEX 0 @6
1349; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1350; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1351; EG-NEXT:    CF_END
1352; EG-NEXT:    PAD
1353; EG-NEXT:    Fetch clause starting at 6:
1354; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1355; EG-NEXT:    ALU clause starting at 8:
1356; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1357; EG-NEXT:    ALU clause starting at 9:
1358; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1359; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1360; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1361; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1362; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1363; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1364; EG-NEXT:     LSHL T0.X, PV.W, PS,
1365; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1366; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1367; EG-NEXT:     MOV T0.Y, 0.0,
1368; EG-NEXT:     MOV * T0.Z, 0.0,
1369; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1370; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1371;
1372; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
1373; GFX10:       ; %bb.0:
1374; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1375; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1376; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1377; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1378; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1379; GFX10-NEXT:    s_waitcnt vmcnt(0)
1380; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1381; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1382; GFX10-NEXT:    s_endpgm
1383;
1384; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
1385; GFX10-GISEL:       ; %bb.0:
1386; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1387; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1388; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1389; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1390; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1391; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1392; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1393; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1394; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1395; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1396; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1397; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1398; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1399; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
1400; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1401; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1402; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1403; GFX10-GISEL-NEXT:    s_endpgm
1404  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1405  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1406  %val = load i8, i8 addrspace(1)* %valptr.gep
1407  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1408  %cmp = icmp eq i8 %val, 0
1409  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1410  store i8 %sel, i8 addrspace(1)* %out
1411  ret void
1412}
1413
1414 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1415; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1416; SI:       ; %bb.0:
1417; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1418; SI-NEXT:    s_mov_b32 s3, 0xf000
1419; SI-NEXT:    s_mov_b32 s2, -1
1420; SI-NEXT:    s_mov_b32 s6, s2
1421; SI-NEXT:    s_mov_b32 s7, s3
1422; SI-NEXT:    s_waitcnt lgkmcnt(0)
1423; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1424; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1425; SI-NEXT:    s_waitcnt vmcnt(0)
1426; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1427; SI-NEXT:    s_waitcnt lgkmcnt(0)
1428; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1429; SI-NEXT:    s_endpgm
1430;
1431; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1432; VI:       ; %bb.0:
1433; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1434; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1435; VI-NEXT:    s_mov_b32 s7, 0xf000
1436; VI-NEXT:    s_mov_b32 s6, -1
1437; VI-NEXT:    s_mov_b32 s2, s6
1438; VI-NEXT:    s_mov_b32 s3, s7
1439; VI-NEXT:    s_waitcnt lgkmcnt(0)
1440; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1441; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1442; VI-NEXT:    s_waitcnt vmcnt(0)
1443; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1444; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1445; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
1446; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1447; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1448; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1449; VI-NEXT:    s_endpgm
1450;
1451; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1452; EG:       ; %bb.0:
1453; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1454; EG-NEXT:    TEX 0 @6
1455; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1456; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1457; EG-NEXT:    CF_END
1458; EG-NEXT:    PAD
1459; EG-NEXT:    Fetch clause starting at 6:
1460; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1461; EG-NEXT:    ALU clause starting at 8:
1462; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1463; EG-NEXT:    ALU clause starting at 9:
1464; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1465; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1466; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1467; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1468; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1469; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1470; EG-NEXT:     LSHL T0.X, PV.W, PS,
1471; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1472; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1473; EG-NEXT:     MOV T0.Y, 0.0,
1474; EG-NEXT:     MOV * T0.Z, 0.0,
1475; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1476; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1477;
1478; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
1479; GFX10:       ; %bb.0:
1480; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1481; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1482; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1483; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1485; GFX10-NEXT:    s_waitcnt vmcnt(0)
1486; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
1487; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1488; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1489; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1490; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1491; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1492; GFX10-NEXT:    s_endpgm
1493;
1494; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
1495; GFX10-GISEL:       ; %bb.0:
1496; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1497; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1498; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1499; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1500; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1501; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
1502; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0xffff
1503; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1504; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
1505; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1506; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1507; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
1508; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1509; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, s2, vcc_lo
1510; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1511; GFX10-GISEL-NEXT:    s_endpgm
1512  %val = load i16, i16 addrspace(1)* %valptr
1513  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1514  %cmp = icmp eq i16 %val, 0
1515  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1516  store i16 %sel, i16 addrspace(1)* %out
1517  ret void
1518}
1519
1520; FIXME: Need to handle non-uniform case for function below (load without gep).
1521define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1522; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1523; SI:       ; %bb.0:
1524; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1525; SI-NEXT:    s_mov_b32 s3, 0xf000
1526; SI-NEXT:    v_mov_b32_e32 v1, 0
1527; SI-NEXT:    s_mov_b32 s6, 0
1528; SI-NEXT:    s_mov_b32 s7, s3
1529; SI-NEXT:    s_waitcnt lgkmcnt(0)
1530; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1531; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1532; SI-NEXT:    s_mov_b32 s2, -1
1533; SI-NEXT:    s_waitcnt vmcnt(0)
1534; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1535; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1536; SI-NEXT:    s_waitcnt lgkmcnt(0)
1537; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1538; SI-NEXT:    s_endpgm
1539;
1540; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1541; VI:       ; %bb.0:
1542; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1543; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1544; VI-NEXT:    s_mov_b32 s7, 0xf000
1545; VI-NEXT:    s_mov_b32 s6, -1
1546; VI-NEXT:    s_waitcnt lgkmcnt(0)
1547; VI-NEXT:    v_mov_b32_e32 v1, s1
1548; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1549; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1550; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1551; VI-NEXT:    s_waitcnt vmcnt(0)
1552; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1553; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1554; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1555; VI-NEXT:    s_endpgm
1556;
1557; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1558; EG:       ; %bb.0:
1559; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1560; EG-NEXT:    TEX 0 @6
1561; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1562; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1563; EG-NEXT:    CF_END
1564; EG-NEXT:    PAD
1565; EG-NEXT:    Fetch clause starting at 6:
1566; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1567; EG-NEXT:    ALU clause starting at 8:
1568; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1569; EG-NEXT:    ALU clause starting at 9:
1570; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1571; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1572; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1573; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1574; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1575; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1576; EG-NEXT:     LSHL T0.X, PV.W, PS,
1577; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1578; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1579; EG-NEXT:     MOV T0.Y, 0.0,
1580; EG-NEXT:     MOV * T0.Z, 0.0,
1581; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1582; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1583;
1584; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
1585; GFX10:       ; %bb.0:
1586; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1587; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1588; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1589; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1590; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1591; GFX10-NEXT:    s_waitcnt vmcnt(0)
1592; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1593; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1594; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1595; GFX10-NEXT:    s_endpgm
1596;
1597; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
1598; GFX10-GISEL:       ; %bb.0:
1599; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1600; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1601; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1602; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1604; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1605; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x7f
1606; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1607; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1608; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1609; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1610; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1611; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1612; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1613; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1614; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 25, v1
1615; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1616; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1617; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1618; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1619; GFX10-GISEL-NEXT:    s_endpgm
1620  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1621  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1622  %val = load i7, i7 addrspace(1)* %valptr.gep
1623  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1624  %cmp = icmp eq i7 %val, 0
1625  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1626  store i7 %sel, i7 addrspace(1)* %out
1627  ret void
1628}
1629