1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,EG
5
6declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
7declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
8declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
9
10declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
11declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
12declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
13
14declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
15declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
16declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
17
18declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
19
20define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
21; SI-LABEL: s_ctlz_i32:
22; SI:       ; %bb.0:
23; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
24; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-NEXT:    s_flbit_i32_b32 s0, s2
28; SI-NEXT:    s_cmp_lg_u32 s2, 0
29; SI-NEXT:    s_cselect_b32 s0, s0, 32
30; SI-NEXT:    s_mov_b32 s6, -1
31; SI-NEXT:    v_mov_b32_e32 v0, s0
32; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
33; SI-NEXT:    s_endpgm
34;
35; VI-LABEL: s_ctlz_i32:
36; VI:       ; %bb.0:
37; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
38; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
39; VI-NEXT:    s_mov_b32 s7, 0xf000
40; VI-NEXT:    s_mov_b32 s6, -1
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    s_flbit_i32_b32 s1, s0
43; VI-NEXT:    s_cmp_lg_u32 s0, 0
44; VI-NEXT:    s_cselect_b32 s0, s1, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_ctlz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
61  store i32 %ctlz, i32 addrspace(1)* %out, align 4
62  ret void
63}
64
65define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
66; SI-LABEL: v_ctlz_i32:
67; SI:       ; %bb.0:
68; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
69; SI-NEXT:    s_mov_b32 s3, 0xf000
70; SI-NEXT:    s_mov_b32 s6, 0
71; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
72; SI-NEXT:    v_mov_b32_e32 v1, 0
73; SI-NEXT:    s_mov_b32 s7, s3
74; SI-NEXT:    s_waitcnt lgkmcnt(0)
75; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
76; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
77; SI-NEXT:    s_mov_b32 s2, -1
78; SI-NEXT:    s_waitcnt vmcnt(0)
79; SI-NEXT:    v_ffbh_u32_e32 v1, v0
80; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
81; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
82; SI-NEXT:    s_waitcnt lgkmcnt(0)
83; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
84; SI-NEXT:    s_endpgm
85;
86; VI-LABEL: v_ctlz_i32:
87; VI:       ; %bb.0:
88; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
89; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
90; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
91; VI-NEXT:    s_mov_b32 s7, 0xf000
92; VI-NEXT:    s_mov_b32 s6, -1
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v1, s1
95; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
96; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
97; VI-NEXT:    flat_load_dword v0, v[0:1]
98; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
99; VI-NEXT:    v_ffbh_u32_e32 v1, v0
100; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
101; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
102; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
103; VI-NEXT:    s_endpgm
104;
105; EG-LABEL: v_ctlz_i32:
106; EG:       ; %bb.0:
107; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
108; EG-NEXT:    TEX 0 @6
109; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
110; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
111; EG-NEXT:    CF_END
112; EG-NEXT:    PAD
113; EG-NEXT:    Fetch clause starting at 6:
114; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
115; EG-NEXT:    ALU clause starting at 8:
116; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
117; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
118; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
119; EG-NEXT:    ALU clause starting at 11:
120; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
121; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
122; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
123; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
126  %val = load i32, i32 addrspace(1)* %in.gep, align 4
127  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
128  store i32 %ctlz, i32 addrspace(1)* %out, align 4
129  ret void
130}
131
132define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
133; SI-LABEL: v_ctlz_v2i32:
134; SI:       ; %bb.0:
135; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
136; SI-NEXT:    s_mov_b32 s3, 0xf000
137; SI-NEXT:    s_mov_b32 s6, 0
138; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
139; SI-NEXT:    v_mov_b32_e32 v1, 0
140; SI-NEXT:    s_mov_b32 s7, s3
141; SI-NEXT:    s_waitcnt lgkmcnt(0)
142; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
143; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
144; SI-NEXT:    s_mov_b32 s2, -1
145; SI-NEXT:    s_waitcnt vmcnt(0)
146; SI-NEXT:    v_ffbh_u32_e32 v2, v1
147; SI-NEXT:    v_ffbh_u32_e32 v3, v0
148; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
149; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
150; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
151; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
154; SI-NEXT:    s_endpgm
155;
156; VI-LABEL: v_ctlz_v2i32:
157; VI:       ; %bb.0:
158; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
159; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
160; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
161; VI-NEXT:    s_mov_b32 s7, 0xf000
162; VI-NEXT:    s_mov_b32 s6, -1
163; VI-NEXT:    s_waitcnt lgkmcnt(0)
164; VI-NEXT:    v_mov_b32_e32 v1, s1
165; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
166; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
167; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
168; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
169; VI-NEXT:    v_ffbh_u32_e32 v2, v1
170; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
171; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
172; VI-NEXT:    v_ffbh_u32_e32 v3, v0
173; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
174; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
175; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
176; VI-NEXT:    s_endpgm
177;
178; EG-LABEL: v_ctlz_v2i32:
179; EG:       ; %bb.0:
180; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
181; EG-NEXT:    TEX 0 @6
182; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
183; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
184; EG-NEXT:    CF_END
185; EG-NEXT:    PAD
186; EG-NEXT:    Fetch clause starting at 6:
187; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
188; EG-NEXT:    ALU clause starting at 8:
189; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
190; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
191; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
192; EG-NEXT:    ALU clause starting at 11:
193; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
194; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
195; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
196; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
197; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
198; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
199; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
200  %tid = call i32 @llvm.amdgcn.workitem.id.x()
201  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
202  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
203  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
204  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
205  ret void
206}
207
208define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
209; SI-LABEL: v_ctlz_v4i32:
210; SI:       ; %bb.0:
211; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
212; SI-NEXT:    s_mov_b32 s3, 0xf000
213; SI-NEXT:    s_mov_b32 s6, 0
214; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
215; SI-NEXT:    v_mov_b32_e32 v1, 0
216; SI-NEXT:    s_mov_b32 s7, s3
217; SI-NEXT:    s_waitcnt lgkmcnt(0)
218; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
219; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
220; SI-NEXT:    s_mov_b32 s2, -1
221; SI-NEXT:    s_waitcnt vmcnt(0)
222; SI-NEXT:    v_ffbh_u32_e32 v4, v3
223; SI-NEXT:    v_ffbh_u32_e32 v5, v2
224; SI-NEXT:    v_ffbh_u32_e32 v6, v1
225; SI-NEXT:    v_ffbh_u32_e32 v7, v0
226; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
227; SI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
228; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
229; SI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
230; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
231; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
232; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
233; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
234; SI-NEXT:    s_waitcnt lgkmcnt(0)
235; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
236; SI-NEXT:    s_endpgm
237;
238; VI-LABEL: v_ctlz_v4i32:
239; VI:       ; %bb.0:
240; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
241; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
242; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
243; VI-NEXT:    s_mov_b32 s7, 0xf000
244; VI-NEXT:    s_mov_b32 s6, -1
245; VI-NEXT:    s_waitcnt lgkmcnt(0)
246; VI-NEXT:    v_mov_b32_e32 v1, s1
247; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
248; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
249; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
250; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
251; VI-NEXT:    v_ffbh_u32_e32 v4, v3
252; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
253; VI-NEXT:    v_cndmask_b32_e32 v3, 32, v4, vcc
254; VI-NEXT:    v_ffbh_u32_e32 v5, v2
255; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
256; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v5, vcc
257; VI-NEXT:    v_ffbh_u32_e32 v6, v1
258; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
259; VI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
260; VI-NEXT:    v_ffbh_u32_e32 v7, v0
261; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
262; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
263; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
264; VI-NEXT:    s_endpgm
265;
266; EG-LABEL: v_ctlz_v4i32:
267; EG:       ; %bb.0:
268; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
269; EG-NEXT:    TEX 0 @6
270; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
271; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
272; EG-NEXT:    CF_END
273; EG-NEXT:    PAD
274; EG-NEXT:    Fetch clause starting at 6:
275; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
276; EG-NEXT:    ALU clause starting at 8:
277; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
278; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
279; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
280; EG-NEXT:    ALU clause starting at 11:
281; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
282; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
283; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
284; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
285; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
286; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
287; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
288; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
289; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
290; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
291; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
292; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
293; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
295  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
296  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
297  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
298  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
299  ret void
300}
301
302define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
303; SI-LABEL: v_ctlz_i8:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
306; SI-NEXT:    s_mov_b32 s3, 0xf000
307; SI-NEXT:    s_mov_b32 s2, -1
308; SI-NEXT:    s_mov_b32 s6, s2
309; SI-NEXT:    s_mov_b32 s7, s3
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
312; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
313; SI-NEXT:    s_waitcnt vmcnt(0)
314; SI-NEXT:    v_ffbh_u32_e32 v1, v0
315; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
316; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
317; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
318; SI-NEXT:    s_waitcnt lgkmcnt(0)
319; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
320; SI-NEXT:    s_endpgm
321;
322; VI-LABEL: v_ctlz_i8:
323; VI:       ; %bb.0:
324; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
325; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
326; VI-NEXT:    s_mov_b32 s7, 0xf000
327; VI-NEXT:    s_mov_b32 s6, -1
328; VI-NEXT:    s_mov_b32 s2, s6
329; VI-NEXT:    s_mov_b32 s3, s7
330; VI-NEXT:    s_waitcnt lgkmcnt(0)
331; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
332; VI-NEXT:    s_waitcnt vmcnt(0)
333; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
334; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
335; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
336; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
337; VI-NEXT:    v_add_u16_e32 v0, -8, v0
338; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
339; VI-NEXT:    s_endpgm
340;
341; EG-LABEL: v_ctlz_i8:
342; EG:       ; %bb.0:
343; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
344; EG-NEXT:    TEX 0 @6
345; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
346; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
347; EG-NEXT:    CF_END
348; EG-NEXT:    PAD
349; EG-NEXT:    Fetch clause starting at 6:
350; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
351; EG-NEXT:    ALU clause starting at 8:
352; EG-NEXT:     MOV * T0.X, KC0[2].Z,
353; EG-NEXT:    ALU clause starting at 9:
354; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
355; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
356; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
357; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
358; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
359; EG-NEXT:    -24(nan), 0(0.000000e+00)
360; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
361; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
362; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
363; EG-NEXT:     LSHL T0.X, PV.W, PS,
364; EG-NEXT:     LSHL * T0.W, literal.x, PS,
365; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
366; EG-NEXT:     MOV T0.Y, 0.0,
367; EG-NEXT:     MOV * T0.Z, 0.0,
368; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
369; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
370  %val = load i8, i8 addrspace(1)* %valptr
371  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
372  store i8 %ctlz, i8 addrspace(1)* %out
373  ret void
374}
375
376define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
377; SI-LABEL: s_ctlz_i64:
378; SI:       ; %bb.0:
379; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
380; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
381; SI-NEXT:    s_mov_b32 s7, 0xf000
382; SI-NEXT:    s_mov_b32 s6, -1
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_flbit_i32_b32 s0, s2
385; SI-NEXT:    s_flbit_i32_b32 s1, s3
386; SI-NEXT:    s_add_i32 s0, s0, 32
387; SI-NEXT:    s_cmp_eq_u32 s3, 0
388; SI-NEXT:    s_cselect_b32 s0, s0, s1
389; SI-NEXT:    s_or_b32 s1, s2, s3
390; SI-NEXT:    s_cmp_lg_u32 s1, 0
391; SI-NEXT:    s_cselect_b32 s0, s0, 64
392; SI-NEXT:    v_mov_b32_e32 v1, 0
393; SI-NEXT:    v_mov_b32_e32 v0, s0
394; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
395; SI-NEXT:    s_endpgm
396;
397; VI-LABEL: s_ctlz_i64:
398; VI:       ; %bb.0:
399; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
400; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4c
401; VI-NEXT:    s_mov_b32 s7, 0xf000
402; VI-NEXT:    s_mov_b32 s6, -1
403; VI-NEXT:    v_mov_b32_e32 v1, 0
404; VI-NEXT:    s_waitcnt lgkmcnt(0)
405; VI-NEXT:    s_flbit_i32_b32 s2, s0
406; VI-NEXT:    s_add_i32 s2, s2, 32
407; VI-NEXT:    s_flbit_i32_b32 s3, s1
408; VI-NEXT:    s_cmp_eq_u32 s1, 0
409; VI-NEXT:    s_cselect_b32 s2, s2, s3
410; VI-NEXT:    s_or_b32 s0, s0, s1
411; VI-NEXT:    s_cmp_lg_u32 s0, 0
412; VI-NEXT:    s_cselect_b32 s0, s2, 64
413; VI-NEXT:    v_mov_b32_e32 v0, s0
414; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
415; VI-NEXT:    s_endpgm
416;
417; EG-LABEL: s_ctlz_i64:
418; EG:       ; %bb.0:
419; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
420; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
421; EG-NEXT:    CF_END
422; EG-NEXT:    PAD
423; EG-NEXT:    ALU clause starting at 4:
424; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
425; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
426; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
427; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
428; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
429; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
430; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
431; EG-NEXT:     MOV T0.Y, 0.0,
432; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
433; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
434  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
435  store i64 %ctlz, i64 addrspace(1)* %out
436  ret void
437}
438
439define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
440; SI-LABEL: s_ctlz_i64_trunc:
441; SI:       ; %bb.0:
442; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
443; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
444; SI-NEXT:    s_mov_b32 s7, 0xf000
445; SI-NEXT:    s_waitcnt lgkmcnt(0)
446; SI-NEXT:    s_flbit_i32_b32 s0, s2
447; SI-NEXT:    s_flbit_i32_b32 s1, s3
448; SI-NEXT:    s_add_i32 s0, s0, 32
449; SI-NEXT:    s_cmp_eq_u32 s3, 0
450; SI-NEXT:    s_cselect_b32 s0, s0, s1
451; SI-NEXT:    s_or_b32 s1, s2, s3
452; SI-NEXT:    s_cmp_lg_u32 s1, 0
453; SI-NEXT:    s_cselect_b32 s0, s0, 64
454; SI-NEXT:    s_mov_b32 s6, -1
455; SI-NEXT:    v_mov_b32_e32 v0, s0
456; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
457; SI-NEXT:    s_endpgm
458;
459; VI-LABEL: s_ctlz_i64_trunc:
460; VI:       ; %bb.0:
461; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
462; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
463; VI-NEXT:    s_mov_b32 s7, 0xf000
464; VI-NEXT:    s_mov_b32 s6, -1
465; VI-NEXT:    s_waitcnt lgkmcnt(0)
466; VI-NEXT:    s_flbit_i32_b32 s2, s0
467; VI-NEXT:    s_add_i32 s2, s2, 32
468; VI-NEXT:    s_flbit_i32_b32 s3, s1
469; VI-NEXT:    s_cmp_eq_u32 s1, 0
470; VI-NEXT:    s_cselect_b32 s2, s2, s3
471; VI-NEXT:    s_or_b32 s0, s0, s1
472; VI-NEXT:    s_cmp_lg_u32 s0, 0
473; VI-NEXT:    s_cselect_b32 s0, s2, 64
474; VI-NEXT:    v_mov_b32_e32 v0, s0
475; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
476; VI-NEXT:    s_endpgm
477;
478; EG-LABEL: s_ctlz_i64_trunc:
479; EG:       ; %bb.0:
480; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
481; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
482; EG-NEXT:    CF_END
483; EG-NEXT:    PAD
484; EG-NEXT:    ALU clause starting at 4:
485; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
486; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
487; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
488; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
489; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
490; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
491; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
492; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
493; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
494  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
495  %trunc = trunc i64 %ctlz to i32
496  store i32 %trunc, i32 addrspace(1)* %out
497  ret void
498}
499
500define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
501; SI-LABEL: v_ctlz_i64:
502; SI:       ; %bb.0:
503; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
504; SI-NEXT:    s_mov_b32 s7, 0xf000
505; SI-NEXT:    s_mov_b32 s6, 0
506; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
507; SI-NEXT:    v_mov_b32_e32 v1, 0
508; SI-NEXT:    s_waitcnt lgkmcnt(0)
509; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
510; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
511; SI-NEXT:    s_waitcnt vmcnt(0)
512; SI-NEXT:    v_ffbh_u32_e32 v4, v2
513; SI-NEXT:    v_ffbh_u32_e32 v5, v3
514; SI-NEXT:    v_or_b32_e32 v2, v2, v3
515; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
516; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
517; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
518; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
519; SI-NEXT:    v_cndmask_b32_e32 v2, 64, v3, vcc
520; SI-NEXT:    v_mov_b32_e32 v3, v1
521; SI-NEXT:    s_waitcnt lgkmcnt(0)
522; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
523; SI-NEXT:    s_endpgm
524;
525; VI-LABEL: v_ctlz_i64:
526; VI:       ; %bb.0:
527; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
528; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
529; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
530; VI-NEXT:    v_mov_b32_e32 v4, 0
531; VI-NEXT:    v_mov_b32_e32 v2, 0
532; VI-NEXT:    s_waitcnt lgkmcnt(0)
533; VI-NEXT:    v_mov_b32_e32 v5, s3
534; VI-NEXT:    v_mov_b32_e32 v1, s1
535; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
536; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
537; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
538; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v3
539; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
540; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
541; VI-NEXT:    v_ffbh_u32_e32 v5, v0
542; VI-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
543; VI-NEXT:    v_ffbh_u32_e32 v6, v1
544; VI-NEXT:    v_or_b32_e32 v0, v0, v1
545; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
546; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
547; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
548; VI-NEXT:    v_cndmask_b32_e32 v1, 64, v1, vcc
549; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
550; VI-NEXT:    s_endpgm
551;
552; EG-LABEL: v_ctlz_i64:
553; EG:       ; %bb.0:
554; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
555; EG-NEXT:    TEX 0 @6
556; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
557; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
558; EG-NEXT:    CF_END
559; EG-NEXT:    PAD
560; EG-NEXT:    Fetch clause starting at 6:
561; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
562; EG-NEXT:    ALU clause starting at 8:
563; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
564; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
565; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
566; EG-NEXT:    ALU clause starting at 11:
567; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
568; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
569; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
570; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
571; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
572; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
573; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
574; EG-NEXT:     MOV T0.Y, 0.0,
575; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
576; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
577; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
578  %tid = call i32 @llvm.amdgcn.workitem.id.x()
579  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
580  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
581  %val = load i64, i64 addrspace(1)* %in.gep
582  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
583  store i64 %ctlz, i64 addrspace(1)* %out.gep
584  ret void
585}
586
587define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
588; SI-LABEL: v_ctlz_i64_trunc:
589; SI:       ; %bb.0:
590; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
591; SI-NEXT:    s_mov_b32 s7, 0xf000
592; SI-NEXT:    s_mov_b32 s6, 0
593; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
594; SI-NEXT:    v_mov_b32_e32 v2, 0
595; SI-NEXT:    s_waitcnt lgkmcnt(0)
596; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
597; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
598; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
599; SI-NEXT:    s_waitcnt vmcnt(0)
600; SI-NEXT:    v_ffbh_u32_e32 v0, v3
601; SI-NEXT:    v_ffbh_u32_e32 v5, v4
602; SI-NEXT:    v_or_b32_e32 v3, v3, v4
603; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
604; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
605; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
606; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
607; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
608; SI-NEXT:    s_waitcnt lgkmcnt(0)
609; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
610; SI-NEXT:    s_endpgm
611;
612; VI-LABEL: v_ctlz_i64_trunc:
613; VI:       ; %bb.0:
614; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
615; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
616; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
617; VI-NEXT:    v_mov_b32_e32 v4, 0
618; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
619; VI-NEXT:    s_waitcnt lgkmcnt(0)
620; VI-NEXT:    v_mov_b32_e32 v5, s3
621; VI-NEXT:    v_mov_b32_e32 v2, s1
622; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
623; VI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
624; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
625; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
626; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
627; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
628; VI-NEXT:    v_ffbh_u32_e32 v0, v1
629; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
630; VI-NEXT:    v_ffbh_u32_e32 v5, v2
631; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
632; VI-NEXT:    v_or_b32_e32 v1, v1, v2
633; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
634; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
635; VI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
636; VI-NEXT:    flat_store_dword v[3:4], v0
637; VI-NEXT:    s_endpgm
638;
639; EG-LABEL: v_ctlz_i64_trunc:
640; EG:       ; %bb.0:
641; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
642; EG-NEXT:    TEX 0 @6
643; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
644; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
645; EG-NEXT:    CF_END
646; EG-NEXT:    PAD
647; EG-NEXT:    Fetch clause starting at 6:
648; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
649; EG-NEXT:    ALU clause starting at 8:
650; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
651; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
652; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
653; EG-NEXT:    ALU clause starting at 11:
654; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
655; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
656; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
657; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
658; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
659; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
660; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
661; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
662; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
663; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
664; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
665  %tid = call i32 @llvm.amdgcn.workitem.id.x()
666  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
667  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
668  %val = load i64, i64 addrspace(1)* %in.gep
669  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
670  %trunc = trunc i64 %ctlz to i32
671  store i32 %trunc, i32 addrspace(1)* %out.gep
672  ret void
673}
674
675define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
676; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
677; SI:       ; %bb.0:
678; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
679; SI-NEXT:    s_mov_b32 s3, 0xf000
680; SI-NEXT:    s_mov_b32 s6, 0
681; SI-NEXT:    s_mov_b32 s7, s3
682; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
683; SI-NEXT:    v_mov_b32_e32 v1, 0
684; SI-NEXT:    s_waitcnt lgkmcnt(0)
685; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
686; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
687; SI-NEXT:    s_mov_b32 s2, -1
688; SI-NEXT:    s_waitcnt vmcnt(0)
689; SI-NEXT:    v_ffbh_u32_e32 v0, v0
690; SI-NEXT:    s_waitcnt lgkmcnt(0)
691; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
692; SI-NEXT:    s_endpgm
693;
694; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
695; VI:       ; %bb.0:
696; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
697; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
698; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
699; VI-NEXT:    s_mov_b32 s7, 0xf000
700; VI-NEXT:    s_mov_b32 s6, -1
701; VI-NEXT:    s_waitcnt lgkmcnt(0)
702; VI-NEXT:    v_mov_b32_e32 v1, s1
703; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
704; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
705; VI-NEXT:    flat_load_dword v0, v[0:1]
706; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
707; VI-NEXT:    v_ffbh_u32_e32 v0, v0
708; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
709; VI-NEXT:    s_endpgm
710;
711; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
712; EG:       ; %bb.0:
713; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
714; EG-NEXT:    TEX 0 @6
715; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
716; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
717; EG-NEXT:    CF_END
718; EG-NEXT:    PAD
719; EG-NEXT:    Fetch clause starting at 6:
720; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
721; EG-NEXT:    ALU clause starting at 8:
722; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
723; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
724; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
725; EG-NEXT:    ALU clause starting at 11:
726; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
727; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
728; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
729; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
730; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
731; EG-NEXT:    -1(nan), 2(2.802597e-45)
732  %tid = call i32 @llvm.amdgcn.workitem.id.x()
733  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
734  %val = load i32, i32 addrspace(1)* %in.gep
735  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
736  %cmp = icmp eq i32 %val, 0
737  %sel = select i1 %cmp, i32 -1, i32 %ctlz
738  store i32 %sel, i32 addrspace(1)* %out
739  ret void
740}
741
742define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
743; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
744; SI:       ; %bb.0:
745; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
746; SI-NEXT:    s_mov_b32 s3, 0xf000
747; SI-NEXT:    s_mov_b32 s6, 0
748; SI-NEXT:    s_mov_b32 s7, s3
749; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
750; SI-NEXT:    v_mov_b32_e32 v1, 0
751; SI-NEXT:    s_waitcnt lgkmcnt(0)
752; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
753; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
754; SI-NEXT:    s_mov_b32 s2, -1
755; SI-NEXT:    s_waitcnt vmcnt(0)
756; SI-NEXT:    v_ffbh_u32_e32 v0, v0
757; SI-NEXT:    s_waitcnt lgkmcnt(0)
758; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
759; SI-NEXT:    s_endpgm
760;
761; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
762; VI:       ; %bb.0:
763; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
764; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
765; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
766; VI-NEXT:    s_mov_b32 s7, 0xf000
767; VI-NEXT:    s_mov_b32 s6, -1
768; VI-NEXT:    s_waitcnt lgkmcnt(0)
769; VI-NEXT:    v_mov_b32_e32 v1, s1
770; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
771; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
772; VI-NEXT:    flat_load_dword v0, v[0:1]
773; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
774; VI-NEXT:    v_ffbh_u32_e32 v0, v0
775; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
776; VI-NEXT:    s_endpgm
777;
778; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
779; EG:       ; %bb.0:
780; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
781; EG-NEXT:    TEX 0 @6
782; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
783; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
784; EG-NEXT:    CF_END
785; EG-NEXT:    PAD
786; EG-NEXT:    Fetch clause starting at 6:
787; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
788; EG-NEXT:    ALU clause starting at 8:
789; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
790; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
791; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
792; EG-NEXT:    ALU clause starting at 11:
793; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
794; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
795; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
796; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
797; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
798; EG-NEXT:    -1(nan), 2(2.802597e-45)
799  %tid = call i32 @llvm.amdgcn.workitem.id.x()
800  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
801  %val = load i32, i32 addrspace(1)* %in.gep
802  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
803  %cmp = icmp ne i32 %val, 0
804  %sel = select i1 %cmp, i32 %ctlz, i32 -1
805  store i32 %sel, i32 addrspace(1)* %out
806  ret void
807}
808
809; TODO: Should be able to eliminate select here as well.
810define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
811; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
812; SI:       ; %bb.0:
813; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
814; SI-NEXT:    s_mov_b32 s3, 0xf000
815; SI-NEXT:    s_mov_b32 s6, 0
816; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
817; SI-NEXT:    v_mov_b32_e32 v1, 0
818; SI-NEXT:    s_mov_b32 s7, s3
819; SI-NEXT:    s_waitcnt lgkmcnt(0)
820; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
821; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
822; SI-NEXT:    s_mov_b32 s2, -1
823; SI-NEXT:    s_waitcnt vmcnt(0)
824; SI-NEXT:    v_ffbh_u32_e32 v1, v0
825; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
826; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
827; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
828; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
829; SI-NEXT:    s_waitcnt lgkmcnt(0)
830; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
831; SI-NEXT:    s_endpgm
832;
833; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
834; VI:       ; %bb.0:
835; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
836; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
837; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
838; VI-NEXT:    s_mov_b32 s7, 0xf000
839; VI-NEXT:    s_mov_b32 s6, -1
840; VI-NEXT:    s_waitcnt lgkmcnt(0)
841; VI-NEXT:    v_mov_b32_e32 v1, s1
842; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
843; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
844; VI-NEXT:    flat_load_dword v0, v[0:1]
845; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
846; VI-NEXT:    v_ffbh_u32_e32 v1, v0
847; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
848; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
849; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
850; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
851; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
852; VI-NEXT:    s_endpgm
853;
854; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
855; EG:       ; %bb.0:
856; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
857; EG-NEXT:    TEX 0 @6
858; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
859; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
860; EG-NEXT:    CF_END
861; EG-NEXT:    PAD
862; EG-NEXT:    Fetch clause starting at 6:
863; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
864; EG-NEXT:    ALU clause starting at 8:
865; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
866; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
867; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
868; EG-NEXT:    ALU clause starting at 11:
869; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
870; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
871; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
872; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
873; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
874; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
875; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
876; EG-NEXT:    -1(nan), 2(2.802597e-45)
877  %tid = call i32 @llvm.amdgcn.workitem.id.x()
878  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
879  %val = load i32, i32 addrspace(1)* %in.gep
880  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
881  %cmp = icmp eq i32 %ctlz, 32
882  %sel = select i1 %cmp, i32 -1, i32 %ctlz
883  store i32 %sel, i32 addrspace(1)* %out
884  ret void
885}
886
887define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
888; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
889; SI:       ; %bb.0:
890; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
891; SI-NEXT:    s_mov_b32 s3, 0xf000
892; SI-NEXT:    s_mov_b32 s6, 0
893; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
894; SI-NEXT:    v_mov_b32_e32 v1, 0
895; SI-NEXT:    s_mov_b32 s7, s3
896; SI-NEXT:    s_waitcnt lgkmcnt(0)
897; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
898; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
899; SI-NEXT:    s_mov_b32 s2, -1
900; SI-NEXT:    s_waitcnt vmcnt(0)
901; SI-NEXT:    v_ffbh_u32_e32 v1, v0
902; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
903; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
904; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
905; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
906; SI-NEXT:    s_waitcnt lgkmcnt(0)
907; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
908; SI-NEXT:    s_endpgm
909;
910; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
911; VI:       ; %bb.0:
912; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
913; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
914; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
915; VI-NEXT:    s_mov_b32 s7, 0xf000
916; VI-NEXT:    s_mov_b32 s6, -1
917; VI-NEXT:    s_waitcnt lgkmcnt(0)
918; VI-NEXT:    v_mov_b32_e32 v1, s1
919; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
920; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
921; VI-NEXT:    flat_load_dword v0, v[0:1]
922; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
923; VI-NEXT:    v_ffbh_u32_e32 v1, v0
924; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
925; VI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
926; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
927; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
928; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
929; VI-NEXT:    s_endpgm
930;
931; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
932; EG:       ; %bb.0:
933; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
934; EG-NEXT:    TEX 0 @6
935; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
936; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
937; EG-NEXT:    CF_END
938; EG-NEXT:    PAD
939; EG-NEXT:    Fetch clause starting at 6:
940; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
941; EG-NEXT:    ALU clause starting at 8:
942; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
943; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
944; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
945; EG-NEXT:    ALU clause starting at 11:
946; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
947; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
948; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
949; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
950; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
951; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
952; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
953; EG-NEXT:    -1(nan), 2(2.802597e-45)
954  %tid = call i32 @llvm.amdgcn.workitem.id.x()
955  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
956  %val = load i32, i32 addrspace(1)* %in.gep
957  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
958  %cmp = icmp ne i32 %ctlz, 32
959  %sel = select i1 %cmp, i32 %ctlz, i32 -1
960  store i32 %sel, i32 addrspace(1)* %out
961  ret void
962}
963
964 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
965; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
966; SI:       ; %bb.0:
967; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
968; SI-NEXT:    s_mov_b32 s3, 0xf000
969; SI-NEXT:    v_mov_b32_e32 v1, 0
970; SI-NEXT:    s_mov_b32 s6, 0
971; SI-NEXT:    s_mov_b32 s7, s3
972; SI-NEXT:    s_waitcnt lgkmcnt(0)
973; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
974; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
975; SI-NEXT:    s_mov_b32 s2, -1
976; SI-NEXT:    s_waitcnt vmcnt(0)
977; SI-NEXT:    v_ffbh_u32_e32 v0, v0
978; SI-NEXT:    s_waitcnt lgkmcnt(0)
979; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
980; SI-NEXT:    s_endpgm
981;
982; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
983; VI:       ; %bb.0:
984; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
985; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
986; VI-NEXT:    s_mov_b32 s7, 0xf000
987; VI-NEXT:    s_mov_b32 s6, -1
988; VI-NEXT:    s_waitcnt lgkmcnt(0)
989; VI-NEXT:    v_mov_b32_e32 v1, s1
990; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
991; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
992; VI-NEXT:    flat_load_ubyte v0, v[0:1]
993; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
994; VI-NEXT:    v_ffbh_u32_e32 v0, v0
995; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
996; VI-NEXT:    s_endpgm
997;
998; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
999; EG:       ; %bb.0:
1000; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1001; EG-NEXT:    TEX 0 @6
1002; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1003; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1004; EG-NEXT:    CF_END
1005; EG-NEXT:    PAD
1006; EG-NEXT:    Fetch clause starting at 6:
1007; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1008; EG-NEXT:    ALU clause starting at 8:
1009; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1010; EG-NEXT:    ALU clause starting at 9:
1011; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1012; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1013; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1014; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1015; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1016; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1017; EG-NEXT:     LSHL T0.X, PV.W, PS,
1018; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1019; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1020; EG-NEXT:     MOV T0.Y, 0.0,
1021; EG-NEXT:     MOV * T0.Z, 0.0,
1022; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1023; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1024  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1025  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1026  %val = load i8, i8 addrspace(1)* %valptr.gep
1027  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1028  %cmp = icmp eq i8 %val, 0
1029  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1030  store i8 %sel, i8 addrspace(1)* %out
1031  ret void
1032}
1033
1034 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1035; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1036; SI:       ; %bb.0:
1037; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1038; SI-NEXT:    s_mov_b32 s3, 0xf000
1039; SI-NEXT:    s_mov_b32 s2, -1
1040; SI-NEXT:    s_mov_b32 s6, s2
1041; SI-NEXT:    s_mov_b32 s7, s3
1042; SI-NEXT:    s_waitcnt lgkmcnt(0)
1043; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1044; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1045; SI-NEXT:    s_waitcnt vmcnt(0)
1046; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1047; SI-NEXT:    s_waitcnt lgkmcnt(0)
1048; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1049; SI-NEXT:    s_endpgm
1050;
1051; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1052; VI:       ; %bb.0:
1053; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1054; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1055; VI-NEXT:    s_mov_b32 s7, 0xf000
1056; VI-NEXT:    s_mov_b32 s6, -1
1057; VI-NEXT:    s_mov_b32 s2, s6
1058; VI-NEXT:    s_mov_b32 s3, s7
1059; VI-NEXT:    s_waitcnt lgkmcnt(0)
1060; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1061; VI-NEXT:    s_waitcnt vmcnt(0)
1062; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1063; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
1064; VI-NEXT:    v_cndmask_b32_e64 v0, 32, v1, s[0:1]
1065; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
1066; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1067; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[0:1]
1068; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1069; VI-NEXT:    s_endpgm
1070;
1071; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1072; EG:       ; %bb.0:
1073; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1074; EG-NEXT:    TEX 0 @6
1075; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1076; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1077; EG-NEXT:    CF_END
1078; EG-NEXT:    PAD
1079; EG-NEXT:    Fetch clause starting at 6:
1080; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1081; EG-NEXT:    ALU clause starting at 8:
1082; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1083; EG-NEXT:    ALU clause starting at 9:
1084; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1085; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1086; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1087; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1088; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1089; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1090; EG-NEXT:     LSHL T0.X, PV.W, PS,
1091; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1092; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1093; EG-NEXT:     MOV T0.Y, 0.0,
1094; EG-NEXT:     MOV * T0.Z, 0.0,
1095; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1096; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1097  %val = load i16, i16 addrspace(1)* %valptr
1098  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1099  %cmp = icmp eq i16 %val, 0
1100  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1101  store i16 %sel, i16 addrspace(1)* %out
1102  ret void
1103}
1104
1105; FIXME: Need to handle non-uniform case for function below (load without gep).
1106define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1107; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1108; SI:       ; %bb.0:
1109; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1110; SI-NEXT:    s_mov_b32 s3, 0xf000
1111; SI-NEXT:    v_mov_b32_e32 v1, 0
1112; SI-NEXT:    s_mov_b32 s6, 0
1113; SI-NEXT:    s_mov_b32 s7, s3
1114; SI-NEXT:    s_waitcnt lgkmcnt(0)
1115; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1116; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1117; SI-NEXT:    s_mov_b32 s2, -1
1118; SI-NEXT:    s_waitcnt vmcnt(0)
1119; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1120; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1121; SI-NEXT:    s_waitcnt lgkmcnt(0)
1122; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1123; SI-NEXT:    s_endpgm
1124;
1125; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1126; VI:       ; %bb.0:
1127; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1128; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1129; VI-NEXT:    s_mov_b32 s7, 0xf000
1130; VI-NEXT:    s_mov_b32 s6, -1
1131; VI-NEXT:    s_waitcnt lgkmcnt(0)
1132; VI-NEXT:    v_mov_b32_e32 v1, s1
1133; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1134; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1135; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1136; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1137; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1138; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1139; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1140; VI-NEXT:    s_endpgm
1141;
1142; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1143; EG:       ; %bb.0:
1144; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1145; EG-NEXT:    TEX 0 @6
1146; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1147; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1148; EG-NEXT:    CF_END
1149; EG-NEXT:    PAD
1150; EG-NEXT:    Fetch clause starting at 6:
1151; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1152; EG-NEXT:    ALU clause starting at 8:
1153; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1154; EG-NEXT:    ALU clause starting at 9:
1155; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1156; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1157; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1158; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1159; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1160; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1161; EG-NEXT:     LSHL T0.X, PV.W, PS,
1162; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1163; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1164; EG-NEXT:     MOV T0.Y, 0.0,
1165; EG-NEXT:     MOV * T0.Z, 0.0,
1166; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1167; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1168  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1169  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1170  %val = load i7, i7 addrspace(1)* %valptr.gep
1171  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1172  %cmp = icmp eq i7 %val, 0
1173  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1174  store i7 %sel, i7 addrspace(1)* %out
1175  ret void
1176}
1177