1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7
8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
11
12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
15
16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone
18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone
19
20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
21
22define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
23; SI-LABEL: s_cttz_i32:
24; SI:       ; %bb.0:
25; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    s_ff1_i32_b32 s2, s2
30; SI-NEXT:    s_min_u32 s4, s2, 32
31; SI-NEXT:    s_mov_b32 s2, -1
32; SI-NEXT:    v_mov_b32_e32 v0, s4
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: s_cttz_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
39; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
40; VI-NEXT:    s_mov_b32 s3, 0xf000
41; VI-NEXT:    s_mov_b32 s2, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_ff1_i32_b32 s4, s4
44; VI-NEXT:    s_min_u32 s4, s4, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s4
46; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_cttz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBL_INT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60;
61; GFX10-LABEL: s_cttz_i32:
62; GFX10:       ; %bb.0:
63; GFX10-NEXT:    s_clause 0x1
64; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
65; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
66; GFX10-NEXT:    v_mov_b32_e32 v0, 0
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    s_ff1_i32_b32 s0, s4
69; GFX10-NEXT:    s_min_u32 s0, s0, 32
70; GFX10-NEXT:    v_mov_b32_e32 v1, s0
71; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
72; GFX10-NEXT:    s_endpgm
73;
74; GFX10-GISEL-LABEL: s_cttz_i32:
75; GFX10-GISEL:       ; %bb.0:
76; GFX10-GISEL-NEXT:    s_clause 0x1
77; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
78; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
79; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
80; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX10-GISEL-NEXT:    s_ff1_i32_b32 s0, s4
82; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 32
83; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
84; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
85; GFX10-GISEL-NEXT:    s_endpgm
86  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
87  store i32 %cttz, i32 addrspace(1)* %out, align 4
88  ret void
89}
90
91define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
92; SI-LABEL: v_cttz_i32:
93; SI:       ; %bb.0:
94; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
95; SI-NEXT:    s_mov_b32 s3, 0xf000
96; SI-NEXT:    s_mov_b32 s6, 0
97; SI-NEXT:    s_mov_b32 s7, s3
98; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
99; SI-NEXT:    v_mov_b32_e32 v1, 0
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
102; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt vmcnt(0)
105; SI-NEXT:    v_ffbl_b32_e32 v0, v0
106; SI-NEXT:    v_min_u32_e32 v0, 32, v0
107; SI-NEXT:    s_waitcnt lgkmcnt(0)
108; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
109; SI-NEXT:    s_endpgm
110;
111; VI-LABEL: v_cttz_i32:
112; VI:       ; %bb.0:
113; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
114; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
115; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v1, s3
118; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
119; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
120; VI-NEXT:    flat_load_dword v0, v[0:1]
121; VI-NEXT:    s_mov_b32 s3, 0xf000
122; VI-NEXT:    s_mov_b32 s2, -1
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_ffbl_b32_e32 v0, v0
125; VI-NEXT:    v_min_u32_e32 v0, 32, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
127; VI-NEXT:    s_endpgm
128;
129; EG-LABEL: v_cttz_i32:
130; EG:       ; %bb.0:
131; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    TEX 0 @6
133; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
134; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
135; EG-NEXT:    CF_END
136; EG-NEXT:    PAD
137; EG-NEXT:    Fetch clause starting at 6:
138; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
139; EG-NEXT:    ALU clause starting at 8:
140; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
141; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
143; EG-NEXT:    ALU clause starting at 11:
144; EG-NEXT:     FFBL_INT * T0.W, T0.X,
145; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
147; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
148;
149; GFX10-LABEL: v_cttz_i32:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
152; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
153; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
154; GFX10-NEXT:    v_mov_b32_e32 v1, 0
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
157; GFX10-NEXT:    s_waitcnt vmcnt(0)
158; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
159; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
160; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
161; GFX10-NEXT:    s_endpgm
162;
163; GFX10-GISEL-LABEL: v_cttz_i32:
164; GFX10-GISEL:       ; %bb.0:
165; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
166; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
168; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
169; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
171; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
172; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
173; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
174; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
175; GFX10-GISEL-NEXT:    s_endpgm
176  %tid = call i32 @llvm.amdgcn.workitem.id.x()
177  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
178  %val = load i32, i32 addrspace(1)* %in.gep, align 4
179  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
180  store i32 %cttz, i32 addrspace(1)* %out, align 4
181  ret void
182}
183
184define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
185; SI-LABEL: v_cttz_v2i32:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
188; SI-NEXT:    s_mov_b32 s3, 0xf000
189; SI-NEXT:    s_mov_b32 s6, 0
190; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
191; SI-NEXT:    v_mov_b32_e32 v1, 0
192; SI-NEXT:    s_mov_b32 s7, s3
193; SI-NEXT:    s_waitcnt lgkmcnt(0)
194; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
195; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    v_ffbl_b32_e32 v1, v1
199; SI-NEXT:    v_ffbl_b32_e32 v0, v0
200; SI-NEXT:    v_min_u32_e32 v1, 32, v1
201; SI-NEXT:    v_min_u32_e32 v0, 32, v0
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
204; SI-NEXT:    s_endpgm
205;
206; VI-LABEL: v_cttz_v2i32:
207; VI:       ; %bb.0:
208; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
209; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
210; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
211; VI-NEXT:    s_waitcnt lgkmcnt(0)
212; VI-NEXT:    v_mov_b32_e32 v1, s3
213; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
214; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
215; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
216; VI-NEXT:    s_mov_b32 s3, 0xf000
217; VI-NEXT:    s_mov_b32 s2, -1
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    v_ffbl_b32_e32 v1, v1
220; VI-NEXT:    v_ffbl_b32_e32 v0, v0
221; VI-NEXT:    v_min_u32_e32 v1, 32, v1
222; VI-NEXT:    v_min_u32_e32 v0, 32, v0
223; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
224; VI-NEXT:    s_endpgm
225;
226; EG-LABEL: v_cttz_v2i32:
227; EG:       ; %bb.0:
228; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
229; EG-NEXT:    TEX 0 @6
230; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
231; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
232; EG-NEXT:    CF_END
233; EG-NEXT:    PAD
234; EG-NEXT:    Fetch clause starting at 6:
235; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
236; EG-NEXT:    ALU clause starting at 8:
237; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
238; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
239; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
240; EG-NEXT:    ALU clause starting at 11:
241; EG-NEXT:     FFBL_INT * T0.W, T0.Y,
242; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
243; EG-NEXT:     FFBL_INT * T0.W, T0.X,
244; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
245; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
246; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
247; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
248;
249; GFX10-LABEL: v_cttz_v2i32:
250; GFX10:       ; %bb.0:
251; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
252; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
253; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
254; GFX10-NEXT:    v_mov_b32_e32 v2, 0
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
257; GFX10-NEXT:    s_waitcnt vmcnt(0)
258; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
259; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
260; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
261; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
262; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
263; GFX10-NEXT:    s_endpgm
264;
265; GFX10-GISEL-LABEL: v_cttz_v2i32:
266; GFX10-GISEL:       ; %bb.0:
267; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
268; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
269; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
270; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
271; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
273; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
274; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
275; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
276; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
277; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
278; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
279; GFX10-GISEL-NEXT:    s_endpgm
280  %tid = call i32 @llvm.amdgcn.workitem.id.x()
281  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
282  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
283  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
284  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
285  ret void
286}
287
288define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
289; SI-LABEL: v_cttz_v4i32:
290; SI:       ; %bb.0:
291; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
292; SI-NEXT:    s_mov_b32 s3, 0xf000
293; SI-NEXT:    s_mov_b32 s6, 0
294; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
295; SI-NEXT:    v_mov_b32_e32 v1, 0
296; SI-NEXT:    s_mov_b32 s7, s3
297; SI-NEXT:    s_waitcnt lgkmcnt(0)
298; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
300; SI-NEXT:    s_mov_b32 s2, -1
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    v_ffbl_b32_e32 v3, v3
303; SI-NEXT:    v_ffbl_b32_e32 v2, v2
304; SI-NEXT:    v_ffbl_b32_e32 v1, v1
305; SI-NEXT:    v_ffbl_b32_e32 v0, v0
306; SI-NEXT:    v_min_u32_e32 v3, 32, v3
307; SI-NEXT:    v_min_u32_e32 v2, 32, v2
308; SI-NEXT:    v_min_u32_e32 v1, 32, v1
309; SI-NEXT:    v_min_u32_e32 v0, 32, v0
310; SI-NEXT:    s_waitcnt lgkmcnt(0)
311; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
312; SI-NEXT:    s_endpgm
313;
314; VI-LABEL: v_cttz_v4i32:
315; VI:       ; %bb.0:
316; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
317; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
318; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
319; VI-NEXT:    s_waitcnt lgkmcnt(0)
320; VI-NEXT:    v_mov_b32_e32 v1, s3
321; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
322; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
323; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
324; VI-NEXT:    s_mov_b32 s3, 0xf000
325; VI-NEXT:    s_mov_b32 s2, -1
326; VI-NEXT:    s_waitcnt vmcnt(0)
327; VI-NEXT:    v_ffbl_b32_e32 v3, v3
328; VI-NEXT:    v_ffbl_b32_e32 v2, v2
329; VI-NEXT:    v_ffbl_b32_e32 v1, v1
330; VI-NEXT:    v_ffbl_b32_e32 v0, v0
331; VI-NEXT:    v_min_u32_e32 v3, 32, v3
332; VI-NEXT:    v_min_u32_e32 v2, 32, v2
333; VI-NEXT:    v_min_u32_e32 v1, 32, v1
334; VI-NEXT:    v_min_u32_e32 v0, 32, v0
335; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
336; VI-NEXT:    s_endpgm
337;
338; EG-LABEL: v_cttz_v4i32:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
350; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
351; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
352; EG-NEXT:    ALU clause starting at 11:
353; EG-NEXT:     FFBL_INT * T1.W, T0.W,
354; EG-NEXT:     FFBL_INT T2.W, T0.Z,
355; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
356; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
357; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
358; EG-NEXT:     FFBL_INT * T1.W, T0.Y,
359; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
360; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
361; EG-NEXT:     FFBL_INT * T1.W, T0.X,
362; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
363; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
365; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
366;
367; GFX10-LABEL: v_cttz_v4i32:
368; GFX10:       ; %bb.0:
369; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
370; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
371; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
372; GFX10-NEXT:    v_mov_b32_e32 v4, 0
373; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
375; GFX10-NEXT:    s_waitcnt vmcnt(0)
376; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
377; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
378; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
379; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
380; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
381; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
382; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
383; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
384; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
385; GFX10-NEXT:    s_endpgm
386;
387; GFX10-GISEL-LABEL: v_cttz_v4i32:
388; GFX10-GISEL:       ; %bb.0:
389; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
390; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
391; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
392; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
393; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
395; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
396; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
397; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
398; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
399; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
400; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
401; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
402; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
403; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
404; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
405; GFX10-GISEL-NEXT:    s_endpgm
406  %tid = call i32 @llvm.amdgcn.workitem.id.x()
407  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
408  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
409  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
410  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
411  ret void
412}
413
414define amdgpu_kernel void @v_cttz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
415; SI-LABEL: v_cttz_i8:
416; SI:       ; %bb.0:
417; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
418; SI-NEXT:    s_mov_b32 s3, 0xf000
419; SI-NEXT:    s_mov_b32 s2, -1
420; SI-NEXT:    s_mov_b32 s6, s2
421; SI-NEXT:    s_mov_b32 s7, s3
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
424; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
425; SI-NEXT:    s_waitcnt vmcnt(0)
426; SI-NEXT:    v_or_b32_e32 v0, 0x100, v0
427; SI-NEXT:    v_ffbl_b32_e32 v0, v0
428; SI-NEXT:    s_waitcnt lgkmcnt(0)
429; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
430; SI-NEXT:    s_endpgm
431;
432; VI-LABEL: v_cttz_i8:
433; VI:       ; %bb.0:
434; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
435; VI-NEXT:    s_mov_b32 s3, 0xf000
436; VI-NEXT:    s_mov_b32 s2, -1
437; VI-NEXT:    s_mov_b32 s6, s2
438; VI-NEXT:    s_mov_b32 s7, s3
439; VI-NEXT:    s_waitcnt lgkmcnt(0)
440; VI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
441; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
442; VI-NEXT:    s_waitcnt vmcnt(0)
443; VI-NEXT:    v_or_b32_e32 v0, 0x100, v0
444; VI-NEXT:    v_ffbl_b32_e32 v0, v0
445; VI-NEXT:    s_waitcnt lgkmcnt(0)
446; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
447; VI-NEXT:    s_endpgm
448;
449; EG-LABEL: v_cttz_i8:
450; EG:       ; %bb.0:
451; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
452; EG-NEXT:    TEX 0 @6
453; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
454; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
455; EG-NEXT:    CF_END
456; EG-NEXT:    PAD
457; EG-NEXT:    Fetch clause starting at 6:
458; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
459; EG-NEXT:    ALU clause starting at 8:
460; EG-NEXT:     MOV * T0.X, KC0[2].Z,
461; EG-NEXT:    ALU clause starting at 9:
462; EG-NEXT:     OR_INT * T0.W, T0.X, literal.x,
463; EG-NEXT:    256(3.587324e-43), 0(0.000000e+00)
464; EG-NEXT:     FFBL_INT T0.W, PV.W,
465; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
466; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
467; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
468; EG-NEXT:     LSHL * T1.W, PS, literal.y,
469; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
470; EG-NEXT:     LSHL T0.X, PV.W, PS,
471; EG-NEXT:     LSHL * T0.W, literal.x, PS,
472; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
473; EG-NEXT:     MOV T0.Y, 0.0,
474; EG-NEXT:     MOV * T0.Z, 0.0,
475; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
476; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
477;
478; GFX10-LABEL: v_cttz_i8:
479; GFX10:       ; %bb.0:
480; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
481; GFX10-NEXT:    v_mov_b32_e32 v0, 0
482; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
483; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
485; GFX10-NEXT:    s_waitcnt vmcnt(0)
486; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
487; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
488; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
489; GFX10-NEXT:    s_endpgm
490;
491; GFX10-GISEL-LABEL: v_cttz_i8:
492; GFX10-GISEL:       ; %bb.0:
493; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
494; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
495; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
496; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
498; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
499; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
500; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
501; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
502; GFX10-GISEL-NEXT:    s_endpgm
503  %val = load i8, i8 addrspace(1)* %valptr
504  %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
505  store i8 %cttz, i8 addrspace(1)* %out
506  ret void
507}
508
509define amdgpu_kernel void @s_cttz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
510; SI-LABEL: s_cttz_i64:
511; SI:       ; %bb.0:
512; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
513; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
514; SI-NEXT:    s_mov_b32 s3, 0xf000
515; SI-NEXT:    s_mov_b32 s2, -1
516; SI-NEXT:    s_waitcnt lgkmcnt(0)
517; SI-NEXT:    s_ff1_i32_b32 s5, s5
518; SI-NEXT:    s_min_u32 s5, s5, 0xffffffdf
519; SI-NEXT:    s_add_i32 s5, s5, 32
520; SI-NEXT:    s_ff1_i32_b32 s4, s4
521; SI-NEXT:    v_mov_b32_e32 v0, s5
522; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
523; SI-NEXT:    v_mov_b32_e32 v1, 0
524; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
525; SI-NEXT:    s_endpgm
526;
527; VI-LABEL: s_cttz_i64:
528; VI:       ; %bb.0:
529; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
530; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
531; VI-NEXT:    s_mov_b32 s3, 0xf000
532; VI-NEXT:    s_mov_b32 s2, -1
533; VI-NEXT:    v_mov_b32_e32 v1, 0
534; VI-NEXT:    s_waitcnt lgkmcnt(0)
535; VI-NEXT:    s_ff1_i32_b32 s5, s5
536; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s5, 32 clamp
537; VI-NEXT:    s_ff1_i32_b32 s4, s4
538; VI-NEXT:    v_min3_u32 v0, s4, v0, 64
539; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
540; VI-NEXT:    s_endpgm
541;
542; EG-LABEL: s_cttz_i64:
543; EG:       ; %bb.0:
544; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
545; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
546; EG-NEXT:    CF_END
547; EG-NEXT:    PAD
548; EG-NEXT:    ALU clause starting at 4:
549; EG-NEXT:     FFBL_INT * T0.W, KC0[5].X,
550; EG-NEXT:     CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W,
551; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
552; EG-NEXT:     FFBL_INT T1.W, KC0[4].W,
553; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
554; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
555; EG-NEXT:     CNDE_INT T0.X, KC0[4].W, PS, PV.W,
556; EG-NEXT:     MOV T0.Y, 0.0,
557; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
558; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
559;
560; GFX10-LABEL: s_cttz_i64:
561; GFX10:       ; %bb.0:
562; GFX10-NEXT:    s_clause 0x1
563; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
564; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
565; GFX10-NEXT:    v_mov_b32_e32 v1, 0
566; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX10-NEXT:    s_ff1_i32_b32 s0, s3
568; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
569; GFX10-NEXT:    s_ff1_i32_b32 s0, s2
570; GFX10-NEXT:    v_min3_u32 v0, s0, v0, 64
571; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
572; GFX10-NEXT:    s_endpgm
573;
574; GFX10-GISEL-LABEL: s_cttz_i64:
575; GFX10-GISEL:       ; %bb.0:
576; GFX10-GISEL-NEXT:    s_clause 0x1
577; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
578; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
579; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
580; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
581; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[2:3]
582; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
583; GFX10-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
584; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
585; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
586; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
587; GFX10-GISEL-NEXT:    s_endpgm
588  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
589  store i64 %cttz, i64 addrspace(1)* %out
590  ret void
591}
592
593define amdgpu_kernel void @s_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
594; SI-LABEL: s_cttz_i64_trunc:
595; SI:       ; %bb.0:
596; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
597; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
598; SI-NEXT:    s_mov_b32 s3, 0xf000
599; SI-NEXT:    s_mov_b32 s2, -1
600; SI-NEXT:    s_waitcnt lgkmcnt(0)
601; SI-NEXT:    s_ff1_i32_b32 s5, s5
602; SI-NEXT:    s_min_u32 s5, s5, 0xffffffdf
603; SI-NEXT:    s_add_i32 s5, s5, 32
604; SI-NEXT:    s_ff1_i32_b32 s4, s4
605; SI-NEXT:    v_mov_b32_e32 v0, s5
606; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
607; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
608; SI-NEXT:    s_endpgm
609;
610; VI-LABEL: s_cttz_i64_trunc:
611; VI:       ; %bb.0:
612; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
613; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
614; VI-NEXT:    s_mov_b32 s3, 0xf000
615; VI-NEXT:    s_mov_b32 s2, -1
616; VI-NEXT:    s_waitcnt lgkmcnt(0)
617; VI-NEXT:    s_ff1_i32_b32 s5, s5
618; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s5, 32 clamp
619; VI-NEXT:    s_ff1_i32_b32 s4, s4
620; VI-NEXT:    v_min3_u32 v0, s4, v0, 64
621; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
622; VI-NEXT:    s_endpgm
623;
624; EG-LABEL: s_cttz_i64_trunc:
625; EG:       ; %bb.0:
626; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
627; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
628; EG-NEXT:    CF_END
629; EG-NEXT:    PAD
630; EG-NEXT:    ALU clause starting at 4:
631; EG-NEXT:     FFBL_INT * T0.W, KC0[3].X,
632; EG-NEXT:     CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
633; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
634; EG-NEXT:     FFBL_INT T1.W, KC0[2].W,
635; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
636; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
637; EG-NEXT:     CNDE_INT T0.X, KC0[2].W, PS, PV.W,
638; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
639; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
640;
641; GFX10-LABEL: s_cttz_i64_trunc:
642; GFX10:       ; %bb.0:
643; GFX10-NEXT:    s_clause 0x1
644; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
645; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
646; GFX10-NEXT:    v_mov_b32_e32 v1, 0
647; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX10-NEXT:    s_ff1_i32_b32 s0, s3
649; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
650; GFX10-NEXT:    s_ff1_i32_b32 s0, s2
651; GFX10-NEXT:    v_min3_u32 v0, s0, v0, 64
652; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
653; GFX10-NEXT:    s_endpgm
654;
655; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
656; GFX10-GISEL:       ; %bb.0:
657; GFX10-GISEL-NEXT:    s_clause 0x1
658; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
659; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
660; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
661; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[2:3]
663; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
664; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
665; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
666; GFX10-GISEL-NEXT:    s_endpgm
667  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
668  %trunc = trunc i64 %cttz to i32
669  store i32 %trunc, i32 addrspace(1)* %out
670  ret void
671}
672
673define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
674; SI-LABEL: v_cttz_i64:
675; SI:       ; %bb.0:
676; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
677; SI-NEXT:    s_mov_b32 s7, 0xf000
678; SI-NEXT:    s_mov_b32 s6, 0
679; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
680; SI-NEXT:    v_mov_b32_e32 v1, 0
681; SI-NEXT:    s_waitcnt lgkmcnt(0)
682; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
683; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
684; SI-NEXT:    s_waitcnt vmcnt(0)
685; SI-NEXT:    v_ffbl_b32_e32 v3, v3
686; SI-NEXT:    v_min_u32_e32 v3, 0xffffffdf, v3
687; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
688; SI-NEXT:    v_ffbl_b32_e32 v2, v2
689; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
690; SI-NEXT:    v_mov_b32_e32 v3, v1
691; SI-NEXT:    s_waitcnt lgkmcnt(0)
692; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
693; SI-NEXT:    s_endpgm
694;
695; VI-LABEL: v_cttz_i64:
696; VI:       ; %bb.0:
697; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
698; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
699; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
700; VI-NEXT:    v_mov_b32_e32 v2, 0
701; VI-NEXT:    s_waitcnt lgkmcnt(0)
702; VI-NEXT:    v_mov_b32_e32 v1, s3
703; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
704; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
705; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
706; VI-NEXT:    v_mov_b32_e32 v4, s1
707; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
708; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
709; VI-NEXT:    s_waitcnt vmcnt(0)
710; VI-NEXT:    v_ffbl_b32_e32 v1, v1
711; VI-NEXT:    v_add_u32_e64 v1, s[0:1], v1, 32 clamp
712; VI-NEXT:    v_ffbl_b32_e32 v0, v0
713; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
714; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
715; VI-NEXT:    s_endpgm
716;
717; EG-LABEL: v_cttz_i64:
718; EG:       ; %bb.0:
719; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
720; EG-NEXT:    TEX 0 @6
721; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
722; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
723; EG-NEXT:    CF_END
724; EG-NEXT:    PAD
725; EG-NEXT:    Fetch clause starting at 6:
726; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
727; EG-NEXT:    ALU clause starting at 8:
728; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
729; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
730; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
731; EG-NEXT:    ALU clause starting at 11:
732; EG-NEXT:     FFBL_INT * T1.W, T0.Y,
733; EG-NEXT:     CNDE_INT * T1.W, T0.Y, literal.x, PV.W,
734; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
735; EG-NEXT:     FFBL_INT T2.W, T0.X,
736; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
737; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
738; EG-NEXT:     CNDE_INT T0.X, T0.X, PS, PV.W,
739; EG-NEXT:     MOV T0.Y, 0.0,
740; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
741; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
742; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
743;
744; GFX10-LABEL: v_cttz_i64:
745; GFX10:       ; %bb.0:
746; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
747; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
748; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
749; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
751; GFX10-NEXT:    s_waitcnt vmcnt(0)
752; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
753; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
754; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
755; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
756; GFX10-NEXT:    v_mov_b32_e32 v1, 0
757; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
758; GFX10-NEXT:    s_endpgm
759;
760; GFX10-GISEL-LABEL: v_cttz_i64:
761; GFX10-GISEL:       ; %bb.0:
762; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
763; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
764; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
765; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
767; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
768; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
769; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
770; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
771; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
772; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
773; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
774; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
775; GFX10-GISEL-NEXT:    s_endpgm
776  %tid = call i32 @llvm.amdgcn.workitem.id.x()
777  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
778  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
779  %val = load i64, i64 addrspace(1)* %in.gep
780  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
781  store i64 %cttz, i64 addrspace(1)* %out.gep
782  ret void
783}
784
785define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
786; SI-LABEL: v_cttz_i64_trunc:
787; SI:       ; %bb.0:
788; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
789; SI-NEXT:    s_mov_b32 s7, 0xf000
790; SI-NEXT:    s_mov_b32 s6, 0
791; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
792; SI-NEXT:    v_mov_b32_e32 v2, 0
793; SI-NEXT:    s_waitcnt lgkmcnt(0)
794; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
795; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
796; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
797; SI-NEXT:    s_waitcnt vmcnt(0)
798; SI-NEXT:    v_ffbl_b32_e32 v0, v4
799; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
800; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
801; SI-NEXT:    v_ffbl_b32_e32 v3, v3
802; SI-NEXT:    v_min3_u32 v0, v3, v0, 64
803; SI-NEXT:    s_waitcnt lgkmcnt(0)
804; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
805; SI-NEXT:    s_endpgm
806;
807; VI-LABEL: v_cttz_i64_trunc:
808; VI:       ; %bb.0:
809; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
810; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
811; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
812; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
813; VI-NEXT:    s_waitcnt lgkmcnt(0)
814; VI-NEXT:    v_mov_b32_e32 v2, s3
815; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
816; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
817; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
818; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
819; VI-NEXT:    v_mov_b32_e32 v4, s1
820; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
821; VI-NEXT:    s_waitcnt vmcnt(0)
822; VI-NEXT:    v_ffbl_b32_e32 v0, v2
823; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
824; VI-NEXT:    v_ffbl_b32_e32 v1, v1
825; VI-NEXT:    v_min3_u32 v0, v1, v0, 64
826; VI-NEXT:    flat_store_dword v[3:4], v0
827; VI-NEXT:    s_endpgm
828;
829; EG-LABEL: v_cttz_i64_trunc:
830; EG:       ; %bb.0:
831; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
832; EG-NEXT:    TEX 0 @6
833; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
834; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
835; EG-NEXT:    CF_END
836; EG-NEXT:    PAD
837; EG-NEXT:    Fetch clause starting at 6:
838; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
839; EG-NEXT:    ALU clause starting at 8:
840; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
841; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
842; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
843; EG-NEXT:    ALU clause starting at 11:
844; EG-NEXT:     FFBL_INT * T0.W, T1.Y,
845; EG-NEXT:     CNDE_INT * T0.W, T1.Y, literal.x, PV.W,
846; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
847; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
848; EG-NEXT:     FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212
849; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
850; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
851; EG-NEXT:     CNDE_INT T0.X, T1.X, PS, PV.W,
852; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
853; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
854; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
855;
856; GFX10-LABEL: v_cttz_i64_trunc:
857; GFX10:       ; %bb.0:
858; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
859; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
860; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
861; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
862; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
863; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
864; GFX10-NEXT:    s_waitcnt vmcnt(0)
865; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
866; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
867; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
868; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
869; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
870; GFX10-NEXT:    s_endpgm
871;
872; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
873; GFX10-GISEL:       ; %bb.0:
874; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
875; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
876; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
877; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
878; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
880; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
881; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
882; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
883; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
884; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
885; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
886; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
887; GFX10-GISEL-NEXT:    s_endpgm
888  %tid = call i32 @llvm.amdgcn.workitem.id.x()
889  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
890  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
891  %val = load i64, i64 addrspace(1)* %in.gep
892  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
893  %trunc = trunc i64 %cttz to i32
894  store i32 %trunc, i32 addrspace(1)* %out.gep
895  ret void
896}
897
898define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
899; SI-LABEL: v_cttz_i32_sel_eq_neg1:
900; SI:       ; %bb.0:
901; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
902; SI-NEXT:    s_mov_b32 s3, 0xf000
903; SI-NEXT:    s_mov_b32 s6, 0
904; SI-NEXT:    s_mov_b32 s7, s3
905; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
906; SI-NEXT:    v_mov_b32_e32 v1, 0
907; SI-NEXT:    s_waitcnt lgkmcnt(0)
908; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
909; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
910; SI-NEXT:    s_mov_b32 s2, -1
911; SI-NEXT:    s_waitcnt vmcnt(0)
912; SI-NEXT:    v_ffbl_b32_e32 v0, v0
913; SI-NEXT:    s_waitcnt lgkmcnt(0)
914; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
915; SI-NEXT:    s_endpgm
916;
917; VI-LABEL: v_cttz_i32_sel_eq_neg1:
918; VI:       ; %bb.0:
919; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
920; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
921; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
922; VI-NEXT:    s_waitcnt lgkmcnt(0)
923; VI-NEXT:    v_mov_b32_e32 v1, s3
924; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
925; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
926; VI-NEXT:    flat_load_dword v0, v[0:1]
927; VI-NEXT:    s_mov_b32 s3, 0xf000
928; VI-NEXT:    s_mov_b32 s2, -1
929; VI-NEXT:    s_waitcnt vmcnt(0)
930; VI-NEXT:    v_ffbl_b32_e32 v0, v0
931; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
932; VI-NEXT:    s_endpgm
933;
934; EG-LABEL: v_cttz_i32_sel_eq_neg1:
935; EG:       ; %bb.0:
936; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
937; EG-NEXT:    TEX 0 @6
938; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
939; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
940; EG-NEXT:    CF_END
941; EG-NEXT:    PAD
942; EG-NEXT:    Fetch clause starting at 6:
943; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
944; EG-NEXT:    ALU clause starting at 8:
945; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
946; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
947; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
948; EG-NEXT:    ALU clause starting at 11:
949; EG-NEXT:     FFBL_INT * T0.W, T0.X,
950; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
951; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
952; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
953; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
954; EG-NEXT:    -1(nan), 2(2.802597e-45)
955;
956; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
957; GFX10:       ; %bb.0:
958; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
959; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
960; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
961; GFX10-NEXT:    v_mov_b32_e32 v1, 0
962; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
963; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
964; GFX10-NEXT:    s_waitcnt vmcnt(0)
965; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
966; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
967; GFX10-NEXT:    s_endpgm
968;
969; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
970; GFX10-GISEL:       ; %bb.0:
971; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
972; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
973; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
974; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
975; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
976; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
977; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
978; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
979; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
980; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
981; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
982; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
983; GFX10-GISEL-NEXT:    s_endpgm
984  %tid = call i32 @llvm.amdgcn.workitem.id.x()
985  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
986  %val = load i32, i32 addrspace(1)* %in.gep
987  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
988  %cmp = icmp eq i32 %val, 0
989  %sel = select i1 %cmp, i32 -1, i32 %cttz
990  store i32 %sel, i32 addrspace(1)* %out
991  ret void
992}
993
994define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
995; SI-LABEL: v_cttz_i32_sel_ne_neg1:
996; SI:       ; %bb.0:
997; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
998; SI-NEXT:    s_mov_b32 s3, 0xf000
999; SI-NEXT:    s_mov_b32 s6, 0
1000; SI-NEXT:    s_mov_b32 s7, s3
1001; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1002; SI-NEXT:    v_mov_b32_e32 v1, 0
1003; SI-NEXT:    s_waitcnt lgkmcnt(0)
1004; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1005; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1006; SI-NEXT:    s_mov_b32 s2, -1
1007; SI-NEXT:    s_waitcnt vmcnt(0)
1008; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1009; SI-NEXT:    s_waitcnt lgkmcnt(0)
1010; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1011; SI-NEXT:    s_endpgm
1012;
1013; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1014; VI:       ; %bb.0:
1015; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1016; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1017; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1018; VI-NEXT:    s_waitcnt lgkmcnt(0)
1019; VI-NEXT:    v_mov_b32_e32 v1, s3
1020; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1021; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1022; VI-NEXT:    flat_load_dword v0, v[0:1]
1023; VI-NEXT:    s_mov_b32 s3, 0xf000
1024; VI-NEXT:    s_mov_b32 s2, -1
1025; VI-NEXT:    s_waitcnt vmcnt(0)
1026; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1027; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1028; VI-NEXT:    s_endpgm
1029;
1030; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1031; EG:       ; %bb.0:
1032; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1033; EG-NEXT:    TEX 0 @6
1034; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1035; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1036; EG-NEXT:    CF_END
1037; EG-NEXT:    PAD
1038; EG-NEXT:    Fetch clause starting at 6:
1039; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1040; EG-NEXT:    ALU clause starting at 8:
1041; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1042; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1043; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1044; EG-NEXT:    ALU clause starting at 11:
1045; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1046; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1047; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1048; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1049; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1050; EG-NEXT:    -1(nan), 2(2.802597e-45)
1051;
1052; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
1053; GFX10:       ; %bb.0:
1054; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1055; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1056; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1057; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1058; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1060; GFX10-NEXT:    s_waitcnt vmcnt(0)
1061; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1062; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1063; GFX10-NEXT:    s_endpgm
1064;
1065; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1066; GFX10-GISEL:       ; %bb.0:
1067; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1068; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1069; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1070; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1072; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1073; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
1074; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1075; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1076; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1077; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1078; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1079; GFX10-GISEL-NEXT:    s_endpgm
1080  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1081  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1082  %val = load i32, i32 addrspace(1)* %in.gep
1083  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1084  %cmp = icmp ne i32 %val, 0
1085  %sel = select i1 %cmp, i32 %cttz, i32 -1
1086  store i32 %sel, i32 addrspace(1)* %out
1087  ret void
1088}
1089
1090; TODO: Should be able to eliminate select here as well.
1091define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1092; SI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1093; SI:       ; %bb.0:
1094; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1095; SI-NEXT:    s_mov_b32 s3, 0xf000
1096; SI-NEXT:    s_mov_b32 s6, 0
1097; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1098; SI-NEXT:    v_mov_b32_e32 v1, 0
1099; SI-NEXT:    s_mov_b32 s7, s3
1100; SI-NEXT:    s_waitcnt lgkmcnt(0)
1101; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1102; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1103; SI-NEXT:    s_mov_b32 s2, -1
1104; SI-NEXT:    s_waitcnt vmcnt(0)
1105; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1106; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1107; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1108; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1109; SI-NEXT:    s_waitcnt lgkmcnt(0)
1110; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1111; SI-NEXT:    s_endpgm
1112;
1113; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1114; VI:       ; %bb.0:
1115; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1116; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1117; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1118; VI-NEXT:    s_waitcnt lgkmcnt(0)
1119; VI-NEXT:    v_mov_b32_e32 v1, s3
1120; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1121; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1122; VI-NEXT:    flat_load_dword v0, v[0:1]
1123; VI-NEXT:    s_mov_b32 s3, 0xf000
1124; VI-NEXT:    s_mov_b32 s2, -1
1125; VI-NEXT:    s_waitcnt vmcnt(0)
1126; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1127; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1128; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1129; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1130; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1131; VI-NEXT:    s_endpgm
1132;
1133; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
1134; EG:       ; %bb.0:
1135; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1136; EG-NEXT:    TEX 0 @6
1137; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1138; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1139; EG-NEXT:    CF_END
1140; EG-NEXT:    PAD
1141; EG-NEXT:    Fetch clause starting at 6:
1142; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1143; EG-NEXT:    ALU clause starting at 8:
1144; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1145; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1146; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1147; EG-NEXT:    ALU clause starting at 11:
1148; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1149; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1150; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1151; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1152; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1153; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1154; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1155; EG-NEXT:    -1(nan), 2(2.802597e-45)
1156;
1157; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
1158; GFX10:       ; %bb.0:
1159; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1160; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1161; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1162; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1163; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1164; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1165; GFX10-NEXT:    s_waitcnt vmcnt(0)
1166; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1167; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1168; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1169; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1170; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1171; GFX10-NEXT:    s_endpgm
1172;
1173; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
1174; GFX10-GISEL:       ; %bb.0:
1175; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1176; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1177; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1178; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1179; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1181; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1182; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
1183; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1184; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1185; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1186; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1187; GFX10-GISEL-NEXT:    s_endpgm
1188  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1189  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1190  %val = load i32, i32 addrspace(1)* %in.gep
1191  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1192  %cmp = icmp eq i32 %cttz, 32
1193  %sel = select i1 %cmp, i32 -1, i32 %cttz
1194  store i32 %sel, i32 addrspace(1)* %out
1195  ret void
1196}
1197
1198define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1199; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1200; SI:       ; %bb.0:
1201; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1202; SI-NEXT:    s_mov_b32 s3, 0xf000
1203; SI-NEXT:    s_mov_b32 s6, 0
1204; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1205; SI-NEXT:    v_mov_b32_e32 v1, 0
1206; SI-NEXT:    s_mov_b32 s7, s3
1207; SI-NEXT:    s_waitcnt lgkmcnt(0)
1208; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1209; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1210; SI-NEXT:    s_mov_b32 s2, -1
1211; SI-NEXT:    s_waitcnt vmcnt(0)
1212; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1213; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1214; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1215; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1216; SI-NEXT:    s_waitcnt lgkmcnt(0)
1217; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1218; SI-NEXT:    s_endpgm
1219;
1220; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1221; VI:       ; %bb.0:
1222; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1223; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1224; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1225; VI-NEXT:    s_waitcnt lgkmcnt(0)
1226; VI-NEXT:    v_mov_b32_e32 v1, s3
1227; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1228; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1229; VI-NEXT:    flat_load_dword v0, v[0:1]
1230; VI-NEXT:    s_mov_b32 s3, 0xf000
1231; VI-NEXT:    s_mov_b32 s2, -1
1232; VI-NEXT:    s_waitcnt vmcnt(0)
1233; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1234; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1235; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1236; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1237; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1238; VI-NEXT:    s_endpgm
1239;
1240; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1241; EG:       ; %bb.0:
1242; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1243; EG-NEXT:    TEX 0 @6
1244; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1245; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1246; EG-NEXT:    CF_END
1247; EG-NEXT:    PAD
1248; EG-NEXT:    Fetch clause starting at 6:
1249; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1250; EG-NEXT:    ALU clause starting at 8:
1251; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1252; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1253; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1254; EG-NEXT:    ALU clause starting at 11:
1255; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1256; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1257; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1258; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1259; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1260; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1261; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1262; EG-NEXT:    -1(nan), 2(2.802597e-45)
1263;
1264; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
1265; GFX10:       ; %bb.0:
1266; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1267; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1268; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1269; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1270; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1272; GFX10-NEXT:    s_waitcnt vmcnt(0)
1273; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1274; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1275; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1276; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1277; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1278; GFX10-NEXT:    s_endpgm
1279;
1280; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1281; GFX10-GISEL:       ; %bb.0:
1282; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1283; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1284; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1285; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1286; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1288; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1289; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
1290; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1291; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1292; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1293; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1294; GFX10-GISEL-NEXT:    s_endpgm
1295  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1296  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1297  %val = load i32, i32 addrspace(1)* %in.gep
1298  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1299  %cmp = icmp ne i32 %cttz, 32
1300  %sel = select i1 %cmp, i32 %cttz, i32 -1
1301  store i32 %sel, i32 addrspace(1)* %out
1302  ret void
1303}
1304
1305 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1306; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1307; SI:       ; %bb.0:
1308; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1309; SI-NEXT:    s_mov_b32 s3, 0xf000
1310; SI-NEXT:    v_mov_b32_e32 v1, 0
1311; SI-NEXT:    s_mov_b32 s6, 0
1312; SI-NEXT:    s_mov_b32 s7, s3
1313; SI-NEXT:    s_waitcnt lgkmcnt(0)
1314; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1315; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1316; SI-NEXT:    s_mov_b32 s2, -1
1317; SI-NEXT:    s_waitcnt vmcnt(0)
1318; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1319; SI-NEXT:    s_waitcnt lgkmcnt(0)
1320; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1321; SI-NEXT:    s_endpgm
1322;
1323; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1324; VI:       ; %bb.0:
1325; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1326; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1327; VI-NEXT:    s_waitcnt lgkmcnt(0)
1328; VI-NEXT:    v_mov_b32_e32 v1, s3
1329; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1330; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1331; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1332; VI-NEXT:    s_mov_b32 s3, 0xf000
1333; VI-NEXT:    s_mov_b32 s2, -1
1334; VI-NEXT:    s_waitcnt vmcnt(0)
1335; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1336; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1337; VI-NEXT:    s_endpgm
1338;
1339; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1340; EG:       ; %bb.0:
1341; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1342; EG-NEXT:    TEX 0 @6
1343; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1344; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1345; EG-NEXT:    CF_END
1346; EG-NEXT:    PAD
1347; EG-NEXT:    Fetch clause starting at 6:
1348; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1349; EG-NEXT:    ALU clause starting at 8:
1350; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1351; EG-NEXT:    ALU clause starting at 9:
1352; EG-NEXT:     FFBL_INT T0.W, T0.X,
1353; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1354; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1355; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1356; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1357; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1358; EG-NEXT:     LSHL T0.X, PV.W, PS,
1359; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1360; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1361; EG-NEXT:     MOV T0.Y, 0.0,
1362; EG-NEXT:     MOV * T0.Z, 0.0,
1363; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1364; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1365;
1366; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
1367; GFX10:       ; %bb.0:
1368; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1369; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1370; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1371; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1373; GFX10-NEXT:    s_waitcnt vmcnt(0)
1374; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1375; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1376; GFX10-NEXT:    s_endpgm
1377;
1378; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1379; GFX10-GISEL:       ; %bb.0:
1380; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1381; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1382; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1383; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1384; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1385; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1386; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1387; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1388; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
1389; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1390; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1391; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
1392; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
1393; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1394; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
1395; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
1396; GFX10-GISEL-NEXT:    s_endpgm
1397  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1398  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1399  %val = load i8, i8 addrspace(1)* %valptr.gep
1400  %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1401  %cmp = icmp eq i8 %val, 0
1402  %sel = select i1 %cmp, i8 -1, i8 %cttz
1403  store i8 %sel, i8 addrspace(1)* %out
1404  ret void
1405}
1406
1407 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1408; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1409; SI:       ; %bb.0:
1410; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1411; SI-NEXT:    s_mov_b32 s3, 0xf000
1412; SI-NEXT:    s_mov_b32 s2, -1
1413; SI-NEXT:    s_mov_b32 s6, s2
1414; SI-NEXT:    s_mov_b32 s7, s3
1415; SI-NEXT:    s_waitcnt lgkmcnt(0)
1416; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1417; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1418; SI-NEXT:    s_waitcnt vmcnt(0)
1419; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1420; SI-NEXT:    s_waitcnt lgkmcnt(0)
1421; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1422; SI-NEXT:    s_endpgm
1423;
1424; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1425; VI:       ; %bb.0:
1426; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
1427; VI-NEXT:    s_mov_b32 s3, 0xf000
1428; VI-NEXT:    s_mov_b32 s2, -1
1429; VI-NEXT:    s_mov_b32 s6, s2
1430; VI-NEXT:    s_mov_b32 s7, s3
1431; VI-NEXT:    s_waitcnt lgkmcnt(0)
1432; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
1433; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1434; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1435; VI-NEXT:    s_waitcnt vmcnt(0)
1436; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
1437; VI-NEXT:    v_ffbl_b32_e32 v2, v2
1438; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1439; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
1440; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1441; VI-NEXT:    s_waitcnt lgkmcnt(0)
1442; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1443; VI-NEXT:    s_endpgm
1444;
1445; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1446; EG:       ; %bb.0:
1447; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1448; EG-NEXT:    TEX 0 @6
1449; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1450; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1451; EG-NEXT:    CF_END
1452; EG-NEXT:    PAD
1453; EG-NEXT:    Fetch clause starting at 6:
1454; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1455; EG-NEXT:    ALU clause starting at 8:
1456; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1457; EG-NEXT:    ALU clause starting at 9:
1458; EG-NEXT:     FFBL_INT T0.W, T0.X,
1459; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1460; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1461; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1462; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1463; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1464; EG-NEXT:     LSHL T0.X, PV.W, PS,
1465; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1466; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1467; EG-NEXT:     MOV T0.Y, 0.0,
1468; EG-NEXT:     MOV * T0.Z, 0.0,
1469; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1470; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1471;
1472; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
1473; GFX10:       ; %bb.0:
1474; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1475; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1476; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1477; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1479; GFX10-NEXT:    s_waitcnt vmcnt(0)
1480; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1481; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
1482; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
1483; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1484; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1485; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1486; GFX10-NEXT:    s_endpgm
1487;
1488; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1489; GFX10-GISEL:       ; %bb.0:
1490; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1491; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1492; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1493; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1494; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1495; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1496; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1497; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1498; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
1499; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1500; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
1501; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1502; GFX10-GISEL-NEXT:    s_endpgm
1503  %val = load i16, i16 addrspace(1)* %valptr
1504  %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1505  %cmp = icmp eq i16 %val, 0
1506  %sel = select i1 %cmp, i16 -1, i16 %cttz
1507  store i16 %sel, i16 addrspace(1)* %out
1508  ret void
1509}
1510
1511; FIXME: Need to handle non-uniform case for function below (load without gep).
1512define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1513; SI-LABEL: v_cttz_i7_sel_eq_neg1:
1514; SI:       ; %bb.0:
1515; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1516; SI-NEXT:    s_mov_b32 s3, 0xf000
1517; SI-NEXT:    v_mov_b32_e32 v1, 0
1518; SI-NEXT:    s_mov_b32 s6, 0
1519; SI-NEXT:    s_mov_b32 s7, s3
1520; SI-NEXT:    s_waitcnt lgkmcnt(0)
1521; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1522; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1523; SI-NEXT:    s_mov_b32 s2, -1
1524; SI-NEXT:    s_waitcnt vmcnt(0)
1525; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1526; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1527; SI-NEXT:    s_waitcnt lgkmcnt(0)
1528; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1529; SI-NEXT:    s_endpgm
1530;
1531; VI-LABEL: v_cttz_i7_sel_eq_neg1:
1532; VI:       ; %bb.0:
1533; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1534; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1535; VI-NEXT:    s_waitcnt lgkmcnt(0)
1536; VI-NEXT:    v_mov_b32_e32 v1, s3
1537; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1538; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1539; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1540; VI-NEXT:    s_mov_b32 s3, 0xf000
1541; VI-NEXT:    s_mov_b32 s2, -1
1542; VI-NEXT:    s_waitcnt vmcnt(0)
1543; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1544; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1545; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1546; VI-NEXT:    s_endpgm
1547;
1548; EG-LABEL: v_cttz_i7_sel_eq_neg1:
1549; EG:       ; %bb.0:
1550; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1551; EG-NEXT:    TEX 0 @6
1552; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1553; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1554; EG-NEXT:    CF_END
1555; EG-NEXT:    PAD
1556; EG-NEXT:    Fetch clause starting at 6:
1557; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1558; EG-NEXT:    ALU clause starting at 8:
1559; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1560; EG-NEXT:    ALU clause starting at 9:
1561; EG-NEXT:     FFBL_INT T0.W, T0.X,
1562; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1563; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1564; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1565; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1566; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1567; EG-NEXT:     LSHL T0.X, PV.W, PS,
1568; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1569; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1570; EG-NEXT:     MOV T0.Y, 0.0,
1571; EG-NEXT:     MOV * T0.Z, 0.0,
1572; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1573; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1574;
1575; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
1576; GFX10:       ; %bb.0:
1577; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1578; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1579; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1580; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1581; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1582; GFX10-NEXT:    s_waitcnt vmcnt(0)
1583; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1584; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1585; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1586; GFX10-NEXT:    s_endpgm
1587;
1588; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
1589; GFX10-GISEL:       ; %bb.0:
1590; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1591; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1592; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1593; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1595; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1596; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1597; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1598; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1599; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1600; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x80, v0
1601; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1602; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1603; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1604; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1605; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1606; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1607; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1608; GFX10-GISEL-NEXT:    s_endpgm
1609  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1610  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1611  %val = load i7, i7 addrspace(1)* %valptr.gep
1612  %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone
1613  %cmp = icmp eq i7 %val, 0
1614  %sel = select i1 %cmp, i7 -1, i7 %cttz
1615  store i7 %sel, i7 addrspace(1)* %out
1616  ret void
1617}
1618