1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
6
7declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
8
9declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
10declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
11declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
12
13declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
14declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
15declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
16
17declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
18
19define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
20; SI-LABEL: s_ctlz_zero_undef_i32:
21; SI:       ; %bb.0:
22; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
23; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; SI-NEXT:    s_mov_b32 s3, 0xf000
25; SI-NEXT:    s_waitcnt lgkmcnt(0)
26; SI-NEXT:    s_flbit_i32_b32 s4, s2
27; SI-NEXT:    s_mov_b32 s2, -1
28; SI-NEXT:    v_mov_b32_e32 v0, s4
29; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
30; SI-NEXT:    s_endpgm
31;
32; VI-LABEL: s_ctlz_zero_undef_i32:
33; VI:       ; %bb.0:
34; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
35; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
36; VI-NEXT:    s_waitcnt lgkmcnt(0)
37; VI-NEXT:    s_flbit_i32_b32 s2, s2
38; VI-NEXT:    v_mov_b32_e32 v0, s0
39; VI-NEXT:    v_mov_b32_e32 v1, s1
40; VI-NEXT:    v_mov_b32_e32 v2, s2
41; VI-NEXT:    flat_store_dword v[0:1], v2
42; VI-NEXT:    s_endpgm
43;
44; EG-LABEL: s_ctlz_zero_undef_i32:
45; EG:       ; %bb.0:
46; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
47; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
48; EG-NEXT:    CF_END
49; EG-NEXT:    PAD
50; EG-NEXT:    ALU clause starting at 4:
51; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
52; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
53; EG-NEXT:     FFBH_UINT * T1.X, KC0[2].Z,
54;
55; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32:
56; GFX9-GISEL:       ; %bb.0:
57; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
58; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
59; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
60; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s4
62; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
63; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
64; GFX9-GISEL-NEXT:    s_endpgm
65  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
66  store i32 %ctlz, i32 addrspace(1)* %out, align 4
67  ret void
68}
69
70define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
71; SI-LABEL: v_ctlz_zero_undef_i32:
72; SI:       ; %bb.0:
73; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
74; SI-NEXT:    s_mov_b32 s3, 0xf000
75; SI-NEXT:    s_mov_b32 s6, 0
76; SI-NEXT:    s_mov_b32 s7, s3
77; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
78; SI-NEXT:    v_mov_b32_e32 v1, 0
79; SI-NEXT:    s_waitcnt lgkmcnt(0)
80; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
81; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
82; SI-NEXT:    s_mov_b32 s2, -1
83; SI-NEXT:    s_waitcnt vmcnt(0)
84; SI-NEXT:    v_ffbh_u32_e32 v0, v0
85; SI-NEXT:    s_waitcnt lgkmcnt(0)
86; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
87; SI-NEXT:    s_endpgm
88;
89; VI-LABEL: v_ctlz_zero_undef_i32:
90; VI:       ; %bb.0:
91; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
92; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
93; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
94; VI-NEXT:    s_waitcnt lgkmcnt(0)
95; VI-NEXT:    v_mov_b32_e32 v1, s3
96; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
97; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
98; VI-NEXT:    flat_load_dword v0, v[0:1]
99; VI-NEXT:    s_waitcnt vmcnt(0)
100; VI-NEXT:    v_ffbh_u32_e32 v2, v0
101; VI-NEXT:    v_mov_b32_e32 v0, s0
102; VI-NEXT:    v_mov_b32_e32 v1, s1
103; VI-NEXT:    flat_store_dword v[0:1], v2
104; VI-NEXT:    s_endpgm
105;
106; EG-LABEL: v_ctlz_zero_undef_i32:
107; EG:       ; %bb.0:
108; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
109; EG-NEXT:    TEX 0 @6
110; EG-NEXT:    ALU 2, @11, KC0[CB0:0-32], KC1[]
111; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
112; EG-NEXT:    CF_END
113; EG-NEXT:    PAD
114; EG-NEXT:    Fetch clause starting at 6:
115; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
116; EG-NEXT:    ALU clause starting at 8:
117; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
118; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
119; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
120; EG-NEXT:    ALU clause starting at 11:
121; EG-NEXT:     FFBH_UINT T0.X, T0.X,
122; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
123; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
124;
125; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
126; GFX9-GISEL:       ; %bb.0:
127; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
128; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
129; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
130; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
131; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
133; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
134; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
135; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
136; GFX9-GISEL-NEXT:    s_endpgm
137  %tid = call i32 @llvm.amdgcn.workitem.id.x()
138  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
139  %val = load i32, i32 addrspace(1)* %in.gep, align 4
140  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
141  store i32 %ctlz, i32 addrspace(1)* %out, align 4
142  ret void
143}
144
145define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
146; SI-LABEL: v_ctlz_zero_undef_v2i32:
147; SI:       ; %bb.0:
148; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
149; SI-NEXT:    s_mov_b32 s3, 0xf000
150; SI-NEXT:    s_mov_b32 s6, 0
151; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
152; SI-NEXT:    v_mov_b32_e32 v1, 0
153; SI-NEXT:    s_mov_b32 s7, s3
154; SI-NEXT:    s_waitcnt lgkmcnt(0)
155; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
156; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
157; SI-NEXT:    s_mov_b32 s2, -1
158; SI-NEXT:    s_waitcnt vmcnt(0)
159; SI-NEXT:    v_ffbh_u32_e32 v1, v1
160; SI-NEXT:    v_ffbh_u32_e32 v0, v0
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
163; SI-NEXT:    s_endpgm
164;
165; VI-LABEL: v_ctlz_zero_undef_v2i32:
166; VI:       ; %bb.0:
167; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
168; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
169; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
170; VI-NEXT:    s_waitcnt lgkmcnt(0)
171; VI-NEXT:    v_mov_b32_e32 v1, s3
172; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
173; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
174; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
175; VI-NEXT:    v_mov_b32_e32 v3, s1
176; VI-NEXT:    v_mov_b32_e32 v2, s0
177; VI-NEXT:    s_waitcnt vmcnt(0)
178; VI-NEXT:    v_ffbh_u32_e32 v1, v1
179; VI-NEXT:    v_ffbh_u32_e32 v0, v0
180; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
181; VI-NEXT:    s_endpgm
182;
183; EG-LABEL: v_ctlz_zero_undef_v2i32:
184; EG:       ; %bb.0:
185; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
186; EG-NEXT:    TEX 0 @6
187; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
188; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
189; EG-NEXT:    CF_END
190; EG-NEXT:    PAD
191; EG-NEXT:    Fetch clause starting at 6:
192; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
193; EG-NEXT:    ALU clause starting at 8:
194; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
195; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
196; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
197; EG-NEXT:    ALU clause starting at 11:
198; EG-NEXT:     FFBH_UINT * T0.Y, T0.Y,
199; EG-NEXT:     FFBH_UINT T0.X, T0.X,
200; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
201; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
202;
203; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
204; GFX9-GISEL:       ; %bb.0:
205; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
206; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
207; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
208; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
209; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
211; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
213; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
214; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
215; GFX9-GISEL-NEXT:    s_endpgm
216  %tid = call i32 @llvm.amdgcn.workitem.id.x()
217  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
218  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
219  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
220  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
221  ret void
222}
223
224define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
225; SI-LABEL: v_ctlz_zero_undef_v4i32:
226; SI:       ; %bb.0:
227; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
228; SI-NEXT:    s_mov_b32 s3, 0xf000
229; SI-NEXT:    s_mov_b32 s6, 0
230; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
231; SI-NEXT:    v_mov_b32_e32 v1, 0
232; SI-NEXT:    s_mov_b32 s7, s3
233; SI-NEXT:    s_waitcnt lgkmcnt(0)
234; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
235; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
236; SI-NEXT:    s_mov_b32 s2, -1
237; SI-NEXT:    s_waitcnt vmcnt(0)
238; SI-NEXT:    v_ffbh_u32_e32 v3, v3
239; SI-NEXT:    v_ffbh_u32_e32 v2, v2
240; SI-NEXT:    v_ffbh_u32_e32 v1, v1
241; SI-NEXT:    v_ffbh_u32_e32 v0, v0
242; SI-NEXT:    s_waitcnt lgkmcnt(0)
243; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
244; SI-NEXT:    s_endpgm
245;
246; VI-LABEL: v_ctlz_zero_undef_v4i32:
247; VI:       ; %bb.0:
248; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
249; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
250; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
251; VI-NEXT:    s_waitcnt lgkmcnt(0)
252; VI-NEXT:    v_mov_b32_e32 v1, s3
253; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
254; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
255; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
256; VI-NEXT:    v_mov_b32_e32 v5, s1
257; VI-NEXT:    v_mov_b32_e32 v4, s0
258; VI-NEXT:    s_waitcnt vmcnt(0)
259; VI-NEXT:    v_ffbh_u32_e32 v3, v3
260; VI-NEXT:    v_ffbh_u32_e32 v2, v2
261; VI-NEXT:    v_ffbh_u32_e32 v1, v1
262; VI-NEXT:    v_ffbh_u32_e32 v0, v0
263; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
264; VI-NEXT:    s_endpgm
265;
266; EG-LABEL: v_ctlz_zero_undef_v4i32:
267; EG:       ; %bb.0:
268; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
269; EG-NEXT:    TEX 0 @6
270; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
271; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
272; EG-NEXT:    CF_END
273; EG-NEXT:    PAD
274; EG-NEXT:    Fetch clause starting at 6:
275; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
276; EG-NEXT:    ALU clause starting at 8:
277; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
278; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
279; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
280; EG-NEXT:    ALU clause starting at 11:
281; EG-NEXT:     FFBH_UINT * T0.W, T0.W,
282; EG-NEXT:     FFBH_UINT * T0.Z, T0.Z,
283; EG-NEXT:     FFBH_UINT * T0.Y, T0.Y,
284; EG-NEXT:     FFBH_UINT T0.X, T0.X,
285; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
286; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
287;
288; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
289; GFX9-GISEL:       ; %bb.0:
290; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
291; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
292; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
293; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
294; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
296; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
297; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
298; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
299; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
300; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
301; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
302; GFX9-GISEL-NEXT:    s_endpgm
303  %tid = call i32 @llvm.amdgcn.workitem.id.x()
304  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
305  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
306  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
307  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
308  ret void
309}
310
311define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
312; SI-LABEL: v_ctlz_zero_undef_i8:
313; SI:       ; %bb.0:
314; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
315; SI-NEXT:    s_mov_b32 s3, 0xf000
316; SI-NEXT:    v_mov_b32_e32 v1, 0
317; SI-NEXT:    s_mov_b32 s6, 0
318; SI-NEXT:    s_mov_b32 s7, s3
319; SI-NEXT:    s_waitcnt lgkmcnt(0)
320; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
321; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
322; SI-NEXT:    s_mov_b32 s2, -1
323; SI-NEXT:    s_waitcnt vmcnt(0)
324; SI-NEXT:    v_ffbh_u32_e32 v0, v0
325; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
326; SI-NEXT:    s_waitcnt lgkmcnt(0)
327; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
328; SI-NEXT:    s_endpgm
329;
330; VI-LABEL: v_ctlz_zero_undef_i8:
331; VI:       ; %bb.0:
332; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
333; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
334; VI-NEXT:    s_waitcnt lgkmcnt(0)
335; VI-NEXT:    v_mov_b32_e32 v1, s3
336; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
337; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
338; VI-NEXT:    flat_load_ubyte v0, v[0:1]
339; VI-NEXT:    s_waitcnt vmcnt(0)
340; VI-NEXT:    v_ffbh_u32_e32 v0, v0
341; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
342; VI-NEXT:    v_add_u16_e32 v2, -8, v0
343; VI-NEXT:    v_mov_b32_e32 v0, s0
344; VI-NEXT:    v_mov_b32_e32 v1, s1
345; VI-NEXT:    flat_store_byte v[0:1], v2
346; VI-NEXT:    s_endpgm
347;
348; EG-LABEL: v_ctlz_zero_undef_i8:
349; EG:       ; %bb.0:
350; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
351; EG-NEXT:    TEX 0 @6
352; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
353; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
354; EG-NEXT:    CF_END
355; EG-NEXT:    PAD
356; EG-NEXT:    Fetch clause starting at 6:
357; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
358; EG-NEXT:    ALU clause starting at 8:
359; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
360; EG-NEXT:    ALU clause starting at 9:
361; EG-NEXT:     FFBH_UINT T0.W, T0.X,
362; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
363; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
364; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
365; EG-NEXT:    -24(nan), 0(0.000000e+00)
366; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
367; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
368; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
369; EG-NEXT:     LSHL T0.X, PV.W, PS,
370; EG-NEXT:     LSHL * T0.W, literal.x, PS,
371; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
372; EG-NEXT:     MOV T0.Y, 0.0,
373; EG-NEXT:     MOV * T0.Z, 0.0,
374; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
375; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
376;
377; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
378; GFX9-GISEL:       ; %bb.0:
379; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
380; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
381; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
382; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
384; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
385; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
386; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
387; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
388; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
389; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
390; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
391; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
392; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
393; GFX9-GISEL-NEXT:    s_endpgm
394  %tid = call i32 @llvm.amdgcn.workitem.id.x()
395  %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
396  %val = load i8, i8 addrspace(1)* %in.gep
397  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
398  store i8 %ctlz, i8 addrspace(1)* %out
399  ret void
400}
401
402define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
403; SI-LABEL: s_ctlz_zero_undef_i64:
404; SI:       ; %bb.0:
405; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x13
406; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
407; SI-NEXT:    s_mov_b32 s3, 0xf000
408; SI-NEXT:    s_mov_b32 s2, -1
409; SI-NEXT:    s_waitcnt lgkmcnt(0)
410; SI-NEXT:    s_flbit_i32_b32 s4, s4
411; SI-NEXT:    s_flbit_i32_b32 s5, s5
412; SI-NEXT:    s_add_i32 s4, s4, 32
413; SI-NEXT:    s_min_u32 s4, s4, s5
414; SI-NEXT:    v_mov_b32_e32 v1, 0
415; SI-NEXT:    v_mov_b32_e32 v0, s4
416; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
417; SI-NEXT:    s_endpgm
418;
419; VI-LABEL: s_ctlz_zero_undef_i64:
420; VI:       ; %bb.0:
421; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
422; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
423; VI-NEXT:    v_mov_b32_e32 v1, 0
424; VI-NEXT:    s_waitcnt lgkmcnt(0)
425; VI-NEXT:    s_flbit_i32_b32 s2, s2
426; VI-NEXT:    s_flbit_i32_b32 s3, s3
427; VI-NEXT:    s_add_i32 s2, s2, 32
428; VI-NEXT:    s_min_u32 s2, s2, s3
429; VI-NEXT:    v_mov_b32_e32 v3, s1
430; VI-NEXT:    v_mov_b32_e32 v0, s2
431; VI-NEXT:    v_mov_b32_e32 v2, s0
432; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
433; VI-NEXT:    s_endpgm
434;
435; EG-LABEL: s_ctlz_zero_undef_i64:
436; EG:       ; %bb.0:
437; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
438; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
439; EG-NEXT:    CF_END
440; EG-NEXT:    PAD
441; EG-NEXT:    ALU clause starting at 4:
442; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
443; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
444; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
445; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
446; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
447; EG-NEXT:     MOV T0.Y, 0.0,
448; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
449; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
450;
451; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64:
452; GFX9-GISEL:       ; %bb.0:
453; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
454; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
455; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
456; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
458; GFX9-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
459; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
460; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
461; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
462; GFX9-GISEL-NEXT:    s_endpgm
463  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
464  store i64 %ctlz, i64 addrspace(1)* %out
465  ret void
466}
467
468define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
469; SI-LABEL: s_ctlz_zero_undef_i64_trunc:
470; SI:       ; %bb.0:
471; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
472; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
473; SI-NEXT:    s_mov_b32 s3, 0xf000
474; SI-NEXT:    s_waitcnt lgkmcnt(0)
475; SI-NEXT:    s_flbit_i32_b32 s2, s4
476; SI-NEXT:    s_flbit_i32_b32 s4, s5
477; SI-NEXT:    s_add_i32 s2, s2, 32
478; SI-NEXT:    s_min_u32 s4, s2, s4
479; SI-NEXT:    s_mov_b32 s2, -1
480; SI-NEXT:    v_mov_b32_e32 v0, s4
481; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
482; SI-NEXT:    s_endpgm
483;
484; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
485; VI:       ; %bb.0:
486; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
487; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
488; VI-NEXT:    s_waitcnt lgkmcnt(0)
489; VI-NEXT:    s_flbit_i32_b32 s2, s2
490; VI-NEXT:    s_flbit_i32_b32 s3, s3
491; VI-NEXT:    s_add_i32 s2, s2, 32
492; VI-NEXT:    s_min_u32 s2, s2, s3
493; VI-NEXT:    v_mov_b32_e32 v0, s0
494; VI-NEXT:    v_mov_b32_e32 v1, s1
495; VI-NEXT:    v_mov_b32_e32 v2, s2
496; VI-NEXT:    flat_store_dword v[0:1], v2
497; VI-NEXT:    s_endpgm
498;
499; EG-LABEL: s_ctlz_zero_undef_i64_trunc:
500; EG:       ; %bb.0:
501; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
502; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
503; EG-NEXT:    CF_END
504; EG-NEXT:    PAD
505; EG-NEXT:    ALU clause starting at 4:
506; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
507; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
508; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
509; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
510; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
511; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
512; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
513;
514; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
515; GFX9-GISEL:       ; %bb.0:
516; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
517; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
518; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
519; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
520; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[2:3]
521; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
522; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
523; GFX9-GISEL-NEXT:    s_endpgm
524  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
525  %trunc = trunc i64 %ctlz to i32
526  store i32 %trunc, i32 addrspace(1)* %out
527  ret void
528}
529
530define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
531; SI-LABEL: v_ctlz_zero_undef_i64:
532; SI:       ; %bb.0:
533; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
534; SI-NEXT:    s_mov_b32 s7, 0xf000
535; SI-NEXT:    s_mov_b32 s6, 0
536; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
537; SI-NEXT:    v_mov_b32_e32 v1, 0
538; SI-NEXT:    s_waitcnt lgkmcnt(0)
539; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
540; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
541; SI-NEXT:    s_waitcnt vmcnt(0)
542; SI-NEXT:    v_ffbh_u32_e32 v2, v2
543; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
544; SI-NEXT:    v_ffbh_u32_e32 v3, v3
545; SI-NEXT:    v_min_u32_e32 v2, v2, v3
546; SI-NEXT:    v_mov_b32_e32 v3, v1
547; SI-NEXT:    s_waitcnt lgkmcnt(0)
548; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
549; SI-NEXT:    s_endpgm
550;
551; VI-LABEL: v_ctlz_zero_undef_i64:
552; VI:       ; %bb.0:
553; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
554; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
555; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
556; VI-NEXT:    v_mov_b32_e32 v2, 0
557; VI-NEXT:    s_waitcnt lgkmcnt(0)
558; VI-NEXT:    v_mov_b32_e32 v1, s3
559; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
560; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
561; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
562; VI-NEXT:    v_mov_b32_e32 v4, s1
563; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
564; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
565; VI-NEXT:    s_waitcnt vmcnt(0)
566; VI-NEXT:    v_ffbh_u32_e32 v0, v0
567; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
568; VI-NEXT:    v_ffbh_u32_e32 v1, v1
569; VI-NEXT:    v_min_u32_e32 v1, v0, v1
570; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
571; VI-NEXT:    s_endpgm
572;
573; EG-LABEL: v_ctlz_zero_undef_i64:
574; EG:       ; %bb.0:
575; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
576; EG-NEXT:    TEX 0 @6
577; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
578; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
579; EG-NEXT:    CF_END
580; EG-NEXT:    PAD
581; EG-NEXT:    Fetch clause starting at 6:
582; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
583; EG-NEXT:    ALU clause starting at 8:
584; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
585; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
586; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
587; EG-NEXT:    ALU clause starting at 11:
588; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
589; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
590; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
591; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
592; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
593; EG-NEXT:     MOV T0.Y, 0.0,
594; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
595; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
596; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
597;
598; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
599; GFX9-GISEL:       ; %bb.0:
600; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
601; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
602; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
603; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
604; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
605; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
606; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
607; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
608; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
609; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
610; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
611; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
612; GFX9-GISEL-NEXT:    s_endpgm
613  %tid = call i32 @llvm.amdgcn.workitem.id.x()
614  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
615  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
616  %val = load i64, i64 addrspace(1)* %in.gep
617  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
618  store i64 %ctlz, i64 addrspace(1)* %out.gep
619  ret void
620}
621
622define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
623; SI-LABEL: v_ctlz_zero_undef_i64_trunc:
624; SI:       ; %bb.0:
625; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
626; SI-NEXT:    s_mov_b32 s7, 0xf000
627; SI-NEXT:    s_mov_b32 s6, 0
628; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
629; SI-NEXT:    v_mov_b32_e32 v2, 0
630; SI-NEXT:    s_waitcnt lgkmcnt(0)
631; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
632; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
633; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
634; SI-NEXT:    s_waitcnt vmcnt(0)
635; SI-NEXT:    v_ffbh_u32_e32 v0, v3
636; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
637; SI-NEXT:    v_ffbh_u32_e32 v3, v4
638; SI-NEXT:    v_min_u32_e32 v0, v0, v3
639; SI-NEXT:    s_waitcnt lgkmcnt(0)
640; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
641; SI-NEXT:    s_endpgm
642;
643; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
644; VI:       ; %bb.0:
645; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
646; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
647; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
648; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
649; VI-NEXT:    s_waitcnt lgkmcnt(0)
650; VI-NEXT:    v_mov_b32_e32 v2, s3
651; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
652; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
653; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
654; VI-NEXT:    v_mov_b32_e32 v4, s1
655; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
656; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
657; VI-NEXT:    s_waitcnt vmcnt(0)
658; VI-NEXT:    v_ffbh_u32_e32 v0, v1
659; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
660; VI-NEXT:    v_ffbh_u32_e32 v1, v2
661; VI-NEXT:    v_min_u32_e32 v0, v0, v1
662; VI-NEXT:    flat_store_dword v[3:4], v0
663; VI-NEXT:    s_endpgm
664;
665; EG-LABEL: v_ctlz_zero_undef_i64_trunc:
666; EG:       ; %bb.0:
667; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
668; EG-NEXT:    TEX 0 @6
669; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
670; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
671; EG-NEXT:    CF_END
672; EG-NEXT:    PAD
673; EG-NEXT:    Fetch clause starting at 6:
674; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
675; EG-NEXT:    ALU clause starting at 8:
676; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
677; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
678; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
679; EG-NEXT:    ALU clause starting at 11:
680; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
681; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
682; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
683; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
684; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
685; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
686; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
687; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
688; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
689;
690; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
691; GFX9-GISEL:       ; %bb.0:
692; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
693; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
694; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
695; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
696; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
697; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
698; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
699; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
700; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
701; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
702; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
703; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
704; GFX9-GISEL-NEXT:    s_endpgm
705  %tid = call i32 @llvm.amdgcn.workitem.id.x()
706  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
707  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
708  %val = load i64, i64 addrspace(1)* %in.gep
709  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
710  %trunc = trunc i64 %ctlz to i32
711  store i32 %trunc, i32 addrspace(1)* %out.gep
712  ret void
713}
714
715define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
716; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
717; SI:       ; %bb.0:
718; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
719; SI-NEXT:    s_mov_b32 s3, 0xf000
720; SI-NEXT:    s_mov_b32 s6, 0
721; SI-NEXT:    s_mov_b32 s7, s3
722; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
723; SI-NEXT:    v_mov_b32_e32 v1, 0
724; SI-NEXT:    s_waitcnt lgkmcnt(0)
725; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
726; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
727; SI-NEXT:    s_mov_b32 s2, -1
728; SI-NEXT:    s_waitcnt vmcnt(0)
729; SI-NEXT:    v_ffbh_u32_e32 v0, v0
730; SI-NEXT:    s_waitcnt lgkmcnt(0)
731; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
732; SI-NEXT:    s_endpgm
733;
734; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
735; VI:       ; %bb.0:
736; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
737; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
738; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
739; VI-NEXT:    s_waitcnt lgkmcnt(0)
740; VI-NEXT:    v_mov_b32_e32 v1, s3
741; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
742; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
743; VI-NEXT:    flat_load_dword v0, v[0:1]
744; VI-NEXT:    s_waitcnt vmcnt(0)
745; VI-NEXT:    v_ffbh_u32_e32 v2, v0
746; VI-NEXT:    v_mov_b32_e32 v0, s0
747; VI-NEXT:    v_mov_b32_e32 v1, s1
748; VI-NEXT:    flat_store_dword v[0:1], v2
749; VI-NEXT:    s_endpgm
750;
751; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
752; EG:       ; %bb.0:
753; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
754; EG-NEXT:    TEX 0 @6
755; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
756; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
757; EG-NEXT:    CF_END
758; EG-NEXT:    PAD
759; EG-NEXT:    Fetch clause starting at 6:
760; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
761; EG-NEXT:    ALU clause starting at 8:
762; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
763; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
764; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
765; EG-NEXT:    ALU clause starting at 11:
766; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
767; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
768; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
769; EG-NEXT:    -1(nan), 2(2.802597e-45)
770;
771; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
772; GFX9-GISEL:       ; %bb.0:
773; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
774; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
775; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
776; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
778; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
779; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
780; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
781; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
782; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
783; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
784; GFX9-GISEL-NEXT:    s_endpgm
785  %tid = call i32 @llvm.amdgcn.workitem.id.x()
786  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
787  %val = load i32, i32 addrspace(1)* %in.gep
788  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
789  %cmp = icmp eq i32 %val, 0
790  %sel = select i1 %cmp, i32 -1, i32 %ctlz
791  store i32 %sel, i32 addrspace(1)* %out
792  ret void
793}
794
795define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
796; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
797; SI:       ; %bb.0:
798; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
799; SI-NEXT:    s_mov_b32 s3, 0xf000
800; SI-NEXT:    s_mov_b32 s6, 0
801; SI-NEXT:    s_mov_b32 s7, s3
802; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
803; SI-NEXT:    v_mov_b32_e32 v1, 0
804; SI-NEXT:    s_waitcnt lgkmcnt(0)
805; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
806; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
807; SI-NEXT:    s_mov_b32 s2, -1
808; SI-NEXT:    s_waitcnt vmcnt(0)
809; SI-NEXT:    v_ffbh_u32_e32 v0, v0
810; SI-NEXT:    s_waitcnt lgkmcnt(0)
811; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
812; SI-NEXT:    s_endpgm
813;
814; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
815; VI:       ; %bb.0:
816; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
817; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
818; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
819; VI-NEXT:    s_waitcnt lgkmcnt(0)
820; VI-NEXT:    v_mov_b32_e32 v1, s3
821; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
822; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
823; VI-NEXT:    flat_load_dword v0, v[0:1]
824; VI-NEXT:    s_waitcnt vmcnt(0)
825; VI-NEXT:    v_ffbh_u32_e32 v2, v0
826; VI-NEXT:    v_mov_b32_e32 v0, s0
827; VI-NEXT:    v_mov_b32_e32 v1, s1
828; VI-NEXT:    flat_store_dword v[0:1], v2
829; VI-NEXT:    s_endpgm
830;
831; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
832; EG:       ; %bb.0:
833; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
834; EG-NEXT:    TEX 0 @6
835; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
836; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
837; EG-NEXT:    CF_END
838; EG-NEXT:    PAD
839; EG-NEXT:    Fetch clause starting at 6:
840; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
841; EG-NEXT:    ALU clause starting at 8:
842; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
843; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
844; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
845; EG-NEXT:    ALU clause starting at 11:
846; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
847; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
848; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
849; EG-NEXT:    -1(nan), 2(2.802597e-45)
850;
851; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
852; GFX9-GISEL:       ; %bb.0:
853; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
854; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
855; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
856; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
858; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
859; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
860; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
861; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
862; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
863; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
864; GFX9-GISEL-NEXT:    s_endpgm
865  %tid = call i32 @llvm.amdgcn.workitem.id.x()
866  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
867  %val = load i32, i32 addrspace(1)* %in.gep
868  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
869  %cmp = icmp ne i32 %val, 0
870  %sel = select i1 %cmp, i32 %ctlz, i32 -1
871  store i32 %sel, i32 addrspace(1)* %out
872  ret void
873}
874
875define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
876; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
877; SI:       ; %bb.0:
878; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
879; SI-NEXT:    s_mov_b32 s3, 0xf000
880; SI-NEXT:    v_mov_b32_e32 v1, 0
881; SI-NEXT:    s_mov_b32 s6, 0
882; SI-NEXT:    s_mov_b32 s7, s3
883; SI-NEXT:    s_waitcnt lgkmcnt(0)
884; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
885; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
886; SI-NEXT:    s_mov_b32 s2, -1
887; SI-NEXT:    s_waitcnt vmcnt(0)
888; SI-NEXT:    v_ffbh_u32_e32 v0, v0
889; SI-NEXT:    s_waitcnt lgkmcnt(0)
890; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
891; SI-NEXT:    s_endpgm
892;
893; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
894; VI:       ; %bb.0:
895; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
896; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
897; VI-NEXT:    s_waitcnt lgkmcnt(0)
898; VI-NEXT:    v_mov_b32_e32 v1, s3
899; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
900; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
901; VI-NEXT:    flat_load_ubyte v0, v[0:1]
902; VI-NEXT:    s_waitcnt vmcnt(0)
903; VI-NEXT:    v_ffbh_u32_e32 v2, v0
904; VI-NEXT:    v_mov_b32_e32 v0, s0
905; VI-NEXT:    v_mov_b32_e32 v1, s1
906; VI-NEXT:    flat_store_byte v[0:1], v2
907; VI-NEXT:    s_endpgm
908;
909; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
910; EG:       ; %bb.0:
911; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
912; EG-NEXT:    TEX 0 @6
913; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
914; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
915; EG-NEXT:    CF_END
916; EG-NEXT:    PAD
917; EG-NEXT:    Fetch clause starting at 6:
918; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
919; EG-NEXT:    ALU clause starting at 8:
920; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
921; EG-NEXT:    ALU clause starting at 9:
922; EG-NEXT:     FFBH_UINT T0.W, T0.X,
923; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
924; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
925; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
926; EG-NEXT:     LSHL * T1.W, PS, literal.y,
927; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
928; EG-NEXT:     LSHL T0.X, PV.W, PS,
929; EG-NEXT:     LSHL * T0.W, literal.x, PS,
930; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
931; EG-NEXT:     MOV T0.Y, 0.0,
932; EG-NEXT:     MOV * T0.Z, 0.0,
933; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
934; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
935;
936; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
937; GFX9-GISEL:       ; %bb.0:
938; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
939; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
940; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
941; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
942; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
943; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
944; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
945; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
946; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
947; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
948; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
949; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
950; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
951; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
952; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
953; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
954; GFX9-GISEL-NEXT:    s_endpgm
955  %tid = call i32 @llvm.amdgcn.workitem.id.x()
956  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
957  %val = load i8, i8 addrspace(1)* %valptr.gep
958  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
959  %cmp = icmp eq i8 %val, 0
960  %sel = select i1 %cmp, i8 -1, i8 %ctlz
961  store i8 %sel, i8 addrspace(1)* %out
962  ret void
963}
964
965define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
966; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
967; SI:       ; %bb.0:
968; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
969; SI-NEXT:    s_mov_b32 s3, 0xf000
970; SI-NEXT:    s_mov_b32 s6, 0
971; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
972; SI-NEXT:    v_mov_b32_e32 v1, 0
973; SI-NEXT:    s_mov_b32 s7, s3
974; SI-NEXT:    s_waitcnt lgkmcnt(0)
975; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
976; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
977; SI-NEXT:    s_mov_b32 s2, -1
978; SI-NEXT:    s_waitcnt vmcnt(0)
979; SI-NEXT:    v_ffbh_u32_e32 v1, v0
980; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
981; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
982; SI-NEXT:    s_waitcnt lgkmcnt(0)
983; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
984; SI-NEXT:    s_waitcnt vmcnt(0)
985; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
986; SI-NEXT:    s_waitcnt vmcnt(0)
987; SI-NEXT:    s_endpgm
988;
989; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
990; VI:       ; %bb.0:
991; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
992; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
993; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
994; VI-NEXT:    s_waitcnt lgkmcnt(0)
995; VI-NEXT:    v_mov_b32_e32 v1, s3
996; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
997; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
998; VI-NEXT:    flat_load_dword v2, v[0:1]
999; VI-NEXT:    v_mov_b32_e32 v0, s0
1000; VI-NEXT:    v_mov_b32_e32 v1, s1
1001; VI-NEXT:    s_waitcnt vmcnt(0)
1002; VI-NEXT:    v_ffbh_u32_e32 v3, v2
1003; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1004; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1005; VI-NEXT:    flat_store_dword v[0:1], v3
1006; VI-NEXT:    s_waitcnt vmcnt(0)
1007; VI-NEXT:    flat_store_byte v[0:1], v2
1008; VI-NEXT:    s_waitcnt vmcnt(0)
1009; VI-NEXT:    s_endpgm
1010;
1011; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1012; EG:       ; %bb.0:
1013; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1014; EG-NEXT:    TEX 0 @6
1015; EG-NEXT:    ALU 11, @11, KC0[CB0:0-32], KC1[]
1016; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
1017; EG-NEXT:    MEM_RAT MSKOR T1.XW, T2.X
1018; EG-NEXT:    CF_END
1019; EG-NEXT:    Fetch clause starting at 6:
1020; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1021; EG-NEXT:    ALU clause starting at 8:
1022; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1023; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1024; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1025; EG-NEXT:    ALU clause starting at 11:
1026; EG-NEXT:     SETE_INT * T0.W, T0.X, 0.0,
1027; EG-NEXT:     AND_INT T1.X, PV.W, 1,
1028; EG-NEXT:     MOV * T1.W, literal.x,
1029; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1030; EG-NEXT:     MOV T1.Y, 0.0,
1031; EG-NEXT:     MOV * T1.Z, 0.0,
1032; EG-NEXT:     MOV T2.X, literal.x,
1033; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1034; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1035; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1036; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.y,
1037; EG-NEXT:    -1(nan), 2(2.802597e-45)
1038;
1039; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1040; GFX9-GISEL:       ; %bb.0:
1041; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1042; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1043; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1044; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1045; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1046; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1047; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1048; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
1049; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1050; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
1051; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1052; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1053; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1054; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
1055; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1056; GFX9-GISEL-NEXT:    s_endpgm
1057  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1058  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1059  %val = load i32, i32 addrspace(1)* %in.gep
1060  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1061  %cmp = icmp eq i32 %val, 0
1062  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1063  store volatile i32 %sel, i32 addrspace(1)* %out
1064  store volatile i1 %cmp, i1 addrspace(1)* undef
1065  ret void
1066}
1067
1068; Selected on wrong constant
1069define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1070; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1071; SI:       ; %bb.0:
1072; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1073; SI-NEXT:    s_mov_b32 s3, 0xf000
1074; SI-NEXT:    s_mov_b32 s6, 0
1075; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1076; SI-NEXT:    v_mov_b32_e32 v1, 0
1077; SI-NEXT:    s_mov_b32 s7, s3
1078; SI-NEXT:    s_waitcnt lgkmcnt(0)
1079; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1080; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1081; SI-NEXT:    s_mov_b32 s2, -1
1082; SI-NEXT:    s_waitcnt vmcnt(0)
1083; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1084; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1085; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1086; SI-NEXT:    s_waitcnt lgkmcnt(0)
1087; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1088; SI-NEXT:    s_endpgm
1089;
1090; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1091; VI:       ; %bb.0:
1092; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1093; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1094; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1095; VI-NEXT:    s_waitcnt lgkmcnt(0)
1096; VI-NEXT:    v_mov_b32_e32 v1, s3
1097; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1098; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1099; VI-NEXT:    flat_load_dword v0, v[0:1]
1100; VI-NEXT:    s_waitcnt vmcnt(0)
1101; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1102; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1103; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
1104; VI-NEXT:    v_mov_b32_e32 v0, s0
1105; VI-NEXT:    v_mov_b32_e32 v1, s1
1106; VI-NEXT:    flat_store_dword v[0:1], v2
1107; VI-NEXT:    s_endpgm
1108;
1109; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1110; EG:       ; %bb.0:
1111; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1112; EG-NEXT:    TEX 0 @6
1113; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
1114; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1115; EG-NEXT:    CF_END
1116; EG-NEXT:    PAD
1117; EG-NEXT:    Fetch clause starting at 6:
1118; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1119; EG-NEXT:    ALU clause starting at 8:
1120; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1121; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1122; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1123; EG-NEXT:    ALU clause starting at 11:
1124; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1125; EG-NEXT:     CNDE_INT T0.X, T0.X, 0.0, PV.W,
1126; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1127; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1128;
1129; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1130; GFX9-GISEL:       ; %bb.0:
1131; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1132; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1133; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1134; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1135; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1136; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1137; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1138; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1139; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
1140; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1141; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1142; GFX9-GISEL-NEXT:    s_endpgm
1143  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1144  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1145  %val = load i32, i32 addrspace(1)* %in.gep
1146  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1147  %cmp = icmp eq i32 %val, 0
1148  %sel = select i1 %cmp, i32 0, i32 %ctlz
1149  store i32 %sel, i32 addrspace(1)* %out
1150  ret void
1151}
1152
1153; Selected on wrong constant
1154define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1155; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1156; SI:       ; %bb.0:
1157; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1158; SI-NEXT:    s_mov_b32 s3, 0xf000
1159; SI-NEXT:    s_mov_b32 s6, 0
1160; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1161; SI-NEXT:    v_mov_b32_e32 v1, 0
1162; SI-NEXT:    s_mov_b32 s7, s3
1163; SI-NEXT:    s_waitcnt lgkmcnt(0)
1164; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1165; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1166; SI-NEXT:    s_mov_b32 s2, -1
1167; SI-NEXT:    s_waitcnt vmcnt(0)
1168; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1169; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1170; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1171; SI-NEXT:    s_waitcnt lgkmcnt(0)
1172; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1173; SI-NEXT:    s_endpgm
1174;
1175; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1176; VI:       ; %bb.0:
1177; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1178; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1179; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1180; VI-NEXT:    s_waitcnt lgkmcnt(0)
1181; VI-NEXT:    v_mov_b32_e32 v1, s3
1182; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1183; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1184; VI-NEXT:    flat_load_dword v0, v[0:1]
1185; VI-NEXT:    s_waitcnt vmcnt(0)
1186; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1187; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1188; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
1189; VI-NEXT:    v_mov_b32_e32 v0, s0
1190; VI-NEXT:    v_mov_b32_e32 v1, s1
1191; VI-NEXT:    flat_store_dword v[0:1], v2
1192; VI-NEXT:    s_endpgm
1193;
1194; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1195; EG:       ; %bb.0:
1196; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1197; EG-NEXT:    TEX 0 @6
1198; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
1199; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1200; EG-NEXT:    CF_END
1201; EG-NEXT:    PAD
1202; EG-NEXT:    Fetch clause starting at 6:
1203; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1204; EG-NEXT:    ALU clause starting at 8:
1205; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1206; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1207; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1208; EG-NEXT:    ALU clause starting at 11:
1209; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1210; EG-NEXT:     CNDE_INT T0.X, T0.X, 0.0, PV.W,
1211; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1212; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1213;
1214; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1215; GFX9-GISEL:       ; %bb.0:
1216; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1217; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1218; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1219; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1220; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1221; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1222; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1223; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1224; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1225; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1226; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1227; GFX9-GISEL-NEXT:    s_endpgm
1228  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1229  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1230  %val = load i32, i32 addrspace(1)* %in.gep
1231  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1232  %cmp = icmp ne i32 %val, 0
1233  %sel = select i1 %cmp, i32 %ctlz, i32 0
1234  store i32 %sel, i32 addrspace(1)* %out
1235  ret void
1236}
1237
1238; Compare on wrong constant
1239define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1240; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1241; SI:       ; %bb.0:
1242; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1243; SI-NEXT:    s_mov_b32 s3, 0xf000
1244; SI-NEXT:    s_mov_b32 s6, 0
1245; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1246; SI-NEXT:    v_mov_b32_e32 v1, 0
1247; SI-NEXT:    s_mov_b32 s7, s3
1248; SI-NEXT:    s_waitcnt lgkmcnt(0)
1249; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1250; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1251; SI-NEXT:    s_mov_b32 s2, -1
1252; SI-NEXT:    s_waitcnt vmcnt(0)
1253; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1254; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
1255; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1256; SI-NEXT:    s_waitcnt lgkmcnt(0)
1257; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1258; SI-NEXT:    s_endpgm
1259;
1260; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1261; VI:       ; %bb.0:
1262; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1263; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1264; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1265; VI-NEXT:    s_waitcnt lgkmcnt(0)
1266; VI-NEXT:    v_mov_b32_e32 v1, s3
1267; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1268; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1269; VI-NEXT:    flat_load_dword v0, v[0:1]
1270; VI-NEXT:    s_waitcnt vmcnt(0)
1271; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1272; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
1273; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
1274; VI-NEXT:    v_mov_b32_e32 v0, s0
1275; VI-NEXT:    v_mov_b32_e32 v1, s1
1276; VI-NEXT:    flat_store_dword v[0:1], v2
1277; VI-NEXT:    s_endpgm
1278;
1279; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1280; EG:       ; %bb.0:
1281; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1282; EG-NEXT:    TEX 0 @6
1283; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
1284; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1285; EG-NEXT:    CF_END
1286; EG-NEXT:    PAD
1287; EG-NEXT:    Fetch clause starting at 6:
1288; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1289; EG-NEXT:    ALU clause starting at 8:
1290; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1291; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1292; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1293; EG-NEXT:    ALU clause starting at 11:
1294; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1295; EG-NEXT:     SETE_INT * T1.W, T0.X, 1,
1296; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, 0.0,
1297; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1298; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1299;
1300; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1301; GFX9-GISEL:       ; %bb.0:
1302; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1303; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1304; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1305; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1306; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1307; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1308; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1309; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
1310; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
1311; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1312; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1313; GFX9-GISEL-NEXT:    s_endpgm
1314  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1315  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1316  %val = load i32, i32 addrspace(1)* %in.gep
1317  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1318  %cmp = icmp eq i32 %val, 1
1319  %sel = select i1 %cmp, i32 0, i32 %ctlz
1320  store i32 %sel, i32 addrspace(1)* %out
1321  ret void
1322}
1323
1324; Selected on wrong constant
1325define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1326; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1327; SI:       ; %bb.0:
1328; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1329; SI-NEXT:    s_mov_b32 s3, 0xf000
1330; SI-NEXT:    s_mov_b32 s6, 0
1331; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1332; SI-NEXT:    v_mov_b32_e32 v1, 0
1333; SI-NEXT:    s_mov_b32 s7, s3
1334; SI-NEXT:    s_waitcnt lgkmcnt(0)
1335; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1336; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1337; SI-NEXT:    s_mov_b32 s2, -1
1338; SI-NEXT:    s_waitcnt vmcnt(0)
1339; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1340; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
1341; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1342; SI-NEXT:    s_waitcnt lgkmcnt(0)
1343; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1344; SI-NEXT:    s_endpgm
1345;
1346; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1347; VI:       ; %bb.0:
1348; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1349; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1350; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1351; VI-NEXT:    s_waitcnt lgkmcnt(0)
1352; VI-NEXT:    v_mov_b32_e32 v1, s3
1353; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1354; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1355; VI-NEXT:    flat_load_dword v0, v[0:1]
1356; VI-NEXT:    s_waitcnt vmcnt(0)
1357; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1358; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
1359; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
1360; VI-NEXT:    v_mov_b32_e32 v0, s0
1361; VI-NEXT:    v_mov_b32_e32 v1, s1
1362; VI-NEXT:    flat_store_dword v[0:1], v2
1363; VI-NEXT:    s_endpgm
1364;
1365; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1366; EG:       ; %bb.0:
1367; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1368; EG-NEXT:    TEX 0 @6
1369; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
1370; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1371; EG-NEXT:    CF_END
1372; EG-NEXT:    PAD
1373; EG-NEXT:    Fetch clause starting at 6:
1374; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1375; EG-NEXT:    ALU clause starting at 8:
1376; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1377; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1378; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1379; EG-NEXT:    ALU clause starting at 11:
1380; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1381; EG-NEXT:     SETNE_INT * T1.W, T0.X, 1,
1382; EG-NEXT:     CNDE_INT T0.X, PS, 0.0, PV.W,
1383; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1384; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1385;
1386; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1387; GFX9-GISEL:       ; %bb.0:
1388; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1389; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1390; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1391; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1392; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1393; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1394; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1395; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
1396; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1397; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1398; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1399; GFX9-GISEL-NEXT:    s_endpgm
1400  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1401  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1402  %val = load i32, i32 addrspace(1)* %in.gep
1403  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1404  %cmp = icmp ne i32 %val, 1
1405  %sel = select i1 %cmp, i32 %ctlz, i32 0
1406  store i32 %sel, i32 addrspace(1)* %out
1407  ret void
1408}
1409