1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
6
7declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
8declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
9declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
10declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
11declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
12declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
13declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
14declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
15
16define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
17; SI-LABEL: s_cttz_zero_undef_i32:
18; SI:       ; %bb.0:
19; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
20; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
21; SI-NEXT:    s_mov_b32 s3, 0xf000
22; SI-NEXT:    s_waitcnt lgkmcnt(0)
23; SI-NEXT:    s_ff1_i32_b32 s4, s2
24; SI-NEXT:    s_mov_b32 s2, -1
25; SI-NEXT:    v_mov_b32_e32 v0, s4
26; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: s_cttz_zero_undef_i32:
30; VI:       ; %bb.0:
31; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
32; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_ff1_i32_b32 s2, s2
35; VI-NEXT:    v_mov_b32_e32 v0, s0
36; VI-NEXT:    v_mov_b32_e32 v1, s1
37; VI-NEXT:    v_mov_b32_e32 v2, s2
38; VI-NEXT:    flat_store_dword v[0:1], v2
39; VI-NEXT:    s_endpgm
40;
41; EG-LABEL: s_cttz_zero_undef_i32:
42; EG:       ; %bb.0:
43; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
44; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
45; EG-NEXT:    CF_END
46; EG-NEXT:    PAD
47; EG-NEXT:    ALU clause starting at 4:
48; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
49; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
50; EG-NEXT:     FFBL_INT * T1.X, KC0[2].Z,
51;
52; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32:
53; GFX9-GISEL:       ; %bb.0:
54; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
55; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
56; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
57; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s0, s4
59; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
60; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
61; GFX9-GISEL-NEXT:    s_endpgm
62  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
63  store i32 %cttz, i32 addrspace(1)* %out, align 4
64  ret void
65}
66
67define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
68; SI-LABEL: v_cttz_zero_undef_i32:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
71; SI-NEXT:    s_mov_b32 s3, 0xf000
72; SI-NEXT:    s_mov_b32 s6, 0
73; SI-NEXT:    s_mov_b32 s7, s3
74; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
75; SI-NEXT:    v_mov_b32_e32 v1, 0
76; SI-NEXT:    s_waitcnt lgkmcnt(0)
77; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
78; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
79; SI-NEXT:    s_mov_b32 s2, -1
80; SI-NEXT:    s_waitcnt vmcnt(0)
81; SI-NEXT:    v_ffbl_b32_e32 v0, v0
82; SI-NEXT:    s_waitcnt lgkmcnt(0)
83; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
84; SI-NEXT:    s_endpgm
85;
86; VI-LABEL: v_cttz_zero_undef_i32:
87; VI:       ; %bb.0:
88; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
89; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
90; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
91; VI-NEXT:    s_waitcnt lgkmcnt(0)
92; VI-NEXT:    v_mov_b32_e32 v1, s3
93; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
94; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
95; VI-NEXT:    flat_load_dword v0, v[0:1]
96; VI-NEXT:    s_waitcnt vmcnt(0)
97; VI-NEXT:    v_ffbl_b32_e32 v2, v0
98; VI-NEXT:    v_mov_b32_e32 v0, s0
99; VI-NEXT:    v_mov_b32_e32 v1, s1
100; VI-NEXT:    flat_store_dword v[0:1], v2
101; VI-NEXT:    s_endpgm
102;
103; EG-LABEL: v_cttz_zero_undef_i32:
104; EG:       ; %bb.0:
105; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
106; EG-NEXT:    TEX 0 @6
107; EG-NEXT:    ALU 2, @11, KC0[CB0:0-32], KC1[]
108; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
109; EG-NEXT:    CF_END
110; EG-NEXT:    PAD
111; EG-NEXT:    Fetch clause starting at 6:
112; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
113; EG-NEXT:    ALU clause starting at 8:
114; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
115; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
116; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
117; EG-NEXT:    ALU clause starting at 11:
118; EG-NEXT:     FFBL_INT T0.X, T0.X,
119; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
120; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
121;
122; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
123; GFX9-GISEL:       ; %bb.0:
124; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
125; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
126; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
127; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
128; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
130; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
131; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
132; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
133; GFX9-GISEL-NEXT:    s_endpgm
134  %tid = call i32 @llvm.amdgcn.workitem.id.x()
135  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
136  %val = load i32, i32 addrspace(1)* %in.gep, align 4
137  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
138  store i32 %cttz, i32 addrspace(1)* %out, align 4
139  ret void
140}
141
142define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
143; SI-LABEL: v_cttz_zero_undef_v2i32:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
146; SI-NEXT:    s_mov_b32 s3, 0xf000
147; SI-NEXT:    s_mov_b32 s6, 0
148; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
149; SI-NEXT:    v_mov_b32_e32 v1, 0
150; SI-NEXT:    s_mov_b32 s7, s3
151; SI-NEXT:    s_waitcnt lgkmcnt(0)
152; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
153; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
154; SI-NEXT:    s_mov_b32 s2, -1
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    v_ffbl_b32_e32 v1, v1
157; SI-NEXT:    v_ffbl_b32_e32 v0, v0
158; SI-NEXT:    s_waitcnt lgkmcnt(0)
159; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
160; SI-NEXT:    s_endpgm
161;
162; VI-LABEL: v_cttz_zero_undef_v2i32:
163; VI:       ; %bb.0:
164; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
165; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
166; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    v_mov_b32_e32 v1, s3
169; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
170; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
171; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
172; VI-NEXT:    v_mov_b32_e32 v3, s1
173; VI-NEXT:    v_mov_b32_e32 v2, s0
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_ffbl_b32_e32 v1, v1
176; VI-NEXT:    v_ffbl_b32_e32 v0, v0
177; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
178; VI-NEXT:    s_endpgm
179;
180; EG-LABEL: v_cttz_zero_undef_v2i32:
181; EG:       ; %bb.0:
182; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
183; EG-NEXT:    TEX 0 @6
184; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
185; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
186; EG-NEXT:    CF_END
187; EG-NEXT:    PAD
188; EG-NEXT:    Fetch clause starting at 6:
189; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
190; EG-NEXT:    ALU clause starting at 8:
191; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
192; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
193; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
194; EG-NEXT:    ALU clause starting at 11:
195; EG-NEXT:     FFBL_INT * T0.Y, T0.Y,
196; EG-NEXT:     FFBL_INT T0.X, T0.X,
197; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
198; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
199;
200; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
201; GFX9-GISEL:       ; %bb.0:
202; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
203; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
204; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
205; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
206; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
208; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
209; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
210; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
211; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
212; GFX9-GISEL-NEXT:    s_endpgm
213  %tid = call i32 @llvm.amdgcn.workitem.id.x()
214  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
215  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
216  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
217  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
218  ret void
219}
220
221define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
222; SI-LABEL: v_cttz_zero_undef_v4i32:
223; SI:       ; %bb.0:
224; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
225; SI-NEXT:    s_mov_b32 s3, 0xf000
226; SI-NEXT:    s_mov_b32 s6, 0
227; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
228; SI-NEXT:    v_mov_b32_e32 v1, 0
229; SI-NEXT:    s_mov_b32 s7, s3
230; SI-NEXT:    s_waitcnt lgkmcnt(0)
231; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
232; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
233; SI-NEXT:    s_mov_b32 s2, -1
234; SI-NEXT:    s_waitcnt vmcnt(0)
235; SI-NEXT:    v_ffbl_b32_e32 v3, v3
236; SI-NEXT:    v_ffbl_b32_e32 v2, v2
237; SI-NEXT:    v_ffbl_b32_e32 v1, v1
238; SI-NEXT:    v_ffbl_b32_e32 v0, v0
239; SI-NEXT:    s_waitcnt lgkmcnt(0)
240; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
241; SI-NEXT:    s_endpgm
242;
243; VI-LABEL: v_cttz_zero_undef_v4i32:
244; VI:       ; %bb.0:
245; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
246; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
247; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
248; VI-NEXT:    s_waitcnt lgkmcnt(0)
249; VI-NEXT:    v_mov_b32_e32 v1, s3
250; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
251; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
252; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
253; VI-NEXT:    v_mov_b32_e32 v5, s1
254; VI-NEXT:    v_mov_b32_e32 v4, s0
255; VI-NEXT:    s_waitcnt vmcnt(0)
256; VI-NEXT:    v_ffbl_b32_e32 v3, v3
257; VI-NEXT:    v_ffbl_b32_e32 v2, v2
258; VI-NEXT:    v_ffbl_b32_e32 v1, v1
259; VI-NEXT:    v_ffbl_b32_e32 v0, v0
260; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
261; VI-NEXT:    s_endpgm
262;
263; EG-LABEL: v_cttz_zero_undef_v4i32:
264; EG:       ; %bb.0:
265; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
266; EG-NEXT:    TEX 0 @6
267; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
268; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
269; EG-NEXT:    CF_END
270; EG-NEXT:    PAD
271; EG-NEXT:    Fetch clause starting at 6:
272; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
273; EG-NEXT:    ALU clause starting at 8:
274; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
275; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
276; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
277; EG-NEXT:    ALU clause starting at 11:
278; EG-NEXT:     FFBL_INT * T0.W, T0.W,
279; EG-NEXT:     FFBL_INT * T0.Z, T0.Z,
280; EG-NEXT:     FFBL_INT * T0.Y, T0.Y,
281; EG-NEXT:     FFBL_INT T0.X, T0.X,
282; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
283; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
284;
285; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
286; GFX9-GISEL:       ; %bb.0:
287; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
288; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
289; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
290; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
291; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
293; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
294; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
295; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
296; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
297; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
298; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
299; GFX9-GISEL-NEXT:    s_endpgm
300  %tid = call i32 @llvm.amdgcn.workitem.id.x()
301  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
302  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
303  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
304  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
305  ret void
306}
307
308define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind {
309; SI-LABEL: s_cttz_zero_undef_i8_with_select:
310; SI:       ; %bb.0:
311; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
312; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
313; SI-NEXT:    s_mov_b32 s3, 0xf000
314; SI-NEXT:    s_waitcnt lgkmcnt(0)
315; SI-NEXT:    s_ff1_i32_b32 s4, s2
316; SI-NEXT:    s_mov_b32 s2, -1
317; SI-NEXT:    v_mov_b32_e32 v0, s4
318; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
319; SI-NEXT:    s_endpgm
320;
321; VI-LABEL: s_cttz_zero_undef_i8_with_select:
322; VI:       ; %bb.0:
323; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
324; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
325; VI-NEXT:    s_waitcnt lgkmcnt(0)
326; VI-NEXT:    s_ff1_i32_b32 s2, s2
327; VI-NEXT:    v_mov_b32_e32 v0, s0
328; VI-NEXT:    v_mov_b32_e32 v1, s1
329; VI-NEXT:    v_mov_b32_e32 v2, s2
330; VI-NEXT:    flat_store_byte v[0:1], v2
331; VI-NEXT:    s_endpgm
332;
333; EG-LABEL: s_cttz_zero_undef_i8_with_select:
334; EG:       ; %bb.0:
335; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
336; EG-NEXT:    TEX 0 @6
337; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
338; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
339; EG-NEXT:    CF_END
340; EG-NEXT:    PAD
341; EG-NEXT:    Fetch clause starting at 6:
342; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
343; EG-NEXT:    ALU clause starting at 8:
344; EG-NEXT:     MOV * T0.X, 0.0,
345; EG-NEXT:    ALU clause starting at 9:
346; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
347; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
348; EG-NEXT:     FFBL_INT T0.W, PV.W,
349; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
350; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
351; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
352; EG-NEXT:     LSHL * T1.W, PS, literal.y,
353; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
354; EG-NEXT:     LSHL T0.X, PV.W, PS,
355; EG-NEXT:     LSHL * T0.W, literal.x, PS,
356; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
357; EG-NEXT:     MOV T0.Y, 0.0,
358; EG-NEXT:     MOV * T0.Z, 0.0,
359; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
360; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
361;
362; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select:
363; GFX9-GISEL:       ; %bb.0:
364; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
365; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
366; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
367; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s0, s4
369; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
370; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
371; GFX9-GISEL-NEXT:    s_endpgm
372  %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
373  %cttz_ret = icmp ne i8 %val, 0
374  %ret = select i1 %cttz_ret, i8 %cttz, i8 32
375  store i8 %cttz, i8 addrspace(1)* %out, align 4
376  ret void
377}
378
379define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
380; SI-LABEL: s_cttz_zero_undef_i16_with_select:
381; SI:       ; %bb.0:
382; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
383; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
384; SI-NEXT:    s_mov_b32 s3, 0xf000
385; SI-NEXT:    s_waitcnt lgkmcnt(0)
386; SI-NEXT:    s_ff1_i32_b32 s4, s2
387; SI-NEXT:    s_mov_b32 s2, -1
388; SI-NEXT:    v_mov_b32_e32 v0, s4
389; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
390; SI-NEXT:    s_endpgm
391;
392; VI-LABEL: s_cttz_zero_undef_i16_with_select:
393; VI:       ; %bb.0:
394; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
395; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
396; VI-NEXT:    s_waitcnt lgkmcnt(0)
397; VI-NEXT:    s_ff1_i32_b32 s2, s2
398; VI-NEXT:    v_mov_b32_e32 v0, s0
399; VI-NEXT:    v_mov_b32_e32 v1, s1
400; VI-NEXT:    v_mov_b32_e32 v2, s2
401; VI-NEXT:    flat_store_short v[0:1], v2
402; VI-NEXT:    s_endpgm
403;
404; EG-LABEL: s_cttz_zero_undef_i16_with_select:
405; EG:       ; %bb.0:
406; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
407; EG-NEXT:    TEX 0 @6
408; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
409; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
410; EG-NEXT:    CF_END
411; EG-NEXT:    PAD
412; EG-NEXT:    Fetch clause starting at 6:
413; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
414; EG-NEXT:    ALU clause starting at 8:
415; EG-NEXT:     MOV * T0.X, 0.0,
416; EG-NEXT:    ALU clause starting at 9:
417; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
418; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
419; EG-NEXT:     FFBL_INT T0.W, PV.W,
420; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
421; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
422; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
423; EG-NEXT:     LSHL * T1.W, PS, literal.y,
424; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
425; EG-NEXT:     LSHL T0.X, PV.W, PS,
426; EG-NEXT:     LSHL * T0.W, literal.x, PS,
427; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
428; EG-NEXT:     MOV T0.Y, 0.0,
429; EG-NEXT:     MOV * T0.Z, 0.0,
430; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
431; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
432;
433; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select:
434; GFX9-GISEL:       ; %bb.0:
435; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
436; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
437; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
438; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s0, s4
440; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
441; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
442; GFX9-GISEL-NEXT:    s_endpgm
443  %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
444  %cttz_ret = icmp ne i16 %val, 0
445  %ret = select i1 %cttz_ret, i16 %cttz, i16 32
446  store i16 %cttz, i16 addrspace(1)* %out, align 4
447  ret void
448}
449
450define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
451; SI-LABEL: s_cttz_zero_undef_i32_with_select:
452; SI:       ; %bb.0:
453; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
454; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
455; SI-NEXT:    s_mov_b32 s3, 0xf000
456; SI-NEXT:    s_waitcnt lgkmcnt(0)
457; SI-NEXT:    s_ff1_i32_b32 s4, s2
458; SI-NEXT:    s_mov_b32 s2, -1
459; SI-NEXT:    v_mov_b32_e32 v0, s4
460; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
461; SI-NEXT:    s_endpgm
462;
463; VI-LABEL: s_cttz_zero_undef_i32_with_select:
464; VI:       ; %bb.0:
465; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
466; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
467; VI-NEXT:    s_waitcnt lgkmcnt(0)
468; VI-NEXT:    s_ff1_i32_b32 s2, s2
469; VI-NEXT:    v_mov_b32_e32 v0, s0
470; VI-NEXT:    v_mov_b32_e32 v1, s1
471; VI-NEXT:    v_mov_b32_e32 v2, s2
472; VI-NEXT:    flat_store_dword v[0:1], v2
473; VI-NEXT:    s_endpgm
474;
475; EG-LABEL: s_cttz_zero_undef_i32_with_select:
476; EG:       ; %bb.0:
477; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
478; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
479; EG-NEXT:    CF_END
480; EG-NEXT:    PAD
481; EG-NEXT:    ALU clause starting at 4:
482; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
483; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
484; EG-NEXT:     FFBL_INT * T1.X, KC0[2].Z,
485;
486; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select:
487; GFX9-GISEL:       ; %bb.0:
488; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
489; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
490; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
491; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s0, s4
493; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
494; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
495; GFX9-GISEL-NEXT:    s_endpgm
496  %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
497  %cttz_ret = icmp ne i32 %val, 0
498  %ret = select i1 %cttz_ret, i32 %cttz, i32 32
499  store i32 %cttz, i32 addrspace(1)* %out, align 4
500  ret void
501}
502
503define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
504; SI-LABEL: s_cttz_zero_undef_i64_with_select:
505; SI:       ; %bb.0:
506; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
507; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
508; SI-NEXT:    s_mov_b32 s3, 0xf000
509; SI-NEXT:    s_mov_b32 s2, -1
510; SI-NEXT:    s_waitcnt lgkmcnt(0)
511; SI-NEXT:    s_ff1_i32_b32 s5, s5
512; SI-NEXT:    s_ff1_i32_b32 s4, s4
513; SI-NEXT:    s_add_i32 s5, s5, 32
514; SI-NEXT:    s_min_u32 s4, s4, s5
515; SI-NEXT:    v_mov_b32_e32 v1, 0
516; SI-NEXT:    v_mov_b32_e32 v0, s4
517; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
518; SI-NEXT:    s_endpgm
519;
520; VI-LABEL: s_cttz_zero_undef_i64_with_select:
521; VI:       ; %bb.0:
522; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
523; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
524; VI-NEXT:    v_mov_b32_e32 v1, 0
525; VI-NEXT:    s_waitcnt lgkmcnt(0)
526; VI-NEXT:    s_ff1_i32_b32 s3, s3
527; VI-NEXT:    s_ff1_i32_b32 s2, s2
528; VI-NEXT:    s_add_i32 s3, s3, 32
529; VI-NEXT:    s_min_u32 s2, s2, s3
530; VI-NEXT:    v_mov_b32_e32 v3, s1
531; VI-NEXT:    v_mov_b32_e32 v0, s2
532; VI-NEXT:    v_mov_b32_e32 v2, s0
533; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
534; VI-NEXT:    s_endpgm
535;
536; EG-LABEL: s_cttz_zero_undef_i64_with_select:
537; EG:       ; %bb.0:
538; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
539; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
540; EG-NEXT:    CF_END
541; EG-NEXT:    PAD
542; EG-NEXT:    ALU clause starting at 4:
543; EG-NEXT:     FFBL_INT * T0.W, KC0[3].X,
544; EG-NEXT:     FFBL_INT T1.W, KC0[2].W,
545; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
546; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
547; EG-NEXT:     CNDE_INT T0.X, KC0[2].W, PS, PV.W,
548; EG-NEXT:     MOV T0.Y, 0.0,
549; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
550; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
551;
552; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
553; GFX9-GISEL:       ; %bb.0:
554; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
555; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
556; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
557; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s0, s[2:3]
559; GFX9-GISEL-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x200000
560; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
561; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
562; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
563; GFX9-GISEL-NEXT:    s_endpgm
564  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
565  %cttz_ret = icmp ne i64 %val, 0
566  %ret = select i1 %cttz_ret, i64 %cttz, i64 32
567  store i64 %cttz, i64 addrspace(1)* %out, align 4
568  ret void
569}
570
571define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
572; SI-LABEL: v_cttz_zero_undef_i8_with_select:
573; SI:       ; %bb.0:
574; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
575; SI-NEXT:    s_mov_b32 s3, 0xf000
576; SI-NEXT:    s_mov_b32 s2, -1
577; SI-NEXT:    s_mov_b32 s6, s2
578; SI-NEXT:    s_mov_b32 s7, s3
579; SI-NEXT:    s_waitcnt lgkmcnt(0)
580; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
581; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
582; SI-NEXT:    s_waitcnt vmcnt(0)
583; SI-NEXT:    v_ffbl_b32_e32 v1, v0
584; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
585; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
586; SI-NEXT:    s_waitcnt lgkmcnt(0)
587; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
588; SI-NEXT:    s_endpgm
589;
590; VI-LABEL: v_cttz_zero_undef_i8_with_select:
591; VI:       ; %bb.0:
592; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
593; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
594; VI-NEXT:    s_waitcnt lgkmcnt(0)
595; VI-NEXT:    v_mov_b32_e32 v0, s2
596; VI-NEXT:    v_mov_b32_e32 v1, s3
597; VI-NEXT:    flat_load_ubyte v0, v[0:1]
598; VI-NEXT:    s_waitcnt vmcnt(0)
599; VI-NEXT:    v_ffbl_b32_e32 v1, v0
600; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
601; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
602; VI-NEXT:    v_mov_b32_e32 v0, s0
603; VI-NEXT:    v_mov_b32_e32 v1, s1
604; VI-NEXT:    flat_store_byte v[0:1], v2
605; VI-NEXT:    s_endpgm
606;
607; EG-LABEL: v_cttz_zero_undef_i8_with_select:
608; EG:       ; %bb.0:
609; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
610; EG-NEXT:    TEX 0 @6
611; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
612; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
613; EG-NEXT:    CF_END
614; EG-NEXT:    PAD
615; EG-NEXT:    Fetch clause starting at 6:
616; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
617; EG-NEXT:    ALU clause starting at 8:
618; EG-NEXT:     MOV * T0.X, KC0[2].Z,
619; EG-NEXT:    ALU clause starting at 9:
620; EG-NEXT:     FFBL_INT T0.W, T0.X,
621; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
622; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
623; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
624; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
625; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
626; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
627; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
628; EG-NEXT:     LSHL T0.X, PV.W, PS,
629; EG-NEXT:     LSHL * T0.W, literal.x, PS,
630; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
631; EG-NEXT:     MOV T0.Y, 0.0,
632; EG-NEXT:     MOV * T0.Z, 0.0,
633; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
634; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
635;
636; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
637; GFX9-GISEL:       ; %bb.0:
638; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
639; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
640; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
641; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
642; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
643; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
644; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
645; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
646; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
647; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
648; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
649; GFX9-GISEL-NEXT:    s_endpgm
650  %val = load i8, i8 addrspace(1)* %arrayidx, align 1
651  %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
652  %cttz_ret = icmp ne i8 %val, 0
653  %ret = select i1 %cttz_ret, i8 %cttz, i8 32
654  store i8 %ret, i8 addrspace(1)* %out, align 4
655  ret void
656}
657
658define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
659; SI-LABEL: v_cttz_zero_undef_i16_with_select:
660; SI:       ; %bb.0:
661; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
662; SI-NEXT:    s_mov_b32 s3, 0xf000
663; SI-NEXT:    s_mov_b32 s2, -1
664; SI-NEXT:    s_mov_b32 s6, s2
665; SI-NEXT:    s_mov_b32 s7, s3
666; SI-NEXT:    s_waitcnt lgkmcnt(0)
667; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:1
668; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0
669; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
670; SI-NEXT:    s_waitcnt vmcnt(1)
671; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
672; SI-NEXT:    s_waitcnt vmcnt(0)
673; SI-NEXT:    v_or_b32_e32 v0, v0, v1
674; SI-NEXT:    v_ffbl_b32_e32 v1, v0
675; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
676; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
677; SI-NEXT:    s_waitcnt lgkmcnt(0)
678; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
679; SI-NEXT:    s_endpgm
680;
681; VI-LABEL: v_cttz_zero_undef_i16_with_select:
682; VI:       ; %bb.0:
683; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
684; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
685; VI-NEXT:    s_waitcnt lgkmcnt(0)
686; VI-NEXT:    s_add_u32 s4, s2, 1
687; VI-NEXT:    s_addc_u32 s5, s3, 0
688; VI-NEXT:    v_mov_b32_e32 v2, s4
689; VI-NEXT:    v_mov_b32_e32 v0, s2
690; VI-NEXT:    v_mov_b32_e32 v3, s5
691; VI-NEXT:    v_mov_b32_e32 v1, s3
692; VI-NEXT:    flat_load_ubyte v2, v[2:3]
693; VI-NEXT:    flat_load_ubyte v0, v[0:1]
694; VI-NEXT:    s_waitcnt vmcnt(1)
695; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
696; VI-NEXT:    s_waitcnt vmcnt(0)
697; VI-NEXT:    v_or_b32_e32 v0, v1, v0
698; VI-NEXT:    v_ffbl_b32_e32 v1, v0
699; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
700; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
701; VI-NEXT:    v_mov_b32_e32 v0, s0
702; VI-NEXT:    v_mov_b32_e32 v1, s1
703; VI-NEXT:    flat_store_short v[0:1], v2
704; VI-NEXT:    s_endpgm
705;
706; EG-LABEL: v_cttz_zero_undef_i16_with_select:
707; EG:       ; %bb.0:
708; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
709; EG-NEXT:    TEX 0 @6
710; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
711; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
712; EG-NEXT:    CF_END
713; EG-NEXT:    PAD
714; EG-NEXT:    Fetch clause starting at 6:
715; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
716; EG-NEXT:    ALU clause starting at 8:
717; EG-NEXT:     MOV * T0.X, KC0[2].Z,
718; EG-NEXT:    ALU clause starting at 9:
719; EG-NEXT:     FFBL_INT T0.W, T0.X,
720; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
721; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
722; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
723; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
724; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
725; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
726; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
727; EG-NEXT:     LSHL T0.X, PV.W, PS,
728; EG-NEXT:     LSHL * T0.W, literal.x, PS,
729; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
730; EG-NEXT:     MOV T0.Y, 0.0,
731; EG-NEXT:     MOV * T0.Z, 0.0,
732; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
733; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
734;
735; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
736; GFX9-GISEL:       ; %bb.0:
737; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
738; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
739; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
740; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
742; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
743; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
744; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
745; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
746; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
747; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
748; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
749; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
750; GFX9-GISEL-NEXT:    s_endpgm
751  %val = load i16, i16 addrspace(1)* %arrayidx, align 1
752  %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
753  %cttz_ret = icmp ne i16 %val, 0
754  %ret = select i1 %cttz_ret, i16 %cttz, i16 32
755  store i16 %ret, i16 addrspace(1)* %out, align 4
756  ret void
757}
758
759define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
760; SI-LABEL: v_cttz_zero_undef_i32_with_select:
761; SI:       ; %bb.0:
762; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
763; SI-NEXT:    s_mov_b32 s3, 0xf000
764; SI-NEXT:    s_mov_b32 s2, -1
765; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
766; SI-NEXT:    s_mov_b32 s6, s2
767; SI-NEXT:    s_mov_b32 s7, s3
768; SI-NEXT:    s_waitcnt lgkmcnt(0)
769; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:1
770; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:3
771; SI-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0
772; SI-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:2
773; SI-NEXT:    s_waitcnt vmcnt(3)
774; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
775; SI-NEXT:    s_waitcnt vmcnt(2)
776; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
777; SI-NEXT:    s_waitcnt vmcnt(1)
778; SI-NEXT:    v_or_b32_e32 v0, v0, v2
779; SI-NEXT:    s_waitcnt vmcnt(0)
780; SI-NEXT:    v_or_b32_e32 v1, v1, v3
781; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
782; SI-NEXT:    v_or_b32_e32 v0, v1, v0
783; SI-NEXT:    v_ffbl_b32_e32 v0, v0
784; SI-NEXT:    v_min_u32_e32 v0, 32, v0
785; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
786; SI-NEXT:    s_endpgm
787;
788; VI-LABEL: v_cttz_zero_undef_i32_with_select:
789; VI:       ; %bb.0:
790; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
791; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
792; VI-NEXT:    s_waitcnt lgkmcnt(0)
793; VI-NEXT:    s_add_u32 s4, s2, 3
794; VI-NEXT:    s_addc_u32 s5, s3, 0
795; VI-NEXT:    v_mov_b32_e32 v2, s4
796; VI-NEXT:    v_mov_b32_e32 v3, s5
797; VI-NEXT:    s_add_u32 s4, s2, 2
798; VI-NEXT:    v_mov_b32_e32 v0, s2
799; VI-NEXT:    s_addc_u32 s5, s3, 0
800; VI-NEXT:    v_mov_b32_e32 v1, s3
801; VI-NEXT:    s_add_u32 s2, s2, 1
802; VI-NEXT:    s_addc_u32 s3, s3, 0
803; VI-NEXT:    v_mov_b32_e32 v4, s4
804; VI-NEXT:    v_mov_b32_e32 v7, s3
805; VI-NEXT:    v_mov_b32_e32 v5, s5
806; VI-NEXT:    v_mov_b32_e32 v6, s2
807; VI-NEXT:    flat_load_ubyte v2, v[2:3]
808; VI-NEXT:    flat_load_ubyte v3, v[4:5]
809; VI-NEXT:    flat_load_ubyte v4, v[6:7]
810; VI-NEXT:    flat_load_ubyte v0, v[0:1]
811; VI-NEXT:    s_waitcnt vmcnt(3)
812; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
813; VI-NEXT:    s_waitcnt vmcnt(2)
814; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
815; VI-NEXT:    s_waitcnt vmcnt(1)
816; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
817; VI-NEXT:    s_waitcnt vmcnt(0)
818; VI-NEXT:    v_or_b32_e32 v0, v2, v0
819; VI-NEXT:    v_or_b32_e32 v0, v1, v0
820; VI-NEXT:    v_ffbl_b32_e32 v0, v0
821; VI-NEXT:    v_min_u32_e32 v2, 32, v0
822; VI-NEXT:    v_mov_b32_e32 v0, s0
823; VI-NEXT:    v_mov_b32_e32 v1, s1
824; VI-NEXT:    flat_store_dword v[0:1], v2
825; VI-NEXT:    s_endpgm
826;
827; EG-LABEL: v_cttz_zero_undef_i32_with_select:
828; EG:       ; %bb.0:
829; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
830; EG-NEXT:    TEX 1 @6
831; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
832; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
833; EG-NEXT:    CF_END
834; EG-NEXT:    PAD
835; EG-NEXT:    Fetch clause starting at 6:
836; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
837; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
838; EG-NEXT:    ALU clause starting at 10:
839; EG-NEXT:     MOV * T0.X, KC0[2].Z,
840; EG-NEXT:    ALU clause starting at 11:
841; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
842; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
843; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
844; EG-NEXT:     FFBL_INT * T1.W, PV.W,
845; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
846; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
847; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
848;
849; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
850; GFX9-GISEL:       ; %bb.0:
851; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
852; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
853; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
854; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
856; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
857; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
858; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
859; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
860; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
861; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
862; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
863; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
864; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
865; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
866; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
867; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
868; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
869; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
870; GFX9-GISEL-NEXT:    s_endpgm
871  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
872  %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
873  %cttz_ret = icmp ne i32 %val, 0
874  %ret = select i1 %cttz_ret, i32 %cttz, i32 32
875  store i32 %ret, i32 addrspace(1)* %out, align 4
876  ret void
877}
878
879define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind {
880; SI-LABEL: v_cttz_zero_undef_i64_with_select:
881; SI:       ; %bb.0:
882; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
883; SI-NEXT:    s_mov_b32 s3, 0xf000
884; SI-NEXT:    s_mov_b32 s2, -1
885; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
886; SI-NEXT:    s_mov_b32 s6, s2
887; SI-NEXT:    s_mov_b32 s7, s3
888; SI-NEXT:    s_waitcnt lgkmcnt(0)
889; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
890; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
891; SI-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
892; SI-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
893; SI-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:4
894; SI-NEXT:    buffer_load_ubyte v5, off, s[4:7], 0 offset:5
895; SI-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:6
896; SI-NEXT:    buffer_load_ubyte v7, off, s[4:7], 0 offset:7
897; SI-NEXT:    s_waitcnt vmcnt(6)
898; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
899; SI-NEXT:    s_waitcnt vmcnt(4)
900; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
901; SI-NEXT:    s_waitcnt vmcnt(2)
902; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
903; SI-NEXT:    s_waitcnt vmcnt(0)
904; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
905; SI-NEXT:    v_or_b32_e32 v0, v1, v0
906; SI-NEXT:    v_or_b32_e32 v1, v3, v2
907; SI-NEXT:    v_or_b32_e32 v2, v5, v4
908; SI-NEXT:    v_or_b32_e32 v3, v7, v6
909; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
910; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
911; SI-NEXT:    v_or_b32_e32 v0, v1, v0
912; SI-NEXT:    v_or_b32_e32 v1, v3, v2
913; SI-NEXT:    v_ffbl_b32_e32 v1, v1
914; SI-NEXT:    v_ffbl_b32_e32 v0, v0
915; SI-NEXT:    v_min_u32_e32 v1, 0xffffffdf, v1
916; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
917; SI-NEXT:    v_min3_u32 v0, v0, v1, 64
918; SI-NEXT:    v_mov_b32_e32 v1, 0
919; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
920; SI-NEXT:    s_endpgm
921;
922; VI-LABEL: v_cttz_zero_undef_i64_with_select:
923; VI:       ; %bb.0:
924; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
925; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
926; VI-NEXT:    s_waitcnt lgkmcnt(0)
927; VI-NEXT:    s_add_u32 s4, s2, 5
928; VI-NEXT:    s_addc_u32 s5, s3, 0
929; VI-NEXT:    v_mov_b32_e32 v0, s4
930; VI-NEXT:    v_mov_b32_e32 v1, s5
931; VI-NEXT:    s_add_u32 s4, s2, 4
932; VI-NEXT:    s_addc_u32 s5, s3, 0
933; VI-NEXT:    v_mov_b32_e32 v2, s4
934; VI-NEXT:    v_mov_b32_e32 v3, s5
935; VI-NEXT:    s_add_u32 s4, s2, 7
936; VI-NEXT:    s_addc_u32 s5, s3, 0
937; VI-NEXT:    v_mov_b32_e32 v4, s4
938; VI-NEXT:    v_mov_b32_e32 v5, s5
939; VI-NEXT:    s_add_u32 s4, s2, 6
940; VI-NEXT:    s_addc_u32 s5, s3, 0
941; VI-NEXT:    v_mov_b32_e32 v7, s5
942; VI-NEXT:    v_mov_b32_e32 v6, s4
943; VI-NEXT:    s_add_u32 s4, s2, 3
944; VI-NEXT:    s_addc_u32 s5, s3, 0
945; VI-NEXT:    v_mov_b32_e32 v9, s5
946; VI-NEXT:    v_mov_b32_e32 v8, s4
947; VI-NEXT:    s_add_u32 s4, s2, 2
948; VI-NEXT:    s_addc_u32 s5, s3, 0
949; VI-NEXT:    v_mov_b32_e32 v11, s5
950; VI-NEXT:    v_mov_b32_e32 v10, s4
951; VI-NEXT:    flat_load_ubyte v12, v[0:1]
952; VI-NEXT:    flat_load_ubyte v13, v[2:3]
953; VI-NEXT:    flat_load_ubyte v4, v[4:5]
954; VI-NEXT:    flat_load_ubyte v5, v[6:7]
955; VI-NEXT:    s_add_u32 s4, s2, 1
956; VI-NEXT:    flat_load_ubyte v6, v[8:9]
957; VI-NEXT:    s_addc_u32 s5, s3, 0
958; VI-NEXT:    v_mov_b32_e32 v0, s4
959; VI-NEXT:    v_mov_b32_e32 v2, s2
960; VI-NEXT:    v_mov_b32_e32 v1, s5
961; VI-NEXT:    v_mov_b32_e32 v3, s3
962; VI-NEXT:    flat_load_ubyte v7, v[10:11]
963; VI-NEXT:    flat_load_ubyte v0, v[0:1]
964; VI-NEXT:    flat_load_ubyte v2, v[2:3]
965; VI-NEXT:    v_mov_b32_e32 v1, 0
966; VI-NEXT:    s_waitcnt vmcnt(7)
967; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
968; VI-NEXT:    s_waitcnt vmcnt(6)
969; VI-NEXT:    v_or_b32_e32 v3, v3, v13
970; VI-NEXT:    s_waitcnt vmcnt(5)
971; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
972; VI-NEXT:    s_waitcnt vmcnt(4)
973; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
974; VI-NEXT:    v_or_b32_e32 v3, v4, v3
975; VI-NEXT:    s_waitcnt vmcnt(3)
976; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
977; VI-NEXT:    v_ffbl_b32_e32 v3, v3
978; VI-NEXT:    v_add_u32_e64 v3, s[2:3], v3, 32 clamp
979; VI-NEXT:    s_waitcnt vmcnt(2)
980; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
981; VI-NEXT:    s_waitcnt vmcnt(1)
982; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
983; VI-NEXT:    s_waitcnt vmcnt(0)
984; VI-NEXT:    v_or_b32_e32 v0, v0, v2
985; VI-NEXT:    v_or_b32_e32 v0, v4, v0
986; VI-NEXT:    v_ffbl_b32_e32 v0, v0
987; VI-NEXT:    v_min3_u32 v0, v0, v3, 64
988; VI-NEXT:    v_mov_b32_e32 v3, s1
989; VI-NEXT:    v_mov_b32_e32 v2, s0
990; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
991; VI-NEXT:    s_endpgm
992;
993; EG-LABEL: v_cttz_zero_undef_i64_with_select:
994; EG:       ; %bb.0:
995; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
996; EG-NEXT:    TEX 3 @6
997; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
998; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
999; EG-NEXT:    CF_END
1000; EG-NEXT:    PAD
1001; EG-NEXT:    Fetch clause starting at 6:
1002; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 6, #1
1003; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1004; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 2, #1
1005; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1006; EG-NEXT:    ALU clause starting at 14:
1007; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1008; EG-NEXT:    ALU clause starting at 15:
1009; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1010; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1011; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1012; EG-NEXT:     FFBL_INT T1.W, PV.W,
1013; EG-NEXT:     LSHL * T2.W, T3.X, literal.x,
1014; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1015; EG-NEXT:     CNDE_INT T0.W, T0.W, literal.x, PV.W,
1016; EG-NEXT:     OR_INT * T1.W, PS, T2.X,
1017; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1018; EG-NEXT:     FFBL_INT T2.W, PS,
1019; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
1020; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1021; EG-NEXT:     CNDE_INT T0.X, T1.W, PS, PV.W,
1022; EG-NEXT:     MOV T0.Y, 0.0,
1023; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1024; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1025;
1026; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
1027; GFX9-GISEL:       ; %bb.0:
1028; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1029; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1030; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1031; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1032; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
1033; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
1034; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
1035; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
1036; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
1037; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
1038; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:7
1039; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:6
1040; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
1041; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
1042; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
1043; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1044; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
1045; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
1046; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
1047; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1048; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
1049; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1050; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
1051; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1052; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
1053; GFX9-GISEL-NEXT:    v_or3_b32 v3, v5, v6, v4
1054; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v4, v3
1055; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v2
1056; GFX9-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
1057; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1058; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
1059; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
1060; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
1061; GFX9-GISEL-NEXT:    s_endpgm
1062  %val = load i64, i64 addrspace(1)* %arrayidx, align 1
1063  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
1064  %cttz_ret = icmp ne i64 %val, 0
1065  %ret = select i1 %cttz_ret, i64 %cttz, i64 64
1066  store i64 %ret, i64 addrspace(1)* %out, align 4
1067  ret void
1068}
1069
1070define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1071; SI-LABEL: v_cttz_i32_sel_eq_neg1:
1072; SI:       ; %bb.0:
1073; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1074; SI-NEXT:    s_mov_b32 s3, 0xf000
1075; SI-NEXT:    s_mov_b32 s2, -1
1076; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1077; SI-NEXT:    s_mov_b32 s6, s2
1078; SI-NEXT:    s_mov_b32 s7, s3
1079; SI-NEXT:    s_waitcnt lgkmcnt(0)
1080; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1081; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:3
1082; SI-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0
1083; SI-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:2
1084; SI-NEXT:    s_waitcnt vmcnt(3)
1085; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1086; SI-NEXT:    s_waitcnt vmcnt(2)
1087; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1088; SI-NEXT:    s_waitcnt vmcnt(1)
1089; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1090; SI-NEXT:    s_waitcnt vmcnt(0)
1091; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1092; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1093; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1094; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1095; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1096; SI-NEXT:    s_endpgm
1097;
1098; VI-LABEL: v_cttz_i32_sel_eq_neg1:
1099; VI:       ; %bb.0:
1100; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1101; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1102; VI-NEXT:    s_waitcnt lgkmcnt(0)
1103; VI-NEXT:    s_add_u32 s4, s2, 3
1104; VI-NEXT:    s_addc_u32 s5, s3, 0
1105; VI-NEXT:    v_mov_b32_e32 v2, s4
1106; VI-NEXT:    v_mov_b32_e32 v3, s5
1107; VI-NEXT:    s_add_u32 s4, s2, 2
1108; VI-NEXT:    v_mov_b32_e32 v0, s2
1109; VI-NEXT:    s_addc_u32 s5, s3, 0
1110; VI-NEXT:    v_mov_b32_e32 v1, s3
1111; VI-NEXT:    s_add_u32 s2, s2, 1
1112; VI-NEXT:    s_addc_u32 s3, s3, 0
1113; VI-NEXT:    v_mov_b32_e32 v4, s4
1114; VI-NEXT:    v_mov_b32_e32 v7, s3
1115; VI-NEXT:    v_mov_b32_e32 v5, s5
1116; VI-NEXT:    v_mov_b32_e32 v6, s2
1117; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1118; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1119; VI-NEXT:    flat_load_ubyte v4, v[6:7]
1120; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1121; VI-NEXT:    s_waitcnt vmcnt(3)
1122; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1123; VI-NEXT:    s_waitcnt vmcnt(2)
1124; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1125; VI-NEXT:    s_waitcnt vmcnt(1)
1126; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
1127; VI-NEXT:    s_waitcnt vmcnt(0)
1128; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1129; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1130; VI-NEXT:    v_ffbl_b32_e32 v2, v0
1131; VI-NEXT:    v_mov_b32_e32 v0, s0
1132; VI-NEXT:    v_mov_b32_e32 v1, s1
1133; VI-NEXT:    flat_store_dword v[0:1], v2
1134; VI-NEXT:    s_endpgm
1135;
1136; EG-LABEL: v_cttz_i32_sel_eq_neg1:
1137; EG:       ; %bb.0:
1138; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1139; EG-NEXT:    TEX 1 @6
1140; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
1141; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1142; EG-NEXT:    CF_END
1143; EG-NEXT:    PAD
1144; EG-NEXT:    Fetch clause starting at 6:
1145; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1146; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1147; EG-NEXT:    ALU clause starting at 10:
1148; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1149; EG-NEXT:    ALU clause starting at 11:
1150; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1151; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1152; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1153; EG-NEXT:     FFBL_INT * T1.W, PV.W,
1154; EG-NEXT:     CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1155; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1156; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
1157; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1158; EG-NEXT:    -1(nan), 2(2.802597e-45)
1159;
1160; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
1161; GFX9-GISEL:       ; %bb.0:
1162; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1163; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1164; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1165; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1166; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1167; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1168; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1169; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
1170; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1171; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1172; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1173; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1174; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1175; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1176; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
1177; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
1178; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1179; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1180; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
1181; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1182; GFX9-GISEL-NEXT:    s_endpgm
1183  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
1184  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1185  %cmp = icmp eq i32 %val, 0
1186  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1187  store i32 %sel, i32 addrspace(1)* %out
1188  ret void
1189}
1190
1191define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1192; SI-LABEL: v_cttz_i32_sel_ne_neg1:
1193; SI:       ; %bb.0:
1194; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1195; SI-NEXT:    s_mov_b32 s3, 0xf000
1196; SI-NEXT:    s_mov_b32 s2, -1
1197; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1198; SI-NEXT:    s_mov_b32 s6, s2
1199; SI-NEXT:    s_mov_b32 s7, s3
1200; SI-NEXT:    s_waitcnt lgkmcnt(0)
1201; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1202; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:3
1203; SI-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0
1204; SI-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:2
1205; SI-NEXT:    s_waitcnt vmcnt(3)
1206; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1207; SI-NEXT:    s_waitcnt vmcnt(2)
1208; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1209; SI-NEXT:    s_waitcnt vmcnt(1)
1210; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1211; SI-NEXT:    s_waitcnt vmcnt(0)
1212; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1213; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1214; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1215; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1216; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1217; SI-NEXT:    s_endpgm
1218;
1219; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1220; VI:       ; %bb.0:
1221; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1222; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1223; VI-NEXT:    s_waitcnt lgkmcnt(0)
1224; VI-NEXT:    s_add_u32 s4, s2, 3
1225; VI-NEXT:    s_addc_u32 s5, s3, 0
1226; VI-NEXT:    v_mov_b32_e32 v2, s4
1227; VI-NEXT:    v_mov_b32_e32 v3, s5
1228; VI-NEXT:    s_add_u32 s4, s2, 2
1229; VI-NEXT:    v_mov_b32_e32 v0, s2
1230; VI-NEXT:    s_addc_u32 s5, s3, 0
1231; VI-NEXT:    v_mov_b32_e32 v1, s3
1232; VI-NEXT:    s_add_u32 s2, s2, 1
1233; VI-NEXT:    s_addc_u32 s3, s3, 0
1234; VI-NEXT:    v_mov_b32_e32 v4, s4
1235; VI-NEXT:    v_mov_b32_e32 v7, s3
1236; VI-NEXT:    v_mov_b32_e32 v5, s5
1237; VI-NEXT:    v_mov_b32_e32 v6, s2
1238; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1239; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1240; VI-NEXT:    flat_load_ubyte v4, v[6:7]
1241; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1242; VI-NEXT:    s_waitcnt vmcnt(3)
1243; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1244; VI-NEXT:    s_waitcnt vmcnt(2)
1245; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1246; VI-NEXT:    s_waitcnt vmcnt(1)
1247; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
1248; VI-NEXT:    s_waitcnt vmcnt(0)
1249; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1250; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1251; VI-NEXT:    v_ffbl_b32_e32 v2, v0
1252; VI-NEXT:    v_mov_b32_e32 v0, s0
1253; VI-NEXT:    v_mov_b32_e32 v1, s1
1254; VI-NEXT:    flat_store_dword v[0:1], v2
1255; VI-NEXT:    s_endpgm
1256;
1257; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1258; EG:       ; %bb.0:
1259; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1260; EG-NEXT:    TEX 1 @6
1261; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
1262; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1263; EG-NEXT:    CF_END
1264; EG-NEXT:    PAD
1265; EG-NEXT:    Fetch clause starting at 6:
1266; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1267; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1268; EG-NEXT:    ALU clause starting at 10:
1269; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1270; EG-NEXT:    ALU clause starting at 11:
1271; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1272; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1273; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1274; EG-NEXT:     FFBL_INT * T1.W, PV.W,
1275; EG-NEXT:     CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1276; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1277; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
1278; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1279; EG-NEXT:    -1(nan), 2(2.802597e-45)
1280;
1281; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1282; GFX9-GISEL:       ; %bb.0:
1283; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1284; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1285; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1286; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1288; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1289; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1290; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
1291; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1292; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1293; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1294; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1295; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1296; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1297; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
1298; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
1299; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1300; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1301; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
1302; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1303; GFX9-GISEL-NEXT:    s_endpgm
1304  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
1305  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1306  %cmp = icmp ne i32 %val, 0
1307  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1308  store i32 %sel, i32 addrspace(1)* %out
1309  ret void
1310}
1311
1312define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1313; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1314; SI:       ; %bb.0:
1315; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1316; SI-NEXT:    s_mov_b32 s3, 0xf000
1317; SI-NEXT:    s_mov_b32 s2, -1
1318; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1319; SI-NEXT:    s_mov_b32 s6, s2
1320; SI-NEXT:    s_mov_b32 s7, s3
1321; SI-NEXT:    s_waitcnt lgkmcnt(0)
1322; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1323; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:3
1324; SI-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0
1325; SI-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:2
1326; SI-NEXT:    s_waitcnt vmcnt(3)
1327; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1328; SI-NEXT:    s_waitcnt vmcnt(2)
1329; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1330; SI-NEXT:    s_waitcnt vmcnt(1)
1331; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1332; SI-NEXT:    s_waitcnt vmcnt(0)
1333; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1334; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1335; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1336; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1337; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1338; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1339; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1340; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1341; SI-NEXT:    s_endpgm
1342;
1343; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1344; VI:       ; %bb.0:
1345; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1346; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1347; VI-NEXT:    s_waitcnt lgkmcnt(0)
1348; VI-NEXT:    s_add_u32 s4, s2, 3
1349; VI-NEXT:    s_addc_u32 s5, s3, 0
1350; VI-NEXT:    v_mov_b32_e32 v2, s4
1351; VI-NEXT:    v_mov_b32_e32 v3, s5
1352; VI-NEXT:    s_add_u32 s4, s2, 2
1353; VI-NEXT:    v_mov_b32_e32 v0, s2
1354; VI-NEXT:    s_addc_u32 s5, s3, 0
1355; VI-NEXT:    v_mov_b32_e32 v1, s3
1356; VI-NEXT:    s_add_u32 s2, s2, 1
1357; VI-NEXT:    s_addc_u32 s3, s3, 0
1358; VI-NEXT:    v_mov_b32_e32 v4, s4
1359; VI-NEXT:    v_mov_b32_e32 v7, s3
1360; VI-NEXT:    v_mov_b32_e32 v5, s5
1361; VI-NEXT:    v_mov_b32_e32 v6, s2
1362; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1363; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1364; VI-NEXT:    flat_load_ubyte v4, v[6:7]
1365; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1366; VI-NEXT:    s_waitcnt vmcnt(3)
1367; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1368; VI-NEXT:    s_waitcnt vmcnt(2)
1369; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1370; VI-NEXT:    s_waitcnt vmcnt(1)
1371; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
1372; VI-NEXT:    s_waitcnt vmcnt(0)
1373; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1374; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1375; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1376; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1377; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1378; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
1379; VI-NEXT:    v_mov_b32_e32 v0, s0
1380; VI-NEXT:    v_mov_b32_e32 v1, s1
1381; VI-NEXT:    flat_store_dword v[0:1], v2
1382; VI-NEXT:    s_endpgm
1383;
1384; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1385; EG:       ; %bb.0:
1386; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1387; EG-NEXT:    TEX 1 @6
1388; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
1389; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1390; EG-NEXT:    CF_END
1391; EG-NEXT:    PAD
1392; EG-NEXT:    Fetch clause starting at 6:
1393; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1394; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1395; EG-NEXT:    ALU clause starting at 10:
1396; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1397; EG-NEXT:    ALU clause starting at 11:
1398; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1399; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1400; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1401; EG-NEXT:     FFBL_INT * T1.W, PV.W,
1402; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W,
1403; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1404; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1405; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1406; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1407; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1408; EG-NEXT:    -1(nan), 2(2.802597e-45)
1409;
1410; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1411; GFX9-GISEL:       ; %bb.0:
1412; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1413; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1414; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1415; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1417; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1418; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1419; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
1420; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1421; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1422; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1423; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1424; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1425; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1426; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
1427; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1428; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1429; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
1430; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
1431; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1432; GFX9-GISEL-NEXT:    s_endpgm
1433  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
1434  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1435  %cmp = icmp ne i32 %ctlz, 32
1436  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1437  store i32 %sel, i32 addrspace(1)* %out
1438  ret void
1439}
1440
1441 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1442; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1443; SI:       ; %bb.0:
1444; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1445; SI-NEXT:    s_mov_b32 s3, 0xf000
1446; SI-NEXT:    s_mov_b32 s2, -1
1447; SI-NEXT:    s_mov_b32 s6, s2
1448; SI-NEXT:    s_mov_b32 s7, s3
1449; SI-NEXT:    s_waitcnt lgkmcnt(0)
1450; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
1451; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1452; SI-NEXT:    s_waitcnt vmcnt(0)
1453; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1454; SI-NEXT:    s_waitcnt lgkmcnt(0)
1455; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1456; SI-NEXT:    s_endpgm
1457;
1458; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1459; VI:       ; %bb.0:
1460; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1461; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1462; VI-NEXT:    s_waitcnt lgkmcnt(0)
1463; VI-NEXT:    v_mov_b32_e32 v0, s2
1464; VI-NEXT:    v_mov_b32_e32 v1, s3
1465; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1466; VI-NEXT:    v_mov_b32_e32 v1, 0xff
1467; VI-NEXT:    s_waitcnt vmcnt(0)
1468; VI-NEXT:    v_or_b32_e32 v2, 0x100, v0
1469; VI-NEXT:    v_ffbl_b32_e32 v2, v2
1470; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
1471; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
1472; VI-NEXT:    v_mov_b32_e32 v0, s0
1473; VI-NEXT:    v_mov_b32_e32 v1, s1
1474; VI-NEXT:    flat_store_byte v[0:1], v2
1475; VI-NEXT:    s_endpgm
1476;
1477; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1478; EG:       ; %bb.0:
1479; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1480; EG-NEXT:    TEX 0 @6
1481; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1482; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1483; EG-NEXT:    CF_END
1484; EG-NEXT:    PAD
1485; EG-NEXT:    Fetch clause starting at 6:
1486; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1487; EG-NEXT:    ALU clause starting at 8:
1488; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1489; EG-NEXT:    ALU clause starting at 9:
1490; EG-NEXT:     FFBL_INT T0.W, T0.X,
1491; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1492; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1493; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1494; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1495; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1496; EG-NEXT:     LSHL T0.X, PV.W, PS,
1497; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1498; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1499; EG-NEXT:     MOV T0.Y, 0.0,
1500; EG-NEXT:     MOV * T0.Z, 0.0,
1501; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1502; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1503;
1504; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1505; GFX9-GISEL:       ; %bb.0:
1506; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1507; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1508; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1509; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
1510; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1511; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1512; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1513; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
1514; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
1515; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
1516; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1517; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
1518; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
1519; GFX9-GISEL-NEXT:    s_endpgm
1520  %val = load i8, i8 addrspace(1)* %arrayidx, align 1
1521  %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1522  %cmp = icmp eq i8 %val, 0
1523  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1524  store i8 %sel, i8 addrspace(1)* %out
1525  ret void
1526}
1527
1528 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1529; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1530; SI:       ; %bb.0:
1531; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1532; SI-NEXT:    s_mov_b32 s3, 0xf000
1533; SI-NEXT:    s_mov_b32 s2, -1
1534; SI-NEXT:    s_mov_b32 s6, s2
1535; SI-NEXT:    s_mov_b32 s7, s3
1536; SI-NEXT:    s_waitcnt lgkmcnt(0)
1537; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1538; SI-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0
1539; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1540; SI-NEXT:    s_waitcnt vmcnt(1)
1541; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1542; SI-NEXT:    s_waitcnt vmcnt(0)
1543; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1544; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1545; SI-NEXT:    s_waitcnt lgkmcnt(0)
1546; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1547; SI-NEXT:    s_endpgm
1548;
1549; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1550; VI:       ; %bb.0:
1551; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1552; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1553; VI-NEXT:    s_waitcnt lgkmcnt(0)
1554; VI-NEXT:    s_add_u32 s4, s2, 1
1555; VI-NEXT:    s_addc_u32 s5, s3, 0
1556; VI-NEXT:    v_mov_b32_e32 v2, s4
1557; VI-NEXT:    v_mov_b32_e32 v0, s2
1558; VI-NEXT:    v_mov_b32_e32 v3, s5
1559; VI-NEXT:    v_mov_b32_e32 v1, s3
1560; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1561; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1562; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1563; VI-NEXT:    s_waitcnt vmcnt(1)
1564; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
1565; VI-NEXT:    s_waitcnt vmcnt(0)
1566; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1567; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
1568; VI-NEXT:    v_ffbl_b32_e32 v2, v2
1569; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1570; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
1571; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
1572; VI-NEXT:    v_mov_b32_e32 v0, s0
1573; VI-NEXT:    v_mov_b32_e32 v1, s1
1574; VI-NEXT:    flat_store_short v[0:1], v2
1575; VI-NEXT:    s_endpgm
1576;
1577; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1578; EG:       ; %bb.0:
1579; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1580; EG-NEXT:    TEX 0 @6
1581; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1582; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1583; EG-NEXT:    CF_END
1584; EG-NEXT:    PAD
1585; EG-NEXT:    Fetch clause starting at 6:
1586; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1587; EG-NEXT:    ALU clause starting at 8:
1588; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1589; EG-NEXT:    ALU clause starting at 9:
1590; EG-NEXT:     FFBL_INT T0.W, T0.X,
1591; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1592; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1593; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1594; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1595; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1596; EG-NEXT:     LSHL T0.X, PV.W, PS,
1597; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1598; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1599; EG-NEXT:     MOV T0.Y, 0.0,
1600; EG-NEXT:     MOV * T0.Z, 0.0,
1601; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1603;
1604; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1605; GFX9-GISEL:       ; %bb.0:
1606; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1607; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1608; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1609; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
1610; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1611; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1612; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1613; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1614; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1615; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1616; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
1617; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1618; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1619; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
1620; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1621; GFX9-GISEL-NEXT:    s_endpgm
1622  %val = load i16, i16 addrspace(1)* %arrayidx, align 1
1623  %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1624  %cmp = icmp eq i16 %val, 0
1625  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1626  store i16 %sel, i16 addrspace(1)* %out
1627  ret void
1628}
1629
1630
1631