1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
7
8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
9; SI-LABEL: udiv_i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_u32_e32 v2, v1
24; SI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
25; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
26; SI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
27; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
28; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
29; SI-NEXT:    v_mul_hi_u32 v3, v2, v3
30; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
31; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
32; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
33; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
34; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
35; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
36; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
37; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v1, v0
38; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
39; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
40; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
41; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
42; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: udiv_i32:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
48; VI-NEXT:    s_mov_b32 s7, 0xf000
49; VI-NEXT:    s_mov_b32 s6, -1
50; VI-NEXT:    s_mov_b32 s10, s6
51; VI-NEXT:    s_mov_b32 s11, s7
52; VI-NEXT:    s_waitcnt lgkmcnt(0)
53; VI-NEXT:    s_mov_b32 s8, s2
54; VI-NEXT:    s_mov_b32 s9, s3
55; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
56; VI-NEXT:    s_mov_b32 s4, s0
57; VI-NEXT:    s_mov_b32 s5, s1
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    v_cvt_f32_u32_e32 v2, v1
60; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
61; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
62; VI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
63; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
64; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
65; VI-NEXT:    v_mul_hi_u32 v3, v2, v3
66; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
67; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
68; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
69; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
70; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v3, v0
71; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
72; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
73; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v1, v0
74; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
75; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v2
76; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
77; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
78; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
79; VI-NEXT:    s_endpgm
80;
81; GCN-LABEL: udiv_i32:
82; GCN:       ; %bb.0:
83; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
84; GCN-NEXT:    s_waitcnt lgkmcnt(0)
85; GCN-NEXT:    v_mov_b32_e32 v0, s2
86; GCN-NEXT:    v_mov_b32_e32 v1, s3
87; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
88; GCN-NEXT:    s_waitcnt vmcnt(0)
89; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
90; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
91; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
92; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
93; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
94; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
95; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
96; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
97; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
98; GCN-NEXT:    v_mov_b32_e32 v2, s0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_mul_lo_u32 v5, v4, v1
101; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
102; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v5, v0
103; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
104; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
105; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v1, v0
106; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
107; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
108; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
109; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
110; GCN-NEXT:    flat_store_dword v[2:3], v0
111; GCN-NEXT:    s_endpgm
112;
113; GFX1030-LABEL: udiv_i32:
114; GFX1030:       ; %bb.0:
115; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
116; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
117; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1030-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
119; GFX1030-NEXT:    s_waitcnt vmcnt(0)
120; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, v1
121; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, 0, v1
122; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
123; GFX1030-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
124; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
125; GFX1030-NEXT:    v_mul_lo_u32 v4, v4, v3
126; GFX1030-NEXT:    v_mul_hi_u32 v4, v3, v4
127; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v4
128; GFX1030-NEXT:    v_mul_hi_u32 v3, v0, v3
129; GFX1030-NEXT:    v_mul_lo_u32 v4, v3, v1
130; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v4
131; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
132; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v0, v1
133; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
134; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
135; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
136; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
137; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
138; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
139; GFX1030-NEXT:    global_store_dword v2, v0, s[0:1]
140; GFX1030-NEXT:    s_endpgm
141;
142; EG-LABEL: udiv_i32:
143; EG:       ; %bb.0:
144; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
145; EG-NEXT:    TEX 0 @6
146; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
148; EG-NEXT:    CF_END
149; EG-NEXT:    PAD
150; EG-NEXT:    Fetch clause starting at 6:
151; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
152; EG-NEXT:    ALU clause starting at 8:
153; EG-NEXT:     MOV * T0.X, KC0[2].Z,
154; EG-NEXT:    ALU clause starting at 9:
155; EG-NEXT:     SUB_INT T0.W, 0.0, T0.Y,
156; EG-NEXT:     RECIP_UINT * T0.Z, T0.Y,
157; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
158; EG-NEXT:     MULHI * T0.W, T0.Z, PS,
159; EG-NEXT:     ADD_INT * T0.W, T0.Z, PS,
160; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
161; EG-NEXT:     MULLO_INT * T0.W, PS, T0.Y,
162; EG-NEXT:     SUB_INT * T0.W, T0.X, PS,
163; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
164; EG-NEXT:     SETGE_UINT T1.W, PV.W, T0.Y,
165; EG-NEXT:     SUB_INT * T2.W, PV.W, T0.Y,
166; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.W, PS,
167; EG-NEXT:     CNDE_INT * T1.W, PV.W, T0.Z, PV.Z,
168; EG-NEXT:     ADD_INT T2.W, PS, 1,
169; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.Y,
170; EG-NEXT:     CNDE_INT T0.X, PS, T1.W, PV.W,
171; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
172; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
173  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
174  %a = load i32, i32 addrspace(1)* %in
175  %b = load i32, i32 addrspace(1)* %b_ptr
176  %result = udiv i32 %a, %b
177  store i32 %result, i32 addrspace(1)* %out
178  ret void
179}
180
181define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
182; SI-LABEL: s_udiv_i32:
183; SI:       ; %bb.0:
184; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
185; SI-NEXT:    s_mov_b32 s7, 0xf000
186; SI-NEXT:    s_mov_b32 s6, -1
187; SI-NEXT:    s_waitcnt lgkmcnt(0)
188; SI-NEXT:    v_cvt_f32_u32_e32 v0, s3
189; SI-NEXT:    s_sub_i32 s4, 0, s3
190; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
191; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
192; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
193; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
194; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
195; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
196; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
197; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
198; SI-NEXT:    v_mul_lo_u32 v1, v0, s3
199; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
200; SI-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
201; SI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
202; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
203; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
204; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
205; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
206; SI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
207; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
208; SI-NEXT:    s_waitcnt lgkmcnt(0)
209; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
210; SI-NEXT:    s_endpgm
211;
212; VI-LABEL: s_udiv_i32:
213; VI:       ; %bb.0:
214; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
215; VI-NEXT:    s_mov_b32 s7, 0xf000
216; VI-NEXT:    s_mov_b32 s6, -1
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
219; VI-NEXT:    s_sub_i32 s4, 0, s3
220; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
221; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
222; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
223; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
224; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
225; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
226; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
227; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
228; VI-NEXT:    v_mul_lo_u32 v1, v0, s3
229; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
230; VI-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
231; VI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
232; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
233; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
234; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
235; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
236; VI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
237; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
238; VI-NEXT:    s_waitcnt lgkmcnt(0)
239; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
240; VI-NEXT:    s_endpgm
241;
242; GCN-LABEL: s_udiv_i32:
243; GCN:       ; %bb.0:
244; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
245; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
246; GCN-NEXT:    s_waitcnt lgkmcnt(0)
247; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
248; GCN-NEXT:    s_sub_i32 s0, 0, s3
249; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
250; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
251; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
252; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
253; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
254; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
255; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
256; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
257; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
258; GCN-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
259; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
260; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
261; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
262; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
263; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
264; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
265; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
266; GCN-NEXT:    v_mov_b32_e32 v0, s4
267; GCN-NEXT:    v_mov_b32_e32 v1, s5
268; GCN-NEXT:    flat_store_dword v[0:1], v2
269; GCN-NEXT:    s_endpgm
270;
271; GFX1030-LABEL: s_udiv_i32:
272; GFX1030:       ; %bb.0:
273; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
274; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s1
276; GFX1030-NEXT:    s_sub_i32 s2, 0, s1
277; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
278; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
279; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
280; GFX1030-NEXT:    v_mul_lo_u32 v1, s2, v0
281; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
282; GFX1030-NEXT:    v_mul_hi_u32 v1, v0, v1
283; GFX1030-NEXT:    v_add_nc_u32_e32 v0, v0, v1
284; GFX1030-NEXT:    v_mul_hi_u32 v0, s0, v0
285; GFX1030-NEXT:    v_mul_lo_u32 v1, v0, s1
286; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
287; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
288; GFX1030-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
289; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
290; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
291; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
292; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
293; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
294; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
295; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
296; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX1030-NEXT:    global_store_dword v3, v0, s[2:3]
298; GFX1030-NEXT:    s_endpgm
299;
300; EG-LABEL: s_udiv_i32:
301; EG:       ; %bb.0:
302; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
303; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
304; EG-NEXT:    CF_END
305; EG-NEXT:    PAD
306; EG-NEXT:    ALU clause starting at 4:
307; EG-NEXT:     SUB_INT T0.W, 0.0, KC0[2].W,
308; EG-NEXT:     RECIP_UINT * T0.X, KC0[2].W,
309; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
310; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
311; EG-NEXT:     ADD_INT * T0.W, T0.X, PS,
312; EG-NEXT:     MULHI * T0.X, KC0[2].Z, PV.W,
313; EG-NEXT:     MULLO_INT * T0.Y, PS, KC0[2].W,
314; EG-NEXT:     SUB_INT * T0.W, KC0[2].Z, PS,
315; EG-NEXT:     SUB_INT T0.Z, PV.W, KC0[2].W,
316; EG-NEXT:     SETGE_UINT T1.W, PV.W, KC0[2].W,
317; EG-NEXT:     ADD_INT * T2.W, T0.X, 1,
318; EG-NEXT:     CNDE_INT T2.W, PV.W, T0.X, PS,
319; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, PV.Z,
320; EG-NEXT:     SETGE_UINT T0.W, PS, KC0[2].W,
321; EG-NEXT:     ADD_INT * T1.W, PV.W, 1,
322; EG-NEXT:     CNDE_INT T0.X, PV.W, T2.W, PS,
323; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
324; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
325  %result = udiv i32 %a, %b
326  store i32 %result, i32 addrspace(1)* %out
327  ret void
328}
329
330
331; The code generated by udiv is long and complex and may frequently
332; change. The goal of this test is to make sure the ISel doesn't fail
333; when it gets a v4i32 udiv
334define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
335; SI-LABEL: udiv_v2i32:
336; SI:       ; %bb.0:
337; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
338; SI-NEXT:    s_mov_b32 s7, 0xf000
339; SI-NEXT:    s_mov_b32 s6, -1
340; SI-NEXT:    s_mov_b32 s10, s6
341; SI-NEXT:    s_mov_b32 s11, s7
342; SI-NEXT:    s_waitcnt lgkmcnt(0)
343; SI-NEXT:    s_mov_b32 s8, s2
344; SI-NEXT:    s_mov_b32 s9, s3
345; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
346; SI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
347; SI-NEXT:    s_mov_b32 s4, s0
348; SI-NEXT:    s_mov_b32 s5, s1
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    v_cvt_f32_u32_e32 v4, v2
351; SI-NEXT:    v_cvt_f32_u32_e32 v5, v3
352; SI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
353; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
354; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
355; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
356; SI-NEXT:    v_mul_f32_e32 v4, s2, v4
357; SI-NEXT:    v_mul_f32_e32 v5, s2, v5
358; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
359; SI-NEXT:    v_cvt_u32_f32_e32 v5, v5
360; SI-NEXT:    v_mul_lo_u32 v6, v6, v4
361; SI-NEXT:    v_mul_lo_u32 v7, v7, v5
362; SI-NEXT:    v_mul_hi_u32 v6, v4, v6
363; SI-NEXT:    v_mul_hi_u32 v7, v5, v7
364; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
365; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
366; SI-NEXT:    v_mul_hi_u32 v4, v0, v4
367; SI-NEXT:    v_mul_hi_u32 v5, v1, v5
368; SI-NEXT:    v_mul_lo_u32 v6, v4, v2
369; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
370; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
371; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
372; SI-NEXT:    v_subrev_i32_e32 v1, vcc, v8, v1
373; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
374; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
375; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
376; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
377; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
378; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
379; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
380; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
381; SI-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
382; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
383; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
384; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
385; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
386; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
387; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
388; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
389; SI-NEXT:    s_endpgm
390;
391; VI-LABEL: udiv_v2i32:
392; VI:       ; %bb.0:
393; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
394; VI-NEXT:    s_mov_b32 s7, 0xf000
395; VI-NEXT:    s_mov_b32 s6, -1
396; VI-NEXT:    s_mov_b32 s10, s6
397; VI-NEXT:    s_mov_b32 s11, s7
398; VI-NEXT:    s_waitcnt lgkmcnt(0)
399; VI-NEXT:    s_mov_b32 s8, s2
400; VI-NEXT:    s_mov_b32 s9, s3
401; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
402; VI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
403; VI-NEXT:    s_mov_b32 s4, s0
404; VI-NEXT:    s_mov_b32 s5, s1
405; VI-NEXT:    s_waitcnt vmcnt(0)
406; VI-NEXT:    v_cvt_f32_u32_e32 v4, v2
407; VI-NEXT:    v_cvt_f32_u32_e32 v5, v3
408; VI-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
409; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
410; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
411; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
412; VI-NEXT:    v_mul_f32_e32 v4, s2, v4
413; VI-NEXT:    v_mul_f32_e32 v5, s2, v5
414; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
415; VI-NEXT:    v_cvt_u32_f32_e32 v5, v5
416; VI-NEXT:    v_mul_lo_u32 v6, v6, v4
417; VI-NEXT:    v_mul_lo_u32 v7, v7, v5
418; VI-NEXT:    v_mul_hi_u32 v6, v4, v6
419; VI-NEXT:    v_mul_hi_u32 v7, v5, v7
420; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
421; VI-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
422; VI-NEXT:    v_mul_hi_u32 v4, v0, v4
423; VI-NEXT:    v_mul_hi_u32 v5, v1, v5
424; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
425; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
426; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
427; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
428; VI-NEXT:    v_subrev_u32_e32 v1, vcc, v8, v1
429; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
430; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
431; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
432; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
433; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
434; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
435; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
436; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
437; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
438; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
439; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
440; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
441; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
442; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
443; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
444; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
445; VI-NEXT:    s_endpgm
446;
447; GCN-LABEL: udiv_v2i32:
448; GCN:       ; %bb.0:
449; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
450; GCN-NEXT:    s_waitcnt lgkmcnt(0)
451; GCN-NEXT:    v_mov_b32_e32 v0, s2
452; GCN-NEXT:    v_mov_b32_e32 v1, s3
453; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
454; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
455; GCN-NEXT:    s_waitcnt vmcnt(0)
456; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
457; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
458; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
459; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
460; GCN-NEXT:    v_mul_f32_e32 v4, s2, v4
461; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
462; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
463; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v5
464; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
465; GCN-NEXT:    v_mul_lo_u32 v5, v4, v6
466; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
467; GCN-NEXT:    v_mul_lo_u32 v8, v4, v7
468; GCN-NEXT:    v_mul_hi_u32 v9, v6, v5
469; GCN-NEXT:    v_mov_b32_e32 v4, s0
470; GCN-NEXT:    v_mov_b32_e32 v5, s1
471; GCN-NEXT:    v_mul_hi_u32 v8, v7, v8
472; GCN-NEXT:    v_add_u32_e32 v6, vcc, v9, v6
473; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
474; GCN-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
475; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
476; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
477; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
478; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
479; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
480; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
481; GCN-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
482; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
483; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
484; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
485; GCN-NEXT:    v_subrev_u32_e32 v8, vcc, v2, v0
486; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
487; GCN-NEXT:    v_subrev_u32_e32 v9, vcc, v3, v1
488; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
489; GCN-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
490; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
491; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v7
492; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
493; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
494; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
495; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
496; GCN-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
497; GCN-NEXT:    s_endpgm
498;
499; GFX1030-LABEL: udiv_v2i32:
500; GFX1030:       ; %bb.0:
501; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
502; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
503; GFX1030-NEXT:    s_mov_b32 s0, 0x4f7ffffe
504; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
506; GFX1030-NEXT:    s_waitcnt vmcnt(0)
507; GFX1030-NEXT:    v_cvt_f32_u32_e32 v5, v2
508; GFX1030-NEXT:    v_cvt_f32_u32_e32 v6, v3
509; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, 0, v2
510; GFX1030-NEXT:    v_sub_nc_u32_e32 v8, 0, v3
511; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v5
512; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v6, v6
513; GFX1030-NEXT:    v_mul_f32_e32 v5, s0, v5
514; GFX1030-NEXT:    v_mul_f32_e32 v6, s0, v6
515; GFX1030-NEXT:    v_cvt_u32_f32_e32 v5, v5
516; GFX1030-NEXT:    v_cvt_u32_f32_e32 v6, v6
517; GFX1030-NEXT:    v_mul_lo_u32 v7, v7, v5
518; GFX1030-NEXT:    v_mul_lo_u32 v8, v8, v6
519; GFX1030-NEXT:    v_mul_hi_u32 v7, v5, v7
520; GFX1030-NEXT:    v_mul_hi_u32 v8, v6, v8
521; GFX1030-NEXT:    v_add_nc_u32_e32 v5, v5, v7
522; GFX1030-NEXT:    v_add_nc_u32_e32 v6, v6, v8
523; GFX1030-NEXT:    v_mul_hi_u32 v5, v0, v5
524; GFX1030-NEXT:    v_mul_hi_u32 v6, v1, v6
525; GFX1030-NEXT:    v_mul_lo_u32 v7, v5, v2
526; GFX1030-NEXT:    v_mul_lo_u32 v8, v6, v3
527; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
528; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
529; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, v1, v8
530; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
531; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
532; GFX1030-NEXT:    v_sub_nc_u32_e32 v9, v1, v3
533; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v1, v3
534; GFX1030-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
535; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v0, v2
536; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s0
537; GFX1030-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
538; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
539; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
540; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
541; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
542; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc_lo
543; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v1, v3
544; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc_lo
545; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
546; GFX1030-NEXT:    s_endpgm
547;
548; EG-LABEL: udiv_v2i32:
549; EG:       ; %bb.0:
550; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
551; EG-NEXT:    TEX 0 @6
552; EG-NEXT:    ALU 33, @9, KC0[CB0:0-32], KC1[]
553; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
554; EG-NEXT:    CF_END
555; EG-NEXT:    PAD
556; EG-NEXT:    Fetch clause starting at 6:
557; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
558; EG-NEXT:    ALU clause starting at 8:
559; EG-NEXT:     MOV * T0.X, KC0[2].Z,
560; EG-NEXT:    ALU clause starting at 9:
561; EG-NEXT:     SUB_INT T1.W, 0.0, T0.W,
562; EG-NEXT:     RECIP_UINT * T1.X, T0.W,
563; EG-NEXT:     MULLO_INT * T1.Y, PV.W, PS,
564; EG-NEXT:     SUB_INT T1.W, 0.0, T0.Z,
565; EG-NEXT:     RECIP_UINT * T1.Z, T0.Z,
566; EG-NEXT:     MULLO_INT * T1.W, PV.W, PS,
567; EG-NEXT:     MULHI * T1.W, T1.Z, PS,
568; EG-NEXT:     ADD_INT T1.W, T1.Z, PS,
569; EG-NEXT:     MULHI * T1.Y, T1.X, T1.Y,
570; EG-NEXT:     ADD_INT T2.W, T1.X, PS,
571; EG-NEXT:     MULHI * T1.X, T0.X, PV.W,
572; EG-NEXT:     MULHI * T1.Y, T0.Y, PV.W,
573; EG-NEXT:     MULLO_INT * T1.Z, PS, T0.W,
574; EG-NEXT:     SUB_INT T1.W, T0.Y, PS,
575; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T0.Z,
576; EG-NEXT:     SUB_INT T0.Y, T0.X, PS,
577; EG-NEXT:     ADD_INT T1.Z, T1.Y, 1,
578; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
579; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
580; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.W, PS,
581; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PV.Z,
582; EG-NEXT:     ADD_INT T1.Z, T1.X, 1,
583; EG-NEXT:     SETGE_UINT T1.W, PV.Y, T0.Z,
584; EG-NEXT:     SUB_INT * T2.W, PV.Y, T0.Z,
585; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS,
586; EG-NEXT:     CNDE_INT T1.Z, PV.W, T1.X, PV.Z,
587; EG-NEXT:     ADD_INT T1.W, PV.Y, 1,
588; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
589; EG-NEXT:     CNDE_INT T1.Y, PS, T1.Y, PV.W,
590; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
591; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T0.Z,
592; EG-NEXT:     CNDE_INT T1.X, PS, T1.Z, PV.W,
593; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
594; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
595  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
596  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
597  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
598  %result = udiv <2 x i32> %a, %b
599  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
600  ret void
601}
602
603define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
604; SI-LABEL: udiv_v4i32:
605; SI:       ; %bb.0:
606; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
607; SI-NEXT:    s_mov_b32 s11, 0xf000
608; SI-NEXT:    s_mov_b32 s10, -1
609; SI-NEXT:    s_mov_b32 s6, s10
610; SI-NEXT:    s_mov_b32 s7, s11
611; SI-NEXT:    s_waitcnt lgkmcnt(0)
612; SI-NEXT:    s_mov_b32 s4, s2
613; SI-NEXT:    s_mov_b32 s5, s3
614; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
615; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
616; SI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
617; SI-NEXT:    s_mov_b32 s8, s0
618; SI-NEXT:    s_mov_b32 s9, s1
619; SI-NEXT:    s_waitcnt vmcnt(1)
620; SI-NEXT:    v_cvt_f32_u32_e32 v8, v0
621; SI-NEXT:    v_cvt_f32_u32_e32 v10, v1
622; SI-NEXT:    v_cvt_f32_u32_e32 v12, v2
623; SI-NEXT:    v_cvt_f32_u32_e32 v14, v3
624; SI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
625; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
626; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
627; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
628; SI-NEXT:    v_mul_f32_e32 v8, s2, v8
629; SI-NEXT:    v_mul_f32_e32 v10, s2, v10
630; SI-NEXT:    v_mul_f32_e32 v12, s2, v12
631; SI-NEXT:    v_mul_f32_e32 v14, s2, v14
632; SI-NEXT:    v_cvt_u32_f32_e32 v8, v8
633; SI-NEXT:    v_cvt_u32_f32_e32 v10, v10
634; SI-NEXT:    v_cvt_u32_f32_e32 v12, v12
635; SI-NEXT:    v_cvt_u32_f32_e32 v14, v14
636; SI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
637; SI-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
638; SI-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
639; SI-NEXT:    v_sub_i32_e32 v15, vcc, 0, v3
640; SI-NEXT:    v_mul_lo_u32 v9, v9, v8
641; SI-NEXT:    v_mul_lo_u32 v11, v11, v10
642; SI-NEXT:    v_mul_lo_u32 v13, v13, v12
643; SI-NEXT:    v_mul_lo_u32 v15, v15, v14
644; SI-NEXT:    v_mul_hi_u32 v9, v8, v9
645; SI-NEXT:    v_mul_hi_u32 v11, v10, v11
646; SI-NEXT:    v_mul_hi_u32 v13, v12, v13
647; SI-NEXT:    v_mul_hi_u32 v15, v14, v15
648; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
649; SI-NEXT:    v_add_i32_e32 v9, vcc, v11, v10
650; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v12
651; SI-NEXT:    v_add_i32_e32 v11, vcc, v15, v14
652; SI-NEXT:    s_waitcnt vmcnt(0)
653; SI-NEXT:    v_mul_hi_u32 v8, v4, v8
654; SI-NEXT:    v_mul_hi_u32 v9, v5, v9
655; SI-NEXT:    v_mul_hi_u32 v10, v6, v10
656; SI-NEXT:    v_mul_hi_u32 v11, v7, v11
657; SI-NEXT:    v_mul_lo_u32 v12, v8, v0
658; SI-NEXT:    v_mul_lo_u32 v14, v9, v1
659; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
660; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
661; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v12, v4
662; SI-NEXT:    v_subrev_i32_e32 v5, vcc, v14, v5
663; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v16, v6
664; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v18, v7
665; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
666; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
667; SI-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
668; SI-NEXT:    v_add_i32_e32 v19, vcc, 1, v11
669; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
670; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
671; SI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
672; SI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
673; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
674; SI-NEXT:    v_subrev_i32_e32 v12, vcc, v0, v4
675; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
676; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
677; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
678; SI-NEXT:    v_subrev_i32_e32 v14, vcc, v2, v6
679; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
680; SI-NEXT:    v_subrev_i32_e32 v15, vcc, v3, v7
681; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
682; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
683; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
684; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v9
685; SI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
686; SI-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
687; SI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
688; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v11
689; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
690; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
691; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
692; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
693; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
694; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
695; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
696; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
697; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
698; SI-NEXT:    s_endpgm
699;
700; VI-LABEL: udiv_v4i32:
701; VI:       ; %bb.0:
702; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
703; VI-NEXT:    s_mov_b32 s11, 0xf000
704; VI-NEXT:    s_mov_b32 s10, -1
705; VI-NEXT:    s_mov_b32 s6, s10
706; VI-NEXT:    s_mov_b32 s7, s11
707; VI-NEXT:    s_waitcnt lgkmcnt(0)
708; VI-NEXT:    s_mov_b32 s4, s2
709; VI-NEXT:    s_mov_b32 s5, s3
710; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
711; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
712; VI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
713; VI-NEXT:    s_mov_b32 s8, s0
714; VI-NEXT:    s_mov_b32 s9, s1
715; VI-NEXT:    s_waitcnt vmcnt(1)
716; VI-NEXT:    v_cvt_f32_u32_e32 v8, v0
717; VI-NEXT:    v_cvt_f32_u32_e32 v10, v1
718; VI-NEXT:    v_cvt_f32_u32_e32 v12, v2
719; VI-NEXT:    v_cvt_f32_u32_e32 v14, v3
720; VI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
721; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
722; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
723; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
724; VI-NEXT:    v_mul_f32_e32 v8, s2, v8
725; VI-NEXT:    v_mul_f32_e32 v10, s2, v10
726; VI-NEXT:    v_mul_f32_e32 v12, s2, v12
727; VI-NEXT:    v_mul_f32_e32 v14, s2, v14
728; VI-NEXT:    v_cvt_u32_f32_e32 v8, v8
729; VI-NEXT:    v_cvt_u32_f32_e32 v10, v10
730; VI-NEXT:    v_cvt_u32_f32_e32 v12, v12
731; VI-NEXT:    v_cvt_u32_f32_e32 v14, v14
732; VI-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
733; VI-NEXT:    v_sub_u32_e32 v11, vcc, 0, v1
734; VI-NEXT:    v_sub_u32_e32 v13, vcc, 0, v2
735; VI-NEXT:    v_sub_u32_e32 v15, vcc, 0, v3
736; VI-NEXT:    v_mul_lo_u32 v9, v9, v8
737; VI-NEXT:    v_mul_lo_u32 v11, v11, v10
738; VI-NEXT:    v_mul_lo_u32 v13, v13, v12
739; VI-NEXT:    v_mul_lo_u32 v15, v15, v14
740; VI-NEXT:    v_mul_hi_u32 v9, v8, v9
741; VI-NEXT:    v_mul_hi_u32 v11, v10, v11
742; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
743; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
744; VI-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
745; VI-NEXT:    v_add_u32_e32 v9, vcc, v11, v10
746; VI-NEXT:    v_add_u32_e32 v10, vcc, v13, v12
747; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v14
748; VI-NEXT:    s_waitcnt vmcnt(0)
749; VI-NEXT:    v_mul_hi_u32 v8, v4, v8
750; VI-NEXT:    v_mul_hi_u32 v9, v5, v9
751; VI-NEXT:    v_mul_hi_u32 v10, v6, v10
752; VI-NEXT:    v_mul_hi_u32 v11, v7, v11
753; VI-NEXT:    v_mul_lo_u32 v12, v8, v0
754; VI-NEXT:    v_mul_lo_u32 v14, v9, v1
755; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
756; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
757; VI-NEXT:    v_subrev_u32_e32 v4, vcc, v12, v4
758; VI-NEXT:    v_subrev_u32_e32 v5, vcc, v14, v5
759; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v16, v6
760; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v18, v7
761; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
762; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
763; VI-NEXT:    v_add_u32_e32 v17, vcc, 1, v10
764; VI-NEXT:    v_add_u32_e32 v19, vcc, 1, v11
765; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
766; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
767; VI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
768; VI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
769; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
770; VI-NEXT:    v_subrev_u32_e32 v12, vcc, v0, v4
771; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
772; VI-NEXT:    v_subrev_u32_e32 v13, vcc, v1, v5
773; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
774; VI-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
775; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
776; VI-NEXT:    v_subrev_u32_e32 v15, vcc, v3, v7
777; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
778; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
779; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
780; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v9
781; VI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
782; VI-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
783; VI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
784; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
785; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
786; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
787; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
788; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
789; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
790; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
791; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
792; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
793; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
794; VI-NEXT:    s_endpgm
795;
796; GCN-LABEL: udiv_v4i32:
797; GCN:       ; %bb.0:
798; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
799; GCN-NEXT:    s_waitcnt lgkmcnt(0)
800; GCN-NEXT:    s_add_u32 s4, s2, 16
801; GCN-NEXT:    s_addc_u32 s5, s3, 0
802; GCN-NEXT:    v_mov_b32_e32 v0, s4
803; GCN-NEXT:    v_mov_b32_e32 v1, s5
804; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
805; GCN-NEXT:    v_mov_b32_e32 v5, s3
806; GCN-NEXT:    v_mov_b32_e32 v4, s2
807; GCN-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
808; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
809; GCN-NEXT:    v_mov_b32_e32 v8, s0
810; GCN-NEXT:    v_mov_b32_e32 v9, s1
811; GCN-NEXT:    s_waitcnt vmcnt(1)
812; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v0
813; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v1
814; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v2
815; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v3
816; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
817; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
818; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
819; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
820; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
821; GCN-NEXT:    v_mul_f32_e32 v12, s2, v12
822; GCN-NEXT:    v_mul_f32_e32 v14, s2, v14
823; GCN-NEXT:    v_mul_f32_e32 v16, s2, v16
824; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
825; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
826; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14
827; GCN-NEXT:    v_cvt_u32_f32_e32 v16, v16
828; GCN-NEXT:    v_sub_u32_e32 v11, vcc, 0, v0
829; GCN-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
830; GCN-NEXT:    v_sub_u32_e32 v15, vcc, 0, v2
831; GCN-NEXT:    v_sub_u32_e32 v17, vcc, 0, v3
832; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
833; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
834; GCN-NEXT:    v_mul_lo_u32 v15, v15, v14
835; GCN-NEXT:    v_mul_lo_u32 v17, v17, v16
836; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
837; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
838; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
839; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
840; GCN-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
841; GCN-NEXT:    v_add_u32_e32 v11, vcc, v13, v12
842; GCN-NEXT:    v_add_u32_e32 v12, vcc, v15, v14
843; GCN-NEXT:    v_add_u32_e32 v13, vcc, v17, v16
844; GCN-NEXT:    s_waitcnt vmcnt(0)
845; GCN-NEXT:    v_mul_hi_u32 v10, v4, v10
846; GCN-NEXT:    v_mul_hi_u32 v11, v5, v11
847; GCN-NEXT:    v_mul_hi_u32 v12, v6, v12
848; GCN-NEXT:    v_mul_hi_u32 v13, v7, v13
849; GCN-NEXT:    v_mul_lo_u32 v14, v10, v0
850; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
851; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
852; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
853; GCN-NEXT:    v_subrev_u32_e32 v4, vcc, v14, v4
854; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v16, v5
855; GCN-NEXT:    v_subrev_u32_e32 v6, vcc, v18, v6
856; GCN-NEXT:    v_subrev_u32_e32 v7, vcc, v19, v7
857; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
858; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
859; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
860; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
861; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
862; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
863; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
864; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
865; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
866; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v0, v4
867; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
868; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v1, v5
869; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
870; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
871; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
872; GCN-NEXT:    v_subrev_u32_e32 v16, vcc, v3, v7
873; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[0:1]
874; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
875; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v17, s[2:3]
876; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
877; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
878; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
879; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v16, s[6:7]
880; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
881; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
882; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v15, vcc
883; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
884; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v17, vcc
885; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
886; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc
887; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
888; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc
889; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
890; GCN-NEXT:    s_endpgm
891;
892; GFX1030-LABEL: udiv_v4i32:
893; GFX1030:       ; %bb.0:
894; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
895; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
896; GFX1030-NEXT:    s_mov_b32 s0, 0x4f7ffffe
897; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX1030-NEXT:    s_clause 0x1
899; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
900; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
901; GFX1030-NEXT:    s_waitcnt vmcnt(1)
902; GFX1030-NEXT:    v_cvt_f32_u32_e32 v9, v0
903; GFX1030-NEXT:    v_cvt_f32_u32_e32 v10, v1
904; GFX1030-NEXT:    v_cvt_f32_u32_e32 v11, v2
905; GFX1030-NEXT:    v_cvt_f32_u32_e32 v12, v3
906; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, 0, v0
907; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v9, v9
908; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v10, v10
909; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v11, v11
910; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v12, v12
911; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, 0, v1
912; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, 0, v2
913; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, 0, v3
914; GFX1030-NEXT:    v_mul_f32_e32 v9, s0, v9
915; GFX1030-NEXT:    v_mul_f32_e32 v10, s0, v10
916; GFX1030-NEXT:    v_mul_f32_e32 v11, s0, v11
917; GFX1030-NEXT:    v_mul_f32_e32 v12, s0, v12
918; GFX1030-NEXT:    v_cvt_u32_f32_e32 v9, v9
919; GFX1030-NEXT:    v_cvt_u32_f32_e32 v10, v10
920; GFX1030-NEXT:    v_cvt_u32_f32_e32 v11, v11
921; GFX1030-NEXT:    v_cvt_u32_f32_e32 v12, v12
922; GFX1030-NEXT:    v_mul_lo_u32 v13, v13, v9
923; GFX1030-NEXT:    v_mul_lo_u32 v14, v14, v10
924; GFX1030-NEXT:    v_mul_lo_u32 v15, v15, v11
925; GFX1030-NEXT:    v_mul_lo_u32 v16, v16, v12
926; GFX1030-NEXT:    v_mul_hi_u32 v13, v9, v13
927; GFX1030-NEXT:    v_mul_hi_u32 v14, v10, v14
928; GFX1030-NEXT:    v_mul_hi_u32 v15, v11, v15
929; GFX1030-NEXT:    v_mul_hi_u32 v16, v12, v16
930; GFX1030-NEXT:    v_add_nc_u32_e32 v9, v9, v13
931; GFX1030-NEXT:    v_add_nc_u32_e32 v10, v10, v14
932; GFX1030-NEXT:    v_add_nc_u32_e32 v11, v11, v15
933; GFX1030-NEXT:    v_add_nc_u32_e32 v12, v12, v16
934; GFX1030-NEXT:    s_waitcnt vmcnt(0)
935; GFX1030-NEXT:    v_mul_hi_u32 v9, v4, v9
936; GFX1030-NEXT:    v_mul_hi_u32 v10, v5, v10
937; GFX1030-NEXT:    v_mul_hi_u32 v11, v6, v11
938; GFX1030-NEXT:    v_mul_hi_u32 v12, v7, v12
939; GFX1030-NEXT:    v_mul_lo_u32 v13, v9, v0
940; GFX1030-NEXT:    v_mul_lo_u32 v14, v10, v1
941; GFX1030-NEXT:    v_mul_lo_u32 v15, v11, v2
942; GFX1030-NEXT:    v_mul_lo_u32 v16, v12, v3
943; GFX1030-NEXT:    v_add_nc_u32_e32 v17, 1, v9
944; GFX1030-NEXT:    v_add_nc_u32_e32 v18, 1, v10
945; GFX1030-NEXT:    v_add_nc_u32_e32 v19, 1, v11
946; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v13
947; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v5, v14
948; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v6, v15
949; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v7, v16
950; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
951; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
952; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, v4, v0
953; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v5, v1
954; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, v5, v1
955; GFX1030-NEXT:    v_cmp_ge_u32_e64 s1, v6, v2
956; GFX1030-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc_lo
957; GFX1030-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
958; GFX1030-NEXT:    v_cndmask_b32_e64 v10, v10, v18, s0
959; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, v6, v2
960; GFX1030-NEXT:    v_cmp_ge_u32_e64 s2, v7, v3
961; GFX1030-NEXT:    v_add_nc_u32_e32 v14, 1, v9
962; GFX1030-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s0
963; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
964; GFX1030-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s1
965; GFX1030-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s2
966; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, v7, v3
967; GFX1030-NEXT:    v_add_nc_u32_e32 v15, 1, v10
968; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s1
969; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v9, v14, vcc_lo
970; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v5, v1
971; GFX1030-NEXT:    v_add_nc_u32_e32 v16, 1, v11
972; GFX1030-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
973; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
974; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc_lo
975; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v6, v2
976; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v11, v16, vcc_lo
977; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v7, v3
978; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc_lo
979; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
980; GFX1030-NEXT:    s_endpgm
981;
982; EG-LABEL: udiv_v4i32:
983; EG:       ; %bb.0:
984; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
985; EG-NEXT:    TEX 1 @6
986; EG-NEXT:    ALU 65, @11, KC0[CB0:0-32], KC1[]
987; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
988; EG-NEXT:    CF_END
989; EG-NEXT:    PAD
990; EG-NEXT:    Fetch clause starting at 6:
991; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
992; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
993; EG-NEXT:    ALU clause starting at 10:
994; EG-NEXT:     MOV * T0.X, KC0[2].Z,
995; EG-NEXT:    ALU clause starting at 11:
996; EG-NEXT:     SUB_INT T2.W, 0.0, T1.W,
997; EG-NEXT:     RECIP_UINT * T2.X, T1.W,
998; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
999; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
1000; EG-NEXT:     ADD_INT * T2.W, T2.X, PS,
1001; EG-NEXT:     MULHI * T2.X, T0.W, PV.W,
1002; EG-NEXT:     MULLO_INT * T2.Y, PS, T1.W,
1003; EG-NEXT:     SUB_INT T2.W, 0.0, T1.X,
1004; EG-NEXT:     RECIP_UINT * T2.Z, T1.X,
1005; EG-NEXT:     MULLO_INT * T2.W, PV.W, PS,
1006; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Y,
1007; EG-NEXT:     RECIP_UINT * T3.X, T1.Y,
1008; EG-NEXT:     MULLO_INT * T3.Y, PV.W, PS,
1009; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Z,
1010; EG-NEXT:     RECIP_UINT * T3.Z, T1.Z,
1011; EG-NEXT:     MULLO_INT * T3.W, PV.W, PS,
1012; EG-NEXT:     MULHI * T3.W, T3.Z, PS,
1013; EG-NEXT:     ADD_INT T3.W, T3.Z, PS,
1014; EG-NEXT:     MULHI * T3.Y, T3.X, T3.Y,
1015; EG-NEXT:     ADD_INT T4.W, T3.X, PS,
1016; EG-NEXT:     MULHI * T3.X, T0.Z, PV.W,
1017; EG-NEXT:     MULHI * T3.Y, T0.Y, PV.W,
1018; EG-NEXT:     MULLO_INT * T3.Z, PS, T1.Y,
1019; EG-NEXT:     SUB_INT T3.W, T0.Y, PS,
1020; EG-NEXT:     MULLO_INT * T0.Y, T3.X, T1.Z,
1021; EG-NEXT:     SUB_INT T4.X, T0.Z, PS,
1022; EG-NEXT:     ADD_INT T0.Y, T3.Y, 1,
1023; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.Y,
1024; EG-NEXT:     SUB_INT T4.W, PV.W, T1.Y,
1025; EG-NEXT:     MULHI * T2.W, T2.Z, T2.W,
1026; EG-NEXT:     CNDE_INT T5.X, PV.Z, T3.W, PV.W,
1027; EG-NEXT:     CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122
1028; EG-NEXT:     SETGE_UINT T0.Z, PV.X, T1.Z,
1029; EG-NEXT:     ADD_INT T2.W, T2.Z, PS,
1030; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.Y,
1031; EG-NEXT:     ADD_INT T6.X, T3.X, 1,
1032; EG-NEXT:     ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212
1033; EG-NEXT:     SETGE_UINT T2.Z, PS, T1.W,
1034; EG-NEXT:     SUB_INT T3.W, PS, T1.W,
1035; EG-NEXT:     MULHI * T2.W, T0.X, PV.W,
1036; EG-NEXT:     SUB_INT T7.X, T4.X, T1.Z,
1037; EG-NEXT:     CNDE_INT T3.Y, PV.Z, T0.W, PV.W,
1038; EG-NEXT:     CNDE_INT T2.Z, PV.Z, T2.X, PV.Y,
1039; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122
1040; EG-NEXT:     MULLO_INT * T2.X, T2.W, T1.X,
1041; EG-NEXT:     ADD_INT T3.X, T0.W, 1,
1042; EG-NEXT:     ADD_INT T2.Y, T2.Z, 1,
1043; EG-NEXT:     SETGE_UINT T3.Z, T3.Y, T1.W,
1044; EG-NEXT:     SUB_INT T1.W, T0.X, PS, BS:VEC_201
1045; EG-NEXT:     CNDE_INT * T3.W, T0.Z, T4.X, T7.X,
1046; EG-NEXT:     SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122
1047; EG-NEXT:     ADD_INT T3.Y, T2.W, 1,
1048; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.X,
1049; EG-NEXT:     SUB_INT T3.W, PV.W, T1.X,
1050; EG-NEXT:     CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y,
1051; EG-NEXT:     CNDE_INT T2.X, PV.Z, T1.W, PV.W,
1052; EG-NEXT:     CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122
1053; EG-NEXT:     CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201
1054; EG-NEXT:     ADD_INT T0.W, T0.Y, 1,
1055; EG-NEXT:     SETGE_UINT * T1.W, T5.X, T1.Y,
1056; EG-NEXT:     CNDE_INT T4.Y, PS, T0.Y, PV.W,
1057; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
1058; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.X,
1059; EG-NEXT:     CNDE_INT T4.X, PS, T2.Y, PV.W,
1060; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1061; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1062  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1063  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
1064  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
1065  %result = udiv <4 x i32> %a, %b
1066  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1067  ret void
1068}
1069
1070define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1071; SI-LABEL: udiv_i32_div_pow2:
1072; SI:       ; %bb.0:
1073; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1074; SI-NEXT:    s_mov_b32 s7, 0xf000
1075; SI-NEXT:    s_mov_b32 s6, -1
1076; SI-NEXT:    s_mov_b32 s10, s6
1077; SI-NEXT:    s_mov_b32 s11, s7
1078; SI-NEXT:    s_waitcnt lgkmcnt(0)
1079; SI-NEXT:    s_mov_b32 s8, s2
1080; SI-NEXT:    s_mov_b32 s9, s3
1081; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1082; SI-NEXT:    s_mov_b32 s4, s0
1083; SI-NEXT:    s_mov_b32 s5, s1
1084; SI-NEXT:    s_waitcnt vmcnt(0)
1085; SI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1086; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1087; SI-NEXT:    s_endpgm
1088;
1089; VI-LABEL: udiv_i32_div_pow2:
1090; VI:       ; %bb.0:
1091; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1092; VI-NEXT:    s_mov_b32 s7, 0xf000
1093; VI-NEXT:    s_mov_b32 s6, -1
1094; VI-NEXT:    s_mov_b32 s10, s6
1095; VI-NEXT:    s_mov_b32 s11, s7
1096; VI-NEXT:    s_waitcnt lgkmcnt(0)
1097; VI-NEXT:    s_mov_b32 s8, s2
1098; VI-NEXT:    s_mov_b32 s9, s3
1099; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1100; VI-NEXT:    s_mov_b32 s4, s0
1101; VI-NEXT:    s_mov_b32 s5, s1
1102; VI-NEXT:    s_waitcnt vmcnt(0)
1103; VI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1104; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1105; VI-NEXT:    s_endpgm
1106;
1107; GCN-LABEL: udiv_i32_div_pow2:
1108; GCN:       ; %bb.0:
1109; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1110; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1111; GCN-NEXT:    v_mov_b32_e32 v0, s2
1112; GCN-NEXT:    v_mov_b32_e32 v1, s3
1113; GCN-NEXT:    flat_load_dword v2, v[0:1]
1114; GCN-NEXT:    v_mov_b32_e32 v0, s0
1115; GCN-NEXT:    v_mov_b32_e32 v1, s1
1116; GCN-NEXT:    s_waitcnt vmcnt(0)
1117; GCN-NEXT:    v_lshrrev_b32_e32 v2, 4, v2
1118; GCN-NEXT:    flat_store_dword v[0:1], v2
1119; GCN-NEXT:    s_endpgm
1120;
1121; GFX1030-LABEL: udiv_i32_div_pow2:
1122; GFX1030:       ; %bb.0:
1123; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1124; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1125; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1126; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1127; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1128; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
1129; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1130; GFX1030-NEXT:    s_endpgm
1131;
1132; EG-LABEL: udiv_i32_div_pow2:
1133; EG:       ; %bb.0:
1134; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1135; EG-NEXT:    TEX 0 @6
1136; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1137; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1138; EG-NEXT:    CF_END
1139; EG-NEXT:    PAD
1140; EG-NEXT:    Fetch clause starting at 6:
1141; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1142; EG-NEXT:    ALU clause starting at 8:
1143; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1144; EG-NEXT:    ALU clause starting at 9:
1145; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
1146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1147; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
1148  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1149  %a = load i32, i32 addrspace(1)* %in
1150  %result = udiv i32 %a, 16
1151  store i32 %result, i32 addrspace(1)* %out
1152  ret void
1153}
1154
1155define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1156; SI-LABEL: udiv_i32_div_k_even:
1157; SI:       ; %bb.0:
1158; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1159; SI-NEXT:    s_mov_b32 s7, 0xf000
1160; SI-NEXT:    s_mov_b32 s6, -1
1161; SI-NEXT:    s_mov_b32 s10, s6
1162; SI-NEXT:    s_mov_b32 s11, s7
1163; SI-NEXT:    s_waitcnt lgkmcnt(0)
1164; SI-NEXT:    s_mov_b32 s8, s2
1165; SI-NEXT:    s_mov_b32 s9, s3
1166; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1167; SI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1168; SI-NEXT:    s_mov_b32 s4, s0
1169; SI-NEXT:    s_mov_b32 s5, s1
1170; SI-NEXT:    s_waitcnt vmcnt(0)
1171; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1172; SI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1173; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1174; SI-NEXT:    s_endpgm
1175;
1176; VI-LABEL: udiv_i32_div_k_even:
1177; VI:       ; %bb.0:
1178; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1179; VI-NEXT:    s_mov_b32 s7, 0xf000
1180; VI-NEXT:    s_mov_b32 s6, -1
1181; VI-NEXT:    s_mov_b32 s10, s6
1182; VI-NEXT:    s_mov_b32 s11, s7
1183; VI-NEXT:    s_waitcnt lgkmcnt(0)
1184; VI-NEXT:    s_mov_b32 s8, s2
1185; VI-NEXT:    s_mov_b32 s9, s3
1186; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1187; VI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1188; VI-NEXT:    s_mov_b32 s4, s0
1189; VI-NEXT:    s_mov_b32 s5, s1
1190; VI-NEXT:    s_waitcnt vmcnt(0)
1191; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1192; VI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1193; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1194; VI-NEXT:    s_endpgm
1195;
1196; GCN-LABEL: udiv_i32_div_k_even:
1197; GCN:       ; %bb.0:
1198; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1199; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1200; GCN-NEXT:    v_mov_b32_e32 v0, s2
1201; GCN-NEXT:    v_mov_b32_e32 v1, s3
1202; GCN-NEXT:    flat_load_dword v0, v[0:1]
1203; GCN-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1204; GCN-NEXT:    v_mov_b32_e32 v1, s1
1205; GCN-NEXT:    s_waitcnt vmcnt(0)
1206; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1207; GCN-NEXT:    v_mov_b32_e32 v0, s0
1208; GCN-NEXT:    v_lshrrev_b32_e32 v2, 25, v2
1209; GCN-NEXT:    flat_store_dword v[0:1], v2
1210; GCN-NEXT:    s_endpgm
1211;
1212; GFX1030-LABEL: udiv_i32_div_k_even:
1213; GFX1030:       ; %bb.0:
1214; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1215; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1216; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1217; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1218; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1219; GFX1030-NEXT:    v_mul_hi_u32 v1, 0xfabbd9c1, v1
1220; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
1221; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1222; GFX1030-NEXT:    s_endpgm
1223;
1224; EG-LABEL: udiv_i32_div_k_even:
1225; EG:       ; %bb.0:
1226; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1227; EG-NEXT:    TEX 0 @6
1228; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1229; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1230; EG-NEXT:    CF_END
1231; EG-NEXT:    PAD
1232; EG-NEXT:    Fetch clause starting at 6:
1233; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1234; EG-NEXT:    ALU clause starting at 8:
1235; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1236; EG-NEXT:    ALU clause starting at 9:
1237; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1238; EG-NEXT:    -88352319(-4.876880e+35), 0(0.000000e+00)
1239; EG-NEXT:     LSHR T0.X, PS, literal.x,
1240; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1241; EG-NEXT:    25(3.503246e-44), 2(2.802597e-45)
1242  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1243  %a = load i32, i32 addrspace(1)* %in
1244  %result = udiv i32 %a, 34259182
1245  store i32 %result, i32 addrspace(1)* %out
1246  ret void
1247}
1248
1249define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1250; SI-LABEL: udiv_i32_div_k_odd:
1251; SI:       ; %bb.0:
1252; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1253; SI-NEXT:    s_mov_b32 s7, 0xf000
1254; SI-NEXT:    s_mov_b32 s6, -1
1255; SI-NEXT:    s_mov_b32 s10, s6
1256; SI-NEXT:    s_mov_b32 s11, s7
1257; SI-NEXT:    s_waitcnt lgkmcnt(0)
1258; SI-NEXT:    s_mov_b32 s8, s2
1259; SI-NEXT:    s_mov_b32 s9, s3
1260; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1261; SI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1262; SI-NEXT:    s_mov_b32 s4, s0
1263; SI-NEXT:    s_mov_b32 s5, s1
1264; SI-NEXT:    s_waitcnt vmcnt(0)
1265; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1266; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1267; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1268; SI-NEXT:    s_endpgm
1269;
1270; VI-LABEL: udiv_i32_div_k_odd:
1271; VI:       ; %bb.0:
1272; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1273; VI-NEXT:    s_mov_b32 s7, 0xf000
1274; VI-NEXT:    s_mov_b32 s6, -1
1275; VI-NEXT:    s_mov_b32 s10, s6
1276; VI-NEXT:    s_mov_b32 s11, s7
1277; VI-NEXT:    s_waitcnt lgkmcnt(0)
1278; VI-NEXT:    s_mov_b32 s8, s2
1279; VI-NEXT:    s_mov_b32 s9, s3
1280; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1281; VI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1282; VI-NEXT:    s_mov_b32 s4, s0
1283; VI-NEXT:    s_mov_b32 s5, s1
1284; VI-NEXT:    s_waitcnt vmcnt(0)
1285; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1286; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1287; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1288; VI-NEXT:    s_endpgm
1289;
1290; GCN-LABEL: udiv_i32_div_k_odd:
1291; GCN:       ; %bb.0:
1292; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1293; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1294; GCN-NEXT:    v_mov_b32_e32 v0, s2
1295; GCN-NEXT:    v_mov_b32_e32 v1, s3
1296; GCN-NEXT:    flat_load_dword v0, v[0:1]
1297; GCN-NEXT:    s_mov_b32 s2, 0x7d5deca3
1298; GCN-NEXT:    v_mov_b32_e32 v1, s1
1299; GCN-NEXT:    s_waitcnt vmcnt(0)
1300; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1301; GCN-NEXT:    v_mov_b32_e32 v0, s0
1302; GCN-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1303; GCN-NEXT:    flat_store_dword v[0:1], v2
1304; GCN-NEXT:    s_endpgm
1305;
1306; GFX1030-LABEL: udiv_i32_div_k_odd:
1307; GFX1030:       ; %bb.0:
1308; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1309; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1310; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1311; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1312; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1313; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x7d5deca3, v1
1314; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1315; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1316; GFX1030-NEXT:    s_endpgm
1317;
1318; EG-LABEL: udiv_i32_div_k_odd:
1319; EG:       ; %bb.0:
1320; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1321; EG-NEXT:    TEX 0 @6
1322; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1323; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1324; EG-NEXT:    CF_END
1325; EG-NEXT:    PAD
1326; EG-NEXT:    Fetch clause starting at 6:
1327; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1328; EG-NEXT:    ALU clause starting at 8:
1329; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1330; EG-NEXT:    ALU clause starting at 9:
1331; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1332; EG-NEXT:    2103307427(1.843675e+37), 0(0.000000e+00)
1333; EG-NEXT:     LSHR T0.X, PS, literal.x,
1334; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1335; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
1336  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1337  %a = load i32, i32 addrspace(1)* %in
1338  %result = udiv i32 %a, 34259183
1339  store i32 %result, i32 addrspace(1)* %out
1340  ret void
1341}
1342
1343define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1344; SI-LABEL: v_udiv_i8:
1345; SI:       ; %bb.0:
1346; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1347; SI-NEXT:    s_mov_b32 s7, 0xf000
1348; SI-NEXT:    s_mov_b32 s6, -1
1349; SI-NEXT:    s_mov_b32 s10, s6
1350; SI-NEXT:    s_mov_b32 s11, s7
1351; SI-NEXT:    s_waitcnt lgkmcnt(0)
1352; SI-NEXT:    s_mov_b32 s8, s2
1353; SI-NEXT:    s_mov_b32 s9, s3
1354; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1355; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1356; SI-NEXT:    s_mov_b32 s4, s0
1357; SI-NEXT:    s_mov_b32 s5, s1
1358; SI-NEXT:    s_waitcnt vmcnt(1)
1359; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1360; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1361; SI-NEXT:    s_waitcnt vmcnt(0)
1362; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1363; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1364; SI-NEXT:    v_trunc_f32_e32 v2, v2
1365; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1366; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1367; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1368; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1369; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1370; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1371; SI-NEXT:    s_endpgm
1372;
1373; VI-LABEL: v_udiv_i8:
1374; VI:       ; %bb.0:
1375; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1376; VI-NEXT:    s_mov_b32 s7, 0xf000
1377; VI-NEXT:    s_mov_b32 s6, -1
1378; VI-NEXT:    s_mov_b32 s10, s6
1379; VI-NEXT:    s_mov_b32 s11, s7
1380; VI-NEXT:    s_waitcnt lgkmcnt(0)
1381; VI-NEXT:    s_mov_b32 s8, s2
1382; VI-NEXT:    s_mov_b32 s9, s3
1383; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1384; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1385; VI-NEXT:    s_mov_b32 s4, s0
1386; VI-NEXT:    s_mov_b32 s5, s1
1387; VI-NEXT:    s_waitcnt vmcnt(1)
1388; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1389; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1390; VI-NEXT:    s_waitcnt vmcnt(0)
1391; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1392; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1393; VI-NEXT:    v_trunc_f32_e32 v2, v2
1394; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1395; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1396; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1397; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1398; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1399; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1400; VI-NEXT:    s_endpgm
1401;
1402; GCN-LABEL: v_udiv_i8:
1403; GCN:       ; %bb.0:
1404; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1405; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1406; GCN-NEXT:    v_mov_b32_e32 v0, s2
1407; GCN-NEXT:    v_mov_b32_e32 v1, s3
1408; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1409; GCN-NEXT:    v_mov_b32_e32 v0, s0
1410; GCN-NEXT:    v_mov_b32_e32 v1, s1
1411; GCN-NEXT:    s_waitcnt vmcnt(0)
1412; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
1413; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
1414; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1415; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
1416; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1417; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1418; GCN-NEXT:    v_mad_f32 v2, -v4, v3, v2
1419; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1420; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1421; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
1422; GCN-NEXT:    flat_store_dword v[0:1], v2
1423; GCN-NEXT:    s_endpgm
1424;
1425; GFX1030-LABEL: v_udiv_i8:
1426; GFX1030:       ; %bb.0:
1427; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1428; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1429; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1430; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3]
1431; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1432; GFX1030-NEXT:    v_cvt_f32_ubyte1_e32 v2, v1
1433; GFX1030-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1434; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1435; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
1436; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1437; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
1438; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1439; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1440; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1441; GFX1030-NEXT:    v_and_b32_e32 v1, 0xff, v1
1442; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1443; GFX1030-NEXT:    s_endpgm
1444;
1445; EG-LABEL: v_udiv_i8:
1446; EG:       ; %bb.0:
1447; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1448; EG-NEXT:    TEX 1 @6
1449; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1450; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1451; EG-NEXT:    CF_END
1452; EG-NEXT:    PAD
1453; EG-NEXT:    Fetch clause starting at 6:
1454; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1455; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1456; EG-NEXT:    ALU clause starting at 10:
1457; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1458; EG-NEXT:    ALU clause starting at 11:
1459; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1460; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1461; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1462; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1463; EG-NEXT:     TRUNC * T0.W, PV.W,
1464; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1465; EG-NEXT:     TRUNC * T0.W, PV.W,
1466; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1467; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1468; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1469; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1470; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1471; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1472; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1473; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
1474  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1475  %num = load i8, i8 addrspace(1) * %in
1476  %den = load i8, i8 addrspace(1) * %den_ptr
1477  %result = udiv i8 %num, %den
1478  %result.ext = zext i8 %result to i32
1479  store i32 %result.ext, i32 addrspace(1)* %out
1480  ret void
1481}
1482
1483define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
1484; SI-LABEL: v_udiv_i16:
1485; SI:       ; %bb.0:
1486; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1487; SI-NEXT:    s_mov_b32 s7, 0xf000
1488; SI-NEXT:    s_mov_b32 s6, -1
1489; SI-NEXT:    s_mov_b32 s10, s6
1490; SI-NEXT:    s_mov_b32 s11, s7
1491; SI-NEXT:    s_waitcnt lgkmcnt(0)
1492; SI-NEXT:    s_mov_b32 s8, s2
1493; SI-NEXT:    s_mov_b32 s9, s3
1494; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1495; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1496; SI-NEXT:    s_mov_b32 s4, s0
1497; SI-NEXT:    s_mov_b32 s5, s1
1498; SI-NEXT:    s_waitcnt vmcnt(1)
1499; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1500; SI-NEXT:    s_waitcnt vmcnt(0)
1501; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1502; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1503; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1504; SI-NEXT:    v_trunc_f32_e32 v2, v2
1505; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1506; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1507; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1508; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1509; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1510; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1511; SI-NEXT:    s_endpgm
1512;
1513; VI-LABEL: v_udiv_i16:
1514; VI:       ; %bb.0:
1515; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1516; VI-NEXT:    s_mov_b32 s7, 0xf000
1517; VI-NEXT:    s_mov_b32 s6, -1
1518; VI-NEXT:    s_mov_b32 s10, s6
1519; VI-NEXT:    s_mov_b32 s11, s7
1520; VI-NEXT:    s_waitcnt lgkmcnt(0)
1521; VI-NEXT:    s_mov_b32 s8, s2
1522; VI-NEXT:    s_mov_b32 s9, s3
1523; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1524; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1525; VI-NEXT:    s_mov_b32 s4, s0
1526; VI-NEXT:    s_mov_b32 s5, s1
1527; VI-NEXT:    s_waitcnt vmcnt(1)
1528; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1529; VI-NEXT:    s_waitcnt vmcnt(0)
1530; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1531; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1532; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1533; VI-NEXT:    v_trunc_f32_e32 v2, v2
1534; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1535; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1536; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1537; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1538; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1539; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1540; VI-NEXT:    s_endpgm
1541;
1542; GCN-LABEL: v_udiv_i16:
1543; GCN:       ; %bb.0:
1544; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1545; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1546; GCN-NEXT:    s_add_u32 s4, s2, 2
1547; GCN-NEXT:    s_addc_u32 s5, s3, 0
1548; GCN-NEXT:    v_mov_b32_e32 v0, s4
1549; GCN-NEXT:    v_mov_b32_e32 v1, s5
1550; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1551; GCN-NEXT:    v_mov_b32_e32 v0, s2
1552; GCN-NEXT:    v_mov_b32_e32 v1, s3
1553; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1554; GCN-NEXT:    v_mov_b32_e32 v1, s1
1555; GCN-NEXT:    s_waitcnt vmcnt(1)
1556; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1557; GCN-NEXT:    s_waitcnt vmcnt(0)
1558; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1559; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1560; GCN-NEXT:    v_mov_b32_e32 v0, s0
1561; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1562; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1563; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1564; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1565; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1566; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1567; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1568; GCN-NEXT:    flat_store_dword v[0:1], v2
1569; GCN-NEXT:    s_endpgm
1570;
1571; GFX1030-LABEL: v_udiv_i16:
1572; GFX1030:       ; %bb.0:
1573; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1574; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1575; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX1030-NEXT:    s_clause 0x1
1577; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3] offset:2
1578; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3]
1579; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1580; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1581; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1582; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1583; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1584; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1585; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1586; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1587; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1588; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1589; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1590; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1591; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1592; GFX1030-NEXT:    s_endpgm
1593;
1594; EG-LABEL: v_udiv_i16:
1595; EG:       ; %bb.0:
1596; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1597; EG-NEXT:    TEX 1 @6
1598; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1599; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1600; EG-NEXT:    CF_END
1601; EG-NEXT:    PAD
1602; EG-NEXT:    Fetch clause starting at 6:
1603; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1604; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1605; EG-NEXT:    ALU clause starting at 10:
1606; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1607; EG-NEXT:    ALU clause starting at 11:
1608; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1609; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1610; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1611; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1612; EG-NEXT:     TRUNC * T0.W, PV.W,
1613; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1614; EG-NEXT:     TRUNC * T0.W, PV.W,
1615; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1616; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1617; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1618; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1619; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1620; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1621; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1622; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1623  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
1624  %num = load i16, i16 addrspace(1) * %in
1625  %den = load i16, i16 addrspace(1) * %den_ptr
1626  %result = udiv i16 %num, %den
1627  %result.ext = zext i16 %result to i32
1628  store i32 %result.ext, i32 addrspace(1)* %out
1629  ret void
1630}
1631
1632define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1633; SI-LABEL: v_udiv_i23:
1634; SI:       ; %bb.0:
1635; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1636; SI-NEXT:    s_mov_b32 s7, 0xf000
1637; SI-NEXT:    s_mov_b32 s6, -1
1638; SI-NEXT:    s_mov_b32 s10, s6
1639; SI-NEXT:    s_mov_b32 s11, s7
1640; SI-NEXT:    s_waitcnt lgkmcnt(0)
1641; SI-NEXT:    s_mov_b32 s8, s2
1642; SI-NEXT:    s_mov_b32 s9, s3
1643; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1644; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1645; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1646; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1647; SI-NEXT:    s_mov_b32 s4, s0
1648; SI-NEXT:    s_mov_b32 s5, s1
1649; SI-NEXT:    s_waitcnt vmcnt(3)
1650; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1651; SI-NEXT:    s_waitcnt vmcnt(2)
1652; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1653; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1654; SI-NEXT:    s_waitcnt vmcnt(1)
1655; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1656; SI-NEXT:    s_waitcnt vmcnt(0)
1657; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1658; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1659; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1660; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1661; SI-NEXT:    v_trunc_f32_e32 v2, v2
1662; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1663; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1664; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1665; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1666; SI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1667; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1668; SI-NEXT:    s_endpgm
1669;
1670; VI-LABEL: v_udiv_i23:
1671; VI:       ; %bb.0:
1672; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1673; VI-NEXT:    s_mov_b32 s7, 0xf000
1674; VI-NEXT:    s_mov_b32 s6, -1
1675; VI-NEXT:    s_mov_b32 s10, s6
1676; VI-NEXT:    s_mov_b32 s11, s7
1677; VI-NEXT:    s_waitcnt lgkmcnt(0)
1678; VI-NEXT:    s_mov_b32 s8, s2
1679; VI-NEXT:    s_mov_b32 s9, s3
1680; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1681; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1682; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1683; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1684; VI-NEXT:    s_mov_b32 s4, s0
1685; VI-NEXT:    s_mov_b32 s5, s1
1686; VI-NEXT:    s_waitcnt vmcnt(3)
1687; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1688; VI-NEXT:    s_waitcnt vmcnt(2)
1689; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1690; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1691; VI-NEXT:    s_waitcnt vmcnt(1)
1692; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1693; VI-NEXT:    s_waitcnt vmcnt(0)
1694; VI-NEXT:    v_or_b32_e32 v1, v3, v1
1695; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1696; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1697; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1698; VI-NEXT:    v_trunc_f32_e32 v2, v2
1699; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1700; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1701; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1702; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1703; VI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1704; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1705; VI-NEXT:    s_endpgm
1706;
1707; GCN-LABEL: v_udiv_i23:
1708; GCN:       ; %bb.0:
1709; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1710; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1711; GCN-NEXT:    s_add_u32 s4, s2, 4
1712; GCN-NEXT:    s_addc_u32 s5, s3, 0
1713; GCN-NEXT:    s_add_u32 s6, s2, 2
1714; GCN-NEXT:    s_addc_u32 s7, s3, 0
1715; GCN-NEXT:    v_mov_b32_e32 v0, s6
1716; GCN-NEXT:    v_mov_b32_e32 v1, s7
1717; GCN-NEXT:    s_add_u32 s6, s2, 6
1718; GCN-NEXT:    s_addc_u32 s7, s3, 0
1719; GCN-NEXT:    v_mov_b32_e32 v2, s6
1720; GCN-NEXT:    v_mov_b32_e32 v3, s7
1721; GCN-NEXT:    v_mov_b32_e32 v4, s4
1722; GCN-NEXT:    v_mov_b32_e32 v5, s5
1723; GCN-NEXT:    flat_load_ubyte v6, v[2:3]
1724; GCN-NEXT:    flat_load_ushort v4, v[4:5]
1725; GCN-NEXT:    v_mov_b32_e32 v2, s2
1726; GCN-NEXT:    v_mov_b32_e32 v3, s3
1727; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
1728; GCN-NEXT:    flat_load_ushort v1, v[2:3]
1729; GCN-NEXT:    s_waitcnt vmcnt(3)
1730; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
1731; GCN-NEXT:    s_waitcnt vmcnt(2)
1732; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
1733; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1734; GCN-NEXT:    s_waitcnt vmcnt(1)
1735; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1736; GCN-NEXT:    s_waitcnt vmcnt(0)
1737; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
1738; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1739; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1740; GCN-NEXT:    v_mov_b32_e32 v0, s0
1741; GCN-NEXT:    v_mov_b32_e32 v1, s1
1742; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1743; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1744; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1745; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1746; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1747; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1748; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
1749; GCN-NEXT:    flat_store_dword v[0:1], v2
1750; GCN-NEXT:    s_endpgm
1751;
1752; GFX1030-LABEL: v_udiv_i23:
1753; GFX1030:       ; %bb.0:
1754; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1755; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1756; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1757; GFX1030-NEXT:    s_clause 0x3
1758; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1759; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1760; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1761; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1762; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1763; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1764; GFX1030-NEXT:    s_waitcnt vmcnt(2)
1765; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1766; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1767; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1768; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1769; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1770; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
1771; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1772; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1773; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1774; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1775; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1776; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1777; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1778; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1779; GFX1030-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
1780; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1781; GFX1030-NEXT:    s_endpgm
1782;
1783; EG-LABEL: v_udiv_i23:
1784; EG:       ; %bb.0:
1785; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1786; EG-NEXT:    TEX 3 @6
1787; EG-NEXT:    ALU 20, @15, KC0[CB0:0-32], KC1[]
1788; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1789; EG-NEXT:    CF_END
1790; EG-NEXT:    PAD
1791; EG-NEXT:    Fetch clause starting at 6:
1792; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1793; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1794; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1795; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1796; EG-NEXT:    ALU clause starting at 14:
1797; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1798; EG-NEXT:    ALU clause starting at 15:
1799; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1800; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1801; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1802; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1803; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1804; EG-NEXT:     UINT_TO_FLT * T0.X, PV.W,
1805; EG-NEXT:     OR_INT T0.W, T2.X, T1.W,
1806; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1807; EG-NEXT:     UINT_TO_FLT * T0.Z, PV.W,
1808; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Y,
1809; EG-NEXT:     TRUNC * T0.W, PV.W,
1810; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
1811; EG-NEXT:     TRUNC * T0.W, PV.W,
1812; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.X|,
1813; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1814; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1815; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1816; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1817; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1818; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1819; EG-NEXT:    8388607(1.175494e-38), 2(2.802597e-45)
1820  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1821  %num = load i23, i23 addrspace(1) * %in
1822  %den = load i23, i23 addrspace(1) * %den_ptr
1823  %result = udiv i23 %num, %den
1824  %result.ext = zext i23 %result to i32
1825  store i32 %result.ext, i32 addrspace(1)* %out
1826  ret void
1827}
1828
1829define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1830; SI-LABEL: v_udiv_i24:
1831; SI:       ; %bb.0:
1832; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1833; SI-NEXT:    s_mov_b32 s7, 0xf000
1834; SI-NEXT:    s_mov_b32 s6, -1
1835; SI-NEXT:    s_mov_b32 s10, s6
1836; SI-NEXT:    s_mov_b32 s11, s7
1837; SI-NEXT:    s_waitcnt lgkmcnt(0)
1838; SI-NEXT:    s_mov_b32 s8, s2
1839; SI-NEXT:    s_mov_b32 s9, s3
1840; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1841; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1842; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1843; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1844; SI-NEXT:    s_mov_b32 s4, s0
1845; SI-NEXT:    s_mov_b32 s5, s1
1846; SI-NEXT:    s_waitcnt vmcnt(3)
1847; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1848; SI-NEXT:    s_waitcnt vmcnt(2)
1849; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1850; SI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1851; SI-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1852; SI-NEXT:    s_waitcnt vmcnt(1)
1853; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1854; SI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1855; SI-NEXT:    s_waitcnt vmcnt(0)
1856; SI-NEXT:    v_or_b32_e32 v2, v3, v2
1857; SI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1858; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1859; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
1860; SI-NEXT:    v_mul_hi_u32 v4, v1, v4
1861; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1862; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
1863; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
1864; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1865; SI-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
1866; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1867; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1868; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v2
1869; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1870; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
1871; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1872; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1873; SI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1874; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1875; SI-NEXT:    s_endpgm
1876;
1877; VI-LABEL: v_udiv_i24:
1878; VI:       ; %bb.0:
1879; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1880; VI-NEXT:    s_mov_b32 s7, 0xf000
1881; VI-NEXT:    s_mov_b32 s6, -1
1882; VI-NEXT:    s_mov_b32 s10, s6
1883; VI-NEXT:    s_mov_b32 s11, s7
1884; VI-NEXT:    s_waitcnt lgkmcnt(0)
1885; VI-NEXT:    s_mov_b32 s8, s2
1886; VI-NEXT:    s_mov_b32 s9, s3
1887; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1888; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1889; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1890; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1891; VI-NEXT:    s_mov_b32 s4, s0
1892; VI-NEXT:    s_mov_b32 s5, s1
1893; VI-NEXT:    s_waitcnt vmcnt(3)
1894; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1895; VI-NEXT:    s_waitcnt vmcnt(2)
1896; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1897; VI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1898; VI-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
1899; VI-NEXT:    s_waitcnt vmcnt(1)
1900; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1901; VI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1902; VI-NEXT:    s_waitcnt vmcnt(0)
1903; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1904; VI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1905; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1906; VI-NEXT:    v_mul_lo_u32 v4, v4, v1
1907; VI-NEXT:    v_mul_hi_u32 v4, v1, v4
1908; VI-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
1909; VI-NEXT:    v_mul_hi_u32 v1, v2, v1
1910; VI-NEXT:    v_mul_lo_u32 v3, v1, v0
1911; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
1912; VI-NEXT:    v_subrev_u32_e32 v2, vcc, v3, v2
1913; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1914; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1915; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v0, v2
1916; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1917; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
1918; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1919; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1920; VI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1921; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1922; VI-NEXT:    s_endpgm
1923;
1924; GCN-LABEL: v_udiv_i24:
1925; GCN:       ; %bb.0:
1926; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1927; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1928; GCN-NEXT:    s_add_u32 s4, s2, 4
1929; GCN-NEXT:    s_addc_u32 s5, s3, 0
1930; GCN-NEXT:    s_add_u32 s6, s2, 2
1931; GCN-NEXT:    v_mov_b32_e32 v0, s4
1932; GCN-NEXT:    s_addc_u32 s7, s3, 0
1933; GCN-NEXT:    v_mov_b32_e32 v1, s5
1934; GCN-NEXT:    s_add_u32 s4, s2, 6
1935; GCN-NEXT:    s_addc_u32 s5, s3, 0
1936; GCN-NEXT:    v_mov_b32_e32 v2, s4
1937; GCN-NEXT:    v_mov_b32_e32 v3, s5
1938; GCN-NEXT:    flat_load_ubyte v4, v[2:3]
1939; GCN-NEXT:    flat_load_ushort v5, v[0:1]
1940; GCN-NEXT:    v_mov_b32_e32 v2, s6
1941; GCN-NEXT:    v_mov_b32_e32 v0, s2
1942; GCN-NEXT:    v_mov_b32_e32 v3, s7
1943; GCN-NEXT:    v_mov_b32_e32 v1, s3
1944; GCN-NEXT:    flat_load_ubyte v2, v[2:3]
1945; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1946; GCN-NEXT:    s_waitcnt vmcnt(3)
1947; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
1948; GCN-NEXT:    s_waitcnt vmcnt(2)
1949; GCN-NEXT:    v_or_b32_e32 v3, v5, v1
1950; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v3
1951; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
1952; GCN-NEXT:    s_waitcnt vmcnt(1)
1953; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1954; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1955; GCN-NEXT:    s_waitcnt vmcnt(0)
1956; GCN-NEXT:    v_or_b32_e32 v2, v0, v2
1957; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1958; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1959; GCN-NEXT:    v_mul_lo_u32 v4, v4, v1
1960; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1961; GCN-NEXT:    v_add_u32_e32 v0, vcc, v4, v1
1962; GCN-NEXT:    v_mul_hi_u32 v4, v2, v0
1963; GCN-NEXT:    v_mov_b32_e32 v0, s0
1964; GCN-NEXT:    v_mov_b32_e32 v1, s1
1965; GCN-NEXT:    v_mul_lo_u32 v5, v4, v3
1966; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
1967; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, v5, v2
1968; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v3
1969; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
1970; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v3, v2
1971; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1972; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
1973; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
1974; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
1975; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
1976; GCN-NEXT:    flat_store_dword v[0:1], v2
1977; GCN-NEXT:    s_endpgm
1978;
1979; GFX1030-LABEL: v_udiv_i24:
1980; GFX1030:       ; %bb.0:
1981; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1982; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1983; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1984; GFX1030-NEXT:    s_clause 0x3
1985; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1986; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1987; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1988; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1989; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1990; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1991; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1992; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1993; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1994; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1995; GFX1030-NEXT:    v_or_b32_e32 v3, v4, v3
1996; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v1
1997; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
1998; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1999; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2000; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
2001; GFX1030-NEXT:    v_mul_lo_u32 v5, v5, v2
2002; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v5
2003; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v2, v5
2004; GFX1030-NEXT:    v_mul_hi_u32 v2, v3, v2
2005; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, v1
2006; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
2007; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2008; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v3, v1
2009; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2010; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
2011; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
2012; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2013; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2014; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
2015; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
2016; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
2017; GFX1030-NEXT:    s_endpgm
2018;
2019; EG-LABEL: v_udiv_i24:
2020; EG:       ; %bb.0:
2021; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
2022; EG-NEXT:    TEX 3 @6
2023; EG-NEXT:    ALU 23, @15, KC0[CB0:0-32], KC1[]
2024; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2025; EG-NEXT:    CF_END
2026; EG-NEXT:    PAD
2027; EG-NEXT:    Fetch clause starting at 6:
2028; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
2029; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
2030; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
2031; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
2032; EG-NEXT:    ALU clause starting at 14:
2033; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2034; EG-NEXT:    ALU clause starting at 15:
2035; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
2036; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2037; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
2038; EG-NEXT:     SUB_INT T1.W, 0.0, PV.W,
2039; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2040; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
2041; EG-NEXT:     LSHL T1.W, T3.X, literal.x,
2042; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2043; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2044; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2045; EG-NEXT:     OR_INT * T1.W, T2.X, PV.W,
2046; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2047; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2048; EG-NEXT:     SUB_INT * T1.W, T1.W, PS,
2049; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2050; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
2051; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
2052; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.W, PS,
2053; EG-NEXT:     CNDE_INT * T2.W, PV.W, T0.X, PV.Z,
2054; EG-NEXT:     ADD_INT T3.W, PS, 1,
2055; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2056; EG-NEXT:     CNDE_INT T0.X, PS, T2.W, PV.W,
2057; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2058; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2059  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
2060  %num = load i24, i24 addrspace(1) * %in
2061  %den = load i24, i24 addrspace(1) * %den_ptr
2062  %result = udiv i24 %num, %den
2063  %result.ext = zext i24 %result to i32
2064  store i32 %result.ext, i32 addrspace(1)* %out
2065  ret void
2066}
2067
2068define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2069; SI-LABEL: scalarize_mulhu_4xi32:
2070; SI:       ; %bb.0:
2071; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2072; SI-NEXT:    s_mov_b32 s7, 0xf000
2073; SI-NEXT:    s_mov_b32 s6, -1
2074; SI-NEXT:    s_waitcnt lgkmcnt(0)
2075; SI-NEXT:    s_mov_b32 s4, s0
2076; SI-NEXT:    s_mov_b32 s5, s1
2077; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2078; SI-NEXT:    s_mov_b32 s0, 0x1389c755
2079; SI-NEXT:    s_mov_b32 s4, s2
2080; SI-NEXT:    s_mov_b32 s5, s3
2081; SI-NEXT:    s_waitcnt vmcnt(0)
2082; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2083; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2084; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2085; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2086; SI-NEXT:    v_mul_hi_u32 v0, v0, s0
2087; SI-NEXT:    v_mul_hi_u32 v1, v1, s0
2088; SI-NEXT:    v_mul_hi_u32 v2, v2, s0
2089; SI-NEXT:    v_mul_hi_u32 v3, v3, s0
2090; SI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2091; SI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2092; SI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2093; SI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2094; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2095; SI-NEXT:    s_endpgm
2096;
2097; VI-LABEL: scalarize_mulhu_4xi32:
2098; VI:       ; %bb.0:
2099; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2100; VI-NEXT:    s_mov_b32 s7, 0xf000
2101; VI-NEXT:    s_mov_b32 s6, -1
2102; VI-NEXT:    s_waitcnt lgkmcnt(0)
2103; VI-NEXT:    s_mov_b32 s4, s0
2104; VI-NEXT:    s_mov_b32 s5, s1
2105; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2106; VI-NEXT:    s_mov_b32 s0, 0x1389c755
2107; VI-NEXT:    s_mov_b32 s4, s2
2108; VI-NEXT:    s_mov_b32 s5, s3
2109; VI-NEXT:    s_waitcnt vmcnt(0)
2110; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2111; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2112; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2113; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2114; VI-NEXT:    v_mul_hi_u32 v0, v0, s0
2115; VI-NEXT:    v_mul_hi_u32 v1, v1, s0
2116; VI-NEXT:    v_mul_hi_u32 v2, v2, s0
2117; VI-NEXT:    v_mul_hi_u32 v3, v3, s0
2118; VI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2119; VI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2120; VI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2121; VI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2122; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2123; VI-NEXT:    s_endpgm
2124;
2125; GCN-LABEL: scalarize_mulhu_4xi32:
2126; GCN:       ; %bb.0:
2127; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2128; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2129; GCN-NEXT:    v_mov_b32_e32 v0, s0
2130; GCN-NEXT:    v_mov_b32_e32 v1, s1
2131; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2132; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2133; GCN-NEXT:    v_mov_b32_e32 v4, s2
2134; GCN-NEXT:    v_mov_b32_e32 v5, s3
2135; GCN-NEXT:    s_waitcnt vmcnt(0)
2136; GCN-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2137; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2138; GCN-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2139; GCN-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2140; GCN-NEXT:    v_mul_hi_u32 v0, v0, s0
2141; GCN-NEXT:    v_mul_hi_u32 v1, v1, s0
2142; GCN-NEXT:    v_mul_hi_u32 v2, v2, s0
2143; GCN-NEXT:    v_mul_hi_u32 v3, v3, s0
2144; GCN-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2145; GCN-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2146; GCN-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2147; GCN-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2148; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2149; GCN-NEXT:    s_endpgm
2150;
2151; GFX1030-LABEL: scalarize_mulhu_4xi32:
2152; GFX1030:       ; %bb.0:
2153; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2154; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
2155; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2156; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
2157; GFX1030-NEXT:    s_mov_b32 s0, 0x1389c755
2158; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2159; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2160; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2161; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2162; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2163; GFX1030-NEXT:    v_mul_hi_u32 v0, v0, s0
2164; GFX1030-NEXT:    v_mul_hi_u32 v1, v1, s0
2165; GFX1030-NEXT:    v_mul_hi_u32 v2, v2, s0
2166; GFX1030-NEXT:    v_mul_hi_u32 v3, v3, s0
2167; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2168; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2169; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2170; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2171; GFX1030-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
2172; GFX1030-NEXT:    s_endpgm
2173;
2174; EG-LABEL: scalarize_mulhu_4xi32:
2175; EG:       ; %bb.0:
2176; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2177; EG-NEXT:    TEX 0 @6
2178; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
2179; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2180; EG-NEXT:    CF_END
2181; EG-NEXT:    PAD
2182; EG-NEXT:    Fetch clause starting at 6:
2183; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2184; EG-NEXT:    ALU clause starting at 8:
2185; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2186; EG-NEXT:    ALU clause starting at 9:
2187; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
2188; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
2189; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2190; EG-NEXT:     MULHI * T0.Z, PV.W, literal.x,
2191; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2192; EG-NEXT:     LSHR T1.Z, T0.Y, literal.x,
2193; EG-NEXT:     LSHR T0.W, PS, literal.y,
2194; EG-NEXT:     MULHI * T0.Y, T1.W, literal.z,
2195; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
2196; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2197; EG-NEXT:     LSHR T0.Z, PS, literal.x,
2198; EG-NEXT:     LSHR T1.W, T0.X, literal.y,
2199; EG-NEXT:     MULHI * T0.X, PV.Z, literal.z,
2200; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2201; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2202; EG-NEXT:     LSHR T0.Y, PS, literal.x,
2203; EG-NEXT:     MULHI * T0.X, PV.W, literal.y,
2204; EG-NEXT:    10(1.401298e-44), 327796565(3.478022e-27)
2205; EG-NEXT:     LSHR T0.X, PS, literal.x,
2206; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.y,
2207; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2208  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2209  %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2210  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
2211  ret void
2212}
2213
2214define amdgpu_kernel void @test_udiv2(i32 %p) {
2215; SI-LABEL: test_udiv2:
2216; SI:       ; %bb.0:
2217; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2218; SI-NEXT:    s_mov_b32 s3, 0xf000
2219; SI-NEXT:    s_mov_b32 s2, -1
2220; SI-NEXT:    s_waitcnt lgkmcnt(0)
2221; SI-NEXT:    s_lshr_b32 s0, s0, 1
2222; SI-NEXT:    v_mov_b32_e32 v0, s0
2223; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2224; SI-NEXT:    s_waitcnt vmcnt(0)
2225; SI-NEXT:    s_endpgm
2226;
2227; VI-LABEL: test_udiv2:
2228; VI:       ; %bb.0:
2229; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2230; VI-NEXT:    s_mov_b32 s3, 0xf000
2231; VI-NEXT:    s_mov_b32 s2, -1
2232; VI-NEXT:    s_waitcnt lgkmcnt(0)
2233; VI-NEXT:    s_lshr_b32 s0, s0, 1
2234; VI-NEXT:    v_mov_b32_e32 v0, s0
2235; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2236; VI-NEXT:    s_waitcnt vmcnt(0)
2237; VI-NEXT:    s_endpgm
2238;
2239; GCN-LABEL: test_udiv2:
2240; GCN:       ; %bb.0:
2241; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2242; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2243; GCN-NEXT:    s_lshr_b32 s0, s0, 1
2244; GCN-NEXT:    v_mov_b32_e32 v0, s0
2245; GCN-NEXT:    flat_store_dword v[0:1], v0
2246; GCN-NEXT:    s_waitcnt vmcnt(0)
2247; GCN-NEXT:    s_endpgm
2248;
2249; GFX1030-LABEL: test_udiv2:
2250; GFX1030:       ; %bb.0:
2251; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2252; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2253; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2254; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2255; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2256; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2257; GFX1030-NEXT:    s_endpgm
2258;
2259; EG-LABEL: test_udiv2:
2260; EG:       ; %bb.0:
2261; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2262; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2263; EG-NEXT:    CF_END
2264; EG-NEXT:    PAD
2265; EG-NEXT:    ALU clause starting at 4:
2266; EG-NEXT:     MOV T0.X, literal.x,
2267; EG-NEXT:     LSHR * T1.X, KC0[2].Y, 1,
2268; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2269  %i = udiv i32 %p, 2
2270  store volatile i32 %i, i32 addrspace(1)* undef
2271  ret void
2272}
2273
2274define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
2275; SI-LABEL: test_udiv_3_mulhu:
2276; SI:       ; %bb.0:
2277; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2278; SI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2279; SI-NEXT:    s_mov_b32 s3, 0xf000
2280; SI-NEXT:    s_mov_b32 s2, -1
2281; SI-NEXT:    s_waitcnt lgkmcnt(0)
2282; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
2283; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2284; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2285; SI-NEXT:    s_waitcnt vmcnt(0)
2286; SI-NEXT:    s_endpgm
2287;
2288; VI-LABEL: test_udiv_3_mulhu:
2289; VI:       ; %bb.0:
2290; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2291; VI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2292; VI-NEXT:    s_mov_b32 s3, 0xf000
2293; VI-NEXT:    s_mov_b32 s2, -1
2294; VI-NEXT:    s_waitcnt lgkmcnt(0)
2295; VI-NEXT:    v_mul_hi_u32 v0, s0, v0
2296; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2297; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2298; VI-NEXT:    s_waitcnt vmcnt(0)
2299; VI-NEXT:    s_endpgm
2300;
2301; GCN-LABEL: test_udiv_3_mulhu:
2302; GCN:       ; %bb.0:
2303; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2304; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2305; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2306; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
2307; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2308; GCN-NEXT:    flat_store_dword v[0:1], v0
2309; GCN-NEXT:    s_waitcnt vmcnt(0)
2310; GCN-NEXT:    s_endpgm
2311;
2312; GFX1030-LABEL: test_udiv_3_mulhu:
2313; GFX1030:       ; %bb.0:
2314; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2315; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2316; GFX1030-NEXT:    s_mul_hi_u32 s0, s0, 0xaaaaaaab
2317; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2318; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2319; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2320; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2321; GFX1030-NEXT:    s_endpgm
2322;
2323; EG-LABEL: test_udiv_3_mulhu:
2324; EG:       ; %bb.0:
2325; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2326; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2327; EG-NEXT:    CF_END
2328; EG-NEXT:    PAD
2329; EG-NEXT:    ALU clause starting at 4:
2330; EG-NEXT:     MULHI * T0.X, KC0[2].Y, literal.x,
2331; EG-NEXT:    -1431655765(-3.031649e-13), 0(0.000000e+00)
2332; EG-NEXT:     LSHR T0.X, PS, 1,
2333; EG-NEXT:     MOV * T1.X, literal.x,
2334; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2335   %i = udiv i32 %p, 3
2336   store volatile i32 %i, i32 addrspace(1)* undef
2337   ret void
2338}
2339
2340define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
2341; SI-LABEL: fdiv_test_denormals:
2342; SI:       ; %bb.0: ; %bb
2343; SI-NEXT:    s_mov_b32 s0, 0
2344; SI-NEXT:    s_mov_b32 s3, 0xf000
2345; SI-NEXT:    s_mov_b32 s2, -1
2346; SI-NEXT:    s_mov_b32 s1, s0
2347; SI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2348; SI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2349; SI-NEXT:    s_waitcnt vmcnt(1)
2350; SI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2351; SI-NEXT:    s_waitcnt vmcnt(0)
2352; SI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2353; SI-NEXT:    v_xor_b32_e32 v0, v1, v0
2354; SI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2355; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2356; SI-NEXT:    v_or_b32_e32 v0, 1, v0
2357; SI-NEXT:    v_mul_f32_e32 v1, v3, v4
2358; SI-NEXT:    v_trunc_f32_e32 v1, v1
2359; SI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2360; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2361; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2362; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2363; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2364; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2365; SI-NEXT:    s_endpgm
2366;
2367; VI-LABEL: fdiv_test_denormals:
2368; VI:       ; %bb.0: ; %bb
2369; VI-NEXT:    s_mov_b32 s0, 0
2370; VI-NEXT:    s_mov_b32 s3, 0xf000
2371; VI-NEXT:    s_mov_b32 s2, -1
2372; VI-NEXT:    s_mov_b32 s1, s0
2373; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2374; VI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2375; VI-NEXT:    s_waitcnt vmcnt(1)
2376; VI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2377; VI-NEXT:    s_waitcnt vmcnt(0)
2378; VI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2379; VI-NEXT:    v_xor_b32_e32 v0, v1, v0
2380; VI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2381; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2382; VI-NEXT:    v_or_b32_e32 v0, 1, v0
2383; VI-NEXT:    v_mul_f32_e32 v1, v3, v4
2384; VI-NEXT:    v_trunc_f32_e32 v1, v1
2385; VI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2386; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2387; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2388; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2389; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
2390; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2391; VI-NEXT:    s_endpgm
2392;
2393; GCN-LABEL: fdiv_test_denormals:
2394; GCN:       ; %bb.0: ; %bb
2395; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
2396; GCN-NEXT:    v_mov_b32_e32 v0, 0
2397; GCN-NEXT:    v_mov_b32_e32 v1, 0
2398; GCN-NEXT:    flat_load_sbyte v3, v[0:1]
2399; GCN-NEXT:    s_waitcnt vmcnt(1)
2400; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v2
2401; GCN-NEXT:    s_waitcnt vmcnt(0)
2402; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v3
2403; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2404; GCN-NEXT:    v_xor_b32_e32 v2, v3, v2
2405; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2406; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
2407; GCN-NEXT:    v_mul_f32_e32 v3, v5, v6
2408; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2409; GCN-NEXT:    v_mad_f32 v5, -v3, v4, v5
2410; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2411; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
2412; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
2413; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2414; GCN-NEXT:    flat_store_byte v[0:1], v2
2415; GCN-NEXT:    s_endpgm
2416;
2417; GFX1030-LABEL: fdiv_test_denormals:
2418; GFX1030:       ; %bb.0: ; %bb
2419; GFX1030-NEXT:    global_load_sbyte v2, v[0:1], off
2420; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
2421; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
2422; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
2423; GFX1030-NEXT:    s_waitcnt vmcnt(1)
2424; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
2425; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v4
2426; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2427; GFX1030-NEXT:    v_cvt_f32_i32_e32 v6, v3
2428; GFX1030-NEXT:    v_xor_b32_e32 v2, v3, v2
2429; GFX1030-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2430; GFX1030-NEXT:    v_mul_f32_e32 v5, v6, v5
2431; GFX1030-NEXT:    v_or_b32_e32 v2, 1, v2
2432; GFX1030-NEXT:    v_trunc_f32_e32 v3, v5
2433; GFX1030-NEXT:    v_fma_f32 v5, -v3, v4, v6
2434; GFX1030-NEXT:    v_cvt_i32_f32_e32 v3, v3
2435; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4|
2436; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
2437; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v3, v2
2438; GFX1030-NEXT:    global_store_byte v[0:1], v2, off
2439; GFX1030-NEXT:    s_endpgm
2440;
2441; EG-LABEL: fdiv_test_denormals:
2442; EG:       ; %bb.0: ; %bb
2443; EG-NEXT:    TEX 0 @6
2444; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
2445; EG-NEXT:    TEX 0 @8
2446; EG-NEXT:    ALU 25, @11, KC0[], KC1[]
2447; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
2448; EG-NEXT:    CF_END
2449; EG-NEXT:    Fetch clause starting at 6:
2450; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
2451; EG-NEXT:    Fetch clause starting at 8:
2452; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
2453; EG-NEXT:    ALU clause starting at 10:
2454; EG-NEXT:     MOV * T1.X, 0.0,
2455; EG-NEXT:    ALU clause starting at 11:
2456; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
2457; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2458; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
2459; EG-NEXT:     BFE_INT T1.W, T1.X, 0.0, literal.x,
2460; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
2461; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2462; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
2463; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
2464; EG-NEXT:     TRUNC T2.W, PV.W,
2465; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
2466; EG-NEXT:     ASHR T0.W, PS, literal.x,
2467; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
2468; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
2469; EG-NEXT:     TRUNC T0.Z, T2.W,
2470; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
2471; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
2472; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
2473; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
2474; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
2475; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
2476; EG-NEXT:     MOV * T0.W, literal.x,
2477; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2478; EG-NEXT:     MOV T0.Y, 0.0,
2479; EG-NEXT:     MOV * T0.Z, 0.0,
2480; EG-NEXT:     MOV * T1.X, literal.x,
2481; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2482bb:
2483  %tmp = load i8, i8 addrspace(1)* null, align 1
2484  %tmp1 = sext i8 %tmp to i32
2485  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
2486  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
2487  %tmp4 = sext i8 %tmp3 to i32
2488  %tmp5 = sdiv i32 %tmp1, %tmp4
2489  %tmp6 = trunc i32 %tmp5 to i8
2490  store i8 %tmp6, i8 addrspace(1)* null, align 1
2491  ret void
2492}
2493
2494define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
2495; SI-LABEL: v_test_udiv64_mulhi_fold:
2496; SI:       ; %bb.0:
2497; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498; SI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2499; SI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2500; SI-NEXT:    v_rcp_f32_e32 v2, v2
2501; SI-NEXT:    s_mov_b32 s4, 0xfffe7960
2502; SI-NEXT:    v_mov_b32_e32 v8, 0
2503; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2504; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2505; SI-NEXT:    v_trunc_f32_e32 v3, v3
2506; SI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2507; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
2508; SI-NEXT:    v_cvt_u32_f32_e32 v3, v3
2509; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2510; SI-NEXT:    v_mul_lo_u32 v6, v3, s4
2511; SI-NEXT:    v_mul_lo_u32 v5, v2, s4
2512; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2513; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
2514; SI-NEXT:    v_mul_hi_u32 v7, v2, v5
2515; SI-NEXT:    v_mul_lo_u32 v6, v2, v4
2516; SI-NEXT:    v_mul_hi_u32 v9, v2, v4
2517; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
2518; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2519; SI-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
2520; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
2521; SI-NEXT:    v_mul_lo_u32 v9, v3, v5
2522; SI-NEXT:    v_mul_hi_u32 v5, v3, v5
2523; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
2524; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
2525; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
2526; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2527; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2528; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2529; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2530; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2531; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
2532; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2533; SI-NEXT:    s_mov_b32 s4, 0x186a0
2534; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2535; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
2536; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
2537; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
2538; SI-NEXT:    v_mul_hi_u32 v9, v2, v4
2539; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
2540; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2541; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
2542; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
2543; SI-NEXT:    v_mul_lo_u32 v9, v3, v6
2544; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
2545; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
2546; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
2547; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v8, vcc
2548; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2549; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2550; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2551; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2552; SI-NEXT:    v_mul_lo_u32 v4, v0, v3
2553; SI-NEXT:    v_mul_hi_u32 v5, v0, v2
2554; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
2555; SI-NEXT:    v_mul_hi_u32 v7, v1, v3
2556; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
2557; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2558; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2559; SI-NEXT:    v_mul_lo_u32 v6, v1, v2
2560; SI-NEXT:    v_mul_hi_u32 v2, v1, v2
2561; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
2562; SI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
2563; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v8, vcc
2564; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2565; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2566; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
2567; SI-NEXT:    v_mul_hi_u32 v5, v2, s4
2568; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2569; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2570; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
2571; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
2572; SI-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
2573; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
2574; SI-NEXT:    s_mov_b32 s4, 0x1869f
2575; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
2576; SI-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
2577; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2578; SI-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
2579; SI-NEXT:    v_add_i32_e32 v5, vcc, 2, v2
2580; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
2581; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
2582; SI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2583; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
2584; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2585; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2586; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
2587; SI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2588; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
2589; SI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2590; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2591; SI-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
2592; SI-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
2593; SI-NEXT:    s_setpc_b64 s[30:31]
2594;
2595; VI-LABEL: v_test_udiv64_mulhi_fold:
2596; VI:       ; %bb.0:
2597; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2598; VI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2599; VI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2600; VI-NEXT:    v_rcp_f32_e32 v2, v2
2601; VI-NEXT:    s_mov_b32 s6, 0xfffe7960
2602; VI-NEXT:    v_mov_b32_e32 v9, 0
2603; VI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2604; VI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2605; VI-NEXT:    v_trunc_f32_e32 v3, v3
2606; VI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2607; VI-NEXT:    v_cvt_u32_f32_e32 v6, v2
2608; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
2609; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2610; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2611; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2612; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
2613; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
2614; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
2615; VI-NEXT:    v_add_u32_e32 v10, vcc, v5, v3
2616; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2617; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v4, vcc
2618; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
2619; VI-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
2620; VI-NEXT:    v_addc_u32_e32 v2, vcc, v11, v3, vcc
2621; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2622; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2623; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2624; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2625; VI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2626; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2627; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2628; VI-NEXT:    s_mov_b32 s6, 0x186a0
2629; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2630; VI-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2631; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2632; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
2633; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2634; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2635; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2636; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2637; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2638; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2639; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2640; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2641; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2642; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2643; VI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2644; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2645; VI-NEXT:    v_mul_hi_u32 v6, v0, v4
2646; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2647; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2648; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2649; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2650; VI-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2651; VI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2652; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2653; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2654; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2655; VI-NEXT:    v_mul_lo_u32 v6, v5, s6
2656; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0
2657; VI-NEXT:    s_mov_b32 s4, 0x1869f
2658; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2659; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2660; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2661; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
2662; VI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2663; VI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2664; VI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2665; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2666; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2667; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2668; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2669; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2670; VI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2671; VI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2672; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2673; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2674; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2675; VI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2676; VI-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2677; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2678; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2679; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2680; VI-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2681; VI-NEXT:    s_setpc_b64 s[30:31]
2682;
2683; GCN-LABEL: v_test_udiv64_mulhi_fold:
2684; GCN:       ; %bb.0:
2685; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2686; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2687; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2688; GCN-NEXT:    v_rcp_f32_e32 v2, v2
2689; GCN-NEXT:    s_mov_b32 s6, 0xfffe7960
2690; GCN-NEXT:    v_mov_b32_e32 v9, 0
2691; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2692; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2693; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2694; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2695; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v2
2696; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
2697; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2698; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2699; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2700; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
2701; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
2702; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
2703; GCN-NEXT:    v_add_u32_e32 v10, vcc, v5, v3
2704; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2705; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v4, vcc
2706; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
2707; GCN-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
2708; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v11, v3, vcc
2709; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2710; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2711; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2712; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2713; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2714; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2715; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2716; GCN-NEXT:    s_mov_b32 s6, 0x186a0
2717; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2718; GCN-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2719; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2720; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
2721; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2722; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2723; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2724; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2725; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2726; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2727; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2728; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2729; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2730; GCN-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2731; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2732; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2733; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
2734; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2735; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2736; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2737; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2738; GCN-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2739; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2740; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2741; GCN-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2742; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2743; GCN-NEXT:    v_mul_lo_u32 v6, v5, s6
2744; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0
2745; GCN-NEXT:    s_mov_b32 s4, 0x1869f
2746; GCN-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2747; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2748; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2749; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
2750; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2751; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2752; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2753; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2754; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2755; GCN-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2756; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2757; GCN-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2758; GCN-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2759; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2760; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2761; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2762; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2763; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2764; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2765; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2766; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2767; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2768; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2769; GCN-NEXT:    s_setpc_b64 s[30:31]
2770;
2771; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
2772; GFX1030:       ; %bb.0:
2773; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2774; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2775; GFX1030-NEXT:    s_mov_b32 s4, 0x346d900
2776; GFX1030-NEXT:    s_mov_b32 s5, 0xfffe7960
2777; GFX1030-NEXT:    s_add_u32 s4, 0x4237, s4
2778; GFX1030-NEXT:    s_addc_u32 s6, 0, 0
2779; GFX1030-NEXT:    v_add_co_u32 v2, s4, 0xa9000000, s4
2780; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
2781; GFX1030-NEXT:    s_addc_u32 s4, s6, 0xa7c5
2782; GFX1030-NEXT:    v_mul_hi_u32 v3, v2, s5
2783; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, s5
2784; GFX1030-NEXT:    s_mul_i32 s5, s4, s5
2785; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v2
2786; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v4
2787; GFX1030-NEXT:    v_mul_hi_u32 v8, s4, v4
2788; GFX1030-NEXT:    v_mul_lo_u32 v4, s4, v4
2789; GFX1030-NEXT:    v_add_nc_u32_e32 v3, s5, v3
2790; GFX1030-NEXT:    v_mul_lo_u32 v6, v2, v3
2791; GFX1030-NEXT:    v_mul_hi_u32 v7, v2, v3
2792; GFX1030-NEXT:    v_mul_hi_u32 v9, s4, v3
2793; GFX1030-NEXT:    v_mul_lo_u32 v3, s4, v3
2794; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v5, v6
2795; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v7, vcc_lo
2796; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, v4
2797; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
2798; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
2799; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, v3
2800; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
2801; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v3
2802; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s4, v4, vcc_lo
2803; GFX1030-NEXT:    v_mul_hi_u32 v8, v0, v5
2804; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], s4, v1, v5, 0
2805; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, v0, v6, 0
2806; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], s4, v1, v6, 0
2807; GFX1030-NEXT:    s_mov_b32 s4, 0x186a0
2808; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
2809; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2810; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2811; GFX1030-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
2812; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
2813; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
2814; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
2815; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s5, v4, s4, 0
2816; GFX1030-NEXT:    v_mul_lo_u32 v6, v5, s4
2817; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
2818; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v6
2819; GFX1030-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2820; GFX1030-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s4
2821; GFX1030-NEXT:    s_mov_b32 s4, 0x1869f
2822; GFX1030-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
2823; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s4, v2
2824; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
2825; GFX1030-NEXT:    v_add_co_u32 v6, vcc_lo, v4, 2
2826; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
2827; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s4, v0
2828; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
2829; GFX1030-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2830; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2831; GFX1030-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s4
2832; GFX1030-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
2833; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, 1
2834; GFX1030-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v5, vcc_lo
2835; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
2836; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
2837; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc_lo
2838; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2839; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
2840; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
2841; GFX1030-NEXT:    s_setpc_b64 s[30:31]
2842;
2843; EG-LABEL: v_test_udiv64_mulhi_fold:
2844; EG:       ; %bb.0:
2845; EG-NEXT:    CF_END
2846; EG-NEXT:    PAD
2847  %d = udiv i64 %arg, 100000
2848  ret i64 %d
2849}
2850