1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
7
8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
9; SI-LABEL: udiv_i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_u32_e32 v2, v1
24; SI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
25; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
26; SI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
27; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
28; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
29; SI-NEXT:    v_mul_hi_u32 v3, v2, v3
30; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
31; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
32; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
33; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
34; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
35; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
36; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
37; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v1, v0
38; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
39; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
40; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
41; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
42; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: udiv_i32:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
48; VI-NEXT:    s_mov_b32 s7, 0xf000
49; VI-NEXT:    s_mov_b32 s6, -1
50; VI-NEXT:    s_mov_b32 s10, s6
51; VI-NEXT:    s_mov_b32 s11, s7
52; VI-NEXT:    s_waitcnt lgkmcnt(0)
53; VI-NEXT:    s_mov_b32 s8, s2
54; VI-NEXT:    s_mov_b32 s9, s3
55; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
56; VI-NEXT:    s_mov_b32 s4, s0
57; VI-NEXT:    s_mov_b32 s5, s1
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    v_cvt_f32_u32_e32 v2, v1
60; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
61; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
62; VI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
63; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
64; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
65; VI-NEXT:    v_mul_hi_u32 v3, v2, v3
66; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
67; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
68; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
69; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
70; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v3, v0
71; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
72; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
73; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v1, v0
74; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
75; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v2
76; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
77; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
78; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
79; VI-NEXT:    s_endpgm
80;
81; GCN-LABEL: udiv_i32:
82; GCN:       ; %bb.0:
83; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
84; GCN-NEXT:    s_waitcnt lgkmcnt(0)
85; GCN-NEXT:    v_mov_b32_e32 v0, s2
86; GCN-NEXT:    v_mov_b32_e32 v1, s3
87; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
88; GCN-NEXT:    s_waitcnt vmcnt(0)
89; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
90; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
91; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
92; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
93; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
94; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
95; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
96; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
97; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
98; GCN-NEXT:    v_mov_b32_e32 v2, s0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_mul_lo_u32 v5, v4, v1
101; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
102; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v5, v0
103; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
104; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
105; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v1, v0
106; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
107; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
108; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
109; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
110; GCN-NEXT:    flat_store_dword v[2:3], v0
111; GCN-NEXT:    s_endpgm
112;
113; GFX1030-LABEL: udiv_i32:
114; GFX1030:       ; %bb.0:
115; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
116; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
117; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1030-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
119; GFX1030-NEXT:    s_waitcnt vmcnt(0)
120; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, v1
121; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, 0, v1
122; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
123; GFX1030-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
124; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
125; GFX1030-NEXT:    v_mul_lo_u32 v4, v4, v3
126; GFX1030-NEXT:    v_mul_hi_u32 v4, v3, v4
127; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v4
128; GFX1030-NEXT:    v_mul_hi_u32 v3, v0, v3
129; GFX1030-NEXT:    v_mul_lo_u32 v4, v3, v1
130; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v4
131; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
132; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v0, v1
133; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
134; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
135; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
136; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
137; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
138; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
139; GFX1030-NEXT:    global_store_dword v2, v0, s[0:1]
140; GFX1030-NEXT:    s_endpgm
141;
142; EG-LABEL: udiv_i32:
143; EG:       ; %bb.0:
144; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
145; EG-NEXT:    TEX 0 @6
146; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
148; EG-NEXT:    CF_END
149; EG-NEXT:    PAD
150; EG-NEXT:    Fetch clause starting at 6:
151; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
152; EG-NEXT:    ALU clause starting at 8:
153; EG-NEXT:     MOV * T0.X, KC0[2].Z,
154; EG-NEXT:    ALU clause starting at 9:
155; EG-NEXT:     SUB_INT T0.W, 0.0, T0.Y,
156; EG-NEXT:     RECIP_UINT * T0.Z, T0.Y,
157; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
158; EG-NEXT:     MULHI * T0.W, T0.Z, PS,
159; EG-NEXT:     ADD_INT * T0.W, T0.Z, PS,
160; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
161; EG-NEXT:     MULLO_INT * T0.W, PS, T0.Y,
162; EG-NEXT:     SUB_INT * T0.W, T0.X, PS,
163; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
164; EG-NEXT:     SETGE_UINT T1.W, PV.W, T0.Y,
165; EG-NEXT:     SUB_INT * T2.W, PV.W, T0.Y,
166; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.W, PS,
167; EG-NEXT:     CNDE_INT * T1.W, PV.W, T0.Z, PV.Z,
168; EG-NEXT:     ADD_INT T2.W, PS, 1,
169; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.Y,
170; EG-NEXT:     CNDE_INT T0.X, PS, T1.W, PV.W,
171; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
172; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
173  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
174  %a = load i32, i32 addrspace(1)* %in
175  %b = load i32, i32 addrspace(1)* %b_ptr
176  %result = udiv i32 %a, %b
177  store i32 %result, i32 addrspace(1)* %out
178  ret void
179}
180
181define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
182; SI-LABEL: s_udiv_i32:
183; SI:       ; %bb.0:
184; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
185; SI-NEXT:    s_mov_b32 s7, 0xf000
186; SI-NEXT:    s_mov_b32 s6, -1
187; SI-NEXT:    s_waitcnt lgkmcnt(0)
188; SI-NEXT:    v_cvt_f32_u32_e32 v0, s3
189; SI-NEXT:    s_sub_i32 s4, 0, s3
190; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
191; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
192; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
193; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
194; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
195; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
196; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
197; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
198; SI-NEXT:    v_mul_lo_u32 v1, v0, s3
199; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
200; SI-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
201; SI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
202; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
203; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
204; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
205; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
206; SI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
207; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
208; SI-NEXT:    s_waitcnt lgkmcnt(0)
209; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
210; SI-NEXT:    s_endpgm
211;
212; VI-LABEL: s_udiv_i32:
213; VI:       ; %bb.0:
214; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
215; VI-NEXT:    s_mov_b32 s7, 0xf000
216; VI-NEXT:    s_mov_b32 s6, -1
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
219; VI-NEXT:    s_sub_i32 s4, 0, s3
220; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
221; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
222; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
223; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
224; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
225; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
226; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
227; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
228; VI-NEXT:    v_mul_lo_u32 v1, v0, s3
229; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
230; VI-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
231; VI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
232; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
233; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
234; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
235; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
236; VI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
237; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
238; VI-NEXT:    s_waitcnt lgkmcnt(0)
239; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
240; VI-NEXT:    s_endpgm
241;
242; GCN-LABEL: s_udiv_i32:
243; GCN:       ; %bb.0:
244; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
245; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
246; GCN-NEXT:    s_waitcnt lgkmcnt(0)
247; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
248; GCN-NEXT:    s_sub_i32 s0, 0, s3
249; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
250; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
251; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
252; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
253; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
254; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
255; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
256; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
257; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
258; GCN-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
259; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
260; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
261; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
262; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
263; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
264; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
265; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
266; GCN-NEXT:    v_mov_b32_e32 v0, s4
267; GCN-NEXT:    v_mov_b32_e32 v1, s5
268; GCN-NEXT:    flat_store_dword v[0:1], v2
269; GCN-NEXT:    s_endpgm
270;
271; GFX1030-LABEL: s_udiv_i32:
272; GFX1030:       ; %bb.0:
273; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
274; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s1
276; GFX1030-NEXT:    s_sub_i32 s2, 0, s1
277; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
278; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
279; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
280; GFX1030-NEXT:    v_mul_lo_u32 v1, s2, v0
281; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
282; GFX1030-NEXT:    v_mul_hi_u32 v1, v0, v1
283; GFX1030-NEXT:    v_add_nc_u32_e32 v0, v0, v1
284; GFX1030-NEXT:    v_mul_hi_u32 v0, s0, v0
285; GFX1030-NEXT:    v_mul_lo_u32 v1, v0, s1
286; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
287; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
288; GFX1030-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
289; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
290; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
291; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
292; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
293; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
294; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
295; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
296; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX1030-NEXT:    global_store_dword v3, v0, s[2:3]
298; GFX1030-NEXT:    s_endpgm
299;
300; EG-LABEL: s_udiv_i32:
301; EG:       ; %bb.0:
302; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
303; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
304; EG-NEXT:    CF_END
305; EG-NEXT:    PAD
306; EG-NEXT:    ALU clause starting at 4:
307; EG-NEXT:     SUB_INT T0.W, 0.0, KC0[2].W,
308; EG-NEXT:     RECIP_UINT * T0.X, KC0[2].W,
309; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
310; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
311; EG-NEXT:     ADD_INT * T0.W, T0.X, PS,
312; EG-NEXT:     MULHI * T0.X, KC0[2].Z, PV.W,
313; EG-NEXT:     MULLO_INT * T0.Y, PS, KC0[2].W,
314; EG-NEXT:     SUB_INT * T0.W, KC0[2].Z, PS,
315; EG-NEXT:     SUB_INT T0.Z, PV.W, KC0[2].W,
316; EG-NEXT:     SETGE_UINT T1.W, PV.W, KC0[2].W,
317; EG-NEXT:     ADD_INT * T2.W, T0.X, 1,
318; EG-NEXT:     CNDE_INT T2.W, PV.W, T0.X, PS,
319; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, PV.Z,
320; EG-NEXT:     SETGE_UINT T0.W, PS, KC0[2].W,
321; EG-NEXT:     ADD_INT * T1.W, PV.W, 1,
322; EG-NEXT:     CNDE_INT T0.X, PV.W, T2.W, PS,
323; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
324; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
325  %result = udiv i32 %a, %b
326  store i32 %result, i32 addrspace(1)* %out
327  ret void
328}
329
330
331; The code generated by udiv is long and complex and may frequently
332; change. The goal of this test is to make sure the ISel doesn't fail
333; when it gets a v4i32 udiv
334define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
335; SI-LABEL: udiv_v2i32:
336; SI:       ; %bb.0:
337; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
338; SI-NEXT:    s_mov_b32 s7, 0xf000
339; SI-NEXT:    s_mov_b32 s6, -1
340; SI-NEXT:    s_mov_b32 s10, s6
341; SI-NEXT:    s_mov_b32 s11, s7
342; SI-NEXT:    s_waitcnt lgkmcnt(0)
343; SI-NEXT:    s_mov_b32 s8, s2
344; SI-NEXT:    s_mov_b32 s9, s3
345; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
346; SI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
347; SI-NEXT:    s_mov_b32 s4, s0
348; SI-NEXT:    s_mov_b32 s5, s1
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    v_cvt_f32_u32_e32 v4, v2
351; SI-NEXT:    v_cvt_f32_u32_e32 v5, v3
352; SI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
353; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
354; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
355; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
356; SI-NEXT:    v_mul_f32_e32 v4, s2, v4
357; SI-NEXT:    v_mul_f32_e32 v5, s2, v5
358; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
359; SI-NEXT:    v_cvt_u32_f32_e32 v5, v5
360; SI-NEXT:    v_mul_lo_u32 v6, v6, v4
361; SI-NEXT:    v_mul_lo_u32 v7, v7, v5
362; SI-NEXT:    v_mul_hi_u32 v6, v4, v6
363; SI-NEXT:    v_mul_hi_u32 v7, v5, v7
364; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
365; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
366; SI-NEXT:    v_mul_hi_u32 v4, v0, v4
367; SI-NEXT:    v_mul_hi_u32 v5, v1, v5
368; SI-NEXT:    v_mul_lo_u32 v6, v4, v2
369; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
370; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
371; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
372; SI-NEXT:    v_subrev_i32_e32 v1, vcc, v8, v1
373; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
374; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
375; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
376; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
377; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
378; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
379; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
380; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
381; SI-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
382; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
383; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
384; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
385; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
386; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
387; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
388; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
389; SI-NEXT:    s_endpgm
390;
391; VI-LABEL: udiv_v2i32:
392; VI:       ; %bb.0:
393; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
394; VI-NEXT:    s_mov_b32 s7, 0xf000
395; VI-NEXT:    s_mov_b32 s6, -1
396; VI-NEXT:    s_mov_b32 s10, s6
397; VI-NEXT:    s_mov_b32 s11, s7
398; VI-NEXT:    s_waitcnt lgkmcnt(0)
399; VI-NEXT:    s_mov_b32 s8, s2
400; VI-NEXT:    s_mov_b32 s9, s3
401; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
402; VI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
403; VI-NEXT:    s_mov_b32 s4, s0
404; VI-NEXT:    s_mov_b32 s5, s1
405; VI-NEXT:    s_waitcnt vmcnt(0)
406; VI-NEXT:    v_cvt_f32_u32_e32 v4, v2
407; VI-NEXT:    v_cvt_f32_u32_e32 v5, v3
408; VI-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
409; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
410; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
411; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
412; VI-NEXT:    v_mul_f32_e32 v4, s2, v4
413; VI-NEXT:    v_mul_f32_e32 v5, s2, v5
414; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
415; VI-NEXT:    v_cvt_u32_f32_e32 v5, v5
416; VI-NEXT:    v_mul_lo_u32 v6, v6, v4
417; VI-NEXT:    v_mul_lo_u32 v7, v7, v5
418; VI-NEXT:    v_mul_hi_u32 v6, v4, v6
419; VI-NEXT:    v_mul_hi_u32 v7, v5, v7
420; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
421; VI-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
422; VI-NEXT:    v_mul_hi_u32 v4, v0, v4
423; VI-NEXT:    v_mul_hi_u32 v5, v1, v5
424; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
425; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
426; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
427; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
428; VI-NEXT:    v_subrev_u32_e32 v1, vcc, v8, v1
429; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
430; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
431; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
432; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
433; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
434; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
435; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
436; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
437; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
438; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
439; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
440; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
441; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
442; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
443; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
444; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
445; VI-NEXT:    s_endpgm
446;
447; GCN-LABEL: udiv_v2i32:
448; GCN:       ; %bb.0:
449; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
450; GCN-NEXT:    s_waitcnt lgkmcnt(0)
451; GCN-NEXT:    v_mov_b32_e32 v0, s2
452; GCN-NEXT:    v_mov_b32_e32 v1, s3
453; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
454; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
455; GCN-NEXT:    s_waitcnt vmcnt(0)
456; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
457; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
458; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
459; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
460; GCN-NEXT:    v_mul_f32_e32 v4, s2, v4
461; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
462; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
463; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v5
464; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
465; GCN-NEXT:    v_mul_lo_u32 v5, v4, v6
466; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
467; GCN-NEXT:    v_mul_lo_u32 v8, v4, v7
468; GCN-NEXT:    v_mul_hi_u32 v9, v6, v5
469; GCN-NEXT:    v_mov_b32_e32 v4, s0
470; GCN-NEXT:    v_mov_b32_e32 v5, s1
471; GCN-NEXT:    v_mul_hi_u32 v8, v7, v8
472; GCN-NEXT:    v_add_u32_e32 v6, vcc, v9, v6
473; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
474; GCN-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
475; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
476; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
477; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
478; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
479; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
480; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
481; GCN-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
482; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
483; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
484; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
485; GCN-NEXT:    v_subrev_u32_e32 v8, vcc, v2, v0
486; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
487; GCN-NEXT:    v_subrev_u32_e32 v9, vcc, v3, v1
488; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
489; GCN-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
490; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
491; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v7
492; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
493; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
494; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
495; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
496; GCN-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
497; GCN-NEXT:    s_endpgm
498;
499; GFX1030-LABEL: udiv_v2i32:
500; GFX1030:       ; %bb.0:
501; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
502; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
503; GFX1030-NEXT:    s_mov_b32 s0, 0x4f7ffffe
504; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
506; GFX1030-NEXT:    s_waitcnt vmcnt(0)
507; GFX1030-NEXT:    v_cvt_f32_u32_e32 v5, v2
508; GFX1030-NEXT:    v_cvt_f32_u32_e32 v6, v3
509; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, 0, v2
510; GFX1030-NEXT:    v_sub_nc_u32_e32 v8, 0, v3
511; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v5
512; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v6, v6
513; GFX1030-NEXT:    v_mul_f32_e32 v5, s0, v5
514; GFX1030-NEXT:    v_mul_f32_e32 v6, s0, v6
515; GFX1030-NEXT:    v_cvt_u32_f32_e32 v5, v5
516; GFX1030-NEXT:    v_cvt_u32_f32_e32 v6, v6
517; GFX1030-NEXT:    v_mul_lo_u32 v7, v7, v5
518; GFX1030-NEXT:    v_mul_lo_u32 v8, v8, v6
519; GFX1030-NEXT:    v_mul_hi_u32 v7, v5, v7
520; GFX1030-NEXT:    v_mul_hi_u32 v8, v6, v8
521; GFX1030-NEXT:    v_add_nc_u32_e32 v5, v5, v7
522; GFX1030-NEXT:    v_add_nc_u32_e32 v6, v6, v8
523; GFX1030-NEXT:    v_mul_hi_u32 v5, v0, v5
524; GFX1030-NEXT:    v_mul_hi_u32 v6, v1, v6
525; GFX1030-NEXT:    v_mul_lo_u32 v7, v5, v2
526; GFX1030-NEXT:    v_mul_lo_u32 v8, v6, v3
527; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
528; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
529; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, v1, v8
530; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
531; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
532; GFX1030-NEXT:    v_sub_nc_u32_e32 v9, v1, v3
533; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v1, v3
534; GFX1030-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
535; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v0, v2
536; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s0
537; GFX1030-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
538; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
539; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
540; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
541; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
542; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc_lo
543; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v1, v3
544; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc_lo
545; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
546; GFX1030-NEXT:    s_endpgm
547;
548; EG-LABEL: udiv_v2i32:
549; EG:       ; %bb.0:
550; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
551; EG-NEXT:    TEX 1 @6
552; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
553; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
554; EG-NEXT:    CF_END
555; EG-NEXT:    PAD
556; EG-NEXT:    Fetch clause starting at 6:
557; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
558; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
559; EG-NEXT:    ALU clause starting at 10:
560; EG-NEXT:     MOV * T0.X, KC0[2].Z,
561; EG-NEXT:    ALU clause starting at 11:
562; EG-NEXT:     SUB_INT T0.W, 0.0, T1.Y,
563; EG-NEXT:     RECIP_UINT * T0.Z, T1.Y,
564; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
565; EG-NEXT:     SUB_INT T1.W, 0.0, T1.X,
566; EG-NEXT:     RECIP_UINT * T1.Z, T1.X,
567; EG-NEXT:     MULLO_INT * T1.W, PV.W, PS,
568; EG-NEXT:     MULHI * T1.W, T1.Z, PS,
569; EG-NEXT:     ADD_INT T1.W, T1.Z, PS,
570; EG-NEXT:     MULHI * T0.W, T0.Z, T0.W,
571; EG-NEXT:     ADD_INT T0.W, T0.Z, PS,
572; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
573; EG-NEXT:     MULHI * T0.W, T0.Y, PV.W,
574; EG-NEXT:     MULLO_INT * T1.Z, PS, T1.Y,
575; EG-NEXT:     SUB_INT T1.W, T0.Y, PS,
576; EG-NEXT:     MULLO_INT * T0.Y, T0.Z, T1.X,
577; EG-NEXT:     SUB_INT T0.Y, T0.X, PS,
578; EG-NEXT:     ADD_INT T1.Z, T0.W, 1,
579; EG-NEXT:     SETGE_UINT T2.W, PV.W, T1.Y,
580; EG-NEXT:     SUB_INT * T3.W, PV.W, T1.Y,
581; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.W, PS,
582; EG-NEXT:     CNDE_INT T2.Y, PV.W, T0.W, PV.Z, BS:VEC_021/SCL_122
583; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
584; EG-NEXT:     SETGE_UINT T0.W, PV.Y, T1.X,
585; EG-NEXT:     SUB_INT * T1.W, PV.Y, T1.X,
586; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS, BS:VEC_021/SCL_122
587; EG-NEXT:     CNDE_INT T0.Z, PV.W, T0.Z, PV.Z,
588; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
589; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.Y,
590; EG-NEXT:     CNDE_INT T1.Y, PS, T2.Y, PV.W,
591; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
592; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T1.X,
593; EG-NEXT:     CNDE_INT T1.X, PS, T0.Z, PV.W,
594; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
595; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
596  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
597  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
598  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
599  %result = udiv <2 x i32> %a, %b
600  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
601  ret void
602}
603
604define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
605; SI-LABEL: udiv_v4i32:
606; SI:       ; %bb.0:
607; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
608; SI-NEXT:    s_mov_b32 s11, 0xf000
609; SI-NEXT:    s_mov_b32 s10, -1
610; SI-NEXT:    s_mov_b32 s6, s10
611; SI-NEXT:    s_mov_b32 s7, s11
612; SI-NEXT:    s_waitcnt lgkmcnt(0)
613; SI-NEXT:    s_mov_b32 s4, s2
614; SI-NEXT:    s_mov_b32 s5, s3
615; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
616; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
617; SI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
618; SI-NEXT:    s_mov_b32 s8, s0
619; SI-NEXT:    s_mov_b32 s9, s1
620; SI-NEXT:    s_waitcnt vmcnt(1)
621; SI-NEXT:    v_cvt_f32_u32_e32 v8, v0
622; SI-NEXT:    v_cvt_f32_u32_e32 v10, v1
623; SI-NEXT:    v_cvt_f32_u32_e32 v12, v2
624; SI-NEXT:    v_cvt_f32_u32_e32 v14, v3
625; SI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
626; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
627; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
628; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
629; SI-NEXT:    v_mul_f32_e32 v8, s2, v8
630; SI-NEXT:    v_mul_f32_e32 v10, s2, v10
631; SI-NEXT:    v_mul_f32_e32 v12, s2, v12
632; SI-NEXT:    v_mul_f32_e32 v14, s2, v14
633; SI-NEXT:    v_cvt_u32_f32_e32 v8, v8
634; SI-NEXT:    v_cvt_u32_f32_e32 v10, v10
635; SI-NEXT:    v_cvt_u32_f32_e32 v12, v12
636; SI-NEXT:    v_cvt_u32_f32_e32 v14, v14
637; SI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
638; SI-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
639; SI-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
640; SI-NEXT:    v_sub_i32_e32 v15, vcc, 0, v3
641; SI-NEXT:    v_mul_lo_u32 v9, v9, v8
642; SI-NEXT:    v_mul_lo_u32 v11, v11, v10
643; SI-NEXT:    v_mul_lo_u32 v13, v13, v12
644; SI-NEXT:    v_mul_lo_u32 v15, v15, v14
645; SI-NEXT:    v_mul_hi_u32 v9, v8, v9
646; SI-NEXT:    v_mul_hi_u32 v11, v10, v11
647; SI-NEXT:    v_mul_hi_u32 v13, v12, v13
648; SI-NEXT:    v_mul_hi_u32 v15, v14, v15
649; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
650; SI-NEXT:    v_add_i32_e32 v9, vcc, v11, v10
651; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v12
652; SI-NEXT:    v_add_i32_e32 v11, vcc, v15, v14
653; SI-NEXT:    s_waitcnt vmcnt(0)
654; SI-NEXT:    v_mul_hi_u32 v8, v4, v8
655; SI-NEXT:    v_mul_hi_u32 v9, v5, v9
656; SI-NEXT:    v_mul_hi_u32 v10, v6, v10
657; SI-NEXT:    v_mul_hi_u32 v11, v7, v11
658; SI-NEXT:    v_mul_lo_u32 v12, v8, v0
659; SI-NEXT:    v_mul_lo_u32 v14, v9, v1
660; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
661; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
662; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v12, v4
663; SI-NEXT:    v_subrev_i32_e32 v5, vcc, v14, v5
664; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v16, v6
665; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v18, v7
666; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
667; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
668; SI-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
669; SI-NEXT:    v_add_i32_e32 v19, vcc, 1, v11
670; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
671; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
672; SI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
673; SI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
674; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
675; SI-NEXT:    v_subrev_i32_e32 v12, vcc, v0, v4
676; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
677; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
678; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
679; SI-NEXT:    v_subrev_i32_e32 v14, vcc, v2, v6
680; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
681; SI-NEXT:    v_subrev_i32_e32 v15, vcc, v3, v7
682; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
683; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
684; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
685; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v9
686; SI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
687; SI-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
688; SI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
689; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v11
690; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
691; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
692; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
693; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
694; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
695; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
696; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
697; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
698; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
699; SI-NEXT:    s_endpgm
700;
701; VI-LABEL: udiv_v4i32:
702; VI:       ; %bb.0:
703; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
704; VI-NEXT:    s_mov_b32 s11, 0xf000
705; VI-NEXT:    s_mov_b32 s10, -1
706; VI-NEXT:    s_mov_b32 s6, s10
707; VI-NEXT:    s_mov_b32 s7, s11
708; VI-NEXT:    s_waitcnt lgkmcnt(0)
709; VI-NEXT:    s_mov_b32 s4, s2
710; VI-NEXT:    s_mov_b32 s5, s3
711; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
712; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
713; VI-NEXT:    s_mov_b32 s2, 0x4f7ffffe
714; VI-NEXT:    s_mov_b32 s8, s0
715; VI-NEXT:    s_mov_b32 s9, s1
716; VI-NEXT:    s_waitcnt vmcnt(1)
717; VI-NEXT:    v_cvt_f32_u32_e32 v8, v0
718; VI-NEXT:    v_cvt_f32_u32_e32 v10, v1
719; VI-NEXT:    v_cvt_f32_u32_e32 v12, v2
720; VI-NEXT:    v_cvt_f32_u32_e32 v14, v3
721; VI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
722; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
723; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
724; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
725; VI-NEXT:    v_mul_f32_e32 v8, s2, v8
726; VI-NEXT:    v_mul_f32_e32 v10, s2, v10
727; VI-NEXT:    v_mul_f32_e32 v12, s2, v12
728; VI-NEXT:    v_mul_f32_e32 v14, s2, v14
729; VI-NEXT:    v_cvt_u32_f32_e32 v8, v8
730; VI-NEXT:    v_cvt_u32_f32_e32 v10, v10
731; VI-NEXT:    v_cvt_u32_f32_e32 v12, v12
732; VI-NEXT:    v_cvt_u32_f32_e32 v14, v14
733; VI-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
734; VI-NEXT:    v_sub_u32_e32 v11, vcc, 0, v1
735; VI-NEXT:    v_sub_u32_e32 v13, vcc, 0, v2
736; VI-NEXT:    v_sub_u32_e32 v15, vcc, 0, v3
737; VI-NEXT:    v_mul_lo_u32 v9, v9, v8
738; VI-NEXT:    v_mul_lo_u32 v11, v11, v10
739; VI-NEXT:    v_mul_lo_u32 v13, v13, v12
740; VI-NEXT:    v_mul_lo_u32 v15, v15, v14
741; VI-NEXT:    v_mul_hi_u32 v9, v8, v9
742; VI-NEXT:    v_mul_hi_u32 v11, v10, v11
743; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
744; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
745; VI-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
746; VI-NEXT:    v_add_u32_e32 v9, vcc, v11, v10
747; VI-NEXT:    v_add_u32_e32 v10, vcc, v13, v12
748; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v14
749; VI-NEXT:    s_waitcnt vmcnt(0)
750; VI-NEXT:    v_mul_hi_u32 v8, v4, v8
751; VI-NEXT:    v_mul_hi_u32 v9, v5, v9
752; VI-NEXT:    v_mul_hi_u32 v10, v6, v10
753; VI-NEXT:    v_mul_hi_u32 v11, v7, v11
754; VI-NEXT:    v_mul_lo_u32 v12, v8, v0
755; VI-NEXT:    v_mul_lo_u32 v14, v9, v1
756; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
757; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
758; VI-NEXT:    v_subrev_u32_e32 v4, vcc, v12, v4
759; VI-NEXT:    v_subrev_u32_e32 v5, vcc, v14, v5
760; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v16, v6
761; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v18, v7
762; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
763; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
764; VI-NEXT:    v_add_u32_e32 v17, vcc, 1, v10
765; VI-NEXT:    v_add_u32_e32 v19, vcc, 1, v11
766; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
767; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
768; VI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
769; VI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
770; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
771; VI-NEXT:    v_subrev_u32_e32 v12, vcc, v0, v4
772; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
773; VI-NEXT:    v_subrev_u32_e32 v13, vcc, v1, v5
774; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
775; VI-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
776; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
777; VI-NEXT:    v_subrev_u32_e32 v15, vcc, v3, v7
778; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
779; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
780; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
781; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v9
782; VI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
783; VI-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
784; VI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
785; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
786; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
787; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
788; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
789; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
790; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
791; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
792; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
793; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
794; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
795; VI-NEXT:    s_endpgm
796;
797; GCN-LABEL: udiv_v4i32:
798; GCN:       ; %bb.0:
799; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
800; GCN-NEXT:    s_waitcnt lgkmcnt(0)
801; GCN-NEXT:    s_add_u32 s4, s2, 16
802; GCN-NEXT:    s_addc_u32 s5, s3, 0
803; GCN-NEXT:    v_mov_b32_e32 v0, s4
804; GCN-NEXT:    v_mov_b32_e32 v1, s5
805; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
806; GCN-NEXT:    v_mov_b32_e32 v5, s3
807; GCN-NEXT:    v_mov_b32_e32 v4, s2
808; GCN-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
809; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
810; GCN-NEXT:    v_mov_b32_e32 v8, s0
811; GCN-NEXT:    v_mov_b32_e32 v9, s1
812; GCN-NEXT:    s_waitcnt vmcnt(1)
813; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v0
814; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v1
815; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v2
816; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v3
817; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
818; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
819; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
820; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
821; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
822; GCN-NEXT:    v_mul_f32_e32 v12, s2, v12
823; GCN-NEXT:    v_mul_f32_e32 v14, s2, v14
824; GCN-NEXT:    v_mul_f32_e32 v16, s2, v16
825; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
826; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
827; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14
828; GCN-NEXT:    v_cvt_u32_f32_e32 v16, v16
829; GCN-NEXT:    v_sub_u32_e32 v11, vcc, 0, v0
830; GCN-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
831; GCN-NEXT:    v_sub_u32_e32 v15, vcc, 0, v2
832; GCN-NEXT:    v_sub_u32_e32 v17, vcc, 0, v3
833; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
834; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
835; GCN-NEXT:    v_mul_lo_u32 v15, v15, v14
836; GCN-NEXT:    v_mul_lo_u32 v17, v17, v16
837; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
838; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
839; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
840; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
841; GCN-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
842; GCN-NEXT:    v_add_u32_e32 v11, vcc, v13, v12
843; GCN-NEXT:    v_add_u32_e32 v12, vcc, v15, v14
844; GCN-NEXT:    v_add_u32_e32 v13, vcc, v17, v16
845; GCN-NEXT:    s_waitcnt vmcnt(0)
846; GCN-NEXT:    v_mul_hi_u32 v10, v4, v10
847; GCN-NEXT:    v_mul_hi_u32 v11, v5, v11
848; GCN-NEXT:    v_mul_hi_u32 v12, v6, v12
849; GCN-NEXT:    v_mul_hi_u32 v13, v7, v13
850; GCN-NEXT:    v_mul_lo_u32 v14, v10, v0
851; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
852; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
853; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
854; GCN-NEXT:    v_subrev_u32_e32 v4, vcc, v14, v4
855; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v16, v5
856; GCN-NEXT:    v_subrev_u32_e32 v6, vcc, v18, v6
857; GCN-NEXT:    v_subrev_u32_e32 v7, vcc, v19, v7
858; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
859; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
860; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
861; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
862; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
863; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
864; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
865; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
866; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
867; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v0, v4
868; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
869; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v1, v5
870; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
871; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
872; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
873; GCN-NEXT:    v_subrev_u32_e32 v16, vcc, v3, v7
874; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[0:1]
875; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
876; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v17, s[2:3]
877; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
878; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
879; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
880; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v16, s[6:7]
881; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
882; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
883; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v15, vcc
884; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
885; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v17, vcc
886; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
887; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc
888; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
889; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc
890; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
891; GCN-NEXT:    s_endpgm
892;
893; GFX1030-LABEL: udiv_v4i32:
894; GFX1030:       ; %bb.0:
895; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
896; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
897; GFX1030-NEXT:    s_mov_b32 s0, 0x4f7ffffe
898; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX1030-NEXT:    s_clause 0x1
900; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
901; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
902; GFX1030-NEXT:    s_waitcnt vmcnt(1)
903; GFX1030-NEXT:    v_cvt_f32_u32_e32 v9, v0
904; GFX1030-NEXT:    v_cvt_f32_u32_e32 v10, v1
905; GFX1030-NEXT:    v_cvt_f32_u32_e32 v11, v2
906; GFX1030-NEXT:    v_cvt_f32_u32_e32 v12, v3
907; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, 0, v0
908; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v9, v9
909; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v10, v10
910; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v11, v11
911; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v12, v12
912; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, 0, v1
913; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, 0, v2
914; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, 0, v3
915; GFX1030-NEXT:    v_mul_f32_e32 v9, s0, v9
916; GFX1030-NEXT:    v_mul_f32_e32 v10, s0, v10
917; GFX1030-NEXT:    v_mul_f32_e32 v11, s0, v11
918; GFX1030-NEXT:    v_mul_f32_e32 v12, s0, v12
919; GFX1030-NEXT:    v_cvt_u32_f32_e32 v9, v9
920; GFX1030-NEXT:    v_cvt_u32_f32_e32 v10, v10
921; GFX1030-NEXT:    v_cvt_u32_f32_e32 v11, v11
922; GFX1030-NEXT:    v_cvt_u32_f32_e32 v12, v12
923; GFX1030-NEXT:    v_mul_lo_u32 v13, v13, v9
924; GFX1030-NEXT:    v_mul_lo_u32 v14, v14, v10
925; GFX1030-NEXT:    v_mul_lo_u32 v15, v15, v11
926; GFX1030-NEXT:    v_mul_lo_u32 v16, v16, v12
927; GFX1030-NEXT:    v_mul_hi_u32 v13, v9, v13
928; GFX1030-NEXT:    v_mul_hi_u32 v14, v10, v14
929; GFX1030-NEXT:    v_mul_hi_u32 v15, v11, v15
930; GFX1030-NEXT:    v_mul_hi_u32 v16, v12, v16
931; GFX1030-NEXT:    v_add_nc_u32_e32 v9, v9, v13
932; GFX1030-NEXT:    v_add_nc_u32_e32 v10, v10, v14
933; GFX1030-NEXT:    v_add_nc_u32_e32 v11, v11, v15
934; GFX1030-NEXT:    v_add_nc_u32_e32 v12, v12, v16
935; GFX1030-NEXT:    s_waitcnt vmcnt(0)
936; GFX1030-NEXT:    v_mul_hi_u32 v9, v4, v9
937; GFX1030-NEXT:    v_mul_hi_u32 v10, v5, v10
938; GFX1030-NEXT:    v_mul_hi_u32 v11, v6, v11
939; GFX1030-NEXT:    v_mul_hi_u32 v12, v7, v12
940; GFX1030-NEXT:    v_mul_lo_u32 v13, v9, v0
941; GFX1030-NEXT:    v_mul_lo_u32 v14, v10, v1
942; GFX1030-NEXT:    v_mul_lo_u32 v15, v11, v2
943; GFX1030-NEXT:    v_mul_lo_u32 v16, v12, v3
944; GFX1030-NEXT:    v_add_nc_u32_e32 v17, 1, v9
945; GFX1030-NEXT:    v_add_nc_u32_e32 v18, 1, v10
946; GFX1030-NEXT:    v_add_nc_u32_e32 v19, 1, v11
947; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v13
948; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v5, v14
949; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v6, v15
950; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v7, v16
951; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
952; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
953; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, v4, v0
954; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v5, v1
955; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, v5, v1
956; GFX1030-NEXT:    v_cmp_ge_u32_e64 s1, v6, v2
957; GFX1030-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc_lo
958; GFX1030-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
959; GFX1030-NEXT:    v_cndmask_b32_e64 v10, v10, v18, s0
960; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, v6, v2
961; GFX1030-NEXT:    v_cmp_ge_u32_e64 s2, v7, v3
962; GFX1030-NEXT:    v_add_nc_u32_e32 v14, 1, v9
963; GFX1030-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s0
964; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
965; GFX1030-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s1
966; GFX1030-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s2
967; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, v7, v3
968; GFX1030-NEXT:    v_add_nc_u32_e32 v15, 1, v10
969; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s1
970; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v9, v14, vcc_lo
971; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v5, v1
972; GFX1030-NEXT:    v_add_nc_u32_e32 v16, 1, v11
973; GFX1030-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
974; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
975; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc_lo
976; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v6, v2
977; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v11, v16, vcc_lo
978; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v7, v3
979; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc_lo
980; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
981; GFX1030-NEXT:    s_endpgm
982;
983; EG-LABEL: udiv_v4i32:
984; EG:       ; %bb.0:
985; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
986; EG-NEXT:    TEX 1 @6
987; EG-NEXT:    ALU 65, @11, KC0[CB0:0-32], KC1[]
988; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
989; EG-NEXT:    CF_END
990; EG-NEXT:    PAD
991; EG-NEXT:    Fetch clause starting at 6:
992; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
993; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
994; EG-NEXT:    ALU clause starting at 10:
995; EG-NEXT:     MOV * T0.X, KC0[2].Z,
996; EG-NEXT:    ALU clause starting at 11:
997; EG-NEXT:     SUB_INT T2.W, 0.0, T1.W,
998; EG-NEXT:     RECIP_UINT * T2.X, T1.W,
999; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
1000; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
1001; EG-NEXT:     ADD_INT * T2.W, T2.X, PS,
1002; EG-NEXT:     MULHI * T2.X, T0.W, PV.W,
1003; EG-NEXT:     MULLO_INT * T2.Y, PS, T1.W,
1004; EG-NEXT:     SUB_INT T2.W, 0.0, T1.X,
1005; EG-NEXT:     RECIP_UINT * T2.Z, T1.X,
1006; EG-NEXT:     MULLO_INT * T2.W, PV.W, PS,
1007; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Y,
1008; EG-NEXT:     RECIP_UINT * T3.X, T1.Y,
1009; EG-NEXT:     MULLO_INT * T3.Y, PV.W, PS,
1010; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Z,
1011; EG-NEXT:     RECIP_UINT * T3.Z, T1.Z,
1012; EG-NEXT:     MULLO_INT * T3.W, PV.W, PS,
1013; EG-NEXT:     MULHI * T3.W, T3.Z, PS,
1014; EG-NEXT:     ADD_INT T3.W, T3.Z, PS,
1015; EG-NEXT:     MULHI * T3.Y, T3.X, T3.Y,
1016; EG-NEXT:     ADD_INT T4.W, T3.X, PS,
1017; EG-NEXT:     MULHI * T3.X, T0.Z, PV.W,
1018; EG-NEXT:     MULHI * T3.Y, T0.Y, PV.W,
1019; EG-NEXT:     MULLO_INT * T3.Z, PS, T1.Y,
1020; EG-NEXT:     SUB_INT T3.W, T0.Y, PS,
1021; EG-NEXT:     MULLO_INT * T0.Y, T3.X, T1.Z,
1022; EG-NEXT:     SUB_INT T4.X, T0.Z, PS,
1023; EG-NEXT:     ADD_INT T0.Y, T3.Y, 1,
1024; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.Y,
1025; EG-NEXT:     SUB_INT T4.W, PV.W, T1.Y,
1026; EG-NEXT:     MULHI * T2.W, T2.Z, T2.W,
1027; EG-NEXT:     CNDE_INT T5.X, PV.Z, T3.W, PV.W,
1028; EG-NEXT:     CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122
1029; EG-NEXT:     SETGE_UINT T0.Z, PV.X, T1.Z,
1030; EG-NEXT:     ADD_INT T2.W, T2.Z, PS,
1031; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.Y,
1032; EG-NEXT:     ADD_INT T6.X, T3.X, 1,
1033; EG-NEXT:     ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212
1034; EG-NEXT:     SETGE_UINT T2.Z, PS, T1.W,
1035; EG-NEXT:     SUB_INT T3.W, PS, T1.W,
1036; EG-NEXT:     MULHI * T2.W, T0.X, PV.W,
1037; EG-NEXT:     SUB_INT T7.X, T4.X, T1.Z,
1038; EG-NEXT:     CNDE_INT T3.Y, PV.Z, T0.W, PV.W,
1039; EG-NEXT:     CNDE_INT T2.Z, PV.Z, T2.X, PV.Y,
1040; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122
1041; EG-NEXT:     MULLO_INT * T2.X, T2.W, T1.X,
1042; EG-NEXT:     ADD_INT T3.X, T0.W, 1,
1043; EG-NEXT:     ADD_INT T2.Y, T2.Z, 1,
1044; EG-NEXT:     SETGE_UINT T3.Z, T3.Y, T1.W,
1045; EG-NEXT:     SUB_INT T1.W, T0.X, PS, BS:VEC_201
1046; EG-NEXT:     CNDE_INT * T3.W, T0.Z, T4.X, T7.X,
1047; EG-NEXT:     SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122
1048; EG-NEXT:     ADD_INT T3.Y, T2.W, 1,
1049; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.X,
1050; EG-NEXT:     SUB_INT T3.W, PV.W, T1.X,
1051; EG-NEXT:     CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y,
1052; EG-NEXT:     CNDE_INT T2.X, PV.Z, T1.W, PV.W,
1053; EG-NEXT:     CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122
1054; EG-NEXT:     CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201
1055; EG-NEXT:     ADD_INT T0.W, T0.Y, 1,
1056; EG-NEXT:     SETGE_UINT * T1.W, T5.X, T1.Y,
1057; EG-NEXT:     CNDE_INT T4.Y, PS, T0.Y, PV.W,
1058; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
1059; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.X,
1060; EG-NEXT:     CNDE_INT T4.X, PS, T2.Y, PV.W,
1061; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1062; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1063  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1064  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
1065  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
1066  %result = udiv <4 x i32> %a, %b
1067  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1068  ret void
1069}
1070
1071define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1072; SI-LABEL: udiv_i32_div_pow2:
1073; SI:       ; %bb.0:
1074; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1075; SI-NEXT:    s_mov_b32 s7, 0xf000
1076; SI-NEXT:    s_mov_b32 s6, -1
1077; SI-NEXT:    s_mov_b32 s10, s6
1078; SI-NEXT:    s_mov_b32 s11, s7
1079; SI-NEXT:    s_waitcnt lgkmcnt(0)
1080; SI-NEXT:    s_mov_b32 s8, s2
1081; SI-NEXT:    s_mov_b32 s9, s3
1082; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1083; SI-NEXT:    s_mov_b32 s4, s0
1084; SI-NEXT:    s_mov_b32 s5, s1
1085; SI-NEXT:    s_waitcnt vmcnt(0)
1086; SI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1087; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1088; SI-NEXT:    s_endpgm
1089;
1090; VI-LABEL: udiv_i32_div_pow2:
1091; VI:       ; %bb.0:
1092; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1093; VI-NEXT:    s_mov_b32 s7, 0xf000
1094; VI-NEXT:    s_mov_b32 s6, -1
1095; VI-NEXT:    s_mov_b32 s10, s6
1096; VI-NEXT:    s_mov_b32 s11, s7
1097; VI-NEXT:    s_waitcnt lgkmcnt(0)
1098; VI-NEXT:    s_mov_b32 s8, s2
1099; VI-NEXT:    s_mov_b32 s9, s3
1100; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1101; VI-NEXT:    s_mov_b32 s4, s0
1102; VI-NEXT:    s_mov_b32 s5, s1
1103; VI-NEXT:    s_waitcnt vmcnt(0)
1104; VI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1105; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1106; VI-NEXT:    s_endpgm
1107;
1108; GCN-LABEL: udiv_i32_div_pow2:
1109; GCN:       ; %bb.0:
1110; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1111; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1112; GCN-NEXT:    v_mov_b32_e32 v0, s2
1113; GCN-NEXT:    v_mov_b32_e32 v1, s3
1114; GCN-NEXT:    flat_load_dword v2, v[0:1]
1115; GCN-NEXT:    v_mov_b32_e32 v0, s0
1116; GCN-NEXT:    v_mov_b32_e32 v1, s1
1117; GCN-NEXT:    s_waitcnt vmcnt(0)
1118; GCN-NEXT:    v_lshrrev_b32_e32 v2, 4, v2
1119; GCN-NEXT:    flat_store_dword v[0:1], v2
1120; GCN-NEXT:    s_endpgm
1121;
1122; GFX1030-LABEL: udiv_i32_div_pow2:
1123; GFX1030:       ; %bb.0:
1124; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1125; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1126; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1127; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1128; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1129; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
1130; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1131; GFX1030-NEXT:    s_endpgm
1132;
1133; EG-LABEL: udiv_i32_div_pow2:
1134; EG:       ; %bb.0:
1135; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1136; EG-NEXT:    TEX 0 @6
1137; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1138; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1139; EG-NEXT:    CF_END
1140; EG-NEXT:    PAD
1141; EG-NEXT:    Fetch clause starting at 6:
1142; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1143; EG-NEXT:    ALU clause starting at 8:
1144; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1145; EG-NEXT:    ALU clause starting at 9:
1146; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
1147; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1148; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
1149  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1150  %a = load i32, i32 addrspace(1)* %in
1151  %result = udiv i32 %a, 16
1152  store i32 %result, i32 addrspace(1)* %out
1153  ret void
1154}
1155
1156define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1157; SI-LABEL: udiv_i32_div_k_even:
1158; SI:       ; %bb.0:
1159; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1160; SI-NEXT:    s_mov_b32 s7, 0xf000
1161; SI-NEXT:    s_mov_b32 s6, -1
1162; SI-NEXT:    s_mov_b32 s10, s6
1163; SI-NEXT:    s_mov_b32 s11, s7
1164; SI-NEXT:    s_waitcnt lgkmcnt(0)
1165; SI-NEXT:    s_mov_b32 s8, s2
1166; SI-NEXT:    s_mov_b32 s9, s3
1167; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1168; SI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1169; SI-NEXT:    s_mov_b32 s4, s0
1170; SI-NEXT:    s_mov_b32 s5, s1
1171; SI-NEXT:    s_waitcnt vmcnt(0)
1172; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1173; SI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1174; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1175; SI-NEXT:    s_endpgm
1176;
1177; VI-LABEL: udiv_i32_div_k_even:
1178; VI:       ; %bb.0:
1179; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1180; VI-NEXT:    s_mov_b32 s7, 0xf000
1181; VI-NEXT:    s_mov_b32 s6, -1
1182; VI-NEXT:    s_mov_b32 s10, s6
1183; VI-NEXT:    s_mov_b32 s11, s7
1184; VI-NEXT:    s_waitcnt lgkmcnt(0)
1185; VI-NEXT:    s_mov_b32 s8, s2
1186; VI-NEXT:    s_mov_b32 s9, s3
1187; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1188; VI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1189; VI-NEXT:    s_mov_b32 s4, s0
1190; VI-NEXT:    s_mov_b32 s5, s1
1191; VI-NEXT:    s_waitcnt vmcnt(0)
1192; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1193; VI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1194; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1195; VI-NEXT:    s_endpgm
1196;
1197; GCN-LABEL: udiv_i32_div_k_even:
1198; GCN:       ; %bb.0:
1199; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1200; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1201; GCN-NEXT:    v_mov_b32_e32 v0, s2
1202; GCN-NEXT:    v_mov_b32_e32 v1, s3
1203; GCN-NEXT:    flat_load_dword v0, v[0:1]
1204; GCN-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1205; GCN-NEXT:    v_mov_b32_e32 v1, s1
1206; GCN-NEXT:    s_waitcnt vmcnt(0)
1207; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1208; GCN-NEXT:    v_mov_b32_e32 v0, s0
1209; GCN-NEXT:    v_lshrrev_b32_e32 v2, 25, v2
1210; GCN-NEXT:    flat_store_dword v[0:1], v2
1211; GCN-NEXT:    s_endpgm
1212;
1213; GFX1030-LABEL: udiv_i32_div_k_even:
1214; GFX1030:       ; %bb.0:
1215; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1216; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1217; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1218; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1219; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1220; GFX1030-NEXT:    v_mul_hi_u32 v1, 0xfabbd9c1, v1
1221; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
1222; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1223; GFX1030-NEXT:    s_endpgm
1224;
1225; EG-LABEL: udiv_i32_div_k_even:
1226; EG:       ; %bb.0:
1227; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1228; EG-NEXT:    TEX 0 @6
1229; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1230; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1231; EG-NEXT:    CF_END
1232; EG-NEXT:    PAD
1233; EG-NEXT:    Fetch clause starting at 6:
1234; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1235; EG-NEXT:    ALU clause starting at 8:
1236; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1237; EG-NEXT:    ALU clause starting at 9:
1238; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1239; EG-NEXT:    -88352319(-4.876880e+35), 0(0.000000e+00)
1240; EG-NEXT:     LSHR T0.X, PS, literal.x,
1241; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1242; EG-NEXT:    25(3.503246e-44), 2(2.802597e-45)
1243  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1244  %a = load i32, i32 addrspace(1)* %in
1245  %result = udiv i32 %a, 34259182
1246  store i32 %result, i32 addrspace(1)* %out
1247  ret void
1248}
1249
1250define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1251; SI-LABEL: udiv_i32_div_k_odd:
1252; SI:       ; %bb.0:
1253; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1254; SI-NEXT:    s_mov_b32 s7, 0xf000
1255; SI-NEXT:    s_mov_b32 s6, -1
1256; SI-NEXT:    s_mov_b32 s10, s6
1257; SI-NEXT:    s_mov_b32 s11, s7
1258; SI-NEXT:    s_waitcnt lgkmcnt(0)
1259; SI-NEXT:    s_mov_b32 s8, s2
1260; SI-NEXT:    s_mov_b32 s9, s3
1261; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1262; SI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1263; SI-NEXT:    s_mov_b32 s4, s0
1264; SI-NEXT:    s_mov_b32 s5, s1
1265; SI-NEXT:    s_waitcnt vmcnt(0)
1266; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1267; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1268; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1269; SI-NEXT:    s_endpgm
1270;
1271; VI-LABEL: udiv_i32_div_k_odd:
1272; VI:       ; %bb.0:
1273; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1274; VI-NEXT:    s_mov_b32 s7, 0xf000
1275; VI-NEXT:    s_mov_b32 s6, -1
1276; VI-NEXT:    s_mov_b32 s10, s6
1277; VI-NEXT:    s_mov_b32 s11, s7
1278; VI-NEXT:    s_waitcnt lgkmcnt(0)
1279; VI-NEXT:    s_mov_b32 s8, s2
1280; VI-NEXT:    s_mov_b32 s9, s3
1281; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1282; VI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1283; VI-NEXT:    s_mov_b32 s4, s0
1284; VI-NEXT:    s_mov_b32 s5, s1
1285; VI-NEXT:    s_waitcnt vmcnt(0)
1286; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1287; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1288; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1289; VI-NEXT:    s_endpgm
1290;
1291; GCN-LABEL: udiv_i32_div_k_odd:
1292; GCN:       ; %bb.0:
1293; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1294; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1295; GCN-NEXT:    v_mov_b32_e32 v0, s2
1296; GCN-NEXT:    v_mov_b32_e32 v1, s3
1297; GCN-NEXT:    flat_load_dword v0, v[0:1]
1298; GCN-NEXT:    s_mov_b32 s2, 0x7d5deca3
1299; GCN-NEXT:    v_mov_b32_e32 v1, s1
1300; GCN-NEXT:    s_waitcnt vmcnt(0)
1301; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1302; GCN-NEXT:    v_mov_b32_e32 v0, s0
1303; GCN-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1304; GCN-NEXT:    flat_store_dword v[0:1], v2
1305; GCN-NEXT:    s_endpgm
1306;
1307; GFX1030-LABEL: udiv_i32_div_k_odd:
1308; GFX1030:       ; %bb.0:
1309; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1310; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1311; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1312; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1313; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1314; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x7d5deca3, v1
1315; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1316; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1317; GFX1030-NEXT:    s_endpgm
1318;
1319; EG-LABEL: udiv_i32_div_k_odd:
1320; EG:       ; %bb.0:
1321; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1322; EG-NEXT:    TEX 0 @6
1323; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1324; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1325; EG-NEXT:    CF_END
1326; EG-NEXT:    PAD
1327; EG-NEXT:    Fetch clause starting at 6:
1328; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1329; EG-NEXT:    ALU clause starting at 8:
1330; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1331; EG-NEXT:    ALU clause starting at 9:
1332; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1333; EG-NEXT:    2103307427(1.843675e+37), 0(0.000000e+00)
1334; EG-NEXT:     LSHR T0.X, PS, literal.x,
1335; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1336; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
1337  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1338  %a = load i32, i32 addrspace(1)* %in
1339  %result = udiv i32 %a, 34259183
1340  store i32 %result, i32 addrspace(1)* %out
1341  ret void
1342}
1343
1344define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1345; SI-LABEL: v_udiv_i8:
1346; SI:       ; %bb.0:
1347; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1348; SI-NEXT:    s_mov_b32 s7, 0xf000
1349; SI-NEXT:    s_mov_b32 s6, -1
1350; SI-NEXT:    s_mov_b32 s10, s6
1351; SI-NEXT:    s_mov_b32 s11, s7
1352; SI-NEXT:    s_waitcnt lgkmcnt(0)
1353; SI-NEXT:    s_mov_b32 s8, s2
1354; SI-NEXT:    s_mov_b32 s9, s3
1355; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1356; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1357; SI-NEXT:    s_mov_b32 s4, s0
1358; SI-NEXT:    s_mov_b32 s5, s1
1359; SI-NEXT:    s_waitcnt vmcnt(1)
1360; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1361; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1362; SI-NEXT:    s_waitcnt vmcnt(0)
1363; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1364; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1365; SI-NEXT:    v_trunc_f32_e32 v2, v2
1366; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1367; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1368; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1369; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1370; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1371; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1372; SI-NEXT:    s_endpgm
1373;
1374; VI-LABEL: v_udiv_i8:
1375; VI:       ; %bb.0:
1376; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1377; VI-NEXT:    s_mov_b32 s7, 0xf000
1378; VI-NEXT:    s_mov_b32 s6, -1
1379; VI-NEXT:    s_mov_b32 s10, s6
1380; VI-NEXT:    s_mov_b32 s11, s7
1381; VI-NEXT:    s_waitcnt lgkmcnt(0)
1382; VI-NEXT:    s_mov_b32 s8, s2
1383; VI-NEXT:    s_mov_b32 s9, s3
1384; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1385; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1386; VI-NEXT:    s_mov_b32 s4, s0
1387; VI-NEXT:    s_mov_b32 s5, s1
1388; VI-NEXT:    s_waitcnt vmcnt(1)
1389; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1390; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1391; VI-NEXT:    s_waitcnt vmcnt(0)
1392; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1393; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1394; VI-NEXT:    v_trunc_f32_e32 v2, v2
1395; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1396; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1397; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1398; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1399; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1400; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1401; VI-NEXT:    s_endpgm
1402;
1403; GCN-LABEL: v_udiv_i8:
1404; GCN:       ; %bb.0:
1405; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1406; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1407; GCN-NEXT:    v_mov_b32_e32 v0, s2
1408; GCN-NEXT:    v_mov_b32_e32 v1, s3
1409; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1410; GCN-NEXT:    v_mov_b32_e32 v0, s0
1411; GCN-NEXT:    v_mov_b32_e32 v1, s1
1412; GCN-NEXT:    s_waitcnt vmcnt(0)
1413; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
1414; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
1415; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1416; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
1417; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1418; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1419; GCN-NEXT:    v_mad_f32 v2, -v4, v3, v2
1420; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1421; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1422; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
1423; GCN-NEXT:    flat_store_dword v[0:1], v2
1424; GCN-NEXT:    s_endpgm
1425;
1426; GFX1030-LABEL: v_udiv_i8:
1427; GFX1030:       ; %bb.0:
1428; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1429; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1430; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1431; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3]
1432; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1433; GFX1030-NEXT:    v_cvt_f32_ubyte1_e32 v2, v1
1434; GFX1030-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1435; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1436; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
1437; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1438; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
1439; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1440; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1441; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1442; GFX1030-NEXT:    v_and_b32_e32 v1, 0xff, v1
1443; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1444; GFX1030-NEXT:    s_endpgm
1445;
1446; EG-LABEL: v_udiv_i8:
1447; EG:       ; %bb.0:
1448; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1449; EG-NEXT:    TEX 1 @6
1450; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1451; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1452; EG-NEXT:    CF_END
1453; EG-NEXT:    PAD
1454; EG-NEXT:    Fetch clause starting at 6:
1455; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1456; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1457; EG-NEXT:    ALU clause starting at 10:
1458; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1459; EG-NEXT:    ALU clause starting at 11:
1460; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1461; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1462; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1463; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1464; EG-NEXT:     TRUNC * T0.W, PV.W,
1465; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1466; EG-NEXT:     TRUNC * T0.W, PV.W,
1467; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1468; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1469; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1470; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1471; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1472; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1473; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1474; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
1475  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1476  %num = load i8, i8 addrspace(1) * %in
1477  %den = load i8, i8 addrspace(1) * %den_ptr
1478  %result = udiv i8 %num, %den
1479  %result.ext = zext i8 %result to i32
1480  store i32 %result.ext, i32 addrspace(1)* %out
1481  ret void
1482}
1483
1484define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
1485; SI-LABEL: v_udiv_i16:
1486; SI:       ; %bb.0:
1487; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1488; SI-NEXT:    s_mov_b32 s7, 0xf000
1489; SI-NEXT:    s_mov_b32 s6, -1
1490; SI-NEXT:    s_mov_b32 s10, s6
1491; SI-NEXT:    s_mov_b32 s11, s7
1492; SI-NEXT:    s_waitcnt lgkmcnt(0)
1493; SI-NEXT:    s_mov_b32 s8, s2
1494; SI-NEXT:    s_mov_b32 s9, s3
1495; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1496; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1497; SI-NEXT:    s_mov_b32 s4, s0
1498; SI-NEXT:    s_mov_b32 s5, s1
1499; SI-NEXT:    s_waitcnt vmcnt(1)
1500; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1501; SI-NEXT:    s_waitcnt vmcnt(0)
1502; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1503; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1504; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1505; SI-NEXT:    v_trunc_f32_e32 v2, v2
1506; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1507; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1508; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1509; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1510; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1511; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1512; SI-NEXT:    s_endpgm
1513;
1514; VI-LABEL: v_udiv_i16:
1515; VI:       ; %bb.0:
1516; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1517; VI-NEXT:    s_mov_b32 s7, 0xf000
1518; VI-NEXT:    s_mov_b32 s6, -1
1519; VI-NEXT:    s_mov_b32 s10, s6
1520; VI-NEXT:    s_mov_b32 s11, s7
1521; VI-NEXT:    s_waitcnt lgkmcnt(0)
1522; VI-NEXT:    s_mov_b32 s8, s2
1523; VI-NEXT:    s_mov_b32 s9, s3
1524; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1525; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1526; VI-NEXT:    s_mov_b32 s4, s0
1527; VI-NEXT:    s_mov_b32 s5, s1
1528; VI-NEXT:    s_waitcnt vmcnt(1)
1529; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1530; VI-NEXT:    s_waitcnt vmcnt(0)
1531; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1532; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1533; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1534; VI-NEXT:    v_trunc_f32_e32 v2, v2
1535; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1536; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1537; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1538; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1539; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1540; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1541; VI-NEXT:    s_endpgm
1542;
1543; GCN-LABEL: v_udiv_i16:
1544; GCN:       ; %bb.0:
1545; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1546; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1547; GCN-NEXT:    s_add_u32 s4, s2, 2
1548; GCN-NEXT:    s_addc_u32 s5, s3, 0
1549; GCN-NEXT:    v_mov_b32_e32 v0, s4
1550; GCN-NEXT:    v_mov_b32_e32 v1, s5
1551; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1552; GCN-NEXT:    v_mov_b32_e32 v0, s2
1553; GCN-NEXT:    v_mov_b32_e32 v1, s3
1554; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1555; GCN-NEXT:    v_mov_b32_e32 v1, s1
1556; GCN-NEXT:    s_waitcnt vmcnt(1)
1557; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1558; GCN-NEXT:    s_waitcnt vmcnt(0)
1559; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1560; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1561; GCN-NEXT:    v_mov_b32_e32 v0, s0
1562; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1563; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1564; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1565; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1566; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1567; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1568; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1569; GCN-NEXT:    flat_store_dword v[0:1], v2
1570; GCN-NEXT:    s_endpgm
1571;
1572; GFX1030-LABEL: v_udiv_i16:
1573; GFX1030:       ; %bb.0:
1574; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1575; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1576; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1577; GFX1030-NEXT:    s_clause 0x1
1578; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3] offset:2
1579; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3]
1580; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1581; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1582; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1583; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1584; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1585; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1586; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1587; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1588; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1589; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1590; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1591; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1592; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1593; GFX1030-NEXT:    s_endpgm
1594;
1595; EG-LABEL: v_udiv_i16:
1596; EG:       ; %bb.0:
1597; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1598; EG-NEXT:    TEX 1 @6
1599; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1600; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1601; EG-NEXT:    CF_END
1602; EG-NEXT:    PAD
1603; EG-NEXT:    Fetch clause starting at 6:
1604; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1605; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1606; EG-NEXT:    ALU clause starting at 10:
1607; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1608; EG-NEXT:    ALU clause starting at 11:
1609; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1610; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1611; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1612; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1613; EG-NEXT:     TRUNC * T0.W, PV.W,
1614; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1615; EG-NEXT:     TRUNC * T0.W, PV.W,
1616; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1617; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1618; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1619; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1620; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1621; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1622; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1623; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1624  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
1625  %num = load i16, i16 addrspace(1) * %in
1626  %den = load i16, i16 addrspace(1) * %den_ptr
1627  %result = udiv i16 %num, %den
1628  %result.ext = zext i16 %result to i32
1629  store i32 %result.ext, i32 addrspace(1)* %out
1630  ret void
1631}
1632
1633define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1634; SI-LABEL: v_udiv_i23:
1635; SI:       ; %bb.0:
1636; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1637; SI-NEXT:    s_mov_b32 s7, 0xf000
1638; SI-NEXT:    s_mov_b32 s6, -1
1639; SI-NEXT:    s_mov_b32 s10, s6
1640; SI-NEXT:    s_mov_b32 s11, s7
1641; SI-NEXT:    s_waitcnt lgkmcnt(0)
1642; SI-NEXT:    s_mov_b32 s8, s2
1643; SI-NEXT:    s_mov_b32 s9, s3
1644; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1645; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1646; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1647; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1648; SI-NEXT:    s_mov_b32 s4, s0
1649; SI-NEXT:    s_mov_b32 s5, s1
1650; SI-NEXT:    s_waitcnt vmcnt(3)
1651; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1652; SI-NEXT:    s_waitcnt vmcnt(2)
1653; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1654; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1655; SI-NEXT:    s_waitcnt vmcnt(1)
1656; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1657; SI-NEXT:    s_waitcnt vmcnt(0)
1658; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1659; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1660; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1661; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1662; SI-NEXT:    v_trunc_f32_e32 v2, v2
1663; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1664; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1665; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1666; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1667; SI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1668; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1669; SI-NEXT:    s_endpgm
1670;
1671; VI-LABEL: v_udiv_i23:
1672; VI:       ; %bb.0:
1673; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1674; VI-NEXT:    s_mov_b32 s7, 0xf000
1675; VI-NEXT:    s_mov_b32 s6, -1
1676; VI-NEXT:    s_mov_b32 s10, s6
1677; VI-NEXT:    s_mov_b32 s11, s7
1678; VI-NEXT:    s_waitcnt lgkmcnt(0)
1679; VI-NEXT:    s_mov_b32 s8, s2
1680; VI-NEXT:    s_mov_b32 s9, s3
1681; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1682; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1683; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1684; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1685; VI-NEXT:    s_mov_b32 s4, s0
1686; VI-NEXT:    s_mov_b32 s5, s1
1687; VI-NEXT:    s_waitcnt vmcnt(3)
1688; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1689; VI-NEXT:    s_waitcnt vmcnt(2)
1690; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1691; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1692; VI-NEXT:    s_waitcnt vmcnt(1)
1693; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1694; VI-NEXT:    s_waitcnt vmcnt(0)
1695; VI-NEXT:    v_or_b32_e32 v1, v3, v1
1696; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1697; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1698; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1699; VI-NEXT:    v_trunc_f32_e32 v2, v2
1700; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1701; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1702; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1703; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1704; VI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1705; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1706; VI-NEXT:    s_endpgm
1707;
1708; GCN-LABEL: v_udiv_i23:
1709; GCN:       ; %bb.0:
1710; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1711; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1712; GCN-NEXT:    s_add_u32 s4, s2, 4
1713; GCN-NEXT:    s_addc_u32 s5, s3, 0
1714; GCN-NEXT:    s_add_u32 s6, s2, 2
1715; GCN-NEXT:    s_addc_u32 s7, s3, 0
1716; GCN-NEXT:    v_mov_b32_e32 v0, s6
1717; GCN-NEXT:    v_mov_b32_e32 v1, s7
1718; GCN-NEXT:    s_add_u32 s6, s2, 6
1719; GCN-NEXT:    s_addc_u32 s7, s3, 0
1720; GCN-NEXT:    v_mov_b32_e32 v2, s6
1721; GCN-NEXT:    v_mov_b32_e32 v3, s7
1722; GCN-NEXT:    v_mov_b32_e32 v4, s4
1723; GCN-NEXT:    v_mov_b32_e32 v5, s5
1724; GCN-NEXT:    flat_load_ubyte v6, v[2:3]
1725; GCN-NEXT:    flat_load_ushort v4, v[4:5]
1726; GCN-NEXT:    v_mov_b32_e32 v2, s2
1727; GCN-NEXT:    v_mov_b32_e32 v3, s3
1728; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
1729; GCN-NEXT:    flat_load_ushort v1, v[2:3]
1730; GCN-NEXT:    s_waitcnt vmcnt(3)
1731; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
1732; GCN-NEXT:    s_waitcnt vmcnt(2)
1733; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
1734; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1735; GCN-NEXT:    s_waitcnt vmcnt(1)
1736; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1737; GCN-NEXT:    s_waitcnt vmcnt(0)
1738; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
1739; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1740; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1741; GCN-NEXT:    v_mov_b32_e32 v0, s0
1742; GCN-NEXT:    v_mov_b32_e32 v1, s1
1743; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1744; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1745; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1746; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1747; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1748; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1749; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
1750; GCN-NEXT:    flat_store_dword v[0:1], v2
1751; GCN-NEXT:    s_endpgm
1752;
1753; GFX1030-LABEL: v_udiv_i23:
1754; GFX1030:       ; %bb.0:
1755; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1756; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1757; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1758; GFX1030-NEXT:    s_clause 0x3
1759; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1760; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1761; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1762; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1763; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1764; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1765; GFX1030-NEXT:    s_waitcnt vmcnt(2)
1766; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1767; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1768; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1769; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1770; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1771; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
1772; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1773; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1774; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1775; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1776; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1777; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1778; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1779; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1780; GFX1030-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
1781; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1782; GFX1030-NEXT:    s_endpgm
1783;
1784; EG-LABEL: v_udiv_i23:
1785; EG:       ; %bb.0:
1786; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1787; EG-NEXT:    TEX 3 @6
1788; EG-NEXT:    ALU 20, @15, KC0[CB0:0-32], KC1[]
1789; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1790; EG-NEXT:    CF_END
1791; EG-NEXT:    PAD
1792; EG-NEXT:    Fetch clause starting at 6:
1793; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1794; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1795; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1796; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1797; EG-NEXT:    ALU clause starting at 14:
1798; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1799; EG-NEXT:    ALU clause starting at 15:
1800; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1801; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1802; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1803; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1804; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1805; EG-NEXT:     UINT_TO_FLT * T0.X, PV.W,
1806; EG-NEXT:     OR_INT T0.W, T2.X, T1.W,
1807; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1808; EG-NEXT:     UINT_TO_FLT * T0.Z, PV.W,
1809; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Y,
1810; EG-NEXT:     TRUNC * T0.W, PV.W,
1811; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
1812; EG-NEXT:     TRUNC * T0.W, PV.W,
1813; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.X|,
1814; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1815; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1816; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1817; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1818; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1819; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1820; EG-NEXT:    8388607(1.175494e-38), 2(2.802597e-45)
1821  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1822  %num = load i23, i23 addrspace(1) * %in
1823  %den = load i23, i23 addrspace(1) * %den_ptr
1824  %result = udiv i23 %num, %den
1825  %result.ext = zext i23 %result to i32
1826  store i32 %result.ext, i32 addrspace(1)* %out
1827  ret void
1828}
1829
1830define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1831; SI-LABEL: v_udiv_i24:
1832; SI:       ; %bb.0:
1833; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1834; SI-NEXT:    s_mov_b32 s7, 0xf000
1835; SI-NEXT:    s_mov_b32 s6, -1
1836; SI-NEXT:    s_mov_b32 s10, s6
1837; SI-NEXT:    s_mov_b32 s11, s7
1838; SI-NEXT:    s_waitcnt lgkmcnt(0)
1839; SI-NEXT:    s_mov_b32 s8, s2
1840; SI-NEXT:    s_mov_b32 s9, s3
1841; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1842; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1843; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1844; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1845; SI-NEXT:    s_mov_b32 s4, s0
1846; SI-NEXT:    s_mov_b32 s5, s1
1847; SI-NEXT:    s_waitcnt vmcnt(3)
1848; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1849; SI-NEXT:    s_waitcnt vmcnt(2)
1850; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1851; SI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1852; SI-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1853; SI-NEXT:    s_waitcnt vmcnt(1)
1854; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1855; SI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1856; SI-NEXT:    s_waitcnt vmcnt(0)
1857; SI-NEXT:    v_or_b32_e32 v2, v3, v2
1858; SI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1859; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1860; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
1861; SI-NEXT:    v_mul_hi_u32 v4, v1, v4
1862; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1863; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
1864; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
1865; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1866; SI-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
1867; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1868; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1869; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v2
1870; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1871; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
1872; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1873; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1874; SI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1875; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1876; SI-NEXT:    s_endpgm
1877;
1878; VI-LABEL: v_udiv_i24:
1879; VI:       ; %bb.0:
1880; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1881; VI-NEXT:    s_mov_b32 s7, 0xf000
1882; VI-NEXT:    s_mov_b32 s6, -1
1883; VI-NEXT:    s_mov_b32 s10, s6
1884; VI-NEXT:    s_mov_b32 s11, s7
1885; VI-NEXT:    s_waitcnt lgkmcnt(0)
1886; VI-NEXT:    s_mov_b32 s8, s2
1887; VI-NEXT:    s_mov_b32 s9, s3
1888; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1889; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1890; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1891; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1892; VI-NEXT:    s_mov_b32 s4, s0
1893; VI-NEXT:    s_mov_b32 s5, s1
1894; VI-NEXT:    s_waitcnt vmcnt(3)
1895; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1896; VI-NEXT:    s_waitcnt vmcnt(2)
1897; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1898; VI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1899; VI-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
1900; VI-NEXT:    s_waitcnt vmcnt(1)
1901; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1902; VI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1903; VI-NEXT:    s_waitcnt vmcnt(0)
1904; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1905; VI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1906; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1907; VI-NEXT:    v_mul_lo_u32 v4, v4, v1
1908; VI-NEXT:    v_mul_hi_u32 v4, v1, v4
1909; VI-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
1910; VI-NEXT:    v_mul_hi_u32 v1, v2, v1
1911; VI-NEXT:    v_mul_lo_u32 v3, v1, v0
1912; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
1913; VI-NEXT:    v_subrev_u32_e32 v2, vcc, v3, v2
1914; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1915; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1916; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v0, v2
1917; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1918; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
1919; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1920; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1921; VI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1922; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1923; VI-NEXT:    s_endpgm
1924;
1925; GCN-LABEL: v_udiv_i24:
1926; GCN:       ; %bb.0:
1927; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1928; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1929; GCN-NEXT:    s_add_u32 s4, s2, 4
1930; GCN-NEXT:    s_addc_u32 s5, s3, 0
1931; GCN-NEXT:    s_add_u32 s6, s2, 2
1932; GCN-NEXT:    v_mov_b32_e32 v0, s4
1933; GCN-NEXT:    s_addc_u32 s7, s3, 0
1934; GCN-NEXT:    v_mov_b32_e32 v1, s5
1935; GCN-NEXT:    s_add_u32 s4, s2, 6
1936; GCN-NEXT:    s_addc_u32 s5, s3, 0
1937; GCN-NEXT:    v_mov_b32_e32 v2, s4
1938; GCN-NEXT:    v_mov_b32_e32 v3, s5
1939; GCN-NEXT:    flat_load_ubyte v4, v[2:3]
1940; GCN-NEXT:    flat_load_ushort v5, v[0:1]
1941; GCN-NEXT:    v_mov_b32_e32 v2, s6
1942; GCN-NEXT:    v_mov_b32_e32 v0, s2
1943; GCN-NEXT:    v_mov_b32_e32 v3, s7
1944; GCN-NEXT:    v_mov_b32_e32 v1, s3
1945; GCN-NEXT:    flat_load_ubyte v2, v[2:3]
1946; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1947; GCN-NEXT:    s_waitcnt vmcnt(3)
1948; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
1949; GCN-NEXT:    s_waitcnt vmcnt(2)
1950; GCN-NEXT:    v_or_b32_e32 v3, v5, v1
1951; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v3
1952; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
1953; GCN-NEXT:    s_waitcnt vmcnt(1)
1954; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1955; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1956; GCN-NEXT:    s_waitcnt vmcnt(0)
1957; GCN-NEXT:    v_or_b32_e32 v2, v0, v2
1958; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1959; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1960; GCN-NEXT:    v_mul_lo_u32 v4, v4, v1
1961; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1962; GCN-NEXT:    v_add_u32_e32 v0, vcc, v4, v1
1963; GCN-NEXT:    v_mul_hi_u32 v4, v2, v0
1964; GCN-NEXT:    v_mov_b32_e32 v0, s0
1965; GCN-NEXT:    v_mov_b32_e32 v1, s1
1966; GCN-NEXT:    v_mul_lo_u32 v5, v4, v3
1967; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
1968; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, v5, v2
1969; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v3
1970; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
1971; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v3, v2
1972; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1973; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
1974; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
1975; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
1976; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
1977; GCN-NEXT:    flat_store_dword v[0:1], v2
1978; GCN-NEXT:    s_endpgm
1979;
1980; GFX1030-LABEL: v_udiv_i24:
1981; GFX1030:       ; %bb.0:
1982; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1983; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1984; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX1030-NEXT:    s_clause 0x3
1986; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1987; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1988; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1989; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1990; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1991; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1992; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1993; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1994; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1995; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1996; GFX1030-NEXT:    v_or_b32_e32 v3, v4, v3
1997; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v1
1998; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
1999; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2000; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2001; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
2002; GFX1030-NEXT:    v_mul_lo_u32 v5, v5, v2
2003; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v5
2004; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v2, v5
2005; GFX1030-NEXT:    v_mul_hi_u32 v2, v3, v2
2006; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, v1
2007; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
2008; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2009; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v3, v1
2010; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2011; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
2012; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
2013; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2014; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2015; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
2016; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
2017; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
2018; GFX1030-NEXT:    s_endpgm
2019;
2020; EG-LABEL: v_udiv_i24:
2021; EG:       ; %bb.0:
2022; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
2023; EG-NEXT:    TEX 3 @6
2024; EG-NEXT:    ALU 23, @15, KC0[CB0:0-32], KC1[]
2025; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2026; EG-NEXT:    CF_END
2027; EG-NEXT:    PAD
2028; EG-NEXT:    Fetch clause starting at 6:
2029; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
2030; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
2031; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
2032; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
2033; EG-NEXT:    ALU clause starting at 14:
2034; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2035; EG-NEXT:    ALU clause starting at 15:
2036; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
2037; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2038; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
2039; EG-NEXT:     SUB_INT T1.W, 0.0, PV.W,
2040; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2041; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
2042; EG-NEXT:     LSHL T1.W, T3.X, literal.x,
2043; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2044; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2045; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2046; EG-NEXT:     OR_INT * T1.W, T2.X, PV.W,
2047; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2048; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2049; EG-NEXT:     SUB_INT * T1.W, T1.W, PS,
2050; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2051; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
2052; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
2053; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.W, PS,
2054; EG-NEXT:     CNDE_INT * T2.W, PV.W, T0.X, PV.Z,
2055; EG-NEXT:     ADD_INT T3.W, PS, 1,
2056; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2057; EG-NEXT:     CNDE_INT T0.X, PS, T2.W, PV.W,
2058; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2059; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2060  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
2061  %num = load i24, i24 addrspace(1) * %in
2062  %den = load i24, i24 addrspace(1) * %den_ptr
2063  %result = udiv i24 %num, %den
2064  %result.ext = zext i24 %result to i32
2065  store i32 %result.ext, i32 addrspace(1)* %out
2066  ret void
2067}
2068
2069define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2070; SI-LABEL: scalarize_mulhu_4xi32:
2071; SI:       ; %bb.0:
2072; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2073; SI-NEXT:    s_mov_b32 s7, 0xf000
2074; SI-NEXT:    s_mov_b32 s6, -1
2075; SI-NEXT:    s_waitcnt lgkmcnt(0)
2076; SI-NEXT:    s_mov_b32 s4, s0
2077; SI-NEXT:    s_mov_b32 s5, s1
2078; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2079; SI-NEXT:    s_mov_b32 s0, 0x1389c755
2080; SI-NEXT:    s_mov_b32 s4, s2
2081; SI-NEXT:    s_mov_b32 s5, s3
2082; SI-NEXT:    s_waitcnt vmcnt(0)
2083; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2084; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2085; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2086; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2087; SI-NEXT:    v_mul_hi_u32 v0, v0, s0
2088; SI-NEXT:    v_mul_hi_u32 v1, v1, s0
2089; SI-NEXT:    v_mul_hi_u32 v2, v2, s0
2090; SI-NEXT:    v_mul_hi_u32 v3, v3, s0
2091; SI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2092; SI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2093; SI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2094; SI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2095; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2096; SI-NEXT:    s_endpgm
2097;
2098; VI-LABEL: scalarize_mulhu_4xi32:
2099; VI:       ; %bb.0:
2100; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2101; VI-NEXT:    s_mov_b32 s7, 0xf000
2102; VI-NEXT:    s_mov_b32 s6, -1
2103; VI-NEXT:    s_waitcnt lgkmcnt(0)
2104; VI-NEXT:    s_mov_b32 s4, s0
2105; VI-NEXT:    s_mov_b32 s5, s1
2106; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2107; VI-NEXT:    s_mov_b32 s0, 0x1389c755
2108; VI-NEXT:    s_mov_b32 s4, s2
2109; VI-NEXT:    s_mov_b32 s5, s3
2110; VI-NEXT:    s_waitcnt vmcnt(0)
2111; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2112; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2113; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2114; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2115; VI-NEXT:    v_mul_hi_u32 v0, v0, s0
2116; VI-NEXT:    v_mul_hi_u32 v1, v1, s0
2117; VI-NEXT:    v_mul_hi_u32 v2, v2, s0
2118; VI-NEXT:    v_mul_hi_u32 v3, v3, s0
2119; VI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2120; VI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2121; VI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2122; VI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2123; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2124; VI-NEXT:    s_endpgm
2125;
2126; GCN-LABEL: scalarize_mulhu_4xi32:
2127; GCN:       ; %bb.0:
2128; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2129; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2130; GCN-NEXT:    v_mov_b32_e32 v0, s0
2131; GCN-NEXT:    v_mov_b32_e32 v1, s1
2132; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2133; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2134; GCN-NEXT:    v_mov_b32_e32 v4, s2
2135; GCN-NEXT:    v_mov_b32_e32 v5, s3
2136; GCN-NEXT:    s_waitcnt vmcnt(0)
2137; GCN-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2138; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2139; GCN-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2140; GCN-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2141; GCN-NEXT:    v_mul_hi_u32 v0, v0, s0
2142; GCN-NEXT:    v_mul_hi_u32 v1, v1, s0
2143; GCN-NEXT:    v_mul_hi_u32 v2, v2, s0
2144; GCN-NEXT:    v_mul_hi_u32 v3, v3, s0
2145; GCN-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2146; GCN-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2147; GCN-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2148; GCN-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2149; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2150; GCN-NEXT:    s_endpgm
2151;
2152; GFX1030-LABEL: scalarize_mulhu_4xi32:
2153; GFX1030:       ; %bb.0:
2154; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2155; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
2156; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2157; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
2158; GFX1030-NEXT:    s_mov_b32 s0, 0x1389c755
2159; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2160; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2161; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2162; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2163; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2164; GFX1030-NEXT:    v_mul_hi_u32 v0, v0, s0
2165; GFX1030-NEXT:    v_mul_hi_u32 v1, v1, s0
2166; GFX1030-NEXT:    v_mul_hi_u32 v2, v2, s0
2167; GFX1030-NEXT:    v_mul_hi_u32 v3, v3, s0
2168; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2169; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2170; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2171; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2172; GFX1030-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
2173; GFX1030-NEXT:    s_endpgm
2174;
2175; EG-LABEL: scalarize_mulhu_4xi32:
2176; EG:       ; %bb.0:
2177; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2178; EG-NEXT:    TEX 0 @6
2179; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
2180; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2181; EG-NEXT:    CF_END
2182; EG-NEXT:    PAD
2183; EG-NEXT:    Fetch clause starting at 6:
2184; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2185; EG-NEXT:    ALU clause starting at 8:
2186; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2187; EG-NEXT:    ALU clause starting at 9:
2188; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
2189; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
2190; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2191; EG-NEXT:     MULHI * T0.Z, PV.W, literal.x,
2192; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2193; EG-NEXT:     LSHR T1.Z, T0.Y, literal.x,
2194; EG-NEXT:     LSHR T0.W, PS, literal.y,
2195; EG-NEXT:     MULHI * T0.Y, T1.W, literal.z,
2196; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
2197; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2198; EG-NEXT:     LSHR T0.Z, PS, literal.x,
2199; EG-NEXT:     LSHR T1.W, T0.X, literal.y,
2200; EG-NEXT:     MULHI * T0.X, PV.Z, literal.z,
2201; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2202; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2203; EG-NEXT:     LSHR T0.Y, PS, literal.x,
2204; EG-NEXT:     MULHI * T0.X, PV.W, literal.y,
2205; EG-NEXT:    10(1.401298e-44), 327796565(3.478022e-27)
2206; EG-NEXT:     LSHR T0.X, PS, literal.x,
2207; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.y,
2208; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2209  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2210  %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2211  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
2212  ret void
2213}
2214
2215define amdgpu_kernel void @test_udiv2(i32 %p) {
2216; SI-LABEL: test_udiv2:
2217; SI:       ; %bb.0:
2218; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2219; SI-NEXT:    s_mov_b32 s3, 0xf000
2220; SI-NEXT:    s_mov_b32 s2, -1
2221; SI-NEXT:    s_waitcnt lgkmcnt(0)
2222; SI-NEXT:    s_lshr_b32 s0, s0, 1
2223; SI-NEXT:    v_mov_b32_e32 v0, s0
2224; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2225; SI-NEXT:    s_waitcnt vmcnt(0)
2226; SI-NEXT:    s_endpgm
2227;
2228; VI-LABEL: test_udiv2:
2229; VI:       ; %bb.0:
2230; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2231; VI-NEXT:    s_mov_b32 s3, 0xf000
2232; VI-NEXT:    s_mov_b32 s2, -1
2233; VI-NEXT:    s_waitcnt lgkmcnt(0)
2234; VI-NEXT:    s_lshr_b32 s0, s0, 1
2235; VI-NEXT:    v_mov_b32_e32 v0, s0
2236; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2237; VI-NEXT:    s_waitcnt vmcnt(0)
2238; VI-NEXT:    s_endpgm
2239;
2240; GCN-LABEL: test_udiv2:
2241; GCN:       ; %bb.0:
2242; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2243; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2244; GCN-NEXT:    s_lshr_b32 s0, s0, 1
2245; GCN-NEXT:    v_mov_b32_e32 v0, s0
2246; GCN-NEXT:    flat_store_dword v[0:1], v0
2247; GCN-NEXT:    s_waitcnt vmcnt(0)
2248; GCN-NEXT:    s_endpgm
2249;
2250; GFX1030-LABEL: test_udiv2:
2251; GFX1030:       ; %bb.0:
2252; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2253; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2254; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2255; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2256; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2257; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2258; GFX1030-NEXT:    s_endpgm
2259;
2260; EG-LABEL: test_udiv2:
2261; EG:       ; %bb.0:
2262; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2263; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2264; EG-NEXT:    CF_END
2265; EG-NEXT:    PAD
2266; EG-NEXT:    ALU clause starting at 4:
2267; EG-NEXT:     MOV T0.X, literal.x,
2268; EG-NEXT:     LSHR * T1.X, KC0[2].Y, 1,
2269; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2270  %i = udiv i32 %p, 2
2271  store volatile i32 %i, i32 addrspace(1)* undef
2272  ret void
2273}
2274
2275define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
2276; SI-LABEL: test_udiv_3_mulhu:
2277; SI:       ; %bb.0:
2278; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2279; SI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2280; SI-NEXT:    s_mov_b32 s3, 0xf000
2281; SI-NEXT:    s_mov_b32 s2, -1
2282; SI-NEXT:    s_waitcnt lgkmcnt(0)
2283; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
2284; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2285; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2286; SI-NEXT:    s_waitcnt vmcnt(0)
2287; SI-NEXT:    s_endpgm
2288;
2289; VI-LABEL: test_udiv_3_mulhu:
2290; VI:       ; %bb.0:
2291; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2292; VI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2293; VI-NEXT:    s_mov_b32 s3, 0xf000
2294; VI-NEXT:    s_mov_b32 s2, -1
2295; VI-NEXT:    s_waitcnt lgkmcnt(0)
2296; VI-NEXT:    v_mul_hi_u32 v0, s0, v0
2297; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2298; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2299; VI-NEXT:    s_waitcnt vmcnt(0)
2300; VI-NEXT:    s_endpgm
2301;
2302; GCN-LABEL: test_udiv_3_mulhu:
2303; GCN:       ; %bb.0:
2304; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2305; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2306; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2307; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
2308; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2309; GCN-NEXT:    flat_store_dword v[0:1], v0
2310; GCN-NEXT:    s_waitcnt vmcnt(0)
2311; GCN-NEXT:    s_endpgm
2312;
2313; GFX1030-LABEL: test_udiv_3_mulhu:
2314; GFX1030:       ; %bb.0:
2315; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2316; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX1030-NEXT:    s_mul_hi_u32 s0, s0, 0xaaaaaaab
2318; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2319; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2320; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2321; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2322; GFX1030-NEXT:    s_endpgm
2323;
2324; EG-LABEL: test_udiv_3_mulhu:
2325; EG:       ; %bb.0:
2326; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2327; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2328; EG-NEXT:    CF_END
2329; EG-NEXT:    PAD
2330; EG-NEXT:    ALU clause starting at 4:
2331; EG-NEXT:     MULHI * T0.X, KC0[2].Y, literal.x,
2332; EG-NEXT:    -1431655765(-3.031649e-13), 0(0.000000e+00)
2333; EG-NEXT:     LSHR T0.X, PS, 1,
2334; EG-NEXT:     MOV * T1.X, literal.x,
2335; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2336   %i = udiv i32 %p, 3
2337   store volatile i32 %i, i32 addrspace(1)* undef
2338   ret void
2339}
2340
2341define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
2342; SI-LABEL: fdiv_test_denormals:
2343; SI:       ; %bb.0: ; %bb
2344; SI-NEXT:    s_mov_b32 s0, 0
2345; SI-NEXT:    s_mov_b32 s3, 0xf000
2346; SI-NEXT:    s_mov_b32 s2, -1
2347; SI-NEXT:    s_mov_b32 s1, s0
2348; SI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2349; SI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2350; SI-NEXT:    s_waitcnt vmcnt(1)
2351; SI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2352; SI-NEXT:    s_waitcnt vmcnt(0)
2353; SI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2354; SI-NEXT:    v_xor_b32_e32 v0, v1, v0
2355; SI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2356; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2357; SI-NEXT:    v_or_b32_e32 v0, 1, v0
2358; SI-NEXT:    v_mul_f32_e32 v1, v3, v4
2359; SI-NEXT:    v_trunc_f32_e32 v1, v1
2360; SI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2361; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2362; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2363; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2364; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2365; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2366; SI-NEXT:    s_endpgm
2367;
2368; VI-LABEL: fdiv_test_denormals:
2369; VI:       ; %bb.0: ; %bb
2370; VI-NEXT:    s_mov_b32 s0, 0
2371; VI-NEXT:    s_mov_b32 s3, 0xf000
2372; VI-NEXT:    s_mov_b32 s2, -1
2373; VI-NEXT:    s_mov_b32 s1, s0
2374; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2375; VI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2376; VI-NEXT:    s_waitcnt vmcnt(1)
2377; VI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2378; VI-NEXT:    s_waitcnt vmcnt(0)
2379; VI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2380; VI-NEXT:    v_xor_b32_e32 v0, v1, v0
2381; VI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2382; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2383; VI-NEXT:    v_or_b32_e32 v0, 1, v0
2384; VI-NEXT:    v_mul_f32_e32 v1, v3, v4
2385; VI-NEXT:    v_trunc_f32_e32 v1, v1
2386; VI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2387; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2388; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2389; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2390; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
2391; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2392; VI-NEXT:    s_endpgm
2393;
2394; GCN-LABEL: fdiv_test_denormals:
2395; GCN:       ; %bb.0: ; %bb
2396; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
2397; GCN-NEXT:    v_mov_b32_e32 v0, 0
2398; GCN-NEXT:    v_mov_b32_e32 v1, 0
2399; GCN-NEXT:    flat_load_sbyte v3, v[0:1]
2400; GCN-NEXT:    s_waitcnt vmcnt(1)
2401; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v2
2402; GCN-NEXT:    s_waitcnt vmcnt(0)
2403; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v3
2404; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2405; GCN-NEXT:    v_xor_b32_e32 v2, v3, v2
2406; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2407; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
2408; GCN-NEXT:    v_mul_f32_e32 v3, v5, v6
2409; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2410; GCN-NEXT:    v_mad_f32 v5, -v3, v4, v5
2411; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2412; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
2413; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
2414; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2415; GCN-NEXT:    flat_store_byte v[0:1], v2
2416; GCN-NEXT:    s_endpgm
2417;
2418; GFX1030-LABEL: fdiv_test_denormals:
2419; GFX1030:       ; %bb.0: ; %bb
2420; GFX1030-NEXT:    global_load_sbyte v2, v[0:1], off
2421; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
2422; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
2423; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
2424; GFX1030-NEXT:    s_waitcnt vmcnt(1)
2425; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
2426; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v4
2427; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2428; GFX1030-NEXT:    v_cvt_f32_i32_e32 v6, v3
2429; GFX1030-NEXT:    v_xor_b32_e32 v2, v3, v2
2430; GFX1030-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2431; GFX1030-NEXT:    v_mul_f32_e32 v5, v6, v5
2432; GFX1030-NEXT:    v_or_b32_e32 v2, 1, v2
2433; GFX1030-NEXT:    v_trunc_f32_e32 v3, v5
2434; GFX1030-NEXT:    v_fma_f32 v5, -v3, v4, v6
2435; GFX1030-NEXT:    v_cvt_i32_f32_e32 v3, v3
2436; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4|
2437; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
2438; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v3, v2
2439; GFX1030-NEXT:    global_store_byte v[0:1], v2, off
2440; GFX1030-NEXT:    s_endpgm
2441;
2442; EG-LABEL: fdiv_test_denormals:
2443; EG:       ; %bb.0: ; %bb
2444; EG-NEXT:    TEX 0 @6
2445; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
2446; EG-NEXT:    TEX 0 @8
2447; EG-NEXT:    ALU 25, @11, KC0[], KC1[]
2448; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
2449; EG-NEXT:    CF_END
2450; EG-NEXT:    Fetch clause starting at 6:
2451; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
2452; EG-NEXT:    Fetch clause starting at 8:
2453; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
2454; EG-NEXT:    ALU clause starting at 10:
2455; EG-NEXT:     MOV * T1.X, 0.0,
2456; EG-NEXT:    ALU clause starting at 11:
2457; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
2458; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2459; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
2460; EG-NEXT:     BFE_INT T1.W, T1.X, 0.0, literal.x,
2461; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
2462; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2463; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
2464; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
2465; EG-NEXT:     TRUNC T2.W, PV.W,
2466; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
2467; EG-NEXT:     ASHR T0.W, PS, literal.x,
2468; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
2469; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
2470; EG-NEXT:     TRUNC T0.Z, T2.W,
2471; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
2472; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
2473; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
2474; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
2475; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
2476; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
2477; EG-NEXT:     MOV * T0.W, literal.x,
2478; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2479; EG-NEXT:     MOV T0.Y, 0.0,
2480; EG-NEXT:     MOV * T0.Z, 0.0,
2481; EG-NEXT:     MOV * T1.X, literal.x,
2482; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2483bb:
2484  %tmp = load i8, i8 addrspace(1)* null, align 1
2485  %tmp1 = sext i8 %tmp to i32
2486  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
2487  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
2488  %tmp4 = sext i8 %tmp3 to i32
2489  %tmp5 = sdiv i32 %tmp1, %tmp4
2490  %tmp6 = trunc i32 %tmp5 to i8
2491  store i8 %tmp6, i8 addrspace(1)* null, align 1
2492  ret void
2493}
2494
2495define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
2496; SI-LABEL: v_test_udiv64_mulhi_fold:
2497; SI:       ; %bb.0:
2498; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2499; SI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2500; SI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2501; SI-NEXT:    v_rcp_f32_e32 v2, v2
2502; SI-NEXT:    s_mov_b32 s4, 0xfffe7960
2503; SI-NEXT:    v_mov_b32_e32 v9, 0
2504; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2505; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2506; SI-NEXT:    v_trunc_f32_e32 v3, v3
2507; SI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2508; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
2509; SI-NEXT:    v_cvt_u32_f32_e32 v3, v3
2510; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2511; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
2512; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2513; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2514; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2515; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
2516; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
2517; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
2518; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
2519; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2520; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
2521; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
2522; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
2523; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
2524; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
2525; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
2526; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
2527; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2528; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2529; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2530; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2531; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2532; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
2533; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2534; SI-NEXT:    s_mov_b32 s4, 0x186a0
2535; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2536; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
2537; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
2538; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
2539; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
2540; SI-NEXT:    v_mul_hi_u32 v10, v3, v4
2541; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2542; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
2543; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
2544; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
2545; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
2546; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
2547; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
2548; SI-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
2549; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2550; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2551; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2552; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2553; SI-NEXT:    v_mul_lo_u32 v4, v0, v3
2554; SI-NEXT:    v_mul_hi_u32 v5, v0, v2
2555; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
2556; SI-NEXT:    v_mul_hi_u32 v7, v1, v3
2557; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
2558; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2559; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2560; SI-NEXT:    v_mul_lo_u32 v6, v1, v2
2561; SI-NEXT:    v_mul_hi_u32 v2, v1, v2
2562; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
2563; SI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
2564; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v9, vcc
2565; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2566; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2567; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
2568; SI-NEXT:    v_mul_hi_u32 v5, v2, s4
2569; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2570; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2571; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
2572; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
2573; SI-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
2574; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
2575; SI-NEXT:    s_mov_b32 s4, 0x1869f
2576; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
2577; SI-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
2578; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2579; SI-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
2580; SI-NEXT:    v_add_i32_e32 v5, vcc, 2, v2
2581; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
2582; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
2583; SI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2584; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
2585; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2586; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2587; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
2588; SI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2589; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
2590; SI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2591; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2592; SI-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
2593; SI-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
2594; SI-NEXT:    s_setpc_b64 s[30:31]
2595;
2596; VI-LABEL: v_test_udiv64_mulhi_fold:
2597; VI:       ; %bb.0:
2598; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2599; VI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2600; VI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2601; VI-NEXT:    v_rcp_f32_e32 v2, v2
2602; VI-NEXT:    s_mov_b32 s6, 0xfffe7960
2603; VI-NEXT:    v_mov_b32_e32 v9, 0
2604; VI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2605; VI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2606; VI-NEXT:    v_trunc_f32_e32 v3, v3
2607; VI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2608; VI-NEXT:    v_cvt_u32_f32_e32 v6, v2
2609; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
2610; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2611; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2612; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2613; VI-NEXT:    v_add_u32_e32 v5, vcc, v4, v3
2614; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2615; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
2616; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2617; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2618; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2619; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2620; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2621; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2622; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2623; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2624; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2625; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2626; VI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2627; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2628; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2629; VI-NEXT:    s_mov_b32 s6, 0x186a0
2630; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2631; VI-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2632; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2633; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
2634; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2635; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2636; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2637; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2638; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2639; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2640; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2641; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2642; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2643; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2644; VI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2645; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2646; VI-NEXT:    v_mul_hi_u32 v6, v0, v4
2647; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2648; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2649; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2650; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2651; VI-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2652; VI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2653; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2654; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2655; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2656; VI-NEXT:    v_mul_lo_u32 v6, v5, s6
2657; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0
2658; VI-NEXT:    s_mov_b32 s4, 0x1869f
2659; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2660; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2661; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2662; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
2663; VI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2664; VI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2665; VI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2666; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2667; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2668; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2669; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2670; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2671; VI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2672; VI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2673; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2674; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2675; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2676; VI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2677; VI-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2678; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2679; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2680; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2681; VI-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2682; VI-NEXT:    s_setpc_b64 s[30:31]
2683;
2684; GCN-LABEL: v_test_udiv64_mulhi_fold:
2685; GCN:       ; %bb.0:
2686; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2687; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2688; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2689; GCN-NEXT:    v_rcp_f32_e32 v2, v2
2690; GCN-NEXT:    s_mov_b32 s6, 0xfffe7960
2691; GCN-NEXT:    v_mov_b32_e32 v9, 0
2692; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2693; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2694; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2695; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2696; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v2
2697; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
2698; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2699; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2700; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2701; GCN-NEXT:    v_add_u32_e32 v5, vcc, v4, v3
2702; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2703; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
2704; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2705; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2706; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2707; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2708; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2709; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2710; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2711; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2712; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2713; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2714; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2715; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2716; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2717; GCN-NEXT:    s_mov_b32 s6, 0x186a0
2718; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2719; GCN-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2720; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2721; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
2722; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2723; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2724; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2725; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2726; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2727; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2728; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2729; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2730; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2731; GCN-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2732; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2733; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2734; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
2735; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2736; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2737; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2738; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2739; GCN-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2740; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2741; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v9, vcc
2742; GCN-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2743; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2744; GCN-NEXT:    v_mul_lo_u32 v6, v5, s6
2745; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0
2746; GCN-NEXT:    s_mov_b32 s4, 0x1869f
2747; GCN-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2748; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2749; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2750; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
2751; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2752; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2753; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2754; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2755; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2756; GCN-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2757; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2758; GCN-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2759; GCN-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2760; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2761; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2762; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2763; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2764; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2765; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2766; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2767; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2768; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2769; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2770; GCN-NEXT:    s_setpc_b64 s[30:31]
2771;
2772; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
2773; GFX1030:       ; %bb.0:
2774; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2775; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2776; GFX1030-NEXT:    s_mov_b32 s4, 0x346d900
2777; GFX1030-NEXT:    s_mov_b32 s5, 0xfffe7960
2778; GFX1030-NEXT:    s_add_u32 s4, 0x4237, s4
2779; GFX1030-NEXT:    s_addc_u32 s6, 0, 0
2780; GFX1030-NEXT:    v_add_co_u32 v2, s4, 0xa9000000, s4
2781; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
2782; GFX1030-NEXT:    s_addc_u32 s4, s6, 0xa7c5
2783; GFX1030-NEXT:    v_mul_hi_u32 v3, v2, s5
2784; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, s5
2785; GFX1030-NEXT:    s_mul_i32 s5, s4, s5
2786; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v2
2787; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v4
2788; GFX1030-NEXT:    v_mul_hi_u32 v8, s4, v4
2789; GFX1030-NEXT:    v_mul_lo_u32 v4, s4, v4
2790; GFX1030-NEXT:    v_add_nc_u32_e32 v3, s5, v3
2791; GFX1030-NEXT:    v_mul_lo_u32 v6, v2, v3
2792; GFX1030-NEXT:    v_mul_hi_u32 v7, v2, v3
2793; GFX1030-NEXT:    v_mul_hi_u32 v9, s4, v3
2794; GFX1030-NEXT:    v_mul_lo_u32 v3, s4, v3
2795; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v5, v6
2796; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v7, vcc_lo
2797; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, v4
2798; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
2799; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
2800; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, v3
2801; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
2802; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v3
2803; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s4, v4, vcc_lo
2804; GFX1030-NEXT:    v_mul_hi_u32 v8, v0, v5
2805; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], s4, v1, v5, 0
2806; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, v0, v6, 0
2807; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], s4, v1, v6, 0
2808; GFX1030-NEXT:    s_mov_b32 s4, 0x186a0
2809; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
2810; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2811; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2812; GFX1030-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
2813; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
2814; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
2815; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
2816; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s5, v4, s4, 0
2817; GFX1030-NEXT:    v_mul_lo_u32 v6, v5, s4
2818; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
2819; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v6
2820; GFX1030-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2821; GFX1030-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s4
2822; GFX1030-NEXT:    s_mov_b32 s4, 0x1869f
2823; GFX1030-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
2824; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s4, v2
2825; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
2826; GFX1030-NEXT:    v_add_co_u32 v6, vcc_lo, v4, 2
2827; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
2828; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s4, v0
2829; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
2830; GFX1030-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2831; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2832; GFX1030-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s4
2833; GFX1030-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
2834; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, 1
2835; GFX1030-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v5, vcc_lo
2836; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
2837; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
2838; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc_lo
2839; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2840; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
2841; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
2842; GFX1030-NEXT:    s_setpc_b64 s[30:31]
2843;
2844; EG-LABEL: v_test_udiv64_mulhi_fold:
2845; EG:       ; %bb.0:
2846; EG-NEXT:    CF_END
2847; EG-NEXT:    PAD
2848  %d = udiv i64 %arg, 100000
2849  ret i64 %d
2850}
2851