1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
7
8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
9; SI-LABEL: udiv_i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_u32_e32 v2, v1
24; SI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
25; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
26; SI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
27; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
28; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
29; SI-NEXT:    v_mul_hi_u32 v3, v2, v3
30; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
31; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
32; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
33; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
34; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
35; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
36; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
37; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v1, v0
38; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
39; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
40; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
41; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
42; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: udiv_i32:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
48; VI-NEXT:    s_mov_b32 s7, 0xf000
49; VI-NEXT:    s_mov_b32 s6, -1
50; VI-NEXT:    s_mov_b32 s10, s6
51; VI-NEXT:    s_mov_b32 s11, s7
52; VI-NEXT:    s_waitcnt lgkmcnt(0)
53; VI-NEXT:    s_mov_b32 s8, s2
54; VI-NEXT:    s_mov_b32 s9, s3
55; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
56; VI-NEXT:    s_mov_b32 s4, s0
57; VI-NEXT:    s_mov_b32 s5, s1
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    v_cvt_f32_u32_e32 v2, v1
60; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
61; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
62; VI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
63; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
64; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
65; VI-NEXT:    v_mul_hi_u32 v3, v2, v3
66; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
67; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
68; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
69; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
70; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v3, v0
71; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
72; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
73; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v1, v0
74; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
75; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v2
76; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
77; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
78; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
79; VI-NEXT:    s_endpgm
80;
81; GCN-LABEL: udiv_i32:
82; GCN:       ; %bb.0:
83; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
84; GCN-NEXT:    s_waitcnt lgkmcnt(0)
85; GCN-NEXT:    v_mov_b32_e32 v0, s2
86; GCN-NEXT:    v_mov_b32_e32 v1, s3
87; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
88; GCN-NEXT:    s_waitcnt vmcnt(0)
89; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
90; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
91; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
92; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
93; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
94; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
95; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
96; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
97; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
98; GCN-NEXT:    v_mov_b32_e32 v2, s0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_mul_lo_u32 v5, v4, v1
101; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
102; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v5, v0
103; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
104; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
105; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v1, v0
106; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
107; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
108; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
109; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
110; GCN-NEXT:    flat_store_dword v[2:3], v0
111; GCN-NEXT:    s_endpgm
112;
113; GFX1030-LABEL: udiv_i32:
114; GFX1030:       ; %bb.0:
115; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
116; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
117; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1030-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
119; GFX1030-NEXT:    s_waitcnt vmcnt(0)
120; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, v1
121; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, 0, v1
122; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
123; GFX1030-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
124; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
125; GFX1030-NEXT:    v_mul_lo_u32 v4, v4, v3
126; GFX1030-NEXT:    v_mul_hi_u32 v4, v3, v4
127; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v4
128; GFX1030-NEXT:    v_mul_hi_u32 v3, v0, v3
129; GFX1030-NEXT:    v_mul_lo_u32 v4, v3, v1
130; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v4
131; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
132; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v0, v1
133; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
134; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
135; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
136; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
137; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
138; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
139; GFX1030-NEXT:    global_store_dword v2, v0, s[0:1]
140; GFX1030-NEXT:    s_endpgm
141;
142; EG-LABEL: udiv_i32:
143; EG:       ; %bb.0:
144; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
145; EG-NEXT:    TEX 0 @6
146; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
148; EG-NEXT:    CF_END
149; EG-NEXT:    PAD
150; EG-NEXT:    Fetch clause starting at 6:
151; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
152; EG-NEXT:    ALU clause starting at 8:
153; EG-NEXT:     MOV * T0.X, KC0[2].Z,
154; EG-NEXT:    ALU clause starting at 9:
155; EG-NEXT:     SUB_INT T0.W, 0.0, T0.Y,
156; EG-NEXT:     RECIP_UINT * T0.Z, T0.Y,
157; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
158; EG-NEXT:     MULHI * T0.W, T0.Z, PS,
159; EG-NEXT:     ADD_INT * T0.W, T0.Z, PS,
160; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
161; EG-NEXT:     MULLO_INT * T0.W, PS, T0.Y,
162; EG-NEXT:     SUB_INT * T0.W, T0.X, PS,
163; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
164; EG-NEXT:     SETGE_UINT T1.W, PV.W, T0.Y,
165; EG-NEXT:     SUB_INT * T2.W, PV.W, T0.Y,
166; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.W, PS,
167; EG-NEXT:     CNDE_INT * T1.W, PV.W, T0.Z, PV.Z,
168; EG-NEXT:     ADD_INT T2.W, PS, 1,
169; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.Y,
170; EG-NEXT:     CNDE_INT T0.X, PS, T1.W, PV.W,
171; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
172; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
173  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
174  %a = load i32, i32 addrspace(1)* %in
175  %b = load i32, i32 addrspace(1)* %b_ptr
176  %result = udiv i32 %a, %b
177  store i32 %result, i32 addrspace(1)* %out
178  ret void
179}
180
181define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
182; SI-LABEL: s_udiv_i32:
183; SI:       ; %bb.0:
184; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
185; SI-NEXT:    s_mov_b32 s7, 0xf000
186; SI-NEXT:    s_mov_b32 s6, -1
187; SI-NEXT:    s_waitcnt lgkmcnt(0)
188; SI-NEXT:    v_cvt_f32_u32_e32 v0, s3
189; SI-NEXT:    s_sub_i32 s4, 0, s3
190; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
191; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
192; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
193; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
194; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
195; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
196; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
197; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
198; SI-NEXT:    v_mul_lo_u32 v1, v0, s3
199; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
200; SI-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
201; SI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
202; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
203; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
204; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
205; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
206; SI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
207; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
208; SI-NEXT:    s_waitcnt lgkmcnt(0)
209; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
210; SI-NEXT:    s_endpgm
211;
212; VI-LABEL: s_udiv_i32:
213; VI:       ; %bb.0:
214; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
215; VI-NEXT:    s_mov_b32 s7, 0xf000
216; VI-NEXT:    s_mov_b32 s6, -1
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
219; VI-NEXT:    s_sub_i32 s4, 0, s3
220; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
221; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
222; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
223; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
224; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
225; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
226; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
227; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
228; VI-NEXT:    v_mul_lo_u32 v1, v0, s3
229; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
230; VI-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
231; VI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
232; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
233; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
234; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
235; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
236; VI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
237; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
238; VI-NEXT:    s_waitcnt lgkmcnt(0)
239; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
240; VI-NEXT:    s_endpgm
241;
242; GCN-LABEL: s_udiv_i32:
243; GCN:       ; %bb.0:
244; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
245; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
246; GCN-NEXT:    s_waitcnt lgkmcnt(0)
247; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
248; GCN-NEXT:    s_sub_i32 s0, 0, s3
249; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
250; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
251; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
252; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
253; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
254; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
255; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
256; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
257; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
258; GCN-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
259; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
260; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
261; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
262; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
263; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
264; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
265; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
266; GCN-NEXT:    v_mov_b32_e32 v0, s4
267; GCN-NEXT:    v_mov_b32_e32 v1, s5
268; GCN-NEXT:    flat_store_dword v[0:1], v2
269; GCN-NEXT:    s_endpgm
270;
271; GFX1030-LABEL: s_udiv_i32:
272; GFX1030:       ; %bb.0:
273; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
274; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s1
276; GFX1030-NEXT:    s_sub_i32 s2, 0, s1
277; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
278; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
279; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
280; GFX1030-NEXT:    v_mul_lo_u32 v1, s2, v0
281; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
282; GFX1030-NEXT:    v_mul_hi_u32 v1, v0, v1
283; GFX1030-NEXT:    v_add_nc_u32_e32 v0, v0, v1
284; GFX1030-NEXT:    v_mul_hi_u32 v0, s0, v0
285; GFX1030-NEXT:    v_mul_lo_u32 v1, v0, s1
286; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
287; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
288; GFX1030-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
289; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
290; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
291; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
292; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
293; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
294; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
295; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
296; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX1030-NEXT:    global_store_dword v3, v0, s[2:3]
298; GFX1030-NEXT:    s_endpgm
299;
300; EG-LABEL: s_udiv_i32:
301; EG:       ; %bb.0:
302; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
303; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
304; EG-NEXT:    CF_END
305; EG-NEXT:    PAD
306; EG-NEXT:    ALU clause starting at 4:
307; EG-NEXT:     SUB_INT T0.W, 0.0, KC0[2].W,
308; EG-NEXT:     RECIP_UINT * T0.X, KC0[2].W,
309; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
310; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
311; EG-NEXT:     ADD_INT * T0.W, T0.X, PS,
312; EG-NEXT:     MULHI * T0.X, KC0[2].Z, PV.W,
313; EG-NEXT:     MULLO_INT * T0.Y, PS, KC0[2].W,
314; EG-NEXT:     SUB_INT * T0.W, KC0[2].Z, PS,
315; EG-NEXT:     SUB_INT T0.Z, PV.W, KC0[2].W,
316; EG-NEXT:     SETGE_UINT T1.W, PV.W, KC0[2].W,
317; EG-NEXT:     ADD_INT * T2.W, T0.X, 1,
318; EG-NEXT:     CNDE_INT T2.W, PV.W, T0.X, PS,
319; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, PV.Z,
320; EG-NEXT:     SETGE_UINT T0.W, PS, KC0[2].W,
321; EG-NEXT:     ADD_INT * T1.W, PV.W, 1,
322; EG-NEXT:     CNDE_INT T0.X, PV.W, T2.W, PS,
323; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
324; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
325  %result = udiv i32 %a, %b
326  store i32 %result, i32 addrspace(1)* %out
327  ret void
328}
329
330
331; The code generated by udiv is long and complex and may frequently
332; change. The goal of this test is to make sure the ISel doesn't fail
333; when it gets a v4i32 udiv
334define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
335; SI-LABEL: udiv_v2i32:
336; SI:       ; %bb.0:
337; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
338; SI-NEXT:    s_mov_b32 s7, 0xf000
339; SI-NEXT:    s_mov_b32 s6, -1
340; SI-NEXT:    s_mov_b32 s10, s6
341; SI-NEXT:    s_mov_b32 s11, s7
342; SI-NEXT:    s_waitcnt lgkmcnt(0)
343; SI-NEXT:    s_mov_b32 s8, s2
344; SI-NEXT:    s_mov_b32 s9, s3
345; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
346; SI-NEXT:    s_mov_b32 s4, s0
347; SI-NEXT:    s_mov_b32 s5, s1
348; SI-NEXT:    s_waitcnt vmcnt(0)
349; SI-NEXT:    v_cvt_f32_u32_e32 v4, v2
350; SI-NEXT:    v_cvt_f32_u32_e32 v5, v3
351; SI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
352; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
353; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
354; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
355; SI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
356; SI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
357; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
358; SI-NEXT:    v_cvt_u32_f32_e32 v5, v5
359; SI-NEXT:    v_mul_lo_u32 v6, v6, v4
360; SI-NEXT:    v_mul_lo_u32 v7, v7, v5
361; SI-NEXT:    v_mul_hi_u32 v6, v4, v6
362; SI-NEXT:    v_mul_hi_u32 v7, v5, v7
363; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
364; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
365; SI-NEXT:    v_mul_hi_u32 v4, v0, v4
366; SI-NEXT:    v_mul_hi_u32 v5, v1, v5
367; SI-NEXT:    v_mul_lo_u32 v6, v4, v2
368; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
369; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
370; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
371; SI-NEXT:    v_subrev_i32_e32 v1, vcc, v8, v1
372; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
373; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
374; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
375; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
376; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
377; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
378; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
379; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
380; SI-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
381; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
382; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
383; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
384; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
385; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
386; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
387; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
388; SI-NEXT:    s_endpgm
389;
390; VI-LABEL: udiv_v2i32:
391; VI:       ; %bb.0:
392; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
393; VI-NEXT:    s_mov_b32 s7, 0xf000
394; VI-NEXT:    s_mov_b32 s6, -1
395; VI-NEXT:    s_mov_b32 s10, s6
396; VI-NEXT:    s_mov_b32 s11, s7
397; VI-NEXT:    s_waitcnt lgkmcnt(0)
398; VI-NEXT:    s_mov_b32 s8, s2
399; VI-NEXT:    s_mov_b32 s9, s3
400; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
401; VI-NEXT:    s_mov_b32 s4, s0
402; VI-NEXT:    s_mov_b32 s5, s1
403; VI-NEXT:    s_waitcnt vmcnt(0)
404; VI-NEXT:    v_cvt_f32_u32_e32 v4, v2
405; VI-NEXT:    v_cvt_f32_u32_e32 v5, v3
406; VI-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
407; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
408; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
409; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
410; VI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
411; VI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
412; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
413; VI-NEXT:    v_cvt_u32_f32_e32 v5, v5
414; VI-NEXT:    v_mul_lo_u32 v6, v6, v4
415; VI-NEXT:    v_mul_lo_u32 v7, v7, v5
416; VI-NEXT:    v_mul_hi_u32 v6, v4, v6
417; VI-NEXT:    v_mul_hi_u32 v7, v5, v7
418; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
419; VI-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
420; VI-NEXT:    v_mul_hi_u32 v4, v0, v4
421; VI-NEXT:    v_mul_hi_u32 v5, v1, v5
422; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
423; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
424; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
425; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
426; VI-NEXT:    v_subrev_u32_e32 v1, vcc, v8, v1
427; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
428; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
429; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
430; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
431; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
432; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
433; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
434; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
435; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
436; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
437; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
438; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
439; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
440; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
441; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
442; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
443; VI-NEXT:    s_endpgm
444;
445; GCN-LABEL: udiv_v2i32:
446; GCN:       ; %bb.0:
447; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
448; GCN-NEXT:    s_waitcnt lgkmcnt(0)
449; GCN-NEXT:    v_mov_b32_e32 v0, s2
450; GCN-NEXT:    v_mov_b32_e32 v1, s3
451; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
452; GCN-NEXT:    s_waitcnt vmcnt(0)
453; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
454; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
455; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
456; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
457; GCN-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
458; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
459; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
460; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v5
461; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
462; GCN-NEXT:    v_mul_lo_u32 v5, v4, v6
463; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
464; GCN-NEXT:    v_mul_lo_u32 v8, v4, v7
465; GCN-NEXT:    v_mul_hi_u32 v9, v6, v5
466; GCN-NEXT:    v_mov_b32_e32 v4, s0
467; GCN-NEXT:    v_mov_b32_e32 v5, s1
468; GCN-NEXT:    v_mul_hi_u32 v8, v7, v8
469; GCN-NEXT:    v_add_u32_e32 v6, vcc, v9, v6
470; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
471; GCN-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
472; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
473; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
474; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
475; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
476; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
477; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
478; GCN-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
479; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
480; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
481; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
482; GCN-NEXT:    v_subrev_u32_e32 v8, vcc, v2, v0
483; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
484; GCN-NEXT:    v_subrev_u32_e32 v9, vcc, v3, v1
485; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
486; GCN-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
487; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
488; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v7
489; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
490; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
491; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
492; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
493; GCN-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
494; GCN-NEXT:    s_endpgm
495;
496; GFX1030-LABEL: udiv_v2i32:
497; GFX1030:       ; %bb.0:
498; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
499; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
500; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
502; GFX1030-NEXT:    s_waitcnt vmcnt(0)
503; GFX1030-NEXT:    v_cvt_f32_u32_e32 v5, v2
504; GFX1030-NEXT:    v_cvt_f32_u32_e32 v6, v3
505; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, 0, v2
506; GFX1030-NEXT:    v_sub_nc_u32_e32 v8, 0, v3
507; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v5
508; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v6, v6
509; GFX1030-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
510; GFX1030-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
511; GFX1030-NEXT:    v_cvt_u32_f32_e32 v5, v5
512; GFX1030-NEXT:    v_cvt_u32_f32_e32 v6, v6
513; GFX1030-NEXT:    v_mul_lo_u32 v7, v7, v5
514; GFX1030-NEXT:    v_mul_lo_u32 v8, v8, v6
515; GFX1030-NEXT:    v_mul_hi_u32 v7, v5, v7
516; GFX1030-NEXT:    v_mul_hi_u32 v8, v6, v8
517; GFX1030-NEXT:    v_add_nc_u32_e32 v5, v5, v7
518; GFX1030-NEXT:    v_add_nc_u32_e32 v6, v6, v8
519; GFX1030-NEXT:    v_mul_hi_u32 v5, v0, v5
520; GFX1030-NEXT:    v_mul_hi_u32 v6, v1, v6
521; GFX1030-NEXT:    v_mul_lo_u32 v7, v5, v2
522; GFX1030-NEXT:    v_mul_lo_u32 v8, v6, v3
523; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
524; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
525; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, v1, v8
526; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
527; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
528; GFX1030-NEXT:    v_sub_nc_u32_e32 v9, v1, v3
529; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v1, v3
530; GFX1030-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
531; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v0, v2
532; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s0
533; GFX1030-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
534; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
535; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
536; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
537; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
538; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc_lo
539; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v1, v3
540; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc_lo
541; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
542; GFX1030-NEXT:    s_endpgm
543;
544; EG-LABEL: udiv_v2i32:
545; EG:       ; %bb.0:
546; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
547; EG-NEXT:    TEX 0 @6
548; EG-NEXT:    ALU 33, @9, KC0[CB0:0-32], KC1[]
549; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
550; EG-NEXT:    CF_END
551; EG-NEXT:    PAD
552; EG-NEXT:    Fetch clause starting at 6:
553; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
554; EG-NEXT:    ALU clause starting at 8:
555; EG-NEXT:     MOV * T0.X, KC0[2].Z,
556; EG-NEXT:    ALU clause starting at 9:
557; EG-NEXT:     SUB_INT T1.W, 0.0, T0.W,
558; EG-NEXT:     RECIP_UINT * T1.X, T0.W,
559; EG-NEXT:     MULLO_INT * T1.Y, PV.W, PS,
560; EG-NEXT:     SUB_INT T1.W, 0.0, T0.Z,
561; EG-NEXT:     RECIP_UINT * T1.Z, T0.Z,
562; EG-NEXT:     MULLO_INT * T1.W, PV.W, PS,
563; EG-NEXT:     MULHI * T1.W, T1.Z, PS,
564; EG-NEXT:     ADD_INT T1.W, T1.Z, PS,
565; EG-NEXT:     MULHI * T1.Y, T1.X, T1.Y,
566; EG-NEXT:     ADD_INT T2.W, T1.X, PS,
567; EG-NEXT:     MULHI * T1.X, T0.X, PV.W,
568; EG-NEXT:     MULHI * T1.Y, T0.Y, PV.W,
569; EG-NEXT:     MULLO_INT * T1.Z, PS, T0.W,
570; EG-NEXT:     SUB_INT T1.W, T0.Y, PS,
571; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T0.Z,
572; EG-NEXT:     SUB_INT T0.Y, T0.X, PS,
573; EG-NEXT:     ADD_INT T1.Z, T1.Y, 1,
574; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
575; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
576; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.W, PS,
577; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PV.Z,
578; EG-NEXT:     ADD_INT T1.Z, T1.X, 1,
579; EG-NEXT:     SETGE_UINT T1.W, PV.Y, T0.Z,
580; EG-NEXT:     SUB_INT * T2.W, PV.Y, T0.Z,
581; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS,
582; EG-NEXT:     CNDE_INT T1.Z, PV.W, T1.X, PV.Z,
583; EG-NEXT:     ADD_INT T1.W, PV.Y, 1,
584; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
585; EG-NEXT:     CNDE_INT T1.Y, PS, T1.Y, PV.W,
586; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
587; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T0.Z,
588; EG-NEXT:     CNDE_INT T1.X, PS, T1.Z, PV.W,
589; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
590; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
591  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
592  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
593  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
594  %result = udiv <2 x i32> %a, %b
595  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
596  ret void
597}
598
599define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
600; SI-LABEL: udiv_v4i32:
601; SI:       ; %bb.0:
602; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
603; SI-NEXT:    s_mov_b32 s11, 0xf000
604; SI-NEXT:    s_mov_b32 s10, -1
605; SI-NEXT:    s_mov_b32 s6, s10
606; SI-NEXT:    s_mov_b32 s7, s11
607; SI-NEXT:    s_waitcnt lgkmcnt(0)
608; SI-NEXT:    s_mov_b32 s4, s2
609; SI-NEXT:    s_mov_b32 s5, s3
610; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
611; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
612; SI-NEXT:    s_mov_b32 s8, s0
613; SI-NEXT:    s_mov_b32 s9, s1
614; SI-NEXT:    s_waitcnt vmcnt(1)
615; SI-NEXT:    v_cvt_f32_u32_e32 v8, v0
616; SI-NEXT:    v_cvt_f32_u32_e32 v10, v1
617; SI-NEXT:    v_cvt_f32_u32_e32 v12, v2
618; SI-NEXT:    v_cvt_f32_u32_e32 v14, v3
619; SI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
620; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
621; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
622; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
623; SI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
624; SI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
625; SI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
626; SI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
627; SI-NEXT:    v_cvt_u32_f32_e32 v8, v8
628; SI-NEXT:    v_cvt_u32_f32_e32 v10, v10
629; SI-NEXT:    v_cvt_u32_f32_e32 v12, v12
630; SI-NEXT:    v_cvt_u32_f32_e32 v14, v14
631; SI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
632; SI-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
633; SI-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
634; SI-NEXT:    v_sub_i32_e32 v15, vcc, 0, v3
635; SI-NEXT:    v_mul_lo_u32 v9, v9, v8
636; SI-NEXT:    v_mul_lo_u32 v11, v11, v10
637; SI-NEXT:    v_mul_lo_u32 v13, v13, v12
638; SI-NEXT:    v_mul_lo_u32 v15, v15, v14
639; SI-NEXT:    v_mul_hi_u32 v9, v8, v9
640; SI-NEXT:    v_mul_hi_u32 v11, v10, v11
641; SI-NEXT:    v_mul_hi_u32 v13, v12, v13
642; SI-NEXT:    v_mul_hi_u32 v15, v14, v15
643; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
644; SI-NEXT:    v_add_i32_e32 v9, vcc, v11, v10
645; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v12
646; SI-NEXT:    v_add_i32_e32 v11, vcc, v15, v14
647; SI-NEXT:    s_waitcnt vmcnt(0)
648; SI-NEXT:    v_mul_hi_u32 v8, v4, v8
649; SI-NEXT:    v_mul_hi_u32 v9, v5, v9
650; SI-NEXT:    v_mul_hi_u32 v10, v6, v10
651; SI-NEXT:    v_mul_hi_u32 v11, v7, v11
652; SI-NEXT:    v_mul_lo_u32 v12, v8, v0
653; SI-NEXT:    v_mul_lo_u32 v14, v9, v1
654; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
655; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
656; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v12, v4
657; SI-NEXT:    v_subrev_i32_e32 v5, vcc, v14, v5
658; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v16, v6
659; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v18, v7
660; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
661; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
662; SI-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
663; SI-NEXT:    v_add_i32_e32 v19, vcc, 1, v11
664; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
665; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
666; SI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
667; SI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
668; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
669; SI-NEXT:    v_subrev_i32_e32 v12, vcc, v0, v4
670; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
671; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
672; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
673; SI-NEXT:    v_subrev_i32_e32 v14, vcc, v2, v6
674; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
675; SI-NEXT:    v_subrev_i32_e32 v15, vcc, v3, v7
676; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
677; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
678; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
679; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v9
680; SI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
681; SI-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
682; SI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
683; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v11
684; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
685; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
686; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
687; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
688; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
689; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
690; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
691; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
692; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
693; SI-NEXT:    s_endpgm
694;
695; VI-LABEL: udiv_v4i32:
696; VI:       ; %bb.0:
697; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
698; VI-NEXT:    s_mov_b32 s11, 0xf000
699; VI-NEXT:    s_mov_b32 s10, -1
700; VI-NEXT:    s_mov_b32 s6, s10
701; VI-NEXT:    s_mov_b32 s7, s11
702; VI-NEXT:    s_waitcnt lgkmcnt(0)
703; VI-NEXT:    s_mov_b32 s4, s2
704; VI-NEXT:    s_mov_b32 s5, s3
705; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
706; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
707; VI-NEXT:    s_mov_b32 s8, s0
708; VI-NEXT:    s_mov_b32 s9, s1
709; VI-NEXT:    s_waitcnt vmcnt(1)
710; VI-NEXT:    v_cvt_f32_u32_e32 v8, v0
711; VI-NEXT:    v_cvt_f32_u32_e32 v10, v1
712; VI-NEXT:    v_cvt_f32_u32_e32 v12, v2
713; VI-NEXT:    v_cvt_f32_u32_e32 v14, v3
714; VI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
715; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
716; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
717; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
718; VI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
719; VI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
720; VI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
721; VI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
722; VI-NEXT:    v_cvt_u32_f32_e32 v8, v8
723; VI-NEXT:    v_cvt_u32_f32_e32 v10, v10
724; VI-NEXT:    v_cvt_u32_f32_e32 v12, v12
725; VI-NEXT:    v_cvt_u32_f32_e32 v14, v14
726; VI-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
727; VI-NEXT:    v_sub_u32_e32 v11, vcc, 0, v1
728; VI-NEXT:    v_sub_u32_e32 v13, vcc, 0, v2
729; VI-NEXT:    v_sub_u32_e32 v15, vcc, 0, v3
730; VI-NEXT:    v_mul_lo_u32 v9, v9, v8
731; VI-NEXT:    v_mul_lo_u32 v11, v11, v10
732; VI-NEXT:    v_mul_lo_u32 v13, v13, v12
733; VI-NEXT:    v_mul_lo_u32 v15, v15, v14
734; VI-NEXT:    v_mul_hi_u32 v9, v8, v9
735; VI-NEXT:    v_mul_hi_u32 v11, v10, v11
736; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
737; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
738; VI-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
739; VI-NEXT:    v_add_u32_e32 v9, vcc, v11, v10
740; VI-NEXT:    v_add_u32_e32 v10, vcc, v13, v12
741; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v14
742; VI-NEXT:    s_waitcnt vmcnt(0)
743; VI-NEXT:    v_mul_hi_u32 v8, v4, v8
744; VI-NEXT:    v_mul_hi_u32 v9, v5, v9
745; VI-NEXT:    v_mul_hi_u32 v10, v6, v10
746; VI-NEXT:    v_mul_hi_u32 v11, v7, v11
747; VI-NEXT:    v_mul_lo_u32 v12, v8, v0
748; VI-NEXT:    v_mul_lo_u32 v14, v9, v1
749; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
750; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
751; VI-NEXT:    v_subrev_u32_e32 v4, vcc, v12, v4
752; VI-NEXT:    v_subrev_u32_e32 v5, vcc, v14, v5
753; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v16, v6
754; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v18, v7
755; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
756; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
757; VI-NEXT:    v_add_u32_e32 v17, vcc, 1, v10
758; VI-NEXT:    v_add_u32_e32 v19, vcc, 1, v11
759; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
760; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
761; VI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
762; VI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
763; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
764; VI-NEXT:    v_subrev_u32_e32 v12, vcc, v0, v4
765; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
766; VI-NEXT:    v_subrev_u32_e32 v13, vcc, v1, v5
767; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
768; VI-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
769; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
770; VI-NEXT:    v_subrev_u32_e32 v15, vcc, v3, v7
771; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
772; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
773; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
774; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v9
775; VI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
776; VI-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
777; VI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
778; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
779; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
780; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
781; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
782; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
783; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
784; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
785; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
786; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
787; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
788; VI-NEXT:    s_endpgm
789;
790; GCN-LABEL: udiv_v4i32:
791; GCN:       ; %bb.0:
792; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
793; GCN-NEXT:    s_waitcnt lgkmcnt(0)
794; GCN-NEXT:    s_add_u32 s4, s2, 16
795; GCN-NEXT:    s_addc_u32 s5, s3, 0
796; GCN-NEXT:    v_mov_b32_e32 v0, s4
797; GCN-NEXT:    v_mov_b32_e32 v1, s5
798; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
799; GCN-NEXT:    v_mov_b32_e32 v5, s3
800; GCN-NEXT:    v_mov_b32_e32 v4, s2
801; GCN-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
802; GCN-NEXT:    v_mov_b32_e32 v8, s0
803; GCN-NEXT:    v_mov_b32_e32 v9, s1
804; GCN-NEXT:    s_waitcnt vmcnt(1)
805; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v0
806; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v1
807; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v2
808; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v3
809; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
810; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
811; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
812; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
813; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
814; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
815; GCN-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
816; GCN-NEXT:    v_mul_f32_e32 v16, 0x4f7ffffe, v16
817; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
818; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
819; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14
820; GCN-NEXT:    v_cvt_u32_f32_e32 v16, v16
821; GCN-NEXT:    v_sub_u32_e32 v11, vcc, 0, v0
822; GCN-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
823; GCN-NEXT:    v_sub_u32_e32 v15, vcc, 0, v2
824; GCN-NEXT:    v_sub_u32_e32 v17, vcc, 0, v3
825; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
826; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
827; GCN-NEXT:    v_mul_lo_u32 v15, v15, v14
828; GCN-NEXT:    v_mul_lo_u32 v17, v17, v16
829; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
830; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
831; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
832; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
833; GCN-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
834; GCN-NEXT:    v_add_u32_e32 v11, vcc, v13, v12
835; GCN-NEXT:    v_add_u32_e32 v12, vcc, v15, v14
836; GCN-NEXT:    v_add_u32_e32 v13, vcc, v17, v16
837; GCN-NEXT:    s_waitcnt vmcnt(0)
838; GCN-NEXT:    v_mul_hi_u32 v10, v4, v10
839; GCN-NEXT:    v_mul_hi_u32 v11, v5, v11
840; GCN-NEXT:    v_mul_hi_u32 v12, v6, v12
841; GCN-NEXT:    v_mul_hi_u32 v13, v7, v13
842; GCN-NEXT:    v_mul_lo_u32 v14, v10, v0
843; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
844; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
845; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
846; GCN-NEXT:    v_subrev_u32_e32 v4, vcc, v14, v4
847; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v16, v5
848; GCN-NEXT:    v_subrev_u32_e32 v6, vcc, v18, v6
849; GCN-NEXT:    v_subrev_u32_e32 v7, vcc, v19, v7
850; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
851; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
852; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
853; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
854; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
855; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
856; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
857; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
858; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
859; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v0, v4
860; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
861; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v1, v5
862; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
863; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
864; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
865; GCN-NEXT:    v_subrev_u32_e32 v16, vcc, v3, v7
866; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[0:1]
867; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
868; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v17, s[2:3]
869; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
870; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
871; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
872; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v16, s[6:7]
873; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
874; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
875; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v15, vcc
876; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
877; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v17, vcc
878; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
879; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc
880; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
881; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc
882; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
883; GCN-NEXT:    s_endpgm
884;
885; GFX1030-LABEL: udiv_v4i32:
886; GFX1030:       ; %bb.0:
887; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
888; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
889; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX1030-NEXT:    s_clause 0x1
891; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
892; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
893; GFX1030-NEXT:    s_waitcnt vmcnt(1)
894; GFX1030-NEXT:    v_cvt_f32_u32_e32 v9, v0
895; GFX1030-NEXT:    v_cvt_f32_u32_e32 v10, v1
896; GFX1030-NEXT:    v_cvt_f32_u32_e32 v11, v2
897; GFX1030-NEXT:    v_cvt_f32_u32_e32 v12, v3
898; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, 0, v0
899; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v9, v9
900; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v10, v10
901; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v11, v11
902; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v12, v12
903; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, 0, v1
904; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, 0, v2
905; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, 0, v3
906; GFX1030-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
907; GFX1030-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
908; GFX1030-NEXT:    v_mul_f32_e32 v11, 0x4f7ffffe, v11
909; GFX1030-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
910; GFX1030-NEXT:    v_cvt_u32_f32_e32 v9, v9
911; GFX1030-NEXT:    v_cvt_u32_f32_e32 v10, v10
912; GFX1030-NEXT:    v_cvt_u32_f32_e32 v11, v11
913; GFX1030-NEXT:    v_cvt_u32_f32_e32 v12, v12
914; GFX1030-NEXT:    v_mul_lo_u32 v13, v13, v9
915; GFX1030-NEXT:    v_mul_lo_u32 v14, v14, v10
916; GFX1030-NEXT:    v_mul_lo_u32 v15, v15, v11
917; GFX1030-NEXT:    v_mul_lo_u32 v16, v16, v12
918; GFX1030-NEXT:    v_mul_hi_u32 v13, v9, v13
919; GFX1030-NEXT:    v_mul_hi_u32 v14, v10, v14
920; GFX1030-NEXT:    v_mul_hi_u32 v15, v11, v15
921; GFX1030-NEXT:    v_mul_hi_u32 v16, v12, v16
922; GFX1030-NEXT:    v_add_nc_u32_e32 v9, v9, v13
923; GFX1030-NEXT:    v_add_nc_u32_e32 v10, v10, v14
924; GFX1030-NEXT:    v_add_nc_u32_e32 v11, v11, v15
925; GFX1030-NEXT:    v_add_nc_u32_e32 v12, v12, v16
926; GFX1030-NEXT:    s_waitcnt vmcnt(0)
927; GFX1030-NEXT:    v_mul_hi_u32 v9, v4, v9
928; GFX1030-NEXT:    v_mul_hi_u32 v10, v5, v10
929; GFX1030-NEXT:    v_mul_hi_u32 v11, v6, v11
930; GFX1030-NEXT:    v_mul_hi_u32 v12, v7, v12
931; GFX1030-NEXT:    v_mul_lo_u32 v13, v9, v0
932; GFX1030-NEXT:    v_mul_lo_u32 v14, v10, v1
933; GFX1030-NEXT:    v_mul_lo_u32 v15, v11, v2
934; GFX1030-NEXT:    v_mul_lo_u32 v16, v12, v3
935; GFX1030-NEXT:    v_add_nc_u32_e32 v17, 1, v9
936; GFX1030-NEXT:    v_add_nc_u32_e32 v18, 1, v10
937; GFX1030-NEXT:    v_add_nc_u32_e32 v19, 1, v11
938; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v13
939; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v5, v14
940; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v6, v15
941; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v7, v16
942; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
943; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
944; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, v4, v0
945; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v5, v1
946; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, v5, v1
947; GFX1030-NEXT:    v_cmp_ge_u32_e64 s1, v6, v2
948; GFX1030-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc_lo
949; GFX1030-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
950; GFX1030-NEXT:    v_cndmask_b32_e64 v10, v10, v18, s0
951; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, v6, v2
952; GFX1030-NEXT:    v_cmp_ge_u32_e64 s2, v7, v3
953; GFX1030-NEXT:    v_add_nc_u32_e32 v14, 1, v9
954; GFX1030-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s0
955; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
956; GFX1030-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s1
957; GFX1030-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s2
958; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, v7, v3
959; GFX1030-NEXT:    v_add_nc_u32_e32 v15, 1, v10
960; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s1
961; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v9, v14, vcc_lo
962; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v5, v1
963; GFX1030-NEXT:    v_add_nc_u32_e32 v16, 1, v11
964; GFX1030-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
965; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
966; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc_lo
967; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v6, v2
968; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v11, v16, vcc_lo
969; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v7, v3
970; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc_lo
971; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
972; GFX1030-NEXT:    s_endpgm
973;
974; EG-LABEL: udiv_v4i32:
975; EG:       ; %bb.0:
976; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
977; EG-NEXT:    TEX 1 @6
978; EG-NEXT:    ALU 65, @11, KC0[CB0:0-32], KC1[]
979; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
980; EG-NEXT:    CF_END
981; EG-NEXT:    PAD
982; EG-NEXT:    Fetch clause starting at 6:
983; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
984; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
985; EG-NEXT:    ALU clause starting at 10:
986; EG-NEXT:     MOV * T0.X, KC0[2].Z,
987; EG-NEXT:    ALU clause starting at 11:
988; EG-NEXT:     SUB_INT T2.W, 0.0, T1.W,
989; EG-NEXT:     RECIP_UINT * T2.X, T1.W,
990; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
991; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
992; EG-NEXT:     ADD_INT * T2.W, T2.X, PS,
993; EG-NEXT:     MULHI * T2.X, T0.W, PV.W,
994; EG-NEXT:     MULLO_INT * T2.Y, PS, T1.W,
995; EG-NEXT:     SUB_INT T2.W, 0.0, T1.X,
996; EG-NEXT:     RECIP_UINT * T2.Z, T1.X,
997; EG-NEXT:     MULLO_INT * T2.W, PV.W, PS,
998; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Y,
999; EG-NEXT:     RECIP_UINT * T3.X, T1.Y,
1000; EG-NEXT:     MULLO_INT * T3.Y, PV.W, PS,
1001; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Z,
1002; EG-NEXT:     RECIP_UINT * T3.Z, T1.Z,
1003; EG-NEXT:     MULLO_INT * T3.W, PV.W, PS,
1004; EG-NEXT:     MULHI * T3.W, T3.Z, PS,
1005; EG-NEXT:     ADD_INT T3.W, T3.Z, PS,
1006; EG-NEXT:     MULHI * T3.Y, T3.X, T3.Y,
1007; EG-NEXT:     ADD_INT T4.W, T3.X, PS,
1008; EG-NEXT:     MULHI * T3.X, T0.Z, PV.W,
1009; EG-NEXT:     MULHI * T3.Y, T0.Y, PV.W,
1010; EG-NEXT:     MULLO_INT * T3.Z, PS, T1.Y,
1011; EG-NEXT:     SUB_INT T3.W, T0.Y, PS,
1012; EG-NEXT:     MULLO_INT * T0.Y, T3.X, T1.Z,
1013; EG-NEXT:     SUB_INT T4.X, T0.Z, PS,
1014; EG-NEXT:     ADD_INT T0.Y, T3.Y, 1,
1015; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.Y,
1016; EG-NEXT:     SUB_INT T4.W, PV.W, T1.Y,
1017; EG-NEXT:     MULHI * T2.W, T2.Z, T2.W,
1018; EG-NEXT:     CNDE_INT T5.X, PV.Z, T3.W, PV.W,
1019; EG-NEXT:     CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122
1020; EG-NEXT:     SETGE_UINT T0.Z, PV.X, T1.Z,
1021; EG-NEXT:     ADD_INT T2.W, T2.Z, PS,
1022; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.Y,
1023; EG-NEXT:     ADD_INT T6.X, T3.X, 1,
1024; EG-NEXT:     ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212
1025; EG-NEXT:     SETGE_UINT T2.Z, PS, T1.W,
1026; EG-NEXT:     SUB_INT T3.W, PS, T1.W,
1027; EG-NEXT:     MULHI * T2.W, T0.X, PV.W,
1028; EG-NEXT:     SUB_INT T7.X, T4.X, T1.Z,
1029; EG-NEXT:     CNDE_INT T3.Y, PV.Z, T0.W, PV.W,
1030; EG-NEXT:     CNDE_INT T2.Z, PV.Z, T2.X, PV.Y,
1031; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122
1032; EG-NEXT:     MULLO_INT * T2.X, T2.W, T1.X,
1033; EG-NEXT:     ADD_INT T3.X, T0.W, 1,
1034; EG-NEXT:     ADD_INT T2.Y, T2.Z, 1,
1035; EG-NEXT:     SETGE_UINT T3.Z, T3.Y, T1.W,
1036; EG-NEXT:     SUB_INT T1.W, T0.X, PS, BS:VEC_201
1037; EG-NEXT:     CNDE_INT * T3.W, T0.Z, T4.X, T7.X,
1038; EG-NEXT:     SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122
1039; EG-NEXT:     ADD_INT T3.Y, T2.W, 1,
1040; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.X,
1041; EG-NEXT:     SUB_INT T3.W, PV.W, T1.X,
1042; EG-NEXT:     CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y,
1043; EG-NEXT:     CNDE_INT T2.X, PV.Z, T1.W, PV.W,
1044; EG-NEXT:     CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122
1045; EG-NEXT:     CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201
1046; EG-NEXT:     ADD_INT T0.W, T0.Y, 1,
1047; EG-NEXT:     SETGE_UINT * T1.W, T5.X, T1.Y,
1048; EG-NEXT:     CNDE_INT T4.Y, PS, T0.Y, PV.W,
1049; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
1050; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.X,
1051; EG-NEXT:     CNDE_INT T4.X, PS, T2.Y, PV.W,
1052; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1053; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1054  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1055  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
1056  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
1057  %result = udiv <4 x i32> %a, %b
1058  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1059  ret void
1060}
1061
1062define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1063; SI-LABEL: udiv_i32_div_pow2:
1064; SI:       ; %bb.0:
1065; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1066; SI-NEXT:    s_mov_b32 s7, 0xf000
1067; SI-NEXT:    s_mov_b32 s6, -1
1068; SI-NEXT:    s_mov_b32 s10, s6
1069; SI-NEXT:    s_mov_b32 s11, s7
1070; SI-NEXT:    s_waitcnt lgkmcnt(0)
1071; SI-NEXT:    s_mov_b32 s8, s2
1072; SI-NEXT:    s_mov_b32 s9, s3
1073; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1074; SI-NEXT:    s_mov_b32 s4, s0
1075; SI-NEXT:    s_mov_b32 s5, s1
1076; SI-NEXT:    s_waitcnt vmcnt(0)
1077; SI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1078; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1079; SI-NEXT:    s_endpgm
1080;
1081; VI-LABEL: udiv_i32_div_pow2:
1082; VI:       ; %bb.0:
1083; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1084; VI-NEXT:    s_mov_b32 s7, 0xf000
1085; VI-NEXT:    s_mov_b32 s6, -1
1086; VI-NEXT:    s_mov_b32 s10, s6
1087; VI-NEXT:    s_mov_b32 s11, s7
1088; VI-NEXT:    s_waitcnt lgkmcnt(0)
1089; VI-NEXT:    s_mov_b32 s8, s2
1090; VI-NEXT:    s_mov_b32 s9, s3
1091; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1092; VI-NEXT:    s_mov_b32 s4, s0
1093; VI-NEXT:    s_mov_b32 s5, s1
1094; VI-NEXT:    s_waitcnt vmcnt(0)
1095; VI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1096; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1097; VI-NEXT:    s_endpgm
1098;
1099; GCN-LABEL: udiv_i32_div_pow2:
1100; GCN:       ; %bb.0:
1101; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1102; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1103; GCN-NEXT:    v_mov_b32_e32 v0, s2
1104; GCN-NEXT:    v_mov_b32_e32 v1, s3
1105; GCN-NEXT:    flat_load_dword v2, v[0:1]
1106; GCN-NEXT:    v_mov_b32_e32 v0, s0
1107; GCN-NEXT:    v_mov_b32_e32 v1, s1
1108; GCN-NEXT:    s_waitcnt vmcnt(0)
1109; GCN-NEXT:    v_lshrrev_b32_e32 v2, 4, v2
1110; GCN-NEXT:    flat_store_dword v[0:1], v2
1111; GCN-NEXT:    s_endpgm
1112;
1113; GFX1030-LABEL: udiv_i32_div_pow2:
1114; GFX1030:       ; %bb.0:
1115; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1116; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1117; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1118; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1119; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1120; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
1121; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1122; GFX1030-NEXT:    s_endpgm
1123;
1124; EG-LABEL: udiv_i32_div_pow2:
1125; EG:       ; %bb.0:
1126; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1127; EG-NEXT:    TEX 0 @6
1128; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1129; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1130; EG-NEXT:    CF_END
1131; EG-NEXT:    PAD
1132; EG-NEXT:    Fetch clause starting at 6:
1133; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1134; EG-NEXT:    ALU clause starting at 8:
1135; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1136; EG-NEXT:    ALU clause starting at 9:
1137; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
1138; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1139; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
1140  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1141  %a = load i32, i32 addrspace(1)* %in
1142  %result = udiv i32 %a, 16
1143  store i32 %result, i32 addrspace(1)* %out
1144  ret void
1145}
1146
1147define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1148; SI-LABEL: udiv_i32_div_k_even:
1149; SI:       ; %bb.0:
1150; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1151; SI-NEXT:    s_mov_b32 s7, 0xf000
1152; SI-NEXT:    s_mov_b32 s6, -1
1153; SI-NEXT:    s_mov_b32 s10, s6
1154; SI-NEXT:    s_mov_b32 s11, s7
1155; SI-NEXT:    s_waitcnt lgkmcnt(0)
1156; SI-NEXT:    s_mov_b32 s8, s2
1157; SI-NEXT:    s_mov_b32 s9, s3
1158; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1159; SI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1160; SI-NEXT:    s_mov_b32 s4, s0
1161; SI-NEXT:    s_mov_b32 s5, s1
1162; SI-NEXT:    s_waitcnt vmcnt(0)
1163; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1164; SI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1165; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1166; SI-NEXT:    s_endpgm
1167;
1168; VI-LABEL: udiv_i32_div_k_even:
1169; VI:       ; %bb.0:
1170; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1171; VI-NEXT:    s_mov_b32 s7, 0xf000
1172; VI-NEXT:    s_mov_b32 s6, -1
1173; VI-NEXT:    s_mov_b32 s10, s6
1174; VI-NEXT:    s_mov_b32 s11, s7
1175; VI-NEXT:    s_waitcnt lgkmcnt(0)
1176; VI-NEXT:    s_mov_b32 s8, s2
1177; VI-NEXT:    s_mov_b32 s9, s3
1178; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1179; VI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1180; VI-NEXT:    s_mov_b32 s4, s0
1181; VI-NEXT:    s_mov_b32 s5, s1
1182; VI-NEXT:    s_waitcnt vmcnt(0)
1183; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1184; VI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1185; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1186; VI-NEXT:    s_endpgm
1187;
1188; GCN-LABEL: udiv_i32_div_k_even:
1189; GCN:       ; %bb.0:
1190; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1191; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1192; GCN-NEXT:    v_mov_b32_e32 v0, s2
1193; GCN-NEXT:    v_mov_b32_e32 v1, s3
1194; GCN-NEXT:    flat_load_dword v0, v[0:1]
1195; GCN-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1196; GCN-NEXT:    v_mov_b32_e32 v1, s1
1197; GCN-NEXT:    s_waitcnt vmcnt(0)
1198; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1199; GCN-NEXT:    v_mov_b32_e32 v0, s0
1200; GCN-NEXT:    v_lshrrev_b32_e32 v2, 25, v2
1201; GCN-NEXT:    flat_store_dword v[0:1], v2
1202; GCN-NEXT:    s_endpgm
1203;
1204; GFX1030-LABEL: udiv_i32_div_k_even:
1205; GFX1030:       ; %bb.0:
1206; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1207; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1208; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1210; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1211; GFX1030-NEXT:    v_mul_hi_u32 v1, 0xfabbd9c1, v1
1212; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
1213; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1214; GFX1030-NEXT:    s_endpgm
1215;
1216; EG-LABEL: udiv_i32_div_k_even:
1217; EG:       ; %bb.0:
1218; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1219; EG-NEXT:    TEX 0 @6
1220; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1221; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1222; EG-NEXT:    CF_END
1223; EG-NEXT:    PAD
1224; EG-NEXT:    Fetch clause starting at 6:
1225; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1226; EG-NEXT:    ALU clause starting at 8:
1227; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1228; EG-NEXT:    ALU clause starting at 9:
1229; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1230; EG-NEXT:    -88352319(-4.876880e+35), 0(0.000000e+00)
1231; EG-NEXT:     LSHR T0.X, PS, literal.x,
1232; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1233; EG-NEXT:    25(3.503246e-44), 2(2.802597e-45)
1234  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1235  %a = load i32, i32 addrspace(1)* %in
1236  %result = udiv i32 %a, 34259182
1237  store i32 %result, i32 addrspace(1)* %out
1238  ret void
1239}
1240
1241define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1242; SI-LABEL: udiv_i32_div_k_odd:
1243; SI:       ; %bb.0:
1244; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1245; SI-NEXT:    s_mov_b32 s7, 0xf000
1246; SI-NEXT:    s_mov_b32 s6, -1
1247; SI-NEXT:    s_mov_b32 s10, s6
1248; SI-NEXT:    s_mov_b32 s11, s7
1249; SI-NEXT:    s_waitcnt lgkmcnt(0)
1250; SI-NEXT:    s_mov_b32 s8, s2
1251; SI-NEXT:    s_mov_b32 s9, s3
1252; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1253; SI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1254; SI-NEXT:    s_mov_b32 s4, s0
1255; SI-NEXT:    s_mov_b32 s5, s1
1256; SI-NEXT:    s_waitcnt vmcnt(0)
1257; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1258; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1259; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1260; SI-NEXT:    s_endpgm
1261;
1262; VI-LABEL: udiv_i32_div_k_odd:
1263; VI:       ; %bb.0:
1264; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1265; VI-NEXT:    s_mov_b32 s7, 0xf000
1266; VI-NEXT:    s_mov_b32 s6, -1
1267; VI-NEXT:    s_mov_b32 s10, s6
1268; VI-NEXT:    s_mov_b32 s11, s7
1269; VI-NEXT:    s_waitcnt lgkmcnt(0)
1270; VI-NEXT:    s_mov_b32 s8, s2
1271; VI-NEXT:    s_mov_b32 s9, s3
1272; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1273; VI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1274; VI-NEXT:    s_mov_b32 s4, s0
1275; VI-NEXT:    s_mov_b32 s5, s1
1276; VI-NEXT:    s_waitcnt vmcnt(0)
1277; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1278; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1279; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1280; VI-NEXT:    s_endpgm
1281;
1282; GCN-LABEL: udiv_i32_div_k_odd:
1283; GCN:       ; %bb.0:
1284; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1285; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1286; GCN-NEXT:    v_mov_b32_e32 v0, s2
1287; GCN-NEXT:    v_mov_b32_e32 v1, s3
1288; GCN-NEXT:    flat_load_dword v0, v[0:1]
1289; GCN-NEXT:    s_mov_b32 s2, 0x7d5deca3
1290; GCN-NEXT:    v_mov_b32_e32 v1, s1
1291; GCN-NEXT:    s_waitcnt vmcnt(0)
1292; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1293; GCN-NEXT:    v_mov_b32_e32 v0, s0
1294; GCN-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1295; GCN-NEXT:    flat_store_dword v[0:1], v2
1296; GCN-NEXT:    s_endpgm
1297;
1298; GFX1030-LABEL: udiv_i32_div_k_odd:
1299; GFX1030:       ; %bb.0:
1300; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1301; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1302; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1304; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1305; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x7d5deca3, v1
1306; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1307; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1308; GFX1030-NEXT:    s_endpgm
1309;
1310; EG-LABEL: udiv_i32_div_k_odd:
1311; EG:       ; %bb.0:
1312; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1313; EG-NEXT:    TEX 0 @6
1314; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1315; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1316; EG-NEXT:    CF_END
1317; EG-NEXT:    PAD
1318; EG-NEXT:    Fetch clause starting at 6:
1319; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1320; EG-NEXT:    ALU clause starting at 8:
1321; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1322; EG-NEXT:    ALU clause starting at 9:
1323; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1324; EG-NEXT:    2103307427(1.843675e+37), 0(0.000000e+00)
1325; EG-NEXT:     LSHR T0.X, PS, literal.x,
1326; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1327; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
1328  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1329  %a = load i32, i32 addrspace(1)* %in
1330  %result = udiv i32 %a, 34259183
1331  store i32 %result, i32 addrspace(1)* %out
1332  ret void
1333}
1334
1335define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1336; SI-LABEL: v_udiv_i8:
1337; SI:       ; %bb.0:
1338; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1339; SI-NEXT:    s_mov_b32 s7, 0xf000
1340; SI-NEXT:    s_mov_b32 s6, -1
1341; SI-NEXT:    s_mov_b32 s10, s6
1342; SI-NEXT:    s_mov_b32 s11, s7
1343; SI-NEXT:    s_waitcnt lgkmcnt(0)
1344; SI-NEXT:    s_mov_b32 s8, s2
1345; SI-NEXT:    s_mov_b32 s9, s3
1346; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1347; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1348; SI-NEXT:    s_mov_b32 s4, s0
1349; SI-NEXT:    s_mov_b32 s5, s1
1350; SI-NEXT:    s_waitcnt vmcnt(1)
1351; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1352; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1353; SI-NEXT:    s_waitcnt vmcnt(0)
1354; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1355; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1356; SI-NEXT:    v_trunc_f32_e32 v2, v2
1357; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1358; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1359; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1360; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1361; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1362; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1363; SI-NEXT:    s_endpgm
1364;
1365; VI-LABEL: v_udiv_i8:
1366; VI:       ; %bb.0:
1367; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1368; VI-NEXT:    s_mov_b32 s7, 0xf000
1369; VI-NEXT:    s_mov_b32 s6, -1
1370; VI-NEXT:    s_mov_b32 s10, s6
1371; VI-NEXT:    s_mov_b32 s11, s7
1372; VI-NEXT:    s_waitcnt lgkmcnt(0)
1373; VI-NEXT:    s_mov_b32 s8, s2
1374; VI-NEXT:    s_mov_b32 s9, s3
1375; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1376; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1377; VI-NEXT:    s_mov_b32 s4, s0
1378; VI-NEXT:    s_mov_b32 s5, s1
1379; VI-NEXT:    s_waitcnt vmcnt(1)
1380; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1381; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1382; VI-NEXT:    s_waitcnt vmcnt(0)
1383; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1384; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1385; VI-NEXT:    v_trunc_f32_e32 v2, v2
1386; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1387; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1388; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1389; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1390; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1391; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1392; VI-NEXT:    s_endpgm
1393;
1394; GCN-LABEL: v_udiv_i8:
1395; GCN:       ; %bb.0:
1396; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1397; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1398; GCN-NEXT:    v_mov_b32_e32 v0, s2
1399; GCN-NEXT:    v_mov_b32_e32 v1, s3
1400; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1401; GCN-NEXT:    v_mov_b32_e32 v0, s0
1402; GCN-NEXT:    v_mov_b32_e32 v1, s1
1403; GCN-NEXT:    s_waitcnt vmcnt(0)
1404; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
1405; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
1406; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1407; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
1408; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1409; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1410; GCN-NEXT:    v_mad_f32 v2, -v4, v3, v2
1411; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1412; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1413; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
1414; GCN-NEXT:    flat_store_dword v[0:1], v2
1415; GCN-NEXT:    s_endpgm
1416;
1417; GFX1030-LABEL: v_udiv_i8:
1418; GFX1030:       ; %bb.0:
1419; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1420; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1421; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3]
1423; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1424; GFX1030-NEXT:    v_cvt_f32_ubyte1_e32 v2, v1
1425; GFX1030-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1426; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1427; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
1428; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1429; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
1430; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1431; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1432; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1433; GFX1030-NEXT:    v_and_b32_e32 v1, 0xff, v1
1434; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1435; GFX1030-NEXT:    s_endpgm
1436;
1437; EG-LABEL: v_udiv_i8:
1438; EG:       ; %bb.0:
1439; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1440; EG-NEXT:    TEX 1 @6
1441; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1442; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1443; EG-NEXT:    CF_END
1444; EG-NEXT:    PAD
1445; EG-NEXT:    Fetch clause starting at 6:
1446; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1447; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1448; EG-NEXT:    ALU clause starting at 10:
1449; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1450; EG-NEXT:    ALU clause starting at 11:
1451; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1452; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1453; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1454; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1455; EG-NEXT:     TRUNC * T0.W, PV.W,
1456; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1457; EG-NEXT:     TRUNC * T0.W, PV.W,
1458; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1459; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1460; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1461; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1462; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1463; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1464; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1465; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
1466  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1467  %num = load i8, i8 addrspace(1) * %in
1468  %den = load i8, i8 addrspace(1) * %den_ptr
1469  %result = udiv i8 %num, %den
1470  %result.ext = zext i8 %result to i32
1471  store i32 %result.ext, i32 addrspace(1)* %out
1472  ret void
1473}
1474
1475define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
1476; SI-LABEL: v_udiv_i16:
1477; SI:       ; %bb.0:
1478; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1479; SI-NEXT:    s_mov_b32 s7, 0xf000
1480; SI-NEXT:    s_mov_b32 s6, -1
1481; SI-NEXT:    s_mov_b32 s10, s6
1482; SI-NEXT:    s_mov_b32 s11, s7
1483; SI-NEXT:    s_waitcnt lgkmcnt(0)
1484; SI-NEXT:    s_mov_b32 s8, s2
1485; SI-NEXT:    s_mov_b32 s9, s3
1486; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1487; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1488; SI-NEXT:    s_mov_b32 s4, s0
1489; SI-NEXT:    s_mov_b32 s5, s1
1490; SI-NEXT:    s_waitcnt vmcnt(1)
1491; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1492; SI-NEXT:    s_waitcnt vmcnt(0)
1493; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1494; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1495; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1496; SI-NEXT:    v_trunc_f32_e32 v2, v2
1497; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1498; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1499; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1500; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1501; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1502; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1503; SI-NEXT:    s_endpgm
1504;
1505; VI-LABEL: v_udiv_i16:
1506; VI:       ; %bb.0:
1507; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1508; VI-NEXT:    s_mov_b32 s7, 0xf000
1509; VI-NEXT:    s_mov_b32 s6, -1
1510; VI-NEXT:    s_mov_b32 s10, s6
1511; VI-NEXT:    s_mov_b32 s11, s7
1512; VI-NEXT:    s_waitcnt lgkmcnt(0)
1513; VI-NEXT:    s_mov_b32 s8, s2
1514; VI-NEXT:    s_mov_b32 s9, s3
1515; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1516; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1517; VI-NEXT:    s_mov_b32 s4, s0
1518; VI-NEXT:    s_mov_b32 s5, s1
1519; VI-NEXT:    s_waitcnt vmcnt(1)
1520; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1521; VI-NEXT:    s_waitcnt vmcnt(0)
1522; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1523; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1524; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1525; VI-NEXT:    v_trunc_f32_e32 v2, v2
1526; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1527; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1528; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1529; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1530; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1531; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1532; VI-NEXT:    s_endpgm
1533;
1534; GCN-LABEL: v_udiv_i16:
1535; GCN:       ; %bb.0:
1536; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1537; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1538; GCN-NEXT:    s_add_u32 s4, s2, 2
1539; GCN-NEXT:    s_addc_u32 s5, s3, 0
1540; GCN-NEXT:    v_mov_b32_e32 v0, s4
1541; GCN-NEXT:    v_mov_b32_e32 v1, s5
1542; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1543; GCN-NEXT:    v_mov_b32_e32 v0, s2
1544; GCN-NEXT:    v_mov_b32_e32 v1, s3
1545; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1546; GCN-NEXT:    v_mov_b32_e32 v1, s1
1547; GCN-NEXT:    s_waitcnt vmcnt(1)
1548; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1549; GCN-NEXT:    s_waitcnt vmcnt(0)
1550; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1551; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1552; GCN-NEXT:    v_mov_b32_e32 v0, s0
1553; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1554; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1555; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1556; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1557; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1558; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1559; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1560; GCN-NEXT:    flat_store_dword v[0:1], v2
1561; GCN-NEXT:    s_endpgm
1562;
1563; GFX1030-LABEL: v_udiv_i16:
1564; GFX1030:       ; %bb.0:
1565; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1566; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1567; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX1030-NEXT:    s_clause 0x1
1569; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3] offset:2
1570; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3]
1571; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1572; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1573; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1574; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1575; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1576; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1577; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1578; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1579; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1580; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1581; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1582; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1583; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1584; GFX1030-NEXT:    s_endpgm
1585;
1586; EG-LABEL: v_udiv_i16:
1587; EG:       ; %bb.0:
1588; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1589; EG-NEXT:    TEX 1 @6
1590; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1591; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1592; EG-NEXT:    CF_END
1593; EG-NEXT:    PAD
1594; EG-NEXT:    Fetch clause starting at 6:
1595; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1596; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1597; EG-NEXT:    ALU clause starting at 10:
1598; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1599; EG-NEXT:    ALU clause starting at 11:
1600; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1601; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1602; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1603; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1604; EG-NEXT:     TRUNC * T0.W, PV.W,
1605; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1606; EG-NEXT:     TRUNC * T0.W, PV.W,
1607; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1608; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1609; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1610; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1611; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1612; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1613; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1614; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1615  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
1616  %num = load i16, i16 addrspace(1) * %in
1617  %den = load i16, i16 addrspace(1) * %den_ptr
1618  %result = udiv i16 %num, %den
1619  %result.ext = zext i16 %result to i32
1620  store i32 %result.ext, i32 addrspace(1)* %out
1621  ret void
1622}
1623
1624define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1625; SI-LABEL: v_udiv_i23:
1626; SI:       ; %bb.0:
1627; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1628; SI-NEXT:    s_mov_b32 s7, 0xf000
1629; SI-NEXT:    s_mov_b32 s6, -1
1630; SI-NEXT:    s_mov_b32 s10, s6
1631; SI-NEXT:    s_mov_b32 s11, s7
1632; SI-NEXT:    s_waitcnt lgkmcnt(0)
1633; SI-NEXT:    s_mov_b32 s8, s2
1634; SI-NEXT:    s_mov_b32 s9, s3
1635; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1636; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1637; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1638; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1639; SI-NEXT:    s_mov_b32 s4, s0
1640; SI-NEXT:    s_mov_b32 s5, s1
1641; SI-NEXT:    s_waitcnt vmcnt(3)
1642; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1643; SI-NEXT:    s_waitcnt vmcnt(2)
1644; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1645; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1646; SI-NEXT:    s_waitcnt vmcnt(1)
1647; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1648; SI-NEXT:    s_waitcnt vmcnt(0)
1649; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1650; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1651; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1652; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1653; SI-NEXT:    v_trunc_f32_e32 v2, v2
1654; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1655; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1656; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1657; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1658; SI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1659; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1660; SI-NEXT:    s_endpgm
1661;
1662; VI-LABEL: v_udiv_i23:
1663; VI:       ; %bb.0:
1664; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1665; VI-NEXT:    s_mov_b32 s7, 0xf000
1666; VI-NEXT:    s_mov_b32 s6, -1
1667; VI-NEXT:    s_mov_b32 s10, s6
1668; VI-NEXT:    s_mov_b32 s11, s7
1669; VI-NEXT:    s_waitcnt lgkmcnt(0)
1670; VI-NEXT:    s_mov_b32 s8, s2
1671; VI-NEXT:    s_mov_b32 s9, s3
1672; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1673; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1674; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1675; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1676; VI-NEXT:    s_mov_b32 s4, s0
1677; VI-NEXT:    s_mov_b32 s5, s1
1678; VI-NEXT:    s_waitcnt vmcnt(3)
1679; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1680; VI-NEXT:    s_waitcnt vmcnt(2)
1681; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1682; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1683; VI-NEXT:    s_waitcnt vmcnt(1)
1684; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1685; VI-NEXT:    s_waitcnt vmcnt(0)
1686; VI-NEXT:    v_or_b32_e32 v1, v3, v1
1687; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1688; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1689; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1690; VI-NEXT:    v_trunc_f32_e32 v2, v2
1691; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1692; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1693; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1694; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1695; VI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1696; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1697; VI-NEXT:    s_endpgm
1698;
1699; GCN-LABEL: v_udiv_i23:
1700; GCN:       ; %bb.0:
1701; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1702; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1703; GCN-NEXT:    s_add_u32 s4, s2, 4
1704; GCN-NEXT:    s_addc_u32 s5, s3, 0
1705; GCN-NEXT:    s_add_u32 s6, s2, 2
1706; GCN-NEXT:    s_addc_u32 s7, s3, 0
1707; GCN-NEXT:    v_mov_b32_e32 v0, s6
1708; GCN-NEXT:    v_mov_b32_e32 v1, s7
1709; GCN-NEXT:    s_add_u32 s6, s2, 6
1710; GCN-NEXT:    s_addc_u32 s7, s3, 0
1711; GCN-NEXT:    v_mov_b32_e32 v2, s6
1712; GCN-NEXT:    v_mov_b32_e32 v3, s7
1713; GCN-NEXT:    v_mov_b32_e32 v4, s4
1714; GCN-NEXT:    v_mov_b32_e32 v5, s5
1715; GCN-NEXT:    flat_load_ubyte v6, v[2:3]
1716; GCN-NEXT:    flat_load_ushort v4, v[4:5]
1717; GCN-NEXT:    v_mov_b32_e32 v2, s2
1718; GCN-NEXT:    v_mov_b32_e32 v3, s3
1719; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
1720; GCN-NEXT:    flat_load_ushort v1, v[2:3]
1721; GCN-NEXT:    s_waitcnt vmcnt(3)
1722; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
1723; GCN-NEXT:    s_waitcnt vmcnt(2)
1724; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
1725; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1726; GCN-NEXT:    s_waitcnt vmcnt(1)
1727; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1728; GCN-NEXT:    s_waitcnt vmcnt(0)
1729; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
1730; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1731; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1732; GCN-NEXT:    v_mov_b32_e32 v0, s0
1733; GCN-NEXT:    v_mov_b32_e32 v1, s1
1734; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1735; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1736; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1737; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1738; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1739; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1740; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
1741; GCN-NEXT:    flat_store_dword v[0:1], v2
1742; GCN-NEXT:    s_endpgm
1743;
1744; GFX1030-LABEL: v_udiv_i23:
1745; GFX1030:       ; %bb.0:
1746; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1747; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1748; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1749; GFX1030-NEXT:    s_clause 0x3
1750; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1751; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1752; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1753; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1754; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1755; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1756; GFX1030-NEXT:    s_waitcnt vmcnt(2)
1757; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1758; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1759; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1760; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1761; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1762; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
1763; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1764; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1765; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1766; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1767; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1768; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1769; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1770; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1771; GFX1030-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
1772; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1773; GFX1030-NEXT:    s_endpgm
1774;
1775; EG-LABEL: v_udiv_i23:
1776; EG:       ; %bb.0:
1777; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1778; EG-NEXT:    TEX 3 @6
1779; EG-NEXT:    ALU 20, @15, KC0[CB0:0-32], KC1[]
1780; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1781; EG-NEXT:    CF_END
1782; EG-NEXT:    PAD
1783; EG-NEXT:    Fetch clause starting at 6:
1784; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1785; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1786; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1787; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1788; EG-NEXT:    ALU clause starting at 14:
1789; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1790; EG-NEXT:    ALU clause starting at 15:
1791; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1792; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1793; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1794; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1795; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1796; EG-NEXT:     UINT_TO_FLT * T0.X, PV.W,
1797; EG-NEXT:     OR_INT T0.W, T2.X, T1.W,
1798; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1799; EG-NEXT:     UINT_TO_FLT * T0.Z, PV.W,
1800; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Y,
1801; EG-NEXT:     TRUNC * T0.W, PV.W,
1802; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
1803; EG-NEXT:     TRUNC * T0.W, PV.W,
1804; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.X|,
1805; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1806; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1807; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1808; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1809; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1810; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1811; EG-NEXT:    8388607(1.175494e-38), 2(2.802597e-45)
1812  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1813  %num = load i23, i23 addrspace(1) * %in
1814  %den = load i23, i23 addrspace(1) * %den_ptr
1815  %result = udiv i23 %num, %den
1816  %result.ext = zext i23 %result to i32
1817  store i32 %result.ext, i32 addrspace(1)* %out
1818  ret void
1819}
1820
1821define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1822; SI-LABEL: v_udiv_i24:
1823; SI:       ; %bb.0:
1824; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1825; SI-NEXT:    s_mov_b32 s7, 0xf000
1826; SI-NEXT:    s_mov_b32 s6, -1
1827; SI-NEXT:    s_mov_b32 s10, s6
1828; SI-NEXT:    s_mov_b32 s11, s7
1829; SI-NEXT:    s_waitcnt lgkmcnt(0)
1830; SI-NEXT:    s_mov_b32 s8, s2
1831; SI-NEXT:    s_mov_b32 s9, s3
1832; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1833; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1834; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1835; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1836; SI-NEXT:    s_mov_b32 s4, s0
1837; SI-NEXT:    s_mov_b32 s5, s1
1838; SI-NEXT:    s_waitcnt vmcnt(3)
1839; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1840; SI-NEXT:    s_waitcnt vmcnt(2)
1841; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1842; SI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1843; SI-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1844; SI-NEXT:    s_waitcnt vmcnt(1)
1845; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1846; SI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1847; SI-NEXT:    s_waitcnt vmcnt(0)
1848; SI-NEXT:    v_or_b32_e32 v2, v3, v2
1849; SI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1850; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1851; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
1852; SI-NEXT:    v_mul_hi_u32 v4, v1, v4
1853; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1854; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
1855; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
1856; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1857; SI-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
1858; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1859; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1860; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v2
1861; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1862; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
1863; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1864; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1865; SI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1866; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1867; SI-NEXT:    s_endpgm
1868;
1869; VI-LABEL: v_udiv_i24:
1870; VI:       ; %bb.0:
1871; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1872; VI-NEXT:    s_mov_b32 s7, 0xf000
1873; VI-NEXT:    s_mov_b32 s6, -1
1874; VI-NEXT:    s_mov_b32 s10, s6
1875; VI-NEXT:    s_mov_b32 s11, s7
1876; VI-NEXT:    s_waitcnt lgkmcnt(0)
1877; VI-NEXT:    s_mov_b32 s8, s2
1878; VI-NEXT:    s_mov_b32 s9, s3
1879; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1880; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1881; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1882; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1883; VI-NEXT:    s_mov_b32 s4, s0
1884; VI-NEXT:    s_mov_b32 s5, s1
1885; VI-NEXT:    s_waitcnt vmcnt(3)
1886; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1887; VI-NEXT:    s_waitcnt vmcnt(2)
1888; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1889; VI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1890; VI-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
1891; VI-NEXT:    s_waitcnt vmcnt(1)
1892; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1893; VI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1894; VI-NEXT:    s_waitcnt vmcnt(0)
1895; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1896; VI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1897; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1898; VI-NEXT:    v_mul_lo_u32 v4, v4, v1
1899; VI-NEXT:    v_mul_hi_u32 v4, v1, v4
1900; VI-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
1901; VI-NEXT:    v_mul_hi_u32 v1, v2, v1
1902; VI-NEXT:    v_mul_lo_u32 v3, v1, v0
1903; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
1904; VI-NEXT:    v_subrev_u32_e32 v2, vcc, v3, v2
1905; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1906; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1907; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v0, v2
1908; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1909; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
1910; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1911; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1912; VI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1913; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1914; VI-NEXT:    s_endpgm
1915;
1916; GCN-LABEL: v_udiv_i24:
1917; GCN:       ; %bb.0:
1918; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1919; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1920; GCN-NEXT:    s_add_u32 s4, s2, 4
1921; GCN-NEXT:    s_addc_u32 s5, s3, 0
1922; GCN-NEXT:    s_add_u32 s6, s2, 2
1923; GCN-NEXT:    v_mov_b32_e32 v0, s4
1924; GCN-NEXT:    s_addc_u32 s7, s3, 0
1925; GCN-NEXT:    v_mov_b32_e32 v1, s5
1926; GCN-NEXT:    s_add_u32 s4, s2, 6
1927; GCN-NEXT:    s_addc_u32 s5, s3, 0
1928; GCN-NEXT:    v_mov_b32_e32 v2, s4
1929; GCN-NEXT:    v_mov_b32_e32 v3, s5
1930; GCN-NEXT:    flat_load_ubyte v4, v[2:3]
1931; GCN-NEXT:    flat_load_ushort v5, v[0:1]
1932; GCN-NEXT:    v_mov_b32_e32 v2, s6
1933; GCN-NEXT:    v_mov_b32_e32 v0, s2
1934; GCN-NEXT:    v_mov_b32_e32 v3, s7
1935; GCN-NEXT:    v_mov_b32_e32 v1, s3
1936; GCN-NEXT:    flat_load_ubyte v2, v[2:3]
1937; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1938; GCN-NEXT:    s_waitcnt vmcnt(3)
1939; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
1940; GCN-NEXT:    s_waitcnt vmcnt(2)
1941; GCN-NEXT:    v_or_b32_e32 v3, v5, v1
1942; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v3
1943; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
1944; GCN-NEXT:    s_waitcnt vmcnt(1)
1945; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1946; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1947; GCN-NEXT:    s_waitcnt vmcnt(0)
1948; GCN-NEXT:    v_or_b32_e32 v2, v0, v2
1949; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1950; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1951; GCN-NEXT:    v_mul_lo_u32 v4, v4, v1
1952; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1953; GCN-NEXT:    v_add_u32_e32 v0, vcc, v4, v1
1954; GCN-NEXT:    v_mul_hi_u32 v4, v2, v0
1955; GCN-NEXT:    v_mov_b32_e32 v0, s0
1956; GCN-NEXT:    v_mov_b32_e32 v1, s1
1957; GCN-NEXT:    v_mul_lo_u32 v5, v4, v3
1958; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
1959; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, v5, v2
1960; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v3
1961; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
1962; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v3, v2
1963; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1964; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
1965; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
1966; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
1967; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
1968; GCN-NEXT:    flat_store_dword v[0:1], v2
1969; GCN-NEXT:    s_endpgm
1970;
1971; GFX1030-LABEL: v_udiv_i24:
1972; GFX1030:       ; %bb.0:
1973; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1974; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1975; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1976; GFX1030-NEXT:    s_clause 0x3
1977; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1978; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1979; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1980; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1981; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1982; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1983; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1984; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1985; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1986; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1987; GFX1030-NEXT:    v_or_b32_e32 v3, v4, v3
1988; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v1
1989; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
1990; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1991; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1992; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
1993; GFX1030-NEXT:    v_mul_lo_u32 v5, v5, v2
1994; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v5
1995; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v2, v5
1996; GFX1030-NEXT:    v_mul_hi_u32 v2, v3, v2
1997; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, v1
1998; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
1999; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2000; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v3, v1
2001; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2002; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
2003; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
2004; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2005; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2006; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
2007; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
2008; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
2009; GFX1030-NEXT:    s_endpgm
2010;
2011; EG-LABEL: v_udiv_i24:
2012; EG:       ; %bb.0:
2013; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
2014; EG-NEXT:    TEX 3 @6
2015; EG-NEXT:    ALU 23, @15, KC0[CB0:0-32], KC1[]
2016; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2017; EG-NEXT:    CF_END
2018; EG-NEXT:    PAD
2019; EG-NEXT:    Fetch clause starting at 6:
2020; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
2021; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
2022; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
2023; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
2024; EG-NEXT:    ALU clause starting at 14:
2025; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2026; EG-NEXT:    ALU clause starting at 15:
2027; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
2028; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2029; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
2030; EG-NEXT:     SUB_INT T1.W, 0.0, PV.W,
2031; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2032; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
2033; EG-NEXT:     LSHL T1.W, T3.X, literal.x,
2034; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2035; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2036; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2037; EG-NEXT:     OR_INT * T1.W, T2.X, PV.W,
2038; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2039; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2040; EG-NEXT:     SUB_INT * T1.W, T1.W, PS,
2041; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2042; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
2043; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
2044; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.W, PS,
2045; EG-NEXT:     CNDE_INT * T2.W, PV.W, T0.X, PV.Z,
2046; EG-NEXT:     ADD_INT T3.W, PS, 1,
2047; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2048; EG-NEXT:     CNDE_INT T0.X, PS, T2.W, PV.W,
2049; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2050; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2051  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
2052  %num = load i24, i24 addrspace(1) * %in
2053  %den = load i24, i24 addrspace(1) * %den_ptr
2054  %result = udiv i24 %num, %den
2055  %result.ext = zext i24 %result to i32
2056  store i32 %result.ext, i32 addrspace(1)* %out
2057  ret void
2058}
2059
2060define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2061; SI-LABEL: scalarize_mulhu_4xi32:
2062; SI:       ; %bb.0:
2063; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2064; SI-NEXT:    s_mov_b32 s7, 0xf000
2065; SI-NEXT:    s_mov_b32 s6, -1
2066; SI-NEXT:    s_waitcnt lgkmcnt(0)
2067; SI-NEXT:    s_mov_b32 s4, s0
2068; SI-NEXT:    s_mov_b32 s5, s1
2069; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2070; SI-NEXT:    s_mov_b32 s0, 0x1389c755
2071; SI-NEXT:    s_mov_b32 s4, s2
2072; SI-NEXT:    s_mov_b32 s5, s3
2073; SI-NEXT:    s_waitcnt vmcnt(0)
2074; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2075; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2076; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2077; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2078; SI-NEXT:    v_mul_hi_u32 v0, v0, s0
2079; SI-NEXT:    v_mul_hi_u32 v1, v1, s0
2080; SI-NEXT:    v_mul_hi_u32 v2, v2, s0
2081; SI-NEXT:    v_mul_hi_u32 v3, v3, s0
2082; SI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2083; SI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2084; SI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2085; SI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2086; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2087; SI-NEXT:    s_endpgm
2088;
2089; VI-LABEL: scalarize_mulhu_4xi32:
2090; VI:       ; %bb.0:
2091; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2092; VI-NEXT:    s_mov_b32 s7, 0xf000
2093; VI-NEXT:    s_mov_b32 s6, -1
2094; VI-NEXT:    s_waitcnt lgkmcnt(0)
2095; VI-NEXT:    s_mov_b32 s4, s0
2096; VI-NEXT:    s_mov_b32 s5, s1
2097; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2098; VI-NEXT:    s_mov_b32 s0, 0x1389c755
2099; VI-NEXT:    s_mov_b32 s4, s2
2100; VI-NEXT:    s_mov_b32 s5, s3
2101; VI-NEXT:    s_waitcnt vmcnt(0)
2102; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2103; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2104; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2105; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2106; VI-NEXT:    v_mul_hi_u32 v0, v0, s0
2107; VI-NEXT:    v_mul_hi_u32 v1, v1, s0
2108; VI-NEXT:    v_mul_hi_u32 v2, v2, s0
2109; VI-NEXT:    v_mul_hi_u32 v3, v3, s0
2110; VI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2111; VI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2112; VI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2113; VI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2114; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2115; VI-NEXT:    s_endpgm
2116;
2117; GCN-LABEL: scalarize_mulhu_4xi32:
2118; GCN:       ; %bb.0:
2119; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2120; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2121; GCN-NEXT:    v_mov_b32_e32 v0, s0
2122; GCN-NEXT:    v_mov_b32_e32 v1, s1
2123; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2124; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2125; GCN-NEXT:    v_mov_b32_e32 v4, s2
2126; GCN-NEXT:    v_mov_b32_e32 v5, s3
2127; GCN-NEXT:    s_waitcnt vmcnt(0)
2128; GCN-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2129; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2130; GCN-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2131; GCN-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2132; GCN-NEXT:    v_mul_hi_u32 v0, v0, s0
2133; GCN-NEXT:    v_mul_hi_u32 v1, v1, s0
2134; GCN-NEXT:    v_mul_hi_u32 v2, v2, s0
2135; GCN-NEXT:    v_mul_hi_u32 v3, v3, s0
2136; GCN-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2137; GCN-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2138; GCN-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2139; GCN-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2140; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2141; GCN-NEXT:    s_endpgm
2142;
2143; GFX1030-LABEL: scalarize_mulhu_4xi32:
2144; GFX1030:       ; %bb.0:
2145; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2146; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
2147; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
2149; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2150; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2151; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2152; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2153; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2154; GFX1030-NEXT:    v_mul_hi_u32 v0, 0x1389c755, v0
2155; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x1389c755, v1
2156; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x1389c755, v2
2157; GFX1030-NEXT:    v_mul_hi_u32 v3, 0x1389c755, v3
2158; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2159; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2160; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2161; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2162; GFX1030-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
2163; GFX1030-NEXT:    s_endpgm
2164;
2165; EG-LABEL: scalarize_mulhu_4xi32:
2166; EG:       ; %bb.0:
2167; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2168; EG-NEXT:    TEX 0 @6
2169; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
2170; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2171; EG-NEXT:    CF_END
2172; EG-NEXT:    PAD
2173; EG-NEXT:    Fetch clause starting at 6:
2174; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2175; EG-NEXT:    ALU clause starting at 8:
2176; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2177; EG-NEXT:    ALU clause starting at 9:
2178; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
2179; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
2180; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2181; EG-NEXT:     MULHI * T0.Z, PV.W, literal.x,
2182; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2183; EG-NEXT:     LSHR T1.Z, T0.Y, literal.x,
2184; EG-NEXT:     LSHR T0.W, PS, literal.y,
2185; EG-NEXT:     MULHI * T0.Y, T1.W, literal.z,
2186; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
2187; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2188; EG-NEXT:     LSHR T0.Z, PS, literal.x,
2189; EG-NEXT:     LSHR T1.W, T0.X, literal.y,
2190; EG-NEXT:     MULHI * T0.X, PV.Z, literal.z,
2191; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2192; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2193; EG-NEXT:     LSHR T0.Y, PS, literal.x,
2194; EG-NEXT:     MULHI * T0.X, PV.W, literal.y,
2195; EG-NEXT:    10(1.401298e-44), 327796565(3.478022e-27)
2196; EG-NEXT:     LSHR T0.X, PS, literal.x,
2197; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.y,
2198; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2199  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2200  %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2201  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
2202  ret void
2203}
2204
2205define amdgpu_kernel void @test_udiv2(i32 %p) {
2206; SI-LABEL: test_udiv2:
2207; SI:       ; %bb.0:
2208; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2209; SI-NEXT:    s_mov_b32 s3, 0xf000
2210; SI-NEXT:    s_mov_b32 s2, -1
2211; SI-NEXT:    s_waitcnt lgkmcnt(0)
2212; SI-NEXT:    s_lshr_b32 s0, s0, 1
2213; SI-NEXT:    v_mov_b32_e32 v0, s0
2214; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2215; SI-NEXT:    s_waitcnt vmcnt(0)
2216; SI-NEXT:    s_endpgm
2217;
2218; VI-LABEL: test_udiv2:
2219; VI:       ; %bb.0:
2220; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2221; VI-NEXT:    s_mov_b32 s3, 0xf000
2222; VI-NEXT:    s_mov_b32 s2, -1
2223; VI-NEXT:    s_waitcnt lgkmcnt(0)
2224; VI-NEXT:    s_lshr_b32 s0, s0, 1
2225; VI-NEXT:    v_mov_b32_e32 v0, s0
2226; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2227; VI-NEXT:    s_waitcnt vmcnt(0)
2228; VI-NEXT:    s_endpgm
2229;
2230; GCN-LABEL: test_udiv2:
2231; GCN:       ; %bb.0:
2232; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2233; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2234; GCN-NEXT:    s_lshr_b32 s0, s0, 1
2235; GCN-NEXT:    v_mov_b32_e32 v0, s0
2236; GCN-NEXT:    flat_store_dword v[0:1], v0
2237; GCN-NEXT:    s_waitcnt vmcnt(0)
2238; GCN-NEXT:    s_endpgm
2239;
2240; GFX1030-LABEL: test_udiv2:
2241; GFX1030:       ; %bb.0:
2242; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2243; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2244; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2245; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2246; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2247; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2248; GFX1030-NEXT:    s_endpgm
2249;
2250; EG-LABEL: test_udiv2:
2251; EG:       ; %bb.0:
2252; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2253; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2254; EG-NEXT:    CF_END
2255; EG-NEXT:    PAD
2256; EG-NEXT:    ALU clause starting at 4:
2257; EG-NEXT:     MOV T0.X, literal.x,
2258; EG-NEXT:     LSHR * T1.X, KC0[2].Y, 1,
2259; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2260  %i = udiv i32 %p, 2
2261  store volatile i32 %i, i32 addrspace(1)* undef
2262  ret void
2263}
2264
2265define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
2266; SI-LABEL: test_udiv_3_mulhu:
2267; SI:       ; %bb.0:
2268; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2269; SI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2270; SI-NEXT:    s_mov_b32 s3, 0xf000
2271; SI-NEXT:    s_mov_b32 s2, -1
2272; SI-NEXT:    s_waitcnt lgkmcnt(0)
2273; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
2274; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2275; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2276; SI-NEXT:    s_waitcnt vmcnt(0)
2277; SI-NEXT:    s_endpgm
2278;
2279; VI-LABEL: test_udiv_3_mulhu:
2280; VI:       ; %bb.0:
2281; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2282; VI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2283; VI-NEXT:    s_mov_b32 s3, 0xf000
2284; VI-NEXT:    s_mov_b32 s2, -1
2285; VI-NEXT:    s_waitcnt lgkmcnt(0)
2286; VI-NEXT:    v_mul_hi_u32 v0, s0, v0
2287; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2288; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2289; VI-NEXT:    s_waitcnt vmcnt(0)
2290; VI-NEXT:    s_endpgm
2291;
2292; GCN-LABEL: test_udiv_3_mulhu:
2293; GCN:       ; %bb.0:
2294; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2295; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2296; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2297; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
2298; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2299; GCN-NEXT:    flat_store_dword v[0:1], v0
2300; GCN-NEXT:    s_waitcnt vmcnt(0)
2301; GCN-NEXT:    s_endpgm
2302;
2303; GFX1030-LABEL: test_udiv_3_mulhu:
2304; GFX1030:       ; %bb.0:
2305; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2306; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2307; GFX1030-NEXT:    s_mul_hi_u32 s0, s0, 0xaaaaaaab
2308; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2309; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2310; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2311; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2312; GFX1030-NEXT:    s_endpgm
2313;
2314; EG-LABEL: test_udiv_3_mulhu:
2315; EG:       ; %bb.0:
2316; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2317; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2318; EG-NEXT:    CF_END
2319; EG-NEXT:    PAD
2320; EG-NEXT:    ALU clause starting at 4:
2321; EG-NEXT:     MULHI * T0.X, KC0[2].Y, literal.x,
2322; EG-NEXT:    -1431655765(-3.031649e-13), 0(0.000000e+00)
2323; EG-NEXT:     LSHR T0.X, PS, 1,
2324; EG-NEXT:     MOV * T1.X, literal.x,
2325; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2326   %i = udiv i32 %p, 3
2327   store volatile i32 %i, i32 addrspace(1)* undef
2328   ret void
2329}
2330
2331define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
2332; SI-LABEL: fdiv_test_denormals:
2333; SI:       ; %bb.0: ; %bb
2334; SI-NEXT:    s_mov_b32 s0, 0
2335; SI-NEXT:    s_mov_b32 s3, 0xf000
2336; SI-NEXT:    s_mov_b32 s2, -1
2337; SI-NEXT:    s_mov_b32 s1, s0
2338; SI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2339; SI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2340; SI-NEXT:    s_waitcnt vmcnt(1)
2341; SI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2342; SI-NEXT:    s_waitcnt vmcnt(0)
2343; SI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2344; SI-NEXT:    v_xor_b32_e32 v0, v1, v0
2345; SI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2346; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2347; SI-NEXT:    v_or_b32_e32 v0, 1, v0
2348; SI-NEXT:    v_mul_f32_e32 v1, v3, v4
2349; SI-NEXT:    v_trunc_f32_e32 v1, v1
2350; SI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2351; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2352; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2353; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2354; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2355; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2356; SI-NEXT:    s_endpgm
2357;
2358; VI-LABEL: fdiv_test_denormals:
2359; VI:       ; %bb.0: ; %bb
2360; VI-NEXT:    s_mov_b32 s0, 0
2361; VI-NEXT:    s_mov_b32 s3, 0xf000
2362; VI-NEXT:    s_mov_b32 s2, -1
2363; VI-NEXT:    s_mov_b32 s1, s0
2364; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2365; VI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2366; VI-NEXT:    s_waitcnt vmcnt(1)
2367; VI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2368; VI-NEXT:    s_waitcnt vmcnt(0)
2369; VI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2370; VI-NEXT:    v_xor_b32_e32 v0, v1, v0
2371; VI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2372; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2373; VI-NEXT:    v_or_b32_e32 v0, 1, v0
2374; VI-NEXT:    v_mul_f32_e32 v1, v3, v4
2375; VI-NEXT:    v_trunc_f32_e32 v1, v1
2376; VI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2377; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2378; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2379; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2380; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
2381; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2382; VI-NEXT:    s_endpgm
2383;
2384; GCN-LABEL: fdiv_test_denormals:
2385; GCN:       ; %bb.0: ; %bb
2386; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
2387; GCN-NEXT:    v_mov_b32_e32 v0, 0
2388; GCN-NEXT:    v_mov_b32_e32 v1, 0
2389; GCN-NEXT:    flat_load_sbyte v3, v[0:1]
2390; GCN-NEXT:    s_waitcnt vmcnt(1)
2391; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v2
2392; GCN-NEXT:    s_waitcnt vmcnt(0)
2393; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v3
2394; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2395; GCN-NEXT:    v_xor_b32_e32 v2, v3, v2
2396; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2397; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
2398; GCN-NEXT:    v_mul_f32_e32 v3, v5, v6
2399; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2400; GCN-NEXT:    v_mad_f32 v5, -v3, v4, v5
2401; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2402; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
2403; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
2404; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2405; GCN-NEXT:    flat_store_byte v[0:1], v2
2406; GCN-NEXT:    s_endpgm
2407;
2408; GFX1030-LABEL: fdiv_test_denormals:
2409; GFX1030:       ; %bb.0: ; %bb
2410; GFX1030-NEXT:    global_load_sbyte v2, v[0:1], off
2411; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
2412; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
2413; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
2414; GFX1030-NEXT:    s_waitcnt vmcnt(1)
2415; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
2416; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v4
2417; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2418; GFX1030-NEXT:    v_cvt_f32_i32_e32 v6, v3
2419; GFX1030-NEXT:    v_xor_b32_e32 v2, v3, v2
2420; GFX1030-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2421; GFX1030-NEXT:    v_mul_f32_e32 v5, v6, v5
2422; GFX1030-NEXT:    v_or_b32_e32 v2, 1, v2
2423; GFX1030-NEXT:    v_trunc_f32_e32 v3, v5
2424; GFX1030-NEXT:    v_fma_f32 v5, -v3, v4, v6
2425; GFX1030-NEXT:    v_cvt_i32_f32_e32 v3, v3
2426; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4|
2427; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
2428; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v3, v2
2429; GFX1030-NEXT:    global_store_byte v[0:1], v2, off
2430; GFX1030-NEXT:    s_endpgm
2431;
2432; EG-LABEL: fdiv_test_denormals:
2433; EG:       ; %bb.0: ; %bb
2434; EG-NEXT:    TEX 0 @6
2435; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
2436; EG-NEXT:    TEX 0 @8
2437; EG-NEXT:    ALU 25, @11, KC0[], KC1[]
2438; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
2439; EG-NEXT:    CF_END
2440; EG-NEXT:    Fetch clause starting at 6:
2441; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
2442; EG-NEXT:    Fetch clause starting at 8:
2443; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
2444; EG-NEXT:    ALU clause starting at 10:
2445; EG-NEXT:     MOV * T1.X, 0.0,
2446; EG-NEXT:    ALU clause starting at 11:
2447; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
2448; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2449; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
2450; EG-NEXT:     BFE_INT T1.W, T1.X, 0.0, literal.x,
2451; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
2452; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2453; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
2454; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
2455; EG-NEXT:     TRUNC T2.W, PV.W,
2456; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
2457; EG-NEXT:     ASHR T0.W, PS, literal.x,
2458; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
2459; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
2460; EG-NEXT:     TRUNC T0.Z, T2.W,
2461; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
2462; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
2463; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
2464; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
2465; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
2466; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
2467; EG-NEXT:     MOV * T0.W, literal.x,
2468; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2469; EG-NEXT:     MOV T0.Y, 0.0,
2470; EG-NEXT:     MOV * T0.Z, 0.0,
2471; EG-NEXT:     MOV * T1.X, literal.x,
2472; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2473bb:
2474  %tmp = load i8, i8 addrspace(1)* null, align 1
2475  %tmp1 = sext i8 %tmp to i32
2476  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
2477  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
2478  %tmp4 = sext i8 %tmp3 to i32
2479  %tmp5 = sdiv i32 %tmp1, %tmp4
2480  %tmp6 = trunc i32 %tmp5 to i8
2481  store i8 %tmp6, i8 addrspace(1)* null, align 1
2482  ret void
2483}
2484
2485define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
2486; SI-LABEL: v_test_udiv64_mulhi_fold:
2487; SI:       ; %bb.0:
2488; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2489; SI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2490; SI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2491; SI-NEXT:    v_rcp_f32_e32 v2, v2
2492; SI-NEXT:    s_mov_b32 s4, 0xfffe7960
2493; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2494; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2495; SI-NEXT:    v_trunc_f32_e32 v3, v3
2496; SI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2497; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
2498; SI-NEXT:    v_cvt_u32_f32_e32 v3, v3
2499; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2500; SI-NEXT:    v_mul_lo_u32 v6, v3, s4
2501; SI-NEXT:    v_mul_lo_u32 v5, v2, s4
2502; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2503; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
2504; SI-NEXT:    v_mul_hi_u32 v7, v2, v5
2505; SI-NEXT:    v_mul_lo_u32 v6, v2, v4
2506; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
2507; SI-NEXT:    v_mul_hi_u32 v9, v3, v4
2508; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2509; SI-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
2510; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
2511; SI-NEXT:    v_mul_lo_u32 v8, v3, v5
2512; SI-NEXT:    v_mul_hi_u32 v5, v3, v5
2513; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
2514; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
2515; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
2516; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2517; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2518; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2519; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2520; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2521; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
2522; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2523; SI-NEXT:    s_mov_b32 s4, 0x186a0
2524; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2525; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
2526; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
2527; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
2528; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
2529; SI-NEXT:    v_mul_hi_u32 v9, v3, v4
2530; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2531; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
2532; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
2533; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
2534; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
2535; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
2536; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
2537; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
2538; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2539; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2540; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2541; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2542; SI-NEXT:    v_mul_lo_u32 v4, v0, v3
2543; SI-NEXT:    v_mul_hi_u32 v5, v0, v2
2544; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
2545; SI-NEXT:    v_mul_hi_u32 v7, v1, v3
2546; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
2547; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2548; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2549; SI-NEXT:    v_mul_lo_u32 v6, v1, v2
2550; SI-NEXT:    v_mul_hi_u32 v2, v1, v2
2551; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
2552; SI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
2553; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
2554; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2555; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2556; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
2557; SI-NEXT:    v_mul_hi_u32 v5, v2, s4
2558; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2559; SI-NEXT:    s_mov_b32 s4, 0x1869f
2560; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2561; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
2562; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
2563; SI-NEXT:    v_subrev_i32_e32 v4, vcc, 0x186a0, v0
2564; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
2565; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
2566; SI-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
2567; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2568; SI-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
2569; SI-NEXT:    v_add_i32_e32 v5, vcc, 2, v2
2570; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
2571; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
2572; SI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2573; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
2574; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2575; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2576; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
2577; SI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2578; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
2579; SI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2580; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2581; SI-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
2582; SI-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
2583; SI-NEXT:    s_setpc_b64 s[30:31]
2584;
2585; VI-LABEL: v_test_udiv64_mulhi_fold:
2586; VI:       ; %bb.0:
2587; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2588; VI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2589; VI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2590; VI-NEXT:    v_rcp_f32_e32 v2, v2
2591; VI-NEXT:    s_mov_b32 s6, 0xfffe7960
2592; VI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2593; VI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2594; VI-NEXT:    v_trunc_f32_e32 v3, v3
2595; VI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2596; VI-NEXT:    v_cvt_u32_f32_e32 v6, v2
2597; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
2598; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2599; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2600; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2601; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
2602; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
2603; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
2604; VI-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
2605; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2606; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2607; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
2608; VI-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
2609; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2610; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2611; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2612; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2613; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2614; VI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2615; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2616; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2617; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2618; VI-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2619; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2620; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
2621; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2622; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2623; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
2624; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2625; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2626; VI-NEXT:    v_addc_u32_e32 v2, vcc, v9, v3, vcc
2627; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2628; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2629; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2630; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2631; VI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2632; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2633; VI-NEXT:    v_mul_hi_u32 v6, v0, v4
2634; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2635; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2636; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2637; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2638; VI-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2639; VI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2640; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2641; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2642; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2643; VI-NEXT:    s_mov_b32 s4, 0x186a0
2644; VI-NEXT:    v_mul_lo_u32 v6, v5, s4
2645; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
2646; VI-NEXT:    s_mov_b32 s4, 0x1869f
2647; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2648; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2649; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2650; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 0x186a0, v0
2651; VI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2652; VI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2653; VI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2654; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2655; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2656; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2657; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2658; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2659; VI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2660; VI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2661; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2662; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2663; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2664; VI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2665; VI-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2666; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2667; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2668; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2669; VI-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2670; VI-NEXT:    s_setpc_b64 s[30:31]
2671;
2672; GCN-LABEL: v_test_udiv64_mulhi_fold:
2673; GCN:       ; %bb.0:
2674; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2675; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2676; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2677; GCN-NEXT:    v_rcp_f32_e32 v2, v2
2678; GCN-NEXT:    s_mov_b32 s6, 0xfffe7960
2679; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2680; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2681; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2682; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2683; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v2
2684; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
2685; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2686; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2687; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2688; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
2689; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
2690; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
2691; GCN-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
2692; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2693; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2694; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
2695; GCN-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
2696; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2697; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2698; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2699; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2700; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2701; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2702; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2703; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2704; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2705; GCN-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2706; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2707; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
2708; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2709; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2710; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
2711; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2712; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2713; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v9, v3, vcc
2714; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2715; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2716; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2717; GCN-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2718; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2719; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2720; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
2721; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2722; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2723; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2724; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2725; GCN-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2726; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2727; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2728; GCN-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2729; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2730; GCN-NEXT:    s_mov_b32 s4, 0x186a0
2731; GCN-NEXT:    v_mul_lo_u32 v6, v5, s4
2732; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
2733; GCN-NEXT:    s_mov_b32 s4, 0x1869f
2734; GCN-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2735; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2736; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2737; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, 0x186a0, v0
2738; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2739; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2740; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2741; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2742; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2743; GCN-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2744; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2745; GCN-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2746; GCN-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2747; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2748; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2749; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2750; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2751; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2752; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2753; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2754; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2755; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2756; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2757; GCN-NEXT:    s_setpc_b64 s[30:31]
2758;
2759; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
2760; GFX1030:       ; %bb.0:
2761; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2762; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2763; GFX1030-NEXT:    s_mov_b32 s4, 0x346d900
2764; GFX1030-NEXT:    s_add_u32 s4, 0x4237, s4
2765; GFX1030-NEXT:    s_addc_u32 s5, 0, 0
2766; GFX1030-NEXT:    v_add_co_u32 v2, s4, 0xa9000000, s4
2767; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
2768; GFX1030-NEXT:    s_addc_u32 s4, s5, 0xa7c5
2769; GFX1030-NEXT:    v_mul_hi_u32 v3, 0xfffe7960, v2
2770; GFX1030-NEXT:    v_mul_lo_u32 v4, 0xfffe7960, v2
2771; GFX1030-NEXT:    s_mul_i32 s5, s4, 0xfffe7960
2772; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v2
2773; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v4
2774; GFX1030-NEXT:    v_mul_hi_u32 v8, s4, v4
2775; GFX1030-NEXT:    v_mul_lo_u32 v4, s4, v4
2776; GFX1030-NEXT:    v_add_nc_u32_e32 v3, s5, v3
2777; GFX1030-NEXT:    v_mul_lo_u32 v6, v2, v3
2778; GFX1030-NEXT:    v_mul_hi_u32 v7, v2, v3
2779; GFX1030-NEXT:    v_mul_hi_u32 v9, s4, v3
2780; GFX1030-NEXT:    v_mul_lo_u32 v3, s4, v3
2781; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v5, v6
2782; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v7, vcc_lo
2783; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, v4
2784; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
2785; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
2786; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, v3
2787; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
2788; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v3
2789; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s4, v4, vcc_lo
2790; GFX1030-NEXT:    v_mul_hi_u32 v8, v0, v5
2791; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], s4, v1, v5, 0
2792; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, v0, v6, 0
2793; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], s4, v1, v6, 0
2794; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
2795; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2796; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2797; GFX1030-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
2798; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
2799; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v6
2800; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
2801; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, 0x186a0, v5, 0
2802; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], s4, 0x186a0, v6, v[3:4]
2803; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
2804; GFX1030-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2805; GFX1030-NEXT:    v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0
2806; GFX1030-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
2807; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2
2808; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
2809; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
2810; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, 2
2811; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
2812; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0
2813; GFX1030-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2814; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2815; GFX1030-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s4
2816; GFX1030-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
2817; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v5, 1
2818; GFX1030-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
2819; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
2820; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
2821; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc_lo
2822; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2823; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
2824; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
2825; GFX1030-NEXT:    s_setpc_b64 s[30:31]
2826;
2827; EG-LABEL: v_test_udiv64_mulhi_fold:
2828; EG:       ; %bb.0:
2829; EG-NEXT:    CF_END
2830; EG-NEXT:    PAD
2831  %d = udiv i64 %arg, 100000
2832  ret i64 %d
2833}
2834