1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
7
8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
9; SI-LABEL: udiv_i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_u32_e32 v2, v1
24; SI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
25; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
26; SI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
27; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
28; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
29; SI-NEXT:    v_mul_hi_u32 v3, v2, v3
30; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
31; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
32; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
33; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
34; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
35; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
36; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
37; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v1, v0
38; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
39; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
40; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
41; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
42; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: udiv_i32:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
48; VI-NEXT:    s_mov_b32 s7, 0xf000
49; VI-NEXT:    s_mov_b32 s6, -1
50; VI-NEXT:    s_mov_b32 s10, s6
51; VI-NEXT:    s_mov_b32 s11, s7
52; VI-NEXT:    s_waitcnt lgkmcnt(0)
53; VI-NEXT:    s_mov_b32 s8, s2
54; VI-NEXT:    s_mov_b32 s9, s3
55; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
56; VI-NEXT:    s_mov_b32 s4, s0
57; VI-NEXT:    s_mov_b32 s5, s1
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    v_cvt_f32_u32_e32 v2, v1
60; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
61; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
62; VI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
63; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
64; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
65; VI-NEXT:    v_mul_hi_u32 v3, v2, v3
66; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
67; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
68; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
69; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
70; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v3, v0
71; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
72; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
73; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v1, v0
74; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
75; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v2
76; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
77; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
78; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
79; VI-NEXT:    s_endpgm
80;
81; GCN-LABEL: udiv_i32:
82; GCN:       ; %bb.0:
83; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
84; GCN-NEXT:    s_waitcnt lgkmcnt(0)
85; GCN-NEXT:    v_mov_b32_e32 v0, s2
86; GCN-NEXT:    v_mov_b32_e32 v1, s3
87; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
88; GCN-NEXT:    s_waitcnt vmcnt(0)
89; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
90; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
91; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
92; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
93; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
94; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
95; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
96; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
97; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
98; GCN-NEXT:    v_mov_b32_e32 v2, s0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_mul_lo_u32 v5, v4, v1
101; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
102; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v5, v0
103; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
104; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
105; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v1, v0
106; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
107; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
108; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
109; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
110; GCN-NEXT:    flat_store_dword v[2:3], v0
111; GCN-NEXT:    s_endpgm
112;
113; GFX1030-LABEL: udiv_i32:
114; GFX1030:       ; %bb.0:
115; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
116; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
117; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1030-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
119; GFX1030-NEXT:    s_waitcnt vmcnt(0)
120; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, v1
121; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, 0, v1
122; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
123; GFX1030-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
124; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
125; GFX1030-NEXT:    v_mul_lo_u32 v4, v4, v3
126; GFX1030-NEXT:    v_mul_hi_u32 v4, v3, v4
127; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v4
128; GFX1030-NEXT:    v_mul_hi_u32 v3, v0, v3
129; GFX1030-NEXT:    v_mul_lo_u32 v4, v3, v1
130; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v4
131; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
132; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v0, v1
133; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
134; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
135; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
136; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
137; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
138; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
139; GFX1030-NEXT:    global_store_dword v2, v0, s[0:1]
140; GFX1030-NEXT:    s_endpgm
141;
142; EG-LABEL: udiv_i32:
143; EG:       ; %bb.0:
144; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
145; EG-NEXT:    TEX 0 @6
146; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
148; EG-NEXT:    CF_END
149; EG-NEXT:    PAD
150; EG-NEXT:    Fetch clause starting at 6:
151; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
152; EG-NEXT:    ALU clause starting at 8:
153; EG-NEXT:     MOV * T0.X, KC0[2].Z,
154; EG-NEXT:    ALU clause starting at 9:
155; EG-NEXT:     SUB_INT T0.W, 0.0, T0.Y,
156; EG-NEXT:     RECIP_UINT * T0.Z, T0.Y,
157; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
158; EG-NEXT:     MULHI * T0.W, T0.Z, PS,
159; EG-NEXT:     ADD_INT * T0.W, T0.Z, PS,
160; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
161; EG-NEXT:     MULLO_INT * T0.W, PS, T0.Y,
162; EG-NEXT:     SUB_INT * T0.W, T0.X, PS,
163; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
164; EG-NEXT:     SETGE_UINT T1.W, PV.W, T0.Y,
165; EG-NEXT:     SUB_INT * T2.W, PV.W, T0.Y,
166; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.W, PS,
167; EG-NEXT:     CNDE_INT * T1.W, PV.W, T0.Z, PV.Z,
168; EG-NEXT:     ADD_INT T2.W, PS, 1,
169; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.Y,
170; EG-NEXT:     CNDE_INT T0.X, PS, T1.W, PV.W,
171; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
172; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
173  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
174  %a = load i32, i32 addrspace(1)* %in
175  %b = load i32, i32 addrspace(1)* %b_ptr
176  %result = udiv i32 %a, %b
177  store i32 %result, i32 addrspace(1)* %out
178  ret void
179}
180
181define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
182; SI-LABEL: s_udiv_i32:
183; SI:       ; %bb.0:
184; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
185; SI-NEXT:    s_mov_b32 s7, 0xf000
186; SI-NEXT:    s_mov_b32 s6, -1
187; SI-NEXT:    s_waitcnt lgkmcnt(0)
188; SI-NEXT:    v_cvt_f32_u32_e32 v0, s3
189; SI-NEXT:    s_sub_i32 s4, 0, s3
190; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
191; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
192; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
193; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
194; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
195; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
196; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
197; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
198; SI-NEXT:    v_mul_lo_u32 v1, v0, s3
199; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
200; SI-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
201; SI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
202; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
203; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
204; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
205; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
206; SI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
207; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
208; SI-NEXT:    s_waitcnt lgkmcnt(0)
209; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
210; SI-NEXT:    s_endpgm
211;
212; VI-LABEL: s_udiv_i32:
213; VI:       ; %bb.0:
214; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
215; VI-NEXT:    s_mov_b32 s7, 0xf000
216; VI-NEXT:    s_mov_b32 s6, -1
217; VI-NEXT:    s_waitcnt lgkmcnt(0)
218; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
219; VI-NEXT:    s_sub_i32 s4, 0, s3
220; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
221; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
222; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
223; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
224; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
225; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
226; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
227; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
228; VI-NEXT:    v_mul_lo_u32 v1, v0, s3
229; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
230; VI-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
231; VI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
232; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
233; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
234; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
235; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
236; VI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
237; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
238; VI-NEXT:    s_waitcnt lgkmcnt(0)
239; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
240; VI-NEXT:    s_endpgm
241;
242; GCN-LABEL: s_udiv_i32:
243; GCN:       ; %bb.0:
244; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
245; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
246; GCN-NEXT:    s_waitcnt lgkmcnt(0)
247; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
248; GCN-NEXT:    s_sub_i32 s0, 0, s3
249; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
250; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
251; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
252; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
253; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
254; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
255; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
256; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
257; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
258; GCN-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
259; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
260; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
261; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
262; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
263; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
264; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
265; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
266; GCN-NEXT:    v_mov_b32_e32 v0, s4
267; GCN-NEXT:    v_mov_b32_e32 v1, s5
268; GCN-NEXT:    flat_store_dword v[0:1], v2
269; GCN-NEXT:    s_endpgm
270;
271; GFX1030-LABEL: s_udiv_i32:
272; GFX1030:       ; %bb.0:
273; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
274; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
275; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s1
277; GFX1030-NEXT:    s_sub_i32 s3, 0, s1
278; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
279; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
280; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
281; GFX1030-NEXT:    v_readfirstlane_b32 s2, v0
282; GFX1030-NEXT:    s_mul_i32 s3, s3, s2
283; GFX1030-NEXT:    s_mul_hi_u32 s3, s2, s3
284; GFX1030-NEXT:    s_add_i32 s2, s2, s3
285; GFX1030-NEXT:    s_mul_hi_u32 s6, s0, s2
286; GFX1030-NEXT:    s_mul_i32 s2, s6, s1
287; GFX1030-NEXT:    s_sub_i32 s0, s0, s2
288; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
289; GFX1030-NEXT:    s_cmp_ge_u32 s0, s1
290; GFX1030-NEXT:    s_cselect_b32 vcc_lo, -1, 0
291; GFX1030-NEXT:    s_add_i32 s7, s6, 1
292; GFX1030-NEXT:    s_sub_i32 s4, s0, s1
293; GFX1030-NEXT:    v_mov_b32_e32 v0, s7
294; GFX1030-NEXT:    v_mov_b32_e32 v1, s4
295; GFX1030-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
296; GFX1030-NEXT:    v_cndmask_b32_e32 v1, s0, v1, vcc_lo
297; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
298; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
299; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
300; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX1030-NEXT:    global_store_dword v3, v0, s[2:3]
302; GFX1030-NEXT:    s_endpgm
303;
304; EG-LABEL: s_udiv_i32:
305; EG:       ; %bb.0:
306; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
307; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
308; EG-NEXT:    CF_END
309; EG-NEXT:    PAD
310; EG-NEXT:    ALU clause starting at 4:
311; EG-NEXT:     SUB_INT T0.W, 0.0, KC0[2].W,
312; EG-NEXT:     RECIP_UINT * T0.X, KC0[2].W,
313; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
314; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
315; EG-NEXT:     ADD_INT * T0.W, T0.X, PS,
316; EG-NEXT:     MULHI * T0.X, KC0[2].Z, PV.W,
317; EG-NEXT:     MULLO_INT * T0.Y, PS, KC0[2].W,
318; EG-NEXT:     SUB_INT * T0.W, KC0[2].Z, PS,
319; EG-NEXT:     SUB_INT T0.Z, PV.W, KC0[2].W,
320; EG-NEXT:     SETGE_UINT T1.W, PV.W, KC0[2].W,
321; EG-NEXT:     ADD_INT * T2.W, T0.X, 1,
322; EG-NEXT:     CNDE_INT T2.W, PV.W, T0.X, PS,
323; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, PV.Z,
324; EG-NEXT:     SETGE_UINT T0.W, PS, KC0[2].W,
325; EG-NEXT:     ADD_INT * T1.W, PV.W, 1,
326; EG-NEXT:     CNDE_INT T0.X, PV.W, T2.W, PS,
327; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
328; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
329  %result = udiv i32 %a, %b
330  store i32 %result, i32 addrspace(1)* %out
331  ret void
332}
333
334
335; The code generated by udiv is long and complex and may frequently
336; change. The goal of this test is to make sure the ISel doesn't fail
337; when it gets a v4i32 udiv
338define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
339; SI-LABEL: udiv_v2i32:
340; SI:       ; %bb.0:
341; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
342; SI-NEXT:    s_mov_b32 s7, 0xf000
343; SI-NEXT:    s_mov_b32 s6, -1
344; SI-NEXT:    s_mov_b32 s10, s6
345; SI-NEXT:    s_mov_b32 s11, s7
346; SI-NEXT:    s_waitcnt lgkmcnt(0)
347; SI-NEXT:    s_mov_b32 s8, s2
348; SI-NEXT:    s_mov_b32 s9, s3
349; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
350; SI-NEXT:    s_mov_b32 s4, s0
351; SI-NEXT:    s_mov_b32 s5, s1
352; SI-NEXT:    s_waitcnt vmcnt(0)
353; SI-NEXT:    v_cvt_f32_u32_e32 v4, v2
354; SI-NEXT:    v_cvt_f32_u32_e32 v5, v3
355; SI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
356; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
357; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
358; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
359; SI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
360; SI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
361; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
362; SI-NEXT:    v_cvt_u32_f32_e32 v5, v5
363; SI-NEXT:    v_mul_lo_u32 v6, v6, v4
364; SI-NEXT:    v_mul_lo_u32 v7, v7, v5
365; SI-NEXT:    v_mul_hi_u32 v6, v4, v6
366; SI-NEXT:    v_mul_hi_u32 v7, v5, v7
367; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
368; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
369; SI-NEXT:    v_mul_hi_u32 v4, v0, v4
370; SI-NEXT:    v_mul_hi_u32 v5, v1, v5
371; SI-NEXT:    v_mul_lo_u32 v6, v4, v2
372; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
373; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
374; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
375; SI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
376; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
377; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
378; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
379; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
380; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
381; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
382; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
383; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
384; SI-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
385; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
386; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
387; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
388; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
389; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
390; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
391; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
392; SI-NEXT:    s_endpgm
393;
394; VI-LABEL: udiv_v2i32:
395; VI:       ; %bb.0:
396; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
397; VI-NEXT:    s_mov_b32 s7, 0xf000
398; VI-NEXT:    s_mov_b32 s6, -1
399; VI-NEXT:    s_mov_b32 s10, s6
400; VI-NEXT:    s_mov_b32 s11, s7
401; VI-NEXT:    s_waitcnt lgkmcnt(0)
402; VI-NEXT:    s_mov_b32 s8, s2
403; VI-NEXT:    s_mov_b32 s9, s3
404; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
405; VI-NEXT:    s_mov_b32 s4, s0
406; VI-NEXT:    s_mov_b32 s5, s1
407; VI-NEXT:    s_waitcnt vmcnt(0)
408; VI-NEXT:    v_cvt_f32_u32_e32 v4, v2
409; VI-NEXT:    v_cvt_f32_u32_e32 v5, v3
410; VI-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
411; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
412; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
413; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
414; VI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
415; VI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
416; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
417; VI-NEXT:    v_cvt_u32_f32_e32 v5, v5
418; VI-NEXT:    v_mul_lo_u32 v6, v6, v4
419; VI-NEXT:    v_mul_lo_u32 v7, v7, v5
420; VI-NEXT:    v_mul_hi_u32 v6, v4, v6
421; VI-NEXT:    v_mul_hi_u32 v7, v5, v7
422; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
423; VI-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
424; VI-NEXT:    v_mul_hi_u32 v4, v0, v4
425; VI-NEXT:    v_mul_hi_u32 v5, v1, v5
426; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
427; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
428; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
429; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v6
430; VI-NEXT:    v_subrev_u32_e32 v1, vcc, v8, v1
431; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
432; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
433; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
434; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
435; VI-NEXT:    v_sub_u32_e32 v6, vcc, v0, v2
436; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
437; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
438; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
439; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
440; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
441; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
442; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
443; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
444; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
445; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
446; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
447; VI-NEXT:    s_endpgm
448;
449; GCN-LABEL: udiv_v2i32:
450; GCN:       ; %bb.0:
451; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
452; GCN-NEXT:    s_waitcnt lgkmcnt(0)
453; GCN-NEXT:    v_mov_b32_e32 v0, s2
454; GCN-NEXT:    v_mov_b32_e32 v1, s3
455; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
456; GCN-NEXT:    s_waitcnt vmcnt(0)
457; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
458; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
459; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
460; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
461; GCN-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
462; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
463; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
464; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v5
465; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
466; GCN-NEXT:    v_mul_lo_u32 v5, v4, v6
467; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
468; GCN-NEXT:    v_mul_lo_u32 v8, v4, v7
469; GCN-NEXT:    v_mul_hi_u32 v9, v6, v5
470; GCN-NEXT:    v_mov_b32_e32 v4, s0
471; GCN-NEXT:    v_mov_b32_e32 v5, s1
472; GCN-NEXT:    v_mul_hi_u32 v8, v7, v8
473; GCN-NEXT:    v_add_u32_e32 v6, vcc, v9, v6
474; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
475; GCN-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
476; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
477; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
478; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
479; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
480; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
481; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
482; GCN-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
483; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
484; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
485; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
486; GCN-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
487; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
488; GCN-NEXT:    v_subrev_u32_e32 v9, vcc, v3, v1
489; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
490; GCN-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
491; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
492; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v7
493; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
494; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
495; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
496; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
497; GCN-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
498; GCN-NEXT:    s_endpgm
499;
500; GFX1030-LABEL: udiv_v2i32:
501; GFX1030:       ; %bb.0:
502; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
503; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
504; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
506; GFX1030-NEXT:    s_waitcnt vmcnt(0)
507; GFX1030-NEXT:    v_cvt_f32_u32_e32 v5, v2
508; GFX1030-NEXT:    v_cvt_f32_u32_e32 v6, v3
509; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, 0, v2
510; GFX1030-NEXT:    v_sub_nc_u32_e32 v8, 0, v3
511; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v5
512; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v6, v6
513; GFX1030-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
514; GFX1030-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
515; GFX1030-NEXT:    v_cvt_u32_f32_e32 v5, v5
516; GFX1030-NEXT:    v_cvt_u32_f32_e32 v6, v6
517; GFX1030-NEXT:    v_mul_lo_u32 v7, v7, v5
518; GFX1030-NEXT:    v_mul_lo_u32 v8, v8, v6
519; GFX1030-NEXT:    v_mul_hi_u32 v7, v5, v7
520; GFX1030-NEXT:    v_mul_hi_u32 v8, v6, v8
521; GFX1030-NEXT:    v_add_nc_u32_e32 v5, v5, v7
522; GFX1030-NEXT:    v_add_nc_u32_e32 v6, v6, v8
523; GFX1030-NEXT:    v_mul_hi_u32 v5, v0, v5
524; GFX1030-NEXT:    v_mul_hi_u32 v6, v1, v6
525; GFX1030-NEXT:    v_mul_lo_u32 v7, v5, v2
526; GFX1030-NEXT:    v_mul_lo_u32 v8, v6, v3
527; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
528; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
529; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, v1, v8
530; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
531; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
532; GFX1030-NEXT:    v_sub_nc_u32_e32 v9, v1, v3
533; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v1, v3
534; GFX1030-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
535; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v0, v2
536; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s0
537; GFX1030-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
538; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
539; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
540; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
541; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
542; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc_lo
543; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v1, v3
544; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc_lo
545; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
546; GFX1030-NEXT:    s_endpgm
547;
548; EG-LABEL: udiv_v2i32:
549; EG:       ; %bb.0:
550; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
551; EG-NEXT:    TEX 0 @6
552; EG-NEXT:    ALU 33, @9, KC0[CB0:0-32], KC1[]
553; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
554; EG-NEXT:    CF_END
555; EG-NEXT:    PAD
556; EG-NEXT:    Fetch clause starting at 6:
557; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
558; EG-NEXT:    ALU clause starting at 8:
559; EG-NEXT:     MOV * T0.X, KC0[2].Z,
560; EG-NEXT:    ALU clause starting at 9:
561; EG-NEXT:     SUB_INT T1.W, 0.0, T0.W,
562; EG-NEXT:     RECIP_UINT * T1.X, T0.W,
563; EG-NEXT:     MULLO_INT * T1.Y, PV.W, PS,
564; EG-NEXT:     SUB_INT T1.W, 0.0, T0.Z,
565; EG-NEXT:     RECIP_UINT * T1.Z, T0.Z,
566; EG-NEXT:     MULLO_INT * T1.W, PV.W, PS,
567; EG-NEXT:     MULHI * T1.W, T1.Z, PS,
568; EG-NEXT:     ADD_INT T1.W, T1.Z, PS,
569; EG-NEXT:     MULHI * T1.Y, T1.X, T1.Y,
570; EG-NEXT:     ADD_INT T2.W, T1.X, PS,
571; EG-NEXT:     MULHI * T1.X, T0.X, PV.W,
572; EG-NEXT:     MULHI * T1.Y, T0.Y, PV.W,
573; EG-NEXT:     MULLO_INT * T1.Z, PS, T0.W,
574; EG-NEXT:     SUB_INT T1.W, T0.Y, PS,
575; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T0.Z,
576; EG-NEXT:     SUB_INT T0.Y, T0.X, PS,
577; EG-NEXT:     ADD_INT T1.Z, T1.Y, 1,
578; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
579; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
580; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.W, PS,
581; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PV.Z,
582; EG-NEXT:     ADD_INT T1.Z, T1.X, 1,
583; EG-NEXT:     SETGE_UINT T1.W, PV.Y, T0.Z,
584; EG-NEXT:     SUB_INT * T2.W, PV.Y, T0.Z,
585; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS,
586; EG-NEXT:     CNDE_INT T1.Z, PV.W, T1.X, PV.Z,
587; EG-NEXT:     ADD_INT T1.W, PV.Y, 1,
588; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
589; EG-NEXT:     CNDE_INT T1.Y, PS, T1.Y, PV.W,
590; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
591; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T0.Z,
592; EG-NEXT:     CNDE_INT T1.X, PS, T1.Z, PV.W,
593; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
594; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
595  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
596  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
597  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
598  %result = udiv <2 x i32> %a, %b
599  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
600  ret void
601}
602
603define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
604; SI-LABEL: udiv_v4i32:
605; SI:       ; %bb.0:
606; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
607; SI-NEXT:    s_mov_b32 s11, 0xf000
608; SI-NEXT:    s_mov_b32 s10, -1
609; SI-NEXT:    s_mov_b32 s6, s10
610; SI-NEXT:    s_mov_b32 s7, s11
611; SI-NEXT:    s_waitcnt lgkmcnt(0)
612; SI-NEXT:    s_mov_b32 s4, s2
613; SI-NEXT:    s_mov_b32 s5, s3
614; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
615; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
616; SI-NEXT:    s_mov_b32 s8, s0
617; SI-NEXT:    s_mov_b32 s9, s1
618; SI-NEXT:    s_waitcnt vmcnt(1)
619; SI-NEXT:    v_cvt_f32_u32_e32 v8, v0
620; SI-NEXT:    v_cvt_f32_u32_e32 v10, v1
621; SI-NEXT:    v_cvt_f32_u32_e32 v12, v2
622; SI-NEXT:    v_cvt_f32_u32_e32 v14, v3
623; SI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
624; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
625; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
626; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
627; SI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
628; SI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
629; SI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
630; SI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
631; SI-NEXT:    v_cvt_u32_f32_e32 v8, v8
632; SI-NEXT:    v_cvt_u32_f32_e32 v10, v10
633; SI-NEXT:    v_cvt_u32_f32_e32 v12, v12
634; SI-NEXT:    v_cvt_u32_f32_e32 v14, v14
635; SI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
636; SI-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
637; SI-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
638; SI-NEXT:    v_sub_i32_e32 v15, vcc, 0, v3
639; SI-NEXT:    v_mul_lo_u32 v9, v9, v8
640; SI-NEXT:    v_mul_lo_u32 v11, v11, v10
641; SI-NEXT:    v_mul_lo_u32 v13, v13, v12
642; SI-NEXT:    v_mul_lo_u32 v15, v15, v14
643; SI-NEXT:    v_mul_hi_u32 v9, v8, v9
644; SI-NEXT:    v_mul_hi_u32 v11, v10, v11
645; SI-NEXT:    v_mul_hi_u32 v13, v12, v13
646; SI-NEXT:    v_mul_hi_u32 v15, v14, v15
647; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
648; SI-NEXT:    v_add_i32_e32 v9, vcc, v11, v10
649; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v12
650; SI-NEXT:    v_add_i32_e32 v11, vcc, v15, v14
651; SI-NEXT:    s_waitcnt vmcnt(0)
652; SI-NEXT:    v_mul_hi_u32 v8, v4, v8
653; SI-NEXT:    v_mul_hi_u32 v9, v5, v9
654; SI-NEXT:    v_mul_hi_u32 v10, v6, v10
655; SI-NEXT:    v_mul_hi_u32 v11, v7, v11
656; SI-NEXT:    v_mul_lo_u32 v12, v8, v0
657; SI-NEXT:    v_mul_lo_u32 v14, v9, v1
658; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
659; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
660; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v12, v4
661; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v14
662; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v16
663; SI-NEXT:    v_sub_i32_e32 v7, vcc, v7, v18
664; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
665; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
666; SI-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
667; SI-NEXT:    v_add_i32_e32 v19, vcc, 1, v11
668; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
669; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
670; SI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
671; SI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
672; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
673; SI-NEXT:    v_subrev_i32_e32 v12, vcc, v0, v4
674; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
675; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
676; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
677; SI-NEXT:    v_sub_i32_e32 v14, vcc, v6, v2
678; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
679; SI-NEXT:    v_sub_i32_e32 v15, vcc, v7, v3
680; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
681; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
682; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
683; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v9
684; SI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
685; SI-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
686; SI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
687; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v11
688; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
689; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
690; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
691; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
692; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
693; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
694; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
695; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
696; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
697; SI-NEXT:    s_endpgm
698;
699; VI-LABEL: udiv_v4i32:
700; VI:       ; %bb.0:
701; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
702; VI-NEXT:    s_mov_b32 s11, 0xf000
703; VI-NEXT:    s_mov_b32 s10, -1
704; VI-NEXT:    s_mov_b32 s6, s10
705; VI-NEXT:    s_mov_b32 s7, s11
706; VI-NEXT:    s_waitcnt lgkmcnt(0)
707; VI-NEXT:    s_mov_b32 s4, s2
708; VI-NEXT:    s_mov_b32 s5, s3
709; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
710; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
711; VI-NEXT:    s_mov_b32 s8, s0
712; VI-NEXT:    s_mov_b32 s9, s1
713; VI-NEXT:    s_waitcnt vmcnt(1)
714; VI-NEXT:    v_cvt_f32_u32_e32 v8, v0
715; VI-NEXT:    v_cvt_f32_u32_e32 v10, v1
716; VI-NEXT:    v_cvt_f32_u32_e32 v12, v2
717; VI-NEXT:    v_cvt_f32_u32_e32 v14, v3
718; VI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
719; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
720; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
721; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
722; VI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
723; VI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
724; VI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
725; VI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
726; VI-NEXT:    v_cvt_u32_f32_e32 v8, v8
727; VI-NEXT:    v_cvt_u32_f32_e32 v10, v10
728; VI-NEXT:    v_cvt_u32_f32_e32 v12, v12
729; VI-NEXT:    v_cvt_u32_f32_e32 v14, v14
730; VI-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
731; VI-NEXT:    v_sub_u32_e32 v11, vcc, 0, v1
732; VI-NEXT:    v_sub_u32_e32 v13, vcc, 0, v2
733; VI-NEXT:    v_sub_u32_e32 v15, vcc, 0, v3
734; VI-NEXT:    v_mul_lo_u32 v9, v9, v8
735; VI-NEXT:    v_mul_lo_u32 v11, v11, v10
736; VI-NEXT:    v_mul_lo_u32 v13, v13, v12
737; VI-NEXT:    v_mul_lo_u32 v15, v15, v14
738; VI-NEXT:    v_mul_hi_u32 v9, v8, v9
739; VI-NEXT:    v_mul_hi_u32 v11, v10, v11
740; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
741; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
742; VI-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
743; VI-NEXT:    v_add_u32_e32 v9, vcc, v10, v11
744; VI-NEXT:    v_add_u32_e32 v10, vcc, v12, v13
745; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v14
746; VI-NEXT:    s_waitcnt vmcnt(0)
747; VI-NEXT:    v_mul_hi_u32 v8, v4, v8
748; VI-NEXT:    v_mul_hi_u32 v9, v5, v9
749; VI-NEXT:    v_mul_hi_u32 v10, v6, v10
750; VI-NEXT:    v_mul_hi_u32 v11, v7, v11
751; VI-NEXT:    v_mul_lo_u32 v12, v8, v0
752; VI-NEXT:    v_mul_lo_u32 v14, v9, v1
753; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
754; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
755; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
756; VI-NEXT:    v_subrev_u32_e32 v5, vcc, v14, v5
757; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v16, v6
758; VI-NEXT:    v_sub_u32_e32 v7, vcc, v7, v18
759; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
760; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
761; VI-NEXT:    v_add_u32_e32 v17, vcc, 1, v10
762; VI-NEXT:    v_add_u32_e32 v19, vcc, 1, v11
763; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
764; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
765; VI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
766; VI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
767; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
768; VI-NEXT:    v_subrev_u32_e32 v12, vcc, v0, v4
769; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
770; VI-NEXT:    v_subrev_u32_e32 v13, vcc, v1, v5
771; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
772; VI-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
773; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
774; VI-NEXT:    v_subrev_u32_e32 v15, vcc, v3, v7
775; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
776; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
777; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
778; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v9
779; VI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
780; VI-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
781; VI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
782; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
783; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
784; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
785; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
786; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
787; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
788; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
789; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
790; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
791; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
792; VI-NEXT:    s_endpgm
793;
794; GCN-LABEL: udiv_v4i32:
795; GCN:       ; %bb.0:
796; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
797; GCN-NEXT:    s_waitcnt lgkmcnt(0)
798; GCN-NEXT:    s_add_u32 s4, s2, 16
799; GCN-NEXT:    s_addc_u32 s5, s3, 0
800; GCN-NEXT:    v_mov_b32_e32 v0, s4
801; GCN-NEXT:    v_mov_b32_e32 v1, s5
802; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
803; GCN-NEXT:    v_mov_b32_e32 v5, s3
804; GCN-NEXT:    v_mov_b32_e32 v4, s2
805; GCN-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
806; GCN-NEXT:    v_mov_b32_e32 v8, s0
807; GCN-NEXT:    v_mov_b32_e32 v9, s1
808; GCN-NEXT:    s_waitcnt vmcnt(1)
809; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v0
810; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v1
811; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v2
812; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v3
813; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
814; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
815; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
816; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
817; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
818; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
819; GCN-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
820; GCN-NEXT:    v_mul_f32_e32 v16, 0x4f7ffffe, v16
821; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
822; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
823; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14
824; GCN-NEXT:    v_cvt_u32_f32_e32 v16, v16
825; GCN-NEXT:    v_sub_u32_e32 v11, vcc, 0, v0
826; GCN-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
827; GCN-NEXT:    v_sub_u32_e32 v15, vcc, 0, v2
828; GCN-NEXT:    v_sub_u32_e32 v17, vcc, 0, v3
829; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
830; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
831; GCN-NEXT:    v_mul_lo_u32 v15, v15, v14
832; GCN-NEXT:    v_mul_lo_u32 v17, v17, v16
833; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
834; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
835; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
836; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
837; GCN-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
838; GCN-NEXT:    v_add_u32_e32 v11, vcc, v12, v13
839; GCN-NEXT:    v_add_u32_e32 v12, vcc, v14, v15
840; GCN-NEXT:    v_add_u32_e32 v13, vcc, v17, v16
841; GCN-NEXT:    s_waitcnt vmcnt(0)
842; GCN-NEXT:    v_mul_hi_u32 v10, v4, v10
843; GCN-NEXT:    v_mul_hi_u32 v11, v5, v11
844; GCN-NEXT:    v_mul_hi_u32 v12, v6, v12
845; GCN-NEXT:    v_mul_hi_u32 v13, v7, v13
846; GCN-NEXT:    v_mul_lo_u32 v14, v10, v0
847; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
848; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
849; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
850; GCN-NEXT:    v_sub_u32_e32 v4, vcc, v4, v14
851; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v16, v5
852; GCN-NEXT:    v_subrev_u32_e32 v6, vcc, v18, v6
853; GCN-NEXT:    v_sub_u32_e32 v7, vcc, v7, v19
854; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
855; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
856; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
857; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
858; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
859; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
860; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
861; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
862; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
863; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v0, v4
864; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
865; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v1, v5
866; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
867; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
868; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
869; GCN-NEXT:    v_subrev_u32_e32 v16, vcc, v3, v7
870; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[0:1]
871; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
872; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v17, s[2:3]
873; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
874; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
875; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
876; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v16, s[6:7]
877; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
878; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
879; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v15, vcc
880; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
881; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v17, vcc
882; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
883; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc
884; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
885; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc
886; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
887; GCN-NEXT:    s_endpgm
888;
889; GFX1030-LABEL: udiv_v4i32:
890; GFX1030:       ; %bb.0:
891; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
892; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
893; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
894; GFX1030-NEXT:    s_clause 0x1
895; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
896; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
897; GFX1030-NEXT:    s_waitcnt vmcnt(1)
898; GFX1030-NEXT:    v_cvt_f32_u32_e32 v9, v0
899; GFX1030-NEXT:    v_cvt_f32_u32_e32 v10, v1
900; GFX1030-NEXT:    v_cvt_f32_u32_e32 v11, v2
901; GFX1030-NEXT:    v_cvt_f32_u32_e32 v12, v3
902; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, 0, v0
903; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v9, v9
904; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v10, v10
905; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v11, v11
906; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v12, v12
907; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, 0, v1
908; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, 0, v2
909; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, 0, v3
910; GFX1030-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
911; GFX1030-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
912; GFX1030-NEXT:    v_mul_f32_e32 v11, 0x4f7ffffe, v11
913; GFX1030-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
914; GFX1030-NEXT:    v_cvt_u32_f32_e32 v9, v9
915; GFX1030-NEXT:    v_cvt_u32_f32_e32 v10, v10
916; GFX1030-NEXT:    v_cvt_u32_f32_e32 v11, v11
917; GFX1030-NEXT:    v_cvt_u32_f32_e32 v12, v12
918; GFX1030-NEXT:    v_mul_lo_u32 v13, v13, v9
919; GFX1030-NEXT:    v_mul_lo_u32 v14, v14, v10
920; GFX1030-NEXT:    v_mul_lo_u32 v15, v15, v11
921; GFX1030-NEXT:    v_mul_lo_u32 v16, v16, v12
922; GFX1030-NEXT:    v_mul_hi_u32 v13, v9, v13
923; GFX1030-NEXT:    v_mul_hi_u32 v14, v10, v14
924; GFX1030-NEXT:    v_mul_hi_u32 v15, v11, v15
925; GFX1030-NEXT:    v_mul_hi_u32 v16, v12, v16
926; GFX1030-NEXT:    v_add_nc_u32_e32 v9, v9, v13
927; GFX1030-NEXT:    v_add_nc_u32_e32 v10, v10, v14
928; GFX1030-NEXT:    v_add_nc_u32_e32 v11, v11, v15
929; GFX1030-NEXT:    v_add_nc_u32_e32 v12, v12, v16
930; GFX1030-NEXT:    s_waitcnt vmcnt(0)
931; GFX1030-NEXT:    v_mul_hi_u32 v9, v4, v9
932; GFX1030-NEXT:    v_mul_hi_u32 v10, v5, v10
933; GFX1030-NEXT:    v_mul_hi_u32 v11, v6, v11
934; GFX1030-NEXT:    v_mul_hi_u32 v12, v7, v12
935; GFX1030-NEXT:    v_mul_lo_u32 v13, v9, v0
936; GFX1030-NEXT:    v_mul_lo_u32 v14, v10, v1
937; GFX1030-NEXT:    v_mul_lo_u32 v15, v11, v2
938; GFX1030-NEXT:    v_mul_lo_u32 v16, v12, v3
939; GFX1030-NEXT:    v_add_nc_u32_e32 v17, 1, v9
940; GFX1030-NEXT:    v_add_nc_u32_e32 v18, 1, v10
941; GFX1030-NEXT:    v_add_nc_u32_e32 v19, 1, v11
942; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v13
943; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v5, v14
944; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v6, v15
945; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v7, v16
946; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
947; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
948; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, v4, v0
949; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v5, v1
950; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, v5, v1
951; GFX1030-NEXT:    v_cmp_ge_u32_e64 s1, v6, v2
952; GFX1030-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc_lo
953; GFX1030-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
954; GFX1030-NEXT:    v_cndmask_b32_e64 v10, v10, v18, s0
955; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, v6, v2
956; GFX1030-NEXT:    v_cmp_ge_u32_e64 s2, v7, v3
957; GFX1030-NEXT:    v_add_nc_u32_e32 v14, 1, v9
958; GFX1030-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s0
959; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
960; GFX1030-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s1
961; GFX1030-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s2
962; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, v7, v3
963; GFX1030-NEXT:    v_add_nc_u32_e32 v15, 1, v10
964; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s1
965; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v9, v14, vcc_lo
966; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v5, v1
967; GFX1030-NEXT:    v_add_nc_u32_e32 v16, 1, v11
968; GFX1030-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
969; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
970; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc_lo
971; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v6, v2
972; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v11, v16, vcc_lo
973; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v7, v3
974; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc_lo
975; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
976; GFX1030-NEXT:    s_endpgm
977;
978; EG-LABEL: udiv_v4i32:
979; EG:       ; %bb.0:
980; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
981; EG-NEXT:    TEX 1 @6
982; EG-NEXT:    ALU 65, @11, KC0[CB0:0-32], KC1[]
983; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
984; EG-NEXT:    CF_END
985; EG-NEXT:    PAD
986; EG-NEXT:    Fetch clause starting at 6:
987; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
988; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
989; EG-NEXT:    ALU clause starting at 10:
990; EG-NEXT:     MOV * T0.X, KC0[2].Z,
991; EG-NEXT:    ALU clause starting at 11:
992; EG-NEXT:     SUB_INT T2.W, 0.0, T1.W,
993; EG-NEXT:     RECIP_UINT * T2.X, T1.W,
994; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
995; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
996; EG-NEXT:     ADD_INT * T2.W, T2.X, PS,
997; EG-NEXT:     MULHI * T2.X, T0.W, PV.W,
998; EG-NEXT:     MULLO_INT * T2.Y, PS, T1.W,
999; EG-NEXT:     SUB_INT T2.W, 0.0, T1.X,
1000; EG-NEXT:     RECIP_UINT * T2.Z, T1.X,
1001; EG-NEXT:     MULLO_INT * T2.W, PV.W, PS,
1002; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Y,
1003; EG-NEXT:     RECIP_UINT * T3.X, T1.Y,
1004; EG-NEXT:     MULLO_INT * T3.Y, PV.W, PS,
1005; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Z,
1006; EG-NEXT:     RECIP_UINT * T3.Z, T1.Z,
1007; EG-NEXT:     MULLO_INT * T3.W, PV.W, PS,
1008; EG-NEXT:     MULHI * T3.W, T3.Z, PS,
1009; EG-NEXT:     ADD_INT T3.W, T3.Z, PS,
1010; EG-NEXT:     MULHI * T3.Y, T3.X, T3.Y,
1011; EG-NEXT:     ADD_INT T4.W, T3.X, PS,
1012; EG-NEXT:     MULHI * T3.X, T0.Z, PV.W,
1013; EG-NEXT:     MULHI * T3.Y, T0.Y, PV.W,
1014; EG-NEXT:     MULLO_INT * T3.Z, PS, T1.Y,
1015; EG-NEXT:     SUB_INT T3.W, T0.Y, PS,
1016; EG-NEXT:     MULLO_INT * T0.Y, T3.X, T1.Z,
1017; EG-NEXT:     SUB_INT T4.X, T0.Z, PS,
1018; EG-NEXT:     ADD_INT T0.Y, T3.Y, 1,
1019; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.Y,
1020; EG-NEXT:     SUB_INT T4.W, PV.W, T1.Y,
1021; EG-NEXT:     MULHI * T2.W, T2.Z, T2.W,
1022; EG-NEXT:     CNDE_INT T5.X, PV.Z, T3.W, PV.W,
1023; EG-NEXT:     CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122
1024; EG-NEXT:     SETGE_UINT T0.Z, PV.X, T1.Z,
1025; EG-NEXT:     ADD_INT T2.W, T2.Z, PS,
1026; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.Y,
1027; EG-NEXT:     ADD_INT T6.X, T3.X, 1,
1028; EG-NEXT:     ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212
1029; EG-NEXT:     SETGE_UINT T2.Z, PS, T1.W,
1030; EG-NEXT:     SUB_INT T3.W, PS, T1.W,
1031; EG-NEXT:     MULHI * T2.W, T0.X, PV.W,
1032; EG-NEXT:     SUB_INT T7.X, T4.X, T1.Z,
1033; EG-NEXT:     CNDE_INT T3.Y, PV.Z, T0.W, PV.W,
1034; EG-NEXT:     CNDE_INT T2.Z, PV.Z, T2.X, PV.Y,
1035; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122
1036; EG-NEXT:     MULLO_INT * T2.X, T2.W, T1.X,
1037; EG-NEXT:     ADD_INT T3.X, T0.W, 1,
1038; EG-NEXT:     ADD_INT T2.Y, T2.Z, 1,
1039; EG-NEXT:     SETGE_UINT T3.Z, T3.Y, T1.W,
1040; EG-NEXT:     SUB_INT T1.W, T0.X, PS, BS:VEC_201
1041; EG-NEXT:     CNDE_INT * T3.W, T0.Z, T4.X, T7.X,
1042; EG-NEXT:     SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122
1043; EG-NEXT:     ADD_INT T3.Y, T2.W, 1,
1044; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.X,
1045; EG-NEXT:     SUB_INT T3.W, PV.W, T1.X,
1046; EG-NEXT:     CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y,
1047; EG-NEXT:     CNDE_INT T2.X, PV.Z, T1.W, PV.W,
1048; EG-NEXT:     CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122
1049; EG-NEXT:     CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201
1050; EG-NEXT:     ADD_INT T0.W, T0.Y, 1,
1051; EG-NEXT:     SETGE_UINT * T1.W, T5.X, T1.Y,
1052; EG-NEXT:     CNDE_INT T4.Y, PS, T0.Y, PV.W,
1053; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
1054; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.X,
1055; EG-NEXT:     CNDE_INT T4.X, PS, T2.Y, PV.W,
1056; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1057; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1058  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1059  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
1060  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
1061  %result = udiv <4 x i32> %a, %b
1062  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1063  ret void
1064}
1065
1066define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1067; SI-LABEL: udiv_i32_div_pow2:
1068; SI:       ; %bb.0:
1069; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1070; SI-NEXT:    s_mov_b32 s7, 0xf000
1071; SI-NEXT:    s_mov_b32 s6, -1
1072; SI-NEXT:    s_mov_b32 s10, s6
1073; SI-NEXT:    s_mov_b32 s11, s7
1074; SI-NEXT:    s_waitcnt lgkmcnt(0)
1075; SI-NEXT:    s_mov_b32 s8, s2
1076; SI-NEXT:    s_mov_b32 s9, s3
1077; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1078; SI-NEXT:    s_mov_b32 s4, s0
1079; SI-NEXT:    s_mov_b32 s5, s1
1080; SI-NEXT:    s_waitcnt vmcnt(0)
1081; SI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1082; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1083; SI-NEXT:    s_endpgm
1084;
1085; VI-LABEL: udiv_i32_div_pow2:
1086; VI:       ; %bb.0:
1087; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1088; VI-NEXT:    s_mov_b32 s7, 0xf000
1089; VI-NEXT:    s_mov_b32 s6, -1
1090; VI-NEXT:    s_mov_b32 s10, s6
1091; VI-NEXT:    s_mov_b32 s11, s7
1092; VI-NEXT:    s_waitcnt lgkmcnt(0)
1093; VI-NEXT:    s_mov_b32 s8, s2
1094; VI-NEXT:    s_mov_b32 s9, s3
1095; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1096; VI-NEXT:    s_mov_b32 s4, s0
1097; VI-NEXT:    s_mov_b32 s5, s1
1098; VI-NEXT:    s_waitcnt vmcnt(0)
1099; VI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1100; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1101; VI-NEXT:    s_endpgm
1102;
1103; GCN-LABEL: udiv_i32_div_pow2:
1104; GCN:       ; %bb.0:
1105; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1106; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1107; GCN-NEXT:    v_mov_b32_e32 v0, s2
1108; GCN-NEXT:    v_mov_b32_e32 v1, s3
1109; GCN-NEXT:    flat_load_dword v2, v[0:1]
1110; GCN-NEXT:    v_mov_b32_e32 v0, s0
1111; GCN-NEXT:    v_mov_b32_e32 v1, s1
1112; GCN-NEXT:    s_waitcnt vmcnt(0)
1113; GCN-NEXT:    v_lshrrev_b32_e32 v2, 4, v2
1114; GCN-NEXT:    flat_store_dword v[0:1], v2
1115; GCN-NEXT:    s_endpgm
1116;
1117; GFX1030-LABEL: udiv_i32_div_pow2:
1118; GFX1030:       ; %bb.0:
1119; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1120; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1121; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1123; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1124; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
1125; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1126; GFX1030-NEXT:    s_endpgm
1127;
1128; EG-LABEL: udiv_i32_div_pow2:
1129; EG:       ; %bb.0:
1130; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1131; EG-NEXT:    TEX 0 @6
1132; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1133; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1134; EG-NEXT:    CF_END
1135; EG-NEXT:    PAD
1136; EG-NEXT:    Fetch clause starting at 6:
1137; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1138; EG-NEXT:    ALU clause starting at 8:
1139; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1140; EG-NEXT:    ALU clause starting at 9:
1141; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
1142; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1143; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
1144  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1145  %a = load i32, i32 addrspace(1)* %in
1146  %result = udiv i32 %a, 16
1147  store i32 %result, i32 addrspace(1)* %out
1148  ret void
1149}
1150
1151define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1152; SI-LABEL: udiv_i32_div_k_even:
1153; SI:       ; %bb.0:
1154; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1155; SI-NEXT:    s_mov_b32 s7, 0xf000
1156; SI-NEXT:    s_mov_b32 s6, -1
1157; SI-NEXT:    s_mov_b32 s10, s6
1158; SI-NEXT:    s_mov_b32 s11, s7
1159; SI-NEXT:    s_waitcnt lgkmcnt(0)
1160; SI-NEXT:    s_mov_b32 s8, s2
1161; SI-NEXT:    s_mov_b32 s9, s3
1162; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1163; SI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1164; SI-NEXT:    s_mov_b32 s4, s0
1165; SI-NEXT:    s_mov_b32 s5, s1
1166; SI-NEXT:    s_waitcnt vmcnt(0)
1167; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1168; SI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1169; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1170; SI-NEXT:    s_endpgm
1171;
1172; VI-LABEL: udiv_i32_div_k_even:
1173; VI:       ; %bb.0:
1174; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1175; VI-NEXT:    s_mov_b32 s7, 0xf000
1176; VI-NEXT:    s_mov_b32 s6, -1
1177; VI-NEXT:    s_mov_b32 s10, s6
1178; VI-NEXT:    s_mov_b32 s11, s7
1179; VI-NEXT:    s_waitcnt lgkmcnt(0)
1180; VI-NEXT:    s_mov_b32 s8, s2
1181; VI-NEXT:    s_mov_b32 s9, s3
1182; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1183; VI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1184; VI-NEXT:    s_mov_b32 s4, s0
1185; VI-NEXT:    s_mov_b32 s5, s1
1186; VI-NEXT:    s_waitcnt vmcnt(0)
1187; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1188; VI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1189; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1190; VI-NEXT:    s_endpgm
1191;
1192; GCN-LABEL: udiv_i32_div_k_even:
1193; GCN:       ; %bb.0:
1194; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1195; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1196; GCN-NEXT:    v_mov_b32_e32 v0, s2
1197; GCN-NEXT:    v_mov_b32_e32 v1, s3
1198; GCN-NEXT:    flat_load_dword v0, v[0:1]
1199; GCN-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1200; GCN-NEXT:    v_mov_b32_e32 v1, s1
1201; GCN-NEXT:    s_waitcnt vmcnt(0)
1202; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1203; GCN-NEXT:    v_mov_b32_e32 v0, s0
1204; GCN-NEXT:    v_lshrrev_b32_e32 v2, 25, v2
1205; GCN-NEXT:    flat_store_dword v[0:1], v2
1206; GCN-NEXT:    s_endpgm
1207;
1208; GFX1030-LABEL: udiv_i32_div_k_even:
1209; GFX1030:       ; %bb.0:
1210; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1211; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1212; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1213; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1214; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1215; GFX1030-NEXT:    v_mul_hi_u32 v1, 0xfabbd9c1, v1
1216; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
1217; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1218; GFX1030-NEXT:    s_endpgm
1219;
1220; EG-LABEL: udiv_i32_div_k_even:
1221; EG:       ; %bb.0:
1222; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1223; EG-NEXT:    TEX 0 @6
1224; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1225; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1226; EG-NEXT:    CF_END
1227; EG-NEXT:    PAD
1228; EG-NEXT:    Fetch clause starting at 6:
1229; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1230; EG-NEXT:    ALU clause starting at 8:
1231; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1232; EG-NEXT:    ALU clause starting at 9:
1233; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1234; EG-NEXT:    -88352319(-4.876880e+35), 0(0.000000e+00)
1235; EG-NEXT:     LSHR T0.X, PS, literal.x,
1236; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1237; EG-NEXT:    25(3.503246e-44), 2(2.802597e-45)
1238  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1239  %a = load i32, i32 addrspace(1)* %in
1240  %result = udiv i32 %a, 34259182
1241  store i32 %result, i32 addrspace(1)* %out
1242  ret void
1243}
1244
1245define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1246; SI-LABEL: udiv_i32_div_k_odd:
1247; SI:       ; %bb.0:
1248; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1249; SI-NEXT:    s_mov_b32 s7, 0xf000
1250; SI-NEXT:    s_mov_b32 s6, -1
1251; SI-NEXT:    s_mov_b32 s10, s6
1252; SI-NEXT:    s_mov_b32 s11, s7
1253; SI-NEXT:    s_waitcnt lgkmcnt(0)
1254; SI-NEXT:    s_mov_b32 s8, s2
1255; SI-NEXT:    s_mov_b32 s9, s3
1256; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1257; SI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1258; SI-NEXT:    s_mov_b32 s4, s0
1259; SI-NEXT:    s_mov_b32 s5, s1
1260; SI-NEXT:    s_waitcnt vmcnt(0)
1261; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1262; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1263; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1264; SI-NEXT:    s_endpgm
1265;
1266; VI-LABEL: udiv_i32_div_k_odd:
1267; VI:       ; %bb.0:
1268; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1269; VI-NEXT:    s_mov_b32 s7, 0xf000
1270; VI-NEXT:    s_mov_b32 s6, -1
1271; VI-NEXT:    s_mov_b32 s10, s6
1272; VI-NEXT:    s_mov_b32 s11, s7
1273; VI-NEXT:    s_waitcnt lgkmcnt(0)
1274; VI-NEXT:    s_mov_b32 s8, s2
1275; VI-NEXT:    s_mov_b32 s9, s3
1276; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1277; VI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1278; VI-NEXT:    s_mov_b32 s4, s0
1279; VI-NEXT:    s_mov_b32 s5, s1
1280; VI-NEXT:    s_waitcnt vmcnt(0)
1281; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1282; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1283; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1284; VI-NEXT:    s_endpgm
1285;
1286; GCN-LABEL: udiv_i32_div_k_odd:
1287; GCN:       ; %bb.0:
1288; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1289; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1290; GCN-NEXT:    v_mov_b32_e32 v0, s2
1291; GCN-NEXT:    v_mov_b32_e32 v1, s3
1292; GCN-NEXT:    flat_load_dword v0, v[0:1]
1293; GCN-NEXT:    s_mov_b32 s2, 0x7d5deca3
1294; GCN-NEXT:    v_mov_b32_e32 v1, s1
1295; GCN-NEXT:    s_waitcnt vmcnt(0)
1296; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1297; GCN-NEXT:    v_mov_b32_e32 v0, s0
1298; GCN-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1299; GCN-NEXT:    flat_store_dword v[0:1], v2
1300; GCN-NEXT:    s_endpgm
1301;
1302; GFX1030-LABEL: udiv_i32_div_k_odd:
1303; GFX1030:       ; %bb.0:
1304; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1305; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1306; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1307; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1308; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1309; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x7d5deca3, v1
1310; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1311; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1312; GFX1030-NEXT:    s_endpgm
1313;
1314; EG-LABEL: udiv_i32_div_k_odd:
1315; EG:       ; %bb.0:
1316; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1317; EG-NEXT:    TEX 0 @6
1318; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1319; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1320; EG-NEXT:    CF_END
1321; EG-NEXT:    PAD
1322; EG-NEXT:    Fetch clause starting at 6:
1323; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1324; EG-NEXT:    ALU clause starting at 8:
1325; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1326; EG-NEXT:    ALU clause starting at 9:
1327; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1328; EG-NEXT:    2103307427(1.843675e+37), 0(0.000000e+00)
1329; EG-NEXT:     LSHR T0.X, PS, literal.x,
1330; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1331; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
1332  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1333  %a = load i32, i32 addrspace(1)* %in
1334  %result = udiv i32 %a, 34259183
1335  store i32 %result, i32 addrspace(1)* %out
1336  ret void
1337}
1338
1339define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1340; SI-LABEL: v_udiv_i8:
1341; SI:       ; %bb.0:
1342; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1343; SI-NEXT:    s_mov_b32 s7, 0xf000
1344; SI-NEXT:    s_mov_b32 s6, -1
1345; SI-NEXT:    s_mov_b32 s10, s6
1346; SI-NEXT:    s_mov_b32 s11, s7
1347; SI-NEXT:    s_waitcnt lgkmcnt(0)
1348; SI-NEXT:    s_mov_b32 s8, s2
1349; SI-NEXT:    s_mov_b32 s9, s3
1350; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1351; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1352; SI-NEXT:    s_mov_b32 s4, s0
1353; SI-NEXT:    s_mov_b32 s5, s1
1354; SI-NEXT:    s_waitcnt vmcnt(1)
1355; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1356; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1357; SI-NEXT:    s_waitcnt vmcnt(0)
1358; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1359; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1360; SI-NEXT:    v_trunc_f32_e32 v2, v2
1361; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1362; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1363; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1364; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1365; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1366; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1367; SI-NEXT:    s_endpgm
1368;
1369; VI-LABEL: v_udiv_i8:
1370; VI:       ; %bb.0:
1371; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1372; VI-NEXT:    s_mov_b32 s7, 0xf000
1373; VI-NEXT:    s_mov_b32 s6, -1
1374; VI-NEXT:    s_mov_b32 s10, s6
1375; VI-NEXT:    s_mov_b32 s11, s7
1376; VI-NEXT:    s_waitcnt lgkmcnt(0)
1377; VI-NEXT:    s_mov_b32 s8, s2
1378; VI-NEXT:    s_mov_b32 s9, s3
1379; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1380; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1381; VI-NEXT:    s_mov_b32 s4, s0
1382; VI-NEXT:    s_mov_b32 s5, s1
1383; VI-NEXT:    s_waitcnt vmcnt(1)
1384; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1385; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1386; VI-NEXT:    s_waitcnt vmcnt(0)
1387; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1388; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1389; VI-NEXT:    v_trunc_f32_e32 v2, v2
1390; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1391; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1392; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1393; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1394; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1395; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1396; VI-NEXT:    s_endpgm
1397;
1398; GCN-LABEL: v_udiv_i8:
1399; GCN:       ; %bb.0:
1400; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1401; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1402; GCN-NEXT:    v_mov_b32_e32 v0, s2
1403; GCN-NEXT:    v_mov_b32_e32 v1, s3
1404; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1405; GCN-NEXT:    v_mov_b32_e32 v0, s0
1406; GCN-NEXT:    v_mov_b32_e32 v1, s1
1407; GCN-NEXT:    s_waitcnt vmcnt(0)
1408; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
1409; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
1410; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1411; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
1412; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1413; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1414; GCN-NEXT:    v_mad_f32 v2, -v4, v3, v2
1415; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1416; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1417; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
1418; GCN-NEXT:    flat_store_dword v[0:1], v2
1419; GCN-NEXT:    s_endpgm
1420;
1421; GFX1030-LABEL: v_udiv_i8:
1422; GFX1030:       ; %bb.0:
1423; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1424; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1425; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1426; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3]
1427; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1428; GFX1030-NEXT:    v_cvt_f32_ubyte1_e32 v2, v1
1429; GFX1030-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1430; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1431; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
1432; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1433; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
1434; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1435; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1436; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1437; GFX1030-NEXT:    v_and_b32_e32 v1, 0xff, v1
1438; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1439; GFX1030-NEXT:    s_endpgm
1440;
1441; EG-LABEL: v_udiv_i8:
1442; EG:       ; %bb.0:
1443; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1444; EG-NEXT:    TEX 1 @6
1445; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1446; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1447; EG-NEXT:    CF_END
1448; EG-NEXT:    PAD
1449; EG-NEXT:    Fetch clause starting at 6:
1450; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1451; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1452; EG-NEXT:    ALU clause starting at 10:
1453; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1454; EG-NEXT:    ALU clause starting at 11:
1455; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1456; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1457; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1458; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1459; EG-NEXT:     TRUNC * T0.W, PV.W,
1460; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1461; EG-NEXT:     TRUNC * T0.W, PV.W,
1462; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1463; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1464; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1465; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1466; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1467; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1468; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1469; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
1470  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1471  %num = load i8, i8 addrspace(1) * %in
1472  %den = load i8, i8 addrspace(1) * %den_ptr
1473  %result = udiv i8 %num, %den
1474  %result.ext = zext i8 %result to i32
1475  store i32 %result.ext, i32 addrspace(1)* %out
1476  ret void
1477}
1478
1479define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
1480; SI-LABEL: v_udiv_i16:
1481; SI:       ; %bb.0:
1482; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1483; SI-NEXT:    s_mov_b32 s7, 0xf000
1484; SI-NEXT:    s_mov_b32 s6, -1
1485; SI-NEXT:    s_mov_b32 s10, s6
1486; SI-NEXT:    s_mov_b32 s11, s7
1487; SI-NEXT:    s_waitcnt lgkmcnt(0)
1488; SI-NEXT:    s_mov_b32 s8, s2
1489; SI-NEXT:    s_mov_b32 s9, s3
1490; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1491; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1492; SI-NEXT:    s_mov_b32 s4, s0
1493; SI-NEXT:    s_mov_b32 s5, s1
1494; SI-NEXT:    s_waitcnt vmcnt(1)
1495; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1496; SI-NEXT:    s_waitcnt vmcnt(0)
1497; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1498; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1499; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1500; SI-NEXT:    v_trunc_f32_e32 v2, v2
1501; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1502; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1503; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1504; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1505; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1506; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1507; SI-NEXT:    s_endpgm
1508;
1509; VI-LABEL: v_udiv_i16:
1510; VI:       ; %bb.0:
1511; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1512; VI-NEXT:    s_mov_b32 s7, 0xf000
1513; VI-NEXT:    s_mov_b32 s6, -1
1514; VI-NEXT:    s_mov_b32 s10, s6
1515; VI-NEXT:    s_mov_b32 s11, s7
1516; VI-NEXT:    s_waitcnt lgkmcnt(0)
1517; VI-NEXT:    s_mov_b32 s8, s2
1518; VI-NEXT:    s_mov_b32 s9, s3
1519; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1520; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1521; VI-NEXT:    s_mov_b32 s4, s0
1522; VI-NEXT:    s_mov_b32 s5, s1
1523; VI-NEXT:    s_waitcnt vmcnt(1)
1524; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1525; VI-NEXT:    s_waitcnt vmcnt(0)
1526; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1527; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1528; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1529; VI-NEXT:    v_trunc_f32_e32 v2, v2
1530; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1531; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1532; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1533; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1534; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1535; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1536; VI-NEXT:    s_endpgm
1537;
1538; GCN-LABEL: v_udiv_i16:
1539; GCN:       ; %bb.0:
1540; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1541; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1542; GCN-NEXT:    s_add_u32 s4, s2, 2
1543; GCN-NEXT:    s_addc_u32 s5, s3, 0
1544; GCN-NEXT:    v_mov_b32_e32 v0, s4
1545; GCN-NEXT:    v_mov_b32_e32 v1, s5
1546; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1547; GCN-NEXT:    v_mov_b32_e32 v0, s2
1548; GCN-NEXT:    v_mov_b32_e32 v1, s3
1549; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1550; GCN-NEXT:    v_mov_b32_e32 v1, s1
1551; GCN-NEXT:    s_waitcnt vmcnt(1)
1552; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1553; GCN-NEXT:    s_waitcnt vmcnt(0)
1554; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1555; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1556; GCN-NEXT:    v_mov_b32_e32 v0, s0
1557; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1558; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1559; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1560; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1561; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1562; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1563; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1564; GCN-NEXT:    flat_store_dword v[0:1], v2
1565; GCN-NEXT:    s_endpgm
1566;
1567; GFX1030-LABEL: v_udiv_i16:
1568; GFX1030:       ; %bb.0:
1569; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1570; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1571; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX1030-NEXT:    s_clause 0x1
1573; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3] offset:2
1574; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3]
1575; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1576; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1577; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1578; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1579; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1580; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1581; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1582; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1583; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1584; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1585; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1586; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1587; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1588; GFX1030-NEXT:    s_endpgm
1589;
1590; EG-LABEL: v_udiv_i16:
1591; EG:       ; %bb.0:
1592; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1593; EG-NEXT:    TEX 1 @6
1594; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1595; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1596; EG-NEXT:    CF_END
1597; EG-NEXT:    PAD
1598; EG-NEXT:    Fetch clause starting at 6:
1599; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1600; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1601; EG-NEXT:    ALU clause starting at 10:
1602; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1603; EG-NEXT:    ALU clause starting at 11:
1604; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1605; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1606; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1607; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1608; EG-NEXT:     TRUNC * T0.W, PV.W,
1609; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1610; EG-NEXT:     TRUNC * T0.W, PV.W,
1611; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1612; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1613; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1614; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1615; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1616; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1617; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1618; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1619  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
1620  %num = load i16, i16 addrspace(1) * %in
1621  %den = load i16, i16 addrspace(1) * %den_ptr
1622  %result = udiv i16 %num, %den
1623  %result.ext = zext i16 %result to i32
1624  store i32 %result.ext, i32 addrspace(1)* %out
1625  ret void
1626}
1627
1628define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1629; SI-LABEL: v_udiv_i23:
1630; SI:       ; %bb.0:
1631; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1632; SI-NEXT:    s_mov_b32 s7, 0xf000
1633; SI-NEXT:    s_mov_b32 s6, -1
1634; SI-NEXT:    s_mov_b32 s10, s6
1635; SI-NEXT:    s_mov_b32 s11, s7
1636; SI-NEXT:    s_waitcnt lgkmcnt(0)
1637; SI-NEXT:    s_mov_b32 s8, s2
1638; SI-NEXT:    s_mov_b32 s9, s3
1639; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1640; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1641; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1642; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1643; SI-NEXT:    s_mov_b32 s4, s0
1644; SI-NEXT:    s_mov_b32 s5, s1
1645; SI-NEXT:    s_waitcnt vmcnt(3)
1646; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1647; SI-NEXT:    s_waitcnt vmcnt(2)
1648; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1649; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1650; SI-NEXT:    s_waitcnt vmcnt(1)
1651; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1652; SI-NEXT:    s_waitcnt vmcnt(0)
1653; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1654; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1655; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1656; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1657; SI-NEXT:    v_trunc_f32_e32 v2, v2
1658; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1659; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1660; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1661; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1662; SI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1663; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1664; SI-NEXT:    s_endpgm
1665;
1666; VI-LABEL: v_udiv_i23:
1667; VI:       ; %bb.0:
1668; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1669; VI-NEXT:    s_mov_b32 s7, 0xf000
1670; VI-NEXT:    s_mov_b32 s6, -1
1671; VI-NEXT:    s_mov_b32 s10, s6
1672; VI-NEXT:    s_mov_b32 s11, s7
1673; VI-NEXT:    s_waitcnt lgkmcnt(0)
1674; VI-NEXT:    s_mov_b32 s8, s2
1675; VI-NEXT:    s_mov_b32 s9, s3
1676; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1677; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1678; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1679; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1680; VI-NEXT:    s_mov_b32 s4, s0
1681; VI-NEXT:    s_mov_b32 s5, s1
1682; VI-NEXT:    s_waitcnt vmcnt(3)
1683; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1684; VI-NEXT:    s_waitcnt vmcnt(2)
1685; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1686; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1687; VI-NEXT:    s_waitcnt vmcnt(1)
1688; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1689; VI-NEXT:    s_waitcnt vmcnt(0)
1690; VI-NEXT:    v_or_b32_e32 v1, v3, v1
1691; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1692; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1693; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1694; VI-NEXT:    v_trunc_f32_e32 v2, v2
1695; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1696; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1697; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1698; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1699; VI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1700; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1701; VI-NEXT:    s_endpgm
1702;
1703; GCN-LABEL: v_udiv_i23:
1704; GCN:       ; %bb.0:
1705; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1706; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1707; GCN-NEXT:    s_add_u32 s4, s2, 4
1708; GCN-NEXT:    s_addc_u32 s5, s3, 0
1709; GCN-NEXT:    s_add_u32 s6, s2, 2
1710; GCN-NEXT:    s_addc_u32 s7, s3, 0
1711; GCN-NEXT:    v_mov_b32_e32 v0, s6
1712; GCN-NEXT:    v_mov_b32_e32 v1, s7
1713; GCN-NEXT:    s_add_u32 s6, s2, 6
1714; GCN-NEXT:    s_addc_u32 s7, s3, 0
1715; GCN-NEXT:    v_mov_b32_e32 v2, s6
1716; GCN-NEXT:    v_mov_b32_e32 v3, s7
1717; GCN-NEXT:    v_mov_b32_e32 v4, s4
1718; GCN-NEXT:    v_mov_b32_e32 v5, s5
1719; GCN-NEXT:    flat_load_ubyte v6, v[2:3]
1720; GCN-NEXT:    flat_load_ushort v4, v[4:5]
1721; GCN-NEXT:    v_mov_b32_e32 v2, s2
1722; GCN-NEXT:    v_mov_b32_e32 v3, s3
1723; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
1724; GCN-NEXT:    flat_load_ushort v1, v[2:3]
1725; GCN-NEXT:    s_waitcnt vmcnt(3)
1726; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
1727; GCN-NEXT:    s_waitcnt vmcnt(2)
1728; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
1729; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1730; GCN-NEXT:    s_waitcnt vmcnt(1)
1731; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1732; GCN-NEXT:    s_waitcnt vmcnt(0)
1733; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
1734; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1735; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1736; GCN-NEXT:    v_mov_b32_e32 v0, s0
1737; GCN-NEXT:    v_mov_b32_e32 v1, s1
1738; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1739; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1740; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1741; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1742; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1743; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1744; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
1745; GCN-NEXT:    flat_store_dword v[0:1], v2
1746; GCN-NEXT:    s_endpgm
1747;
1748; GFX1030-LABEL: v_udiv_i23:
1749; GFX1030:       ; %bb.0:
1750; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1751; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1752; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX1030-NEXT:    s_clause 0x3
1754; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1755; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1756; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1757; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1758; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1759; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1760; GFX1030-NEXT:    s_waitcnt vmcnt(2)
1761; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1762; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1763; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1764; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1765; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1766; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
1767; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1768; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1769; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1770; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1771; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1772; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1773; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1774; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1775; GFX1030-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
1776; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1777; GFX1030-NEXT:    s_endpgm
1778;
1779; EG-LABEL: v_udiv_i23:
1780; EG:       ; %bb.0:
1781; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1782; EG-NEXT:    TEX 3 @6
1783; EG-NEXT:    ALU 20, @15, KC0[CB0:0-32], KC1[]
1784; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1785; EG-NEXT:    CF_END
1786; EG-NEXT:    PAD
1787; EG-NEXT:    Fetch clause starting at 6:
1788; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1789; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1790; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1791; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1792; EG-NEXT:    ALU clause starting at 14:
1793; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1794; EG-NEXT:    ALU clause starting at 15:
1795; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1796; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1797; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1798; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1799; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1800; EG-NEXT:     UINT_TO_FLT * T0.X, PV.W,
1801; EG-NEXT:     OR_INT T0.W, T2.X, T1.W,
1802; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1803; EG-NEXT:     UINT_TO_FLT * T0.Z, PV.W,
1804; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Y,
1805; EG-NEXT:     TRUNC * T0.W, PV.W,
1806; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
1807; EG-NEXT:     TRUNC * T0.W, PV.W,
1808; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.X|,
1809; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1810; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1811; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1812; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1813; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1814; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1815; EG-NEXT:    8388607(1.175494e-38), 2(2.802597e-45)
1816  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1817  %num = load i23, i23 addrspace(1) * %in
1818  %den = load i23, i23 addrspace(1) * %den_ptr
1819  %result = udiv i23 %num, %den
1820  %result.ext = zext i23 %result to i32
1821  store i32 %result.ext, i32 addrspace(1)* %out
1822  ret void
1823}
1824
1825define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1826; SI-LABEL: v_udiv_i24:
1827; SI:       ; %bb.0:
1828; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1829; SI-NEXT:    s_mov_b32 s7, 0xf000
1830; SI-NEXT:    s_mov_b32 s6, -1
1831; SI-NEXT:    s_mov_b32 s10, s6
1832; SI-NEXT:    s_mov_b32 s11, s7
1833; SI-NEXT:    s_waitcnt lgkmcnt(0)
1834; SI-NEXT:    s_mov_b32 s8, s2
1835; SI-NEXT:    s_mov_b32 s9, s3
1836; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1837; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1838; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1839; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1840; SI-NEXT:    s_mov_b32 s4, s0
1841; SI-NEXT:    s_mov_b32 s5, s1
1842; SI-NEXT:    s_waitcnt vmcnt(3)
1843; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1844; SI-NEXT:    s_waitcnt vmcnt(2)
1845; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1846; SI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1847; SI-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1848; SI-NEXT:    s_waitcnt vmcnt(1)
1849; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1850; SI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1851; SI-NEXT:    s_waitcnt vmcnt(0)
1852; SI-NEXT:    v_or_b32_e32 v2, v3, v2
1853; SI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1854; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1855; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
1856; SI-NEXT:    v_mul_hi_u32 v4, v1, v4
1857; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1858; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
1859; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
1860; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1861; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1862; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1863; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1864; SI-NEXT:    v_sub_i32_e32 v3, vcc, v2, v0
1865; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1866; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
1867; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1868; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1869; SI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1870; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1871; SI-NEXT:    s_endpgm
1872;
1873; VI-LABEL: v_udiv_i24:
1874; VI:       ; %bb.0:
1875; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1876; VI-NEXT:    s_mov_b32 s7, 0xf000
1877; VI-NEXT:    s_mov_b32 s6, -1
1878; VI-NEXT:    s_mov_b32 s10, s6
1879; VI-NEXT:    s_mov_b32 s11, s7
1880; VI-NEXT:    s_waitcnt lgkmcnt(0)
1881; VI-NEXT:    s_mov_b32 s8, s2
1882; VI-NEXT:    s_mov_b32 s9, s3
1883; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1884; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1885; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1886; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1887; VI-NEXT:    s_mov_b32 s4, s0
1888; VI-NEXT:    s_mov_b32 s5, s1
1889; VI-NEXT:    s_waitcnt vmcnt(3)
1890; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1891; VI-NEXT:    s_waitcnt vmcnt(2)
1892; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1893; VI-NEXT:    v_cvt_f32_u32_e32 v1, v0
1894; VI-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
1895; VI-NEXT:    s_waitcnt vmcnt(1)
1896; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1897; VI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1898; VI-NEXT:    s_waitcnt vmcnt(0)
1899; VI-NEXT:    v_or_b32_e32 v2, v3, v2
1900; VI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1901; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1902; VI-NEXT:    v_mul_lo_u32 v4, v4, v1
1903; VI-NEXT:    v_mul_hi_u32 v4, v1, v4
1904; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
1905; VI-NEXT:    v_mul_hi_u32 v1, v2, v1
1906; VI-NEXT:    v_mul_lo_u32 v3, v1, v0
1907; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
1908; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1909; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
1910; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1911; VI-NEXT:    v_sub_u32_e32 v3, vcc, v2, v0
1912; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1913; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
1914; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1915; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1916; VI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1917; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1918; VI-NEXT:    s_endpgm
1919;
1920; GCN-LABEL: v_udiv_i24:
1921; GCN:       ; %bb.0:
1922; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1923; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1924; GCN-NEXT:    s_add_u32 s4, s2, 4
1925; GCN-NEXT:    s_addc_u32 s5, s3, 0
1926; GCN-NEXT:    s_add_u32 s6, s2, 2
1927; GCN-NEXT:    v_mov_b32_e32 v0, s4
1928; GCN-NEXT:    s_addc_u32 s7, s3, 0
1929; GCN-NEXT:    v_mov_b32_e32 v1, s5
1930; GCN-NEXT:    s_add_u32 s4, s2, 6
1931; GCN-NEXT:    s_addc_u32 s5, s3, 0
1932; GCN-NEXT:    v_mov_b32_e32 v2, s4
1933; GCN-NEXT:    v_mov_b32_e32 v3, s5
1934; GCN-NEXT:    flat_load_ubyte v4, v[2:3]
1935; GCN-NEXT:    flat_load_ushort v5, v[0:1]
1936; GCN-NEXT:    v_mov_b32_e32 v2, s6
1937; GCN-NEXT:    v_mov_b32_e32 v0, s2
1938; GCN-NEXT:    v_mov_b32_e32 v3, s7
1939; GCN-NEXT:    v_mov_b32_e32 v1, s3
1940; GCN-NEXT:    flat_load_ubyte v2, v[2:3]
1941; GCN-NEXT:    flat_load_ushort v0, v[0:1]
1942; GCN-NEXT:    s_waitcnt vmcnt(3)
1943; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
1944; GCN-NEXT:    s_waitcnt vmcnt(2)
1945; GCN-NEXT:    v_or_b32_e32 v3, v5, v1
1946; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v3
1947; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
1948; GCN-NEXT:    s_waitcnt vmcnt(1)
1949; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1950; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1951; GCN-NEXT:    s_waitcnt vmcnt(0)
1952; GCN-NEXT:    v_or_b32_e32 v2, v0, v2
1953; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1954; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1955; GCN-NEXT:    v_mul_lo_u32 v4, v4, v1
1956; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1957; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v4
1958; GCN-NEXT:    v_mul_hi_u32 v4, v2, v0
1959; GCN-NEXT:    v_mov_b32_e32 v0, s0
1960; GCN-NEXT:    v_mov_b32_e32 v1, s1
1961; GCN-NEXT:    v_mul_lo_u32 v5, v4, v3
1962; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
1963; GCN-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
1964; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v3
1965; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
1966; GCN-NEXT:    v_sub_u32_e32 v5, vcc, v2, v3
1967; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1968; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
1969; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
1970; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
1971; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
1972; GCN-NEXT:    flat_store_dword v[0:1], v2
1973; GCN-NEXT:    s_endpgm
1974;
1975; GFX1030-LABEL: v_udiv_i24:
1976; GFX1030:       ; %bb.0:
1977; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1978; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1979; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1980; GFX1030-NEXT:    s_clause 0x3
1981; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1982; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1983; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1984; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1985; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1986; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1987; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1988; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1989; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1990; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1991; GFX1030-NEXT:    v_or_b32_e32 v3, v4, v3
1992; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v1
1993; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
1994; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1995; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1996; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
1997; GFX1030-NEXT:    v_mul_lo_u32 v5, v5, v2
1998; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v5
1999; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v2, v5
2000; GFX1030-NEXT:    v_mul_hi_u32 v2, v3, v2
2001; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, v1
2002; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
2003; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2004; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v3, v1
2005; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2006; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
2007; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
2008; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
2009; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
2010; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
2011; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
2012; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
2013; GFX1030-NEXT:    s_endpgm
2014;
2015; EG-LABEL: v_udiv_i24:
2016; EG:       ; %bb.0:
2017; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
2018; EG-NEXT:    TEX 3 @6
2019; EG-NEXT:    ALU 23, @15, KC0[CB0:0-32], KC1[]
2020; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2021; EG-NEXT:    CF_END
2022; EG-NEXT:    PAD
2023; EG-NEXT:    Fetch clause starting at 6:
2024; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
2025; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
2026; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
2027; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
2028; EG-NEXT:    ALU clause starting at 14:
2029; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2030; EG-NEXT:    ALU clause starting at 15:
2031; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
2032; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2033; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
2034; EG-NEXT:     SUB_INT T1.W, 0.0, PV.W,
2035; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2036; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
2037; EG-NEXT:     LSHL T1.W, T3.X, literal.x,
2038; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2039; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2040; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2041; EG-NEXT:     OR_INT * T1.W, T2.X, PV.W,
2042; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2043; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2044; EG-NEXT:     SUB_INT * T1.W, T1.W, PS,
2045; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2046; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
2047; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
2048; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.W, PS,
2049; EG-NEXT:     CNDE_INT * T2.W, PV.W, T0.X, PV.Z,
2050; EG-NEXT:     ADD_INT T3.W, PS, 1,
2051; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2052; EG-NEXT:     CNDE_INT T0.X, PS, T2.W, PV.W,
2053; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2054; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2055  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
2056  %num = load i24, i24 addrspace(1) * %in
2057  %den = load i24, i24 addrspace(1) * %den_ptr
2058  %result = udiv i24 %num, %den
2059  %result.ext = zext i24 %result to i32
2060  store i32 %result.ext, i32 addrspace(1)* %out
2061  ret void
2062}
2063
2064define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2065; SI-LABEL: scalarize_mulhu_4xi32:
2066; SI:       ; %bb.0:
2067; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2068; SI-NEXT:    s_mov_b32 s7, 0xf000
2069; SI-NEXT:    s_mov_b32 s6, -1
2070; SI-NEXT:    s_waitcnt lgkmcnt(0)
2071; SI-NEXT:    s_mov_b32 s4, s0
2072; SI-NEXT:    s_mov_b32 s5, s1
2073; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2074; SI-NEXT:    s_mov_b32 s0, 0x1389c755
2075; SI-NEXT:    s_mov_b32 s4, s2
2076; SI-NEXT:    s_mov_b32 s5, s3
2077; SI-NEXT:    s_waitcnt vmcnt(0)
2078; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2079; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2080; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2081; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2082; SI-NEXT:    v_mul_hi_u32 v0, v0, s0
2083; SI-NEXT:    v_mul_hi_u32 v1, v1, s0
2084; SI-NEXT:    v_mul_hi_u32 v2, v2, s0
2085; SI-NEXT:    v_mul_hi_u32 v3, v3, s0
2086; SI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2087; SI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2088; SI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2089; SI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2090; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2091; SI-NEXT:    s_endpgm
2092;
2093; VI-LABEL: scalarize_mulhu_4xi32:
2094; VI:       ; %bb.0:
2095; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2096; VI-NEXT:    s_mov_b32 s7, 0xf000
2097; VI-NEXT:    s_mov_b32 s6, -1
2098; VI-NEXT:    s_waitcnt lgkmcnt(0)
2099; VI-NEXT:    s_mov_b32 s4, s0
2100; VI-NEXT:    s_mov_b32 s5, s1
2101; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2102; VI-NEXT:    s_mov_b32 s0, 0x1389c755
2103; VI-NEXT:    s_mov_b32 s4, s2
2104; VI-NEXT:    s_mov_b32 s5, s3
2105; VI-NEXT:    s_waitcnt vmcnt(0)
2106; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2107; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2108; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2109; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2110; VI-NEXT:    v_mul_hi_u32 v0, v0, s0
2111; VI-NEXT:    v_mul_hi_u32 v1, v1, s0
2112; VI-NEXT:    v_mul_hi_u32 v2, v2, s0
2113; VI-NEXT:    v_mul_hi_u32 v3, v3, s0
2114; VI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2115; VI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2116; VI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2117; VI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2118; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2119; VI-NEXT:    s_endpgm
2120;
2121; GCN-LABEL: scalarize_mulhu_4xi32:
2122; GCN:       ; %bb.0:
2123; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2124; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2125; GCN-NEXT:    v_mov_b32_e32 v0, s0
2126; GCN-NEXT:    v_mov_b32_e32 v1, s1
2127; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2128; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2129; GCN-NEXT:    v_mov_b32_e32 v4, s2
2130; GCN-NEXT:    v_mov_b32_e32 v5, s3
2131; GCN-NEXT:    s_waitcnt vmcnt(0)
2132; GCN-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2133; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2134; GCN-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2135; GCN-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2136; GCN-NEXT:    v_mul_hi_u32 v0, v0, s0
2137; GCN-NEXT:    v_mul_hi_u32 v1, v1, s0
2138; GCN-NEXT:    v_mul_hi_u32 v2, v2, s0
2139; GCN-NEXT:    v_mul_hi_u32 v3, v3, s0
2140; GCN-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2141; GCN-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2142; GCN-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2143; GCN-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2144; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2145; GCN-NEXT:    s_endpgm
2146;
2147; GFX1030-LABEL: scalarize_mulhu_4xi32:
2148; GFX1030:       ; %bb.0:
2149; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2150; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
2151; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2152; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
2153; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2154; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2155; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2156; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2157; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2158; GFX1030-NEXT:    v_mul_hi_u32 v0, 0x1389c755, v0
2159; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x1389c755, v1
2160; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x1389c755, v2
2161; GFX1030-NEXT:    v_mul_hi_u32 v3, 0x1389c755, v3
2162; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2163; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2164; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2165; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2166; GFX1030-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
2167; GFX1030-NEXT:    s_endpgm
2168;
2169; EG-LABEL: scalarize_mulhu_4xi32:
2170; EG:       ; %bb.0:
2171; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2172; EG-NEXT:    TEX 0 @6
2173; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
2174; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2175; EG-NEXT:    CF_END
2176; EG-NEXT:    PAD
2177; EG-NEXT:    Fetch clause starting at 6:
2178; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2179; EG-NEXT:    ALU clause starting at 8:
2180; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2181; EG-NEXT:    ALU clause starting at 9:
2182; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
2183; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
2184; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2185; EG-NEXT:     MULHI * T0.Z, PV.W, literal.x,
2186; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2187; EG-NEXT:     LSHR T1.Z, T0.Y, literal.x,
2188; EG-NEXT:     LSHR T0.W, PS, literal.y,
2189; EG-NEXT:     MULHI * T0.Y, T1.W, literal.z,
2190; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
2191; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2192; EG-NEXT:     LSHR T0.Z, PS, literal.x,
2193; EG-NEXT:     LSHR T1.W, T0.X, literal.y,
2194; EG-NEXT:     MULHI * T0.X, PV.Z, literal.z,
2195; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2196; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2197; EG-NEXT:     LSHR T0.Y, PS, literal.x,
2198; EG-NEXT:     MULHI * T0.X, PV.W, literal.y,
2199; EG-NEXT:    10(1.401298e-44), 327796565(3.478022e-27)
2200; EG-NEXT:     LSHR T0.X, PS, literal.x,
2201; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.y,
2202; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2203  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2204  %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2205  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
2206  ret void
2207}
2208
2209define amdgpu_kernel void @test_udiv2(i32 %p) {
2210; SI-LABEL: test_udiv2:
2211; SI:       ; %bb.0:
2212; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2213; SI-NEXT:    s_mov_b32 s3, 0xf000
2214; SI-NEXT:    s_mov_b32 s2, -1
2215; SI-NEXT:    s_waitcnt lgkmcnt(0)
2216; SI-NEXT:    s_lshr_b32 s0, s0, 1
2217; SI-NEXT:    v_mov_b32_e32 v0, s0
2218; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2219; SI-NEXT:    s_waitcnt vmcnt(0)
2220; SI-NEXT:    s_endpgm
2221;
2222; VI-LABEL: test_udiv2:
2223; VI:       ; %bb.0:
2224; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2225; VI-NEXT:    s_mov_b32 s3, 0xf000
2226; VI-NEXT:    s_mov_b32 s2, -1
2227; VI-NEXT:    s_waitcnt lgkmcnt(0)
2228; VI-NEXT:    s_lshr_b32 s0, s0, 1
2229; VI-NEXT:    v_mov_b32_e32 v0, s0
2230; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2231; VI-NEXT:    s_waitcnt vmcnt(0)
2232; VI-NEXT:    s_endpgm
2233;
2234; GCN-LABEL: test_udiv2:
2235; GCN:       ; %bb.0:
2236; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2237; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2238; GCN-NEXT:    s_lshr_b32 s0, s0, 1
2239; GCN-NEXT:    v_mov_b32_e32 v0, s0
2240; GCN-NEXT:    flat_store_dword v[0:1], v0
2241; GCN-NEXT:    s_waitcnt vmcnt(0)
2242; GCN-NEXT:    s_endpgm
2243;
2244; GFX1030-LABEL: test_udiv2:
2245; GFX1030:       ; %bb.0:
2246; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2247; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2248; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2249; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2250; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2251; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2252; GFX1030-NEXT:    s_endpgm
2253;
2254; EG-LABEL: test_udiv2:
2255; EG:       ; %bb.0:
2256; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2257; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2258; EG-NEXT:    CF_END
2259; EG-NEXT:    PAD
2260; EG-NEXT:    ALU clause starting at 4:
2261; EG-NEXT:     MOV T0.X, literal.x,
2262; EG-NEXT:     LSHR * T1.X, KC0[2].Y, 1,
2263; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2264  %i = udiv i32 %p, 2
2265  store volatile i32 %i, i32 addrspace(1)* undef
2266  ret void
2267}
2268
2269define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
2270; SI-LABEL: test_udiv_3_mulhu:
2271; SI:       ; %bb.0:
2272; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2273; SI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2274; SI-NEXT:    s_mov_b32 s3, 0xf000
2275; SI-NEXT:    s_mov_b32 s2, -1
2276; SI-NEXT:    s_waitcnt lgkmcnt(0)
2277; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
2278; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2279; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2280; SI-NEXT:    s_waitcnt vmcnt(0)
2281; SI-NEXT:    s_endpgm
2282;
2283; VI-LABEL: test_udiv_3_mulhu:
2284; VI:       ; %bb.0:
2285; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2286; VI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2287; VI-NEXT:    s_mov_b32 s3, 0xf000
2288; VI-NEXT:    s_mov_b32 s2, -1
2289; VI-NEXT:    s_waitcnt lgkmcnt(0)
2290; VI-NEXT:    v_mul_hi_u32 v0, s0, v0
2291; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2292; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2293; VI-NEXT:    s_waitcnt vmcnt(0)
2294; VI-NEXT:    s_endpgm
2295;
2296; GCN-LABEL: test_udiv_3_mulhu:
2297; GCN:       ; %bb.0:
2298; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
2299; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2300; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2301; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
2302; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2303; GCN-NEXT:    flat_store_dword v[0:1], v0
2304; GCN-NEXT:    s_waitcnt vmcnt(0)
2305; GCN-NEXT:    s_endpgm
2306;
2307; GFX1030-LABEL: test_udiv_3_mulhu:
2308; GFX1030:       ; %bb.0:
2309; GFX1030-NEXT:    s_load_dword s0, s[4:5], 0x0
2310; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2311; GFX1030-NEXT:    s_mul_hi_u32 s0, s0, 0xaaaaaaab
2312; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2313; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2314; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2315; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2316; GFX1030-NEXT:    s_endpgm
2317;
2318; EG-LABEL: test_udiv_3_mulhu:
2319; EG:       ; %bb.0:
2320; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2321; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2322; EG-NEXT:    CF_END
2323; EG-NEXT:    PAD
2324; EG-NEXT:    ALU clause starting at 4:
2325; EG-NEXT:     MULHI * T0.X, KC0[2].Y, literal.x,
2326; EG-NEXT:    -1431655765(-3.031649e-13), 0(0.000000e+00)
2327; EG-NEXT:     LSHR T0.X, PS, 1,
2328; EG-NEXT:     MOV * T1.X, literal.x,
2329; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2330   %i = udiv i32 %p, 3
2331   store volatile i32 %i, i32 addrspace(1)* undef
2332   ret void
2333}
2334
2335define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
2336; SI-LABEL: fdiv_test_denormals:
2337; SI:       ; %bb.0: ; %bb
2338; SI-NEXT:    s_mov_b32 s0, 0
2339; SI-NEXT:    s_mov_b32 s3, 0xf000
2340; SI-NEXT:    s_mov_b32 s2, -1
2341; SI-NEXT:    s_mov_b32 s1, s0
2342; SI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2343; SI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2344; SI-NEXT:    s_waitcnt vmcnt(1)
2345; SI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2346; SI-NEXT:    s_waitcnt vmcnt(0)
2347; SI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2348; SI-NEXT:    v_xor_b32_e32 v0, v1, v0
2349; SI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2350; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2351; SI-NEXT:    v_or_b32_e32 v0, 1, v0
2352; SI-NEXT:    v_mul_f32_e32 v1, v3, v4
2353; SI-NEXT:    v_trunc_f32_e32 v1, v1
2354; SI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2355; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2356; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2357; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2358; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
2359; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2360; SI-NEXT:    s_endpgm
2361;
2362; VI-LABEL: fdiv_test_denormals:
2363; VI:       ; %bb.0: ; %bb
2364; VI-NEXT:    s_mov_b32 s0, 0
2365; VI-NEXT:    s_mov_b32 s3, 0xf000
2366; VI-NEXT:    s_mov_b32 s2, -1
2367; VI-NEXT:    s_mov_b32 s1, s0
2368; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2369; VI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2370; VI-NEXT:    s_waitcnt vmcnt(1)
2371; VI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2372; VI-NEXT:    s_waitcnt vmcnt(0)
2373; VI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2374; VI-NEXT:    v_xor_b32_e32 v0, v1, v0
2375; VI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2376; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2377; VI-NEXT:    v_or_b32_e32 v0, 1, v0
2378; VI-NEXT:    v_mul_f32_e32 v1, v3, v4
2379; VI-NEXT:    v_trunc_f32_e32 v1, v1
2380; VI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2381; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2382; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2383; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2384; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2385; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2386; VI-NEXT:    s_endpgm
2387;
2388; GCN-LABEL: fdiv_test_denormals:
2389; GCN:       ; %bb.0: ; %bb
2390; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
2391; GCN-NEXT:    v_mov_b32_e32 v0, 0
2392; GCN-NEXT:    v_mov_b32_e32 v1, 0
2393; GCN-NEXT:    flat_load_sbyte v3, v[0:1]
2394; GCN-NEXT:    s_waitcnt vmcnt(1)
2395; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v2
2396; GCN-NEXT:    s_waitcnt vmcnt(0)
2397; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v3
2398; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2399; GCN-NEXT:    v_xor_b32_e32 v2, v3, v2
2400; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2401; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
2402; GCN-NEXT:    v_mul_f32_e32 v3, v5, v6
2403; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2404; GCN-NEXT:    v_mad_f32 v5, -v3, v4, v5
2405; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2406; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
2407; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
2408; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
2409; GCN-NEXT:    flat_store_byte v[0:1], v2
2410; GCN-NEXT:    s_endpgm
2411;
2412; GFX1030-LABEL: fdiv_test_denormals:
2413; GFX1030:       ; %bb.0: ; %bb
2414; GFX1030-NEXT:    global_load_sbyte v2, v[0:1], off
2415; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
2416; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
2417; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
2418; GFX1030-NEXT:    s_waitcnt vmcnt(1)
2419; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
2420; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v4
2421; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2422; GFX1030-NEXT:    v_cvt_f32_i32_e32 v6, v3
2423; GFX1030-NEXT:    v_xor_b32_e32 v2, v3, v2
2424; GFX1030-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2425; GFX1030-NEXT:    v_mul_f32_e32 v5, v6, v5
2426; GFX1030-NEXT:    v_or_b32_e32 v2, 1, v2
2427; GFX1030-NEXT:    v_trunc_f32_e32 v3, v5
2428; GFX1030-NEXT:    v_fma_f32 v5, -v3, v4, v6
2429; GFX1030-NEXT:    v_cvt_i32_f32_e32 v3, v3
2430; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4|
2431; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
2432; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v3, v2
2433; GFX1030-NEXT:    global_store_byte v[0:1], v2, off
2434; GFX1030-NEXT:    s_endpgm
2435;
2436; EG-LABEL: fdiv_test_denormals:
2437; EG:       ; %bb.0: ; %bb
2438; EG-NEXT:    TEX 0 @6
2439; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
2440; EG-NEXT:    TEX 0 @8
2441; EG-NEXT:    ALU 25, @11, KC0[], KC1[]
2442; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
2443; EG-NEXT:    CF_END
2444; EG-NEXT:    Fetch clause starting at 6:
2445; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
2446; EG-NEXT:    Fetch clause starting at 8:
2447; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
2448; EG-NEXT:    ALU clause starting at 10:
2449; EG-NEXT:     MOV * T1.X, 0.0,
2450; EG-NEXT:    ALU clause starting at 11:
2451; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
2452; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2453; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
2454; EG-NEXT:     BFE_INT T1.W, T1.X, 0.0, literal.x,
2455; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
2456; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2457; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
2458; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
2459; EG-NEXT:     TRUNC T2.W, PV.W,
2460; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
2461; EG-NEXT:     ASHR T0.W, PS, literal.x,
2462; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
2463; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
2464; EG-NEXT:     TRUNC T0.Z, T2.W,
2465; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
2466; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
2467; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
2468; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
2469; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
2470; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
2471; EG-NEXT:     MOV * T0.W, literal.x,
2472; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2473; EG-NEXT:     MOV T0.Y, 0.0,
2474; EG-NEXT:     MOV * T0.Z, 0.0,
2475; EG-NEXT:     MOV * T1.X, literal.x,
2476; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2477bb:
2478  %tmp = load i8, i8 addrspace(1)* null, align 1
2479  %tmp1 = sext i8 %tmp to i32
2480  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
2481  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
2482  %tmp4 = sext i8 %tmp3 to i32
2483  %tmp5 = sdiv i32 %tmp1, %tmp4
2484  %tmp6 = trunc i32 %tmp5 to i8
2485  store i8 %tmp6, i8 addrspace(1)* null, align 1
2486  ret void
2487}
2488
2489define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
2490; SI-LABEL: v_test_udiv64_mulhi_fold:
2491; SI:       ; %bb.0:
2492; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2493; SI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2494; SI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2495; SI-NEXT:    v_rcp_f32_e32 v2, v2
2496; SI-NEXT:    s_mov_b32 s4, 0xfffe7960
2497; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2498; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2499; SI-NEXT:    v_trunc_f32_e32 v3, v3
2500; SI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2501; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
2502; SI-NEXT:    v_cvt_u32_f32_e32 v3, v3
2503; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2504; SI-NEXT:    v_mul_lo_u32 v6, v3, s4
2505; SI-NEXT:    v_mul_lo_u32 v5, v2, s4
2506; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2507; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
2508; SI-NEXT:    v_mul_hi_u32 v7, v2, v5
2509; SI-NEXT:    v_mul_lo_u32 v6, v2, v4
2510; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
2511; SI-NEXT:    v_mul_hi_u32 v9, v3, v4
2512; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2513; SI-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
2514; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
2515; SI-NEXT:    v_mul_lo_u32 v8, v3, v5
2516; SI-NEXT:    v_mul_hi_u32 v5, v3, v5
2517; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
2518; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
2519; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
2520; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2521; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2522; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2523; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2524; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
2525; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
2526; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2527; SI-NEXT:    s_mov_b32 s4, 0x186a0
2528; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
2529; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
2530; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
2531; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
2532; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
2533; SI-NEXT:    v_mul_hi_u32 v9, v3, v4
2534; SI-NEXT:    v_mul_lo_u32 v4, v3, v4
2535; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
2536; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
2537; SI-NEXT:    v_mul_lo_u32 v8, v3, v6
2538; SI-NEXT:    v_mul_hi_u32 v6, v3, v6
2539; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
2540; SI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
2541; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
2542; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2543; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2544; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
2545; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2546; SI-NEXT:    v_mul_lo_u32 v4, v0, v3
2547; SI-NEXT:    v_mul_hi_u32 v5, v0, v2
2548; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
2549; SI-NEXT:    v_mul_hi_u32 v7, v1, v3
2550; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
2551; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2552; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
2553; SI-NEXT:    v_mul_lo_u32 v6, v1, v2
2554; SI-NEXT:    v_mul_hi_u32 v2, v1, v2
2555; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
2556; SI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
2557; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
2558; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2559; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2560; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
2561; SI-NEXT:    v_mul_hi_u32 v5, v2, s4
2562; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
2563; SI-NEXT:    s_mov_b32 s4, 0x1869f
2564; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2565; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
2566; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
2567; SI-NEXT:    v_subrev_i32_e32 v4, vcc, 0x186a0, v0
2568; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
2569; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
2570; SI-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
2571; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2572; SI-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
2573; SI-NEXT:    v_add_i32_e32 v5, vcc, 2, v2
2574; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
2575; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
2576; SI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2577; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
2578; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2579; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2580; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
2581; SI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2582; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
2583; SI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2584; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2585; SI-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
2586; SI-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
2587; SI-NEXT:    s_setpc_b64 s[30:31]
2588;
2589; VI-LABEL: v_test_udiv64_mulhi_fold:
2590; VI:       ; %bb.0:
2591; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2592; VI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2593; VI-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2594; VI-NEXT:    v_rcp_f32_e32 v2, v2
2595; VI-NEXT:    s_mov_b32 s6, 0xfffe7960
2596; VI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2597; VI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2598; VI-NEXT:    v_trunc_f32_e32 v3, v3
2599; VI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2600; VI-NEXT:    v_cvt_u32_f32_e32 v6, v2
2601; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
2602; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2603; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2604; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
2605; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
2606; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
2607; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
2608; VI-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
2609; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2610; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2611; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
2612; VI-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
2613; VI-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2614; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2615; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2616; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2617; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2618; VI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2619; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2620; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
2621; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2622; VI-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2623; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2624; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
2625; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2626; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2627; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
2628; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2629; VI-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2630; VI-NEXT:    v_addc_u32_e32 v2, vcc, v9, v3, vcc
2631; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2632; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2633; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2634; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2635; VI-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2636; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2637; VI-NEXT:    v_mul_hi_u32 v6, v0, v4
2638; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2639; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2640; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2641; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2642; VI-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2643; VI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2644; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2645; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2646; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2647; VI-NEXT:    s_mov_b32 s4, 0x186a0
2648; VI-NEXT:    v_mul_lo_u32 v6, v5, s4
2649; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
2650; VI-NEXT:    s_mov_b32 s4, 0x1869f
2651; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2652; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2653; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2654; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 0x186a0, v0
2655; VI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2656; VI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2657; VI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2658; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2659; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2660; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2661; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2662; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2663; VI-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2664; VI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2665; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2666; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2667; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2668; VI-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2669; VI-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2670; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2671; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2672; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2673; VI-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2674; VI-NEXT:    s_setpc_b64 s[30:31]
2675;
2676; GCN-LABEL: v_test_udiv64_mulhi_fold:
2677; GCN:       ; %bb.0:
2678; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2679; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
2680; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x47c35000
2681; GCN-NEXT:    v_rcp_f32_e32 v2, v2
2682; GCN-NEXT:    s_mov_b32 s6, 0xfffe7960
2683; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
2684; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
2685; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2686; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
2687; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v2
2688; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
2689; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2690; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2691; GCN-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
2692; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
2693; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
2694; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
2695; GCN-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
2696; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2697; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
2698; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0
2699; GCN-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
2700; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v10, v3, vcc
2701; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2702; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2703; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2704; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2705; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
2706; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
2707; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
2708; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
2709; GCN-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
2710; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2711; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2
2712; GCN-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
2713; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0
2714; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
2715; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0
2716; GCN-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
2717; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v9, v3, vcc
2718; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2719; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2720; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2721; GCN-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
2722; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v3, vcc
2723; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0
2724; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
2725; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
2726; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
2727; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0
2728; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0
2729; GCN-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
2730; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v3, vcc
2731; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2732; GCN-NEXT:    v_add_u32_e32 v4, vcc, v2, v4
2733; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
2734; GCN-NEXT:    s_mov_b32 s4, 0x186a0
2735; GCN-NEXT:    v_mul_lo_u32 v6, v5, s4
2736; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
2737; GCN-NEXT:    s_mov_b32 s4, 0x1869f
2738; GCN-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
2739; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
2740; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2741; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, 0x186a0, v0
2742; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2743; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
2744; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
2745; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2746; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
2747; GCN-NEXT:    v_add_u32_e32 v3, vcc, 2, v4
2748; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
2749; GCN-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
2750; GCN-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
2751; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
2752; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2753; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
2754; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
2755; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2756; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
2757; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
2758; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
2759; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2760; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2761; GCN-NEXT:    s_setpc_b64 s[30:31]
2762;
2763; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
2764; GFX1030:       ; %bb.0:
2765; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2766; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2767; GFX1030-NEXT:    s_mov_b32 s4, 0x346d900
2768; GFX1030-NEXT:    s_add_u32 s4, 0x4237, s4
2769; GFX1030-NEXT:    s_addc_u32 s5, 0, 0
2770; GFX1030-NEXT:    v_add_co_u32 v2, s4, 0xa9000000, s4
2771; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
2772; GFX1030-NEXT:    s_addc_u32 s5, s5, 0xa7c5
2773; GFX1030-NEXT:    v_readfirstlane_b32 s4, v2
2774; GFX1030-NEXT:    s_mul_i32 s6, s5, 0xfffe7960
2775; GFX1030-NEXT:    s_mul_hi_u32 s7, s4, 0xfffe7960
2776; GFX1030-NEXT:    s_mul_i32 s8, s4, 0xfffe7960
2777; GFX1030-NEXT:    s_sub_i32 s7, s7, s4
2778; GFX1030-NEXT:    s_mul_hi_u32 s9, s4, s8
2779; GFX1030-NEXT:    s_add_i32 s7, s7, s6
2780; GFX1030-NEXT:    s_mul_hi_u32 s10, s5, s8
2781; GFX1030-NEXT:    s_mul_i32 s6, s5, s8
2782; GFX1030-NEXT:    s_mul_hi_u32 s8, s4, s7
2783; GFX1030-NEXT:    s_mul_i32 s4, s4, s7
2784; GFX1030-NEXT:    s_mul_hi_u32 s11, s5, s7
2785; GFX1030-NEXT:    s_add_u32 s4, s9, s4
2786; GFX1030-NEXT:    s_addc_u32 s8, 0, s8
2787; GFX1030-NEXT:    s_add_u32 s4, s4, s6
2788; GFX1030-NEXT:    s_mul_i32 s7, s5, s7
2789; GFX1030-NEXT:    s_addc_u32 s4, s8, s10
2790; GFX1030-NEXT:    s_addc_u32 s6, s11, 0
2791; GFX1030-NEXT:    s_add_u32 s4, s4, s7
2792; GFX1030-NEXT:    s_addc_u32 s6, 0, s6
2793; GFX1030-NEXT:    v_add_co_u32 v4, s4, v2, s4
2794; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
2795; GFX1030-NEXT:    s_addc_u32 s4, s5, s6
2796; GFX1030-NEXT:    v_mul_hi_u32 v8, v0, v4
2797; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, s4, 0
2798; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, v4, 0
2799; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v1, s4, 0
2800; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
2801; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2802; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2803; GFX1030-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
2804; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
2805; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v6
2806; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
2807; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x186a0, v5, 0
2808; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, 0x186a0, v6, v[3:4]
2809; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
2810; GFX1030-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2811; GFX1030-NEXT:    v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0
2812; GFX1030-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
2813; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2
2814; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
2815; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
2816; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, 2
2817; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
2818; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0
2819; GFX1030-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2820; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2821; GFX1030-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s4
2822; GFX1030-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
2823; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v5, 1
2824; GFX1030-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
2825; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
2826; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
2827; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc_lo
2828; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2829; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
2830; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
2831; GFX1030-NEXT:    s_setpc_b64 s[30:31]
2832;
2833; EG-LABEL: v_test_udiv64_mulhi_fold:
2834; EG:       ; %bb.0:
2835; EG-NEXT:    CF_END
2836; EG-NEXT:    PAD
2837  %d = udiv i64 %arg, 100000
2838  ret i64 %d
2839}
2840