1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3
4; 64-bit divides and rems should be split into a fast and slow path
5; where the fast path uses a 32-bit operation.
6
7define i64 @sdiv64(i64 %a, i64 %b) {
8; GFX9-LABEL: sdiv64:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
12; GFX9-NEXT:    v_mov_b32_e32 v4, 0
13; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
14; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
15; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
16; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
17; GFX9-NEXT:    s_cbranch_execz .LBB0_2
18; GFX9-NEXT:  ; %bb.1:
19; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
20; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
21; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
22; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
23; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
24; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
25; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
26; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
27; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
28; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
29; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
30; GFX9-NEXT:    v_mov_b32_e32 v14, 0
31; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
32; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
33; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
34; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
35; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
36; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
37; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
38; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
39; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
40; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
41; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
42; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
43; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
44; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
45; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v4, vcc
46; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
47; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
48; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
49; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v14, vcc
50; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
51; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
52; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
53; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
54; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
55; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
56; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
57; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
58; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
59; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
60; GFX9-NEXT:    v_mul_hi_u32 v15, v13, v2
61; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
62; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v15, v5
63; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
64; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
65; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
66; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v14, vcc
67; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
68; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
69; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
70; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
71; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
72; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
73; GFX9-NEXT:    v_xor_b32_e32 v6, v0, v4
74; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v4, vcc
75; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
76; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v2
77; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v4
78; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v0
79; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
80; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
81; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0
82; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
83; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
84; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v14, vcc
85; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
86; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
87; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v2
88; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
89; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
90; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
91; GFX9-NEXT:    v_sub_u32_e32 v7, v5, v1
92; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v6, v0
93; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v7, v10, vcc
94; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v0, v11
95; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
96; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
97; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
98; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v11
99; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
100; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v10
101; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[4:5]
102; GFX9-NEXT:    v_add_co_u32_e64 v7, s[4:5], 2, v2
103; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
104; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[4:5], 0, v3, s[4:5]
105; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
106; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 1, v2
107; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
108; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
109; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, v3, s[4:5]
110; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
111; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
112; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
113; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
114; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
115; GFX9-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s[4:5]
116; GFX9-NEXT:    v_cndmask_b32_e64 v6, v13, v8, s[4:5]
117; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
118; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v9
119; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
120; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
121; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
122; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v1, v2
123; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
124; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
125; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
126; GFX9-NEXT:  .LBB0_2: ; %Flow
127; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
128; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
129; GFX9-NEXT:    s_cbranch_execz .LBB0_4
130; GFX9-NEXT:  ; %bb.3:
131; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
132; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
133; GFX9-NEXT:    v_mov_b32_e32 v5, 0
134; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
135; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
136; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
137; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
138; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
139; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
140; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
141; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
142; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
143; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
144; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
145; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
146; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
147; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
148; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
149; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
150; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
151; GFX9-NEXT:  .LBB0_4:
152; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
153; GFX9-NEXT:    v_mov_b32_e32 v0, v4
154; GFX9-NEXT:    v_mov_b32_e32 v1, v5
155; GFX9-NEXT:    s_setpc_b64 s[30:31]
156  %d = sdiv i64 %a, %b
157  ret i64 %d
158}
159
160define i64 @udiv64(i64 %a, i64 %b) {
161; GFX9-LABEL: udiv64:
162; GFX9:       ; %bb.0:
163; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
165; GFX9-NEXT:    v_mov_b32_e32 v4, 0
166; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
167; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
168; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
169; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
170; GFX9-NEXT:    s_cbranch_execz .LBB1_2
171; GFX9-NEXT:  ; %bb.1:
172; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
173; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
174; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
175; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
176; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
177; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
178; GFX9-NEXT:    v_mov_b32_e32 v13, 0
179; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
180; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
181; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
182; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
183; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
184; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
185; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
186; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
187; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
188; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
189; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
190; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
191; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
192; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
193; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
194; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
195; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
196; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
197; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
198; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
199; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
200; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
201; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
202; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v14
203; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
204; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
205; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
206; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
207; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
208; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
209; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
210; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
211; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
212; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
213; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
214; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
215; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
216; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
217; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
218; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
219; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
220; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
221; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
222; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
223; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
224; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
225; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
226; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
227; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
228; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
229; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
230; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
231; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v7
232; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
233; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
234; GFX9-NEXT:    v_sub_u32_e32 v8, v1, v5
235; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
236; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v8, v3, vcc
237; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v2
238; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
239; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
240; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
241; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
242; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
243; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v3
244; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v8, s[4:5]
245; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v6
246; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
247; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5]
248; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
249; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v6
250; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
251; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
252; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5]
253; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
254; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
255; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
256; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
257; GFX9-NEXT:    v_cndmask_b32_e64 v4, v11, v9, s[4:5]
258; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
259; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
260; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
261; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v0, vcc
262; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
263; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
264; GFX9-NEXT:  .LBB1_2: ; %Flow
265; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
266; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
267; GFX9-NEXT:    s_cbranch_execz .LBB1_4
268; GFX9-NEXT:  ; %bb.3:
269; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
270; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
271; GFX9-NEXT:    v_mov_b32_e32 v5, 0
272; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
273; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
274; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
275; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
276; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
277; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
278; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
279; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
280; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
281; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
282; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
283; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
284; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
285; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
286; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
287; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
288; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
289; GFX9-NEXT:  .LBB1_4:
290; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
291; GFX9-NEXT:    v_mov_b32_e32 v0, v4
292; GFX9-NEXT:    v_mov_b32_e32 v1, v5
293; GFX9-NEXT:    s_setpc_b64 s[30:31]
294  %d = udiv i64 %a, %b
295  ret i64 %d
296}
297
298define i64 @srem64(i64 %a, i64 %b) {
299; GFX9-LABEL: srem64:
300; GFX9:       ; %bb.0:
301; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
303; GFX9-NEXT:    v_mov_b32_e32 v4, 0
304; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
305; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
306; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
307; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
308; GFX9-NEXT:    s_cbranch_execz .LBB2_2
309; GFX9-NEXT:  ; %bb.1:
310; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
311; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
312; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
313; GFX9-NEXT:    v_xor_b32_e32 v9, v3, v4
314; GFX9-NEXT:    v_xor_b32_e32 v10, v2, v4
315; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v10
316; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v9
317; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v10
318; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
319; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
320; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
321; GFX9-NEXT:    v_mov_b32_e32 v13, 0
322; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
323; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
324; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
325; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
326; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
327; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v3
328; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
329; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
330; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v11
331; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v2
332; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
333; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
334; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v3
335; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
336; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
337; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0
338; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
339; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
340; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v13, vcc
341; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
342; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
343; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v6, v2
344; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
345; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v11
346; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v12
347; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
348; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
349; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0
350; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
351; GFX9-NEXT:    v_mul_hi_u32 v14, v12, v2
352; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0
353; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
354; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
355; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
356; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
357; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v13, vcc
358; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
359; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
360; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
361; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
362; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
363; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
364; GFX9-NEXT:    v_xor_b32_e32 v6, v0, v5
365; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
366; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
367; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v2
368; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v5
369; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v0
370; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
371; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
372; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
373; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
374; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
375; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v13, vcc
376; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
377; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
378; GFX9-NEXT:    v_mul_lo_u32 v2, v9, v0
379; GFX9-NEXT:    v_mul_lo_u32 v3, v10, v1
380; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v0, 0
381; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v2
382; GFX9-NEXT:    v_sub_u32_e32 v2, v4, v1
383; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v6, v0
384; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v9, vcc
385; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v0, v10
386; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[6:7], 0, v2, s[4:5]
387; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v9
388; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
389; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v3, v10
390; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
391; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v9
392; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v9, s[4:5]
393; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
394; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v3, v10
395; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
396; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
397; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v9
398; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
399; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
400; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
401; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
402; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
403; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v9
404; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
405; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
406; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
407; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v8, s[4:5]
408; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
409; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
410; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
411; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v5
412; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
413; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
414; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
415; GFX9-NEXT:  .LBB2_2: ; %Flow
416; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
417; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
418; GFX9-NEXT:    s_cbranch_execz .LBB2_4
419; GFX9-NEXT:  ; %bb.3:
420; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
421; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
422; GFX9-NEXT:    v_mov_b32_e32 v5, 0
423; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
424; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
425; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
426; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
427; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
428; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
429; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
430; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
431; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
432; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
433; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
434; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
435; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
436; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
437; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
438; GFX9-NEXT:  .LBB2_4:
439; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
440; GFX9-NEXT:    v_mov_b32_e32 v0, v4
441; GFX9-NEXT:    v_mov_b32_e32 v1, v5
442; GFX9-NEXT:    s_setpc_b64 s[30:31]
443  %d = srem i64 %a, %b
444  ret i64 %d
445}
446
447define i64 @urem64(i64 %a, i64 %b) {
448; GFX9-LABEL: urem64:
449; GFX9:       ; %bb.0:
450; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
452; GFX9-NEXT:    v_mov_b32_e32 v4, 0
453; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
454; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
455; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
456; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
457; GFX9-NEXT:    s_cbranch_execz .LBB3_2
458; GFX9-NEXT:  ; %bb.1:
459; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
460; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
461; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
462; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
463; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
464; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
465; GFX9-NEXT:    v_mov_b32_e32 v13, 0
466; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
467; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
468; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
469; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
470; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
471; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
472; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
473; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
474; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
475; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
476; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
477; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
478; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
479; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
480; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
481; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
482; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
483; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
484; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
485; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
486; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
487; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
488; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
489; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v14
490; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
491; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
492; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
493; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
494; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
495; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
496; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
497; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
498; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
499; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
500; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
501; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
502; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
503; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
504; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
505; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
506; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
507; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
508; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
509; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
510; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
511; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
512; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
513; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
514; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
515; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
516; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
517; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v4
518; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v5
519; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v4, 0
520; GFX9-NEXT:    v_add3_u32 v5, v5, v7, v6
521; GFX9-NEXT:    v_sub_u32_e32 v6, v1, v5
522; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
523; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc
524; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v0, v2
525; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
526; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
527; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
528; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
529; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
530; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
531; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
532; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
533; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
534; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
535; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v6, v2
536; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
537; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
538; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
539; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
540; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
541; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
542; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
543; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
544; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
545; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
546; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v9, s[4:5]
547; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
548; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
549; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
550; GFX9-NEXT:  .LBB3_2: ; %Flow
551; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
552; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
553; GFX9-NEXT:    s_cbranch_execz .LBB3_4
554; GFX9-NEXT:  ; %bb.3:
555; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
556; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
557; GFX9-NEXT:    v_mov_b32_e32 v5, 0
558; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
559; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
560; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
561; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
562; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
563; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
564; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
565; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
566; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
567; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
568; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
569; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
570; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
571; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
572; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
573; GFX9-NEXT:  .LBB3_4:
574; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
575; GFX9-NEXT:    v_mov_b32_e32 v0, v4
576; GFX9-NEXT:    v_mov_b32_e32 v1, v5
577; GFX9-NEXT:    s_setpc_b64 s[30:31]
578  %d = urem i64 %a, %b
579  ret i64 %d
580}
581
582define i32 @sdiv32(i32 %a, i32 %b) {
583; GFX9-LABEL: sdiv32:
584; GFX9:       ; %bb.0:
585; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
587; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
588; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
589; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
590; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v1
591; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
592; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
593; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
594; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
595; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
596; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
597; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
598; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
599; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
600; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
601; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
602; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
603; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
604; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
605; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
606; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v1
607; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
608; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
609; GFX9-NEXT:    v_add_u32_e32 v4, 1, v3
610; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
611; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
612; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
613; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
614; GFX9-NEXT:    s_setpc_b64 s[30:31]
615  %d = sdiv i32 %a, %b
616  ret i32 %d
617}
618
619define i32 @udiv32(i32 %a, i32 %b) {
620; GFX9-LABEL: udiv32:
621; GFX9:       ; %bb.0:
622; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
623; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
624; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
625; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
626; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
627; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
628; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
629; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
630; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
631; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
632; GFX9-NEXT:    v_mul_lo_u32 v3, v2, v1
633; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
634; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
635; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
636; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v1
637; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
638; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
639; GFX9-NEXT:    v_add_u32_e32 v3, 1, v2
640; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
641; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
642; GFX9-NEXT:    s_setpc_b64 s[30:31]
643  %d = udiv i32 %a, %b
644  ret i32 %d
645}
646
647define i32 @srem32(i32 %a, i32 %b) {
648; GFX9-LABEL: srem32:
649; GFX9:       ; %bb.0:
650; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
652; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
653; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
654; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
655; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
656; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
657; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
658; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
659; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
660; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
661; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
662; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
663; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
664; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
665; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
666; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
667; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
668; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
669; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
670; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
671; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
672; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
673; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
674; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
675; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
676; GFX9-NEXT:    s_setpc_b64 s[30:31]
677  %d = srem i32 %a, %b
678  ret i32 %d
679}
680
681define i32 @urem32(i32 %a, i32 %b) {
682; GFX9-LABEL: urem32:
683; GFX9:       ; %bb.0:
684; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
685; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
686; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
687; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
688; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
689; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
690; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
691; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
692; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
693; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
694; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
695; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
696; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
697; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
698; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
699; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
700; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
701; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
702; GFX9-NEXT:    s_setpc_b64 s[30:31]
703  %d = urem i32 %a, %b
704  ret i32 %d
705}
706
707define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
708; GFX9-LABEL: sdivrem64:
709; GFX9:       ; %bb.0:
710; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
711; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
712; GFX9-NEXT:    v_mov_b32_e32 v4, 0
713; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
714; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
715; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
716; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
717; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
718; GFX9-NEXT:    s_cbranch_execz .LBB8_2
719; GFX9-NEXT:  ; %bb.1:
720; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
721; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
722; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
723; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
724; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
725; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
726; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
727; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
728; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
729; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
730; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
731; GFX9-NEXT:    v_mov_b32_e32 v14, 0
732; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
733; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
734; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
735; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
736; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
737; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
738; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
739; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
740; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
741; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
742; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
743; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
744; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
745; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
746; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v4, vcc
747; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
748; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
749; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
750; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v14, vcc
751; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
752; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
753; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
754; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
755; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
756; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
757; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
758; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
759; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
760; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
761; GFX9-NEXT:    v_mul_hi_u32 v15, v13, v2
762; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
763; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v15, v5
764; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
765; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
766; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
767; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v14, vcc
768; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
769; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
770; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
771; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
772; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
773; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
774; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v7
775; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
776; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
777; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v2
778; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v7
779; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
780; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
781; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
782; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
783; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
784; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
785; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v14, vcc
786; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
787; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
788; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v2
789; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
790; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
791; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v6
792; GFX9-NEXT:    v_sub_u32_e32 v6, v4, v1
793; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v5, v0
794; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc
795; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v11
796; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5]
797; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v10
798; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
799; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v11
800; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
801; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v12, v10
802; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
803; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v2
804; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7]
805; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v2
806; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
807; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7]
808; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
809; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v5
810; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
811; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
812; GFX9-NEXT:    v_cndmask_b32_e64 v5, v16, v14, s[6:7]
813; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
814; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
815; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
816; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
817; GFX9-NEXT:    v_cndmask_b32_e64 v4, v15, v13, s[6:7]
818; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
819; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
820; GFX9-NEXT:    v_xor_b32_e32 v5, v7, v9
821; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v5
822; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
823; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v2, v5
824; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5]
825; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9]
826; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v8, v11
827; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
828; GFX9-NEXT:    v_cndmask_b32_e64 v2, v12, v2, s[6:7]
829; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
830; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[6:7]
831; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
832; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
833; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
834; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v7
835; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
836; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
837; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
838; GFX9-NEXT:  .LBB8_2: ; %Flow
839; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[10:11]
840; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
841; GFX9-NEXT:    s_cbranch_execz .LBB8_4
842; GFX9-NEXT:  ; %bb.3:
843; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
844; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
845; GFX9-NEXT:    v_mov_b32_e32 v5, 0
846; GFX9-NEXT:    v_mov_b32_e32 v7, v5
847; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
848; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
849; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
850; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
851; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
852; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
853; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
854; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
855; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
856; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
857; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
858; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
859; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
860; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
861; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
862; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
863; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
864; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
865; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
866; GFX9-NEXT:  .LBB8_4:
867; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
868; GFX9-NEXT:    v_mov_b32_e32 v0, v4
869; GFX9-NEXT:    v_mov_b32_e32 v1, v5
870; GFX9-NEXT:    v_mov_b32_e32 v2, v6
871; GFX9-NEXT:    v_mov_b32_e32 v3, v7
872; GFX9-NEXT:    s_setpc_b64 s[30:31]
873  %d = sdiv i64 %a, %b
874  %r = srem i64 %a, %b
875  %ins.0 = insertelement <2 x i64> undef, i64 %d, i32 0
876  %ins.1 = insertelement <2 x i64> %ins.0, i64 %r, i32 1
877  ret <2 x i64> %ins.1
878}
879
880define <2 x i64> @udivrem64(i64 %a, i64 %b) {
881; GFX9-LABEL: udivrem64:
882; GFX9:       ; %bb.0:
883; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
884; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
885; GFX9-NEXT:    v_mov_b32_e32 v4, 0
886; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
887; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
888; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
889; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
890; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
891; GFX9-NEXT:    s_cbranch_execz .LBB9_2
892; GFX9-NEXT:  ; %bb.1:
893; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
894; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
895; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
896; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
897; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
898; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
899; GFX9-NEXT:    v_mov_b32_e32 v13, 0
900; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
901; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
902; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
903; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
904; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
905; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
906; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
907; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
908; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
909; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
910; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
911; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
912; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
913; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
914; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
915; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
916; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
917; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
918; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
919; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
920; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
921; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
922; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
923; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v14
924; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
925; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
926; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
927; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
928; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
929; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
930; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
931; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
932; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
933; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
934; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
935; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
936; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
937; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
938; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
939; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
940; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
941; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
942; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
943; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
944; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
945; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
946; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
947; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
948; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
949; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
950; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
951; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
952; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v7
953; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
954; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
955; GFX9-NEXT:    v_sub_u32_e32 v8, v1, v5
956; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
957; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc
958; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v0, v2
959; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
960; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v3
961; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[6:7]
962; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v2
963; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
964; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v3
965; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v11, s[6:7]
966; GFX9-NEXT:    v_add_co_u32_e64 v11, s[6:7], 2, v6
967; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, v7, s[6:7]
968; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 1, v6
969; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
970; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v7, s[6:7]
971; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
972; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
973; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
974; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
975; GFX9-NEXT:    v_cndmask_b32_e64 v4, v14, v12, s[6:7]
976; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
977; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
978; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v8, v3, s[4:5]
979; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v9, v2
980; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
981; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
982; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
983; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[6:7]
984; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
985; GFX9-NEXT:    v_cndmask_b32_e64 v4, v13, v11, s[6:7]
986; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
987; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s[6:7]
988; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
989; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
990; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
991; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
992; GFX9-NEXT:  .LBB9_2: ; %Flow
993; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
994; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
995; GFX9-NEXT:    s_cbranch_execz .LBB9_4
996; GFX9-NEXT:  ; %bb.3:
997; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
998; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
999; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1000; GFX9-NEXT:    v_mov_b32_e32 v7, v5
1001; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1002; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1003; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1004; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
1005; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1006; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1007; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
1008; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
1009; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
1010; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
1011; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
1012; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1013; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1014; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1015; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
1016; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
1017; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1018; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
1019; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
1020; GFX9-NEXT:  .LBB9_4:
1021; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1022; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1023; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1024; GFX9-NEXT:    v_mov_b32_e32 v2, v6
1025; GFX9-NEXT:    v_mov_b32_e32 v3, v7
1026; GFX9-NEXT:    s_setpc_b64 s[30:31]
1027  %d = udiv i64 %a, %b
1028  %r = urem i64 %a, %b
1029  %ins.0 = insertelement <2 x i64> undef, i64 %d, i32 0
1030  %ins.1 = insertelement <2 x i64> %ins.0, i64 %r, i32 1
1031  ret <2 x i64> %ins.1
1032}
1033
1034define i64 @sdiv64_known32(i64 %a, i64 %b) {
1035; GFX9-LABEL: sdiv64_known32:
1036; GFX9:       ; %bb.0:
1037; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v3
1039; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
1040; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1041; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1042; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1043; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
1044; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1045; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1046; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1047; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
1048; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1049; GFX9-NEXT:    s_setpc_b64 s[30:31]
1050  %a.ext = ashr i64 %a, 32
1051  %b.ext = ashr i64 %b, 32
1052  %d = udiv i64 %a.ext, %b.ext
1053  ret i64 %d
1054}
1055
1056define i64 @udiv64_known32(i64 %a, i64 %b) {
1057; GFX9-LABEL: udiv64_known32:
1058; GFX9:       ; %bb.0:
1059; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
1061; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
1062; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v1
1063; GFX9-NEXT:    v_mul_f32_e32 v2, v0, v2
1064; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1065; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
1066; GFX9-NEXT:    v_mad_f32 v0, -v2, v1, v0
1067; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
1068; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1069; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
1070; GFX9-NEXT:    s_setpc_b64 s[30:31]
1071  %a.mask = and i64 %a, 4294967295
1072  %b.mask = and i64 %b, 4294967295
1073  %d = udiv i64 %a.mask, %b.mask
1074  ret i64 %d
1075}
1076