1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3
4; 64-bit divides and rems should be split into a fast and slow path
5; where the fast path uses a 32-bit operation.
6
7define i64 @sdiv64(i64 %a, i64 %b) {
8; GFX9-LABEL: sdiv64:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
12; GFX9-NEXT:    v_mov_b32_e32 v4, 0
13; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
14; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
15; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
16; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
17; GFX9-NEXT:    s_cbranch_execz .LBB0_2
18; GFX9-NEXT:  ; %bb.1:
19; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
20; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
21; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
22; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
23; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
24; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
25; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
26; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
27; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
28; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
29; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
30; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
31; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
32; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
33; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
34; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
35; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
36; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
37; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
38; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
39; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
40; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
41; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
42; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
43; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
44; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
45; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
46; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
47; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
48; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
49; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
50; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
51; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
52; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
53; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
54; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
55; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
56; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
57; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
58; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
59; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
60; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
61; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
62; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
63; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
64; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
65; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
66; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
67; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
68; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
69; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
70; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
71; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
72; GFX9-NEXT:    v_xor_b32_e32 v6, v0, v4
73; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v4, vcc
74; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
75; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v2
76; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v4
77; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v0
78; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
79; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
80; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0
81; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
82; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
83; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
84; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
85; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
86; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v2
87; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
88; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
89; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
90; GFX9-NEXT:    v_sub_u32_e32 v7, v5, v1
91; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v6, v0
92; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v7, v10, vcc
93; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v0, v11
94; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
95; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
96; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
97; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v11
98; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
99; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v10
100; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[4:5]
101; GFX9-NEXT:    v_add_co_u32_e64 v7, s[4:5], 2, v2
102; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
103; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[4:5], 0, v3, s[4:5]
104; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
105; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 1, v2
106; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
107; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
108; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, v3, s[4:5]
109; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
110; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
111; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
112; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
113; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
114; GFX9-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s[4:5]
115; GFX9-NEXT:    v_cndmask_b32_e64 v6, v13, v8, s[4:5]
116; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
117; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v9
118; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
119; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
120; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
121; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v1, v2
122; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
123; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
124; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
125; GFX9-NEXT:  .LBB0_2: ; %Flow
126; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
127; GFX9-NEXT:    s_cbranch_execz .LBB0_4
128; GFX9-NEXT:  ; %bb.3:
129; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
130; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
131; GFX9-NEXT:    v_mov_b32_e32 v5, 0
132; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
133; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
134; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
135; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
136; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
137; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
138; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
139; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
140; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
141; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
142; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
143; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
144; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
145; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
146; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
147; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
148; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
149; GFX9-NEXT:  .LBB0_4:
150; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
151; GFX9-NEXT:    v_mov_b32_e32 v0, v4
152; GFX9-NEXT:    v_mov_b32_e32 v1, v5
153; GFX9-NEXT:    s_setpc_b64 s[30:31]
154  %d = sdiv i64 %a, %b
155  ret i64 %d
156}
157
158define i64 @udiv64(i64 %a, i64 %b) {
159; GFX9-LABEL: udiv64:
160; GFX9:       ; %bb.0:
161; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
163; GFX9-NEXT:    v_mov_b32_e32 v4, 0
164; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
165; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
166; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
167; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
168; GFX9-NEXT:    s_cbranch_execz .LBB1_2
169; GFX9-NEXT:  ; %bb.1:
170; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
171; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
172; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
173; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
174; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
175; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
176; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
177; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
178; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
179; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
180; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
181; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
182; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
183; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
184; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
185; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
186; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
187; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
188; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
189; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
190; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
191; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
192; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
193; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
194; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
195; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
196; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
197; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
198; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
199; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
200; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
201; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
202; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
203; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
204; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
205; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
206; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
207; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
208; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
209; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
210; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
211; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
212; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
213; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
214; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
215; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
216; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
217; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
218; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
219; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
220; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
221; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
222; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
223; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
224; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
225; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
226; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
227; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
228; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v7
229; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
230; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
231; GFX9-NEXT:    v_sub_u32_e32 v8, v1, v5
232; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
233; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v8, v3, vcc
234; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v2
235; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
236; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
237; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
238; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
239; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
240; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v3
241; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v8, s[4:5]
242; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v6
243; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
244; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5]
245; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
246; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v6
247; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
248; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
249; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5]
250; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
252; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
253; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
254; GFX9-NEXT:    v_cndmask_b32_e64 v4, v11, v9, s[4:5]
255; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
256; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
257; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
258; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v0, vcc
259; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
260; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
261; GFX9-NEXT:  .LBB1_2: ; %Flow
262; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
263; GFX9-NEXT:    s_cbranch_execz .LBB1_4
264; GFX9-NEXT:  ; %bb.3:
265; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
266; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
267; GFX9-NEXT:    v_mov_b32_e32 v5, 0
268; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
269; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
270; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
271; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
272; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
273; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
274; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
275; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
276; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
277; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
278; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
279; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
280; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
281; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
282; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
283; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
284; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
285; GFX9-NEXT:  .LBB1_4:
286; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
287; GFX9-NEXT:    v_mov_b32_e32 v0, v4
288; GFX9-NEXT:    v_mov_b32_e32 v1, v5
289; GFX9-NEXT:    s_setpc_b64 s[30:31]
290  %d = udiv i64 %a, %b
291  ret i64 %d
292}
293
294define i64 @srem64(i64 %a, i64 %b) {
295; GFX9-LABEL: srem64:
296; GFX9:       ; %bb.0:
297; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
299; GFX9-NEXT:    v_mov_b32_e32 v4, 0
300; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
301; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
302; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
303; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
304; GFX9-NEXT:    s_cbranch_execz .LBB2_2
305; GFX9-NEXT:  ; %bb.1:
306; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
307; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
308; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
309; GFX9-NEXT:    v_xor_b32_e32 v9, v3, v4
310; GFX9-NEXT:    v_xor_b32_e32 v10, v2, v4
311; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v10
312; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v9
313; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v10
314; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
315; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
316; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
317; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
318; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
319; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
320; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
321; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
322; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v3
323; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
324; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
325; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v11
326; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v2
327; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
328; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
329; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v3
330; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
331; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v4, vcc
332; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0
333; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
334; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
335; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
336; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
337; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
338; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v6, v2
339; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
340; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v11
341; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v12
342; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
343; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
344; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0
345; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
346; GFX9-NEXT:    v_mul_hi_u32 v13, v12, v2
347; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0
348; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v5
349; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
350; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
351; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
352; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
353; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
354; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
355; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
356; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
357; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
358; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
359; GFX9-NEXT:    v_xor_b32_e32 v6, v0, v5
360; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
361; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
362; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v2
363; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v5
364; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v0
365; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
366; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
367; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
368; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
369; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
370; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
371; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
372; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
373; GFX9-NEXT:    v_mul_lo_u32 v2, v9, v0
374; GFX9-NEXT:    v_mul_lo_u32 v3, v10, v1
375; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v0, 0
376; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v2
377; GFX9-NEXT:    v_sub_u32_e32 v2, v4, v1
378; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v6, v0
379; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v9, vcc
380; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v0, v10
381; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[6:7], 0, v2, s[4:5]
382; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v9
383; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
384; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v3, v10
385; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
386; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v9
387; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v9, s[4:5]
388; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
389; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v3, v10
390; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
391; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
392; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v9
393; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
394; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
395; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
396; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
397; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
398; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v9
399; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
400; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
401; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
402; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v8, s[4:5]
403; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
404; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
405; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
406; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v5
407; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
408; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
409; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
410; GFX9-NEXT:  .LBB2_2: ; %Flow
411; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
412; GFX9-NEXT:    s_cbranch_execz .LBB2_4
413; GFX9-NEXT:  ; %bb.3:
414; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
415; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
416; GFX9-NEXT:    v_mov_b32_e32 v5, 0
417; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
418; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
419; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
420; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
421; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
422; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
423; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
424; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
425; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
426; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
427; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
428; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
429; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
430; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
431; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
432; GFX9-NEXT:  .LBB2_4:
433; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
434; GFX9-NEXT:    v_mov_b32_e32 v0, v4
435; GFX9-NEXT:    v_mov_b32_e32 v1, v5
436; GFX9-NEXT:    s_setpc_b64 s[30:31]
437  %d = srem i64 %a, %b
438  ret i64 %d
439}
440
441define i64 @urem64(i64 %a, i64 %b) {
442; GFX9-LABEL: urem64:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
446; GFX9-NEXT:    v_mov_b32_e32 v4, 0
447; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
448; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
449; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
450; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
451; GFX9-NEXT:    s_cbranch_execz .LBB3_2
452; GFX9-NEXT:  ; %bb.1:
453; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
454; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
455; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
456; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
457; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
458; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
459; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
460; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
461; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
462; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
463; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
464; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
465; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
466; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
467; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
468; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
469; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
470; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
471; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
472; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
473; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
474; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
475; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
476; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
477; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
478; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
479; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
480; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
481; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
482; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
483; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
484; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
485; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
486; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
487; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
488; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
489; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
490; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
491; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
492; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
493; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
494; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
495; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
496; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
497; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
498; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
499; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
500; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
501; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
502; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
503; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
504; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
505; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
506; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
507; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
508; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
509; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
510; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v4
511; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v5
512; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v4, 0
513; GFX9-NEXT:    v_add3_u32 v5, v5, v7, v6
514; GFX9-NEXT:    v_sub_u32_e32 v6, v1, v5
515; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
516; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc
517; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v0, v2
518; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
519; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
520; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
521; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
522; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
523; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
524; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
525; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
526; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
527; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
528; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v6, v2
529; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
530; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
531; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
532; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
533; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
534; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
535; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
536; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
537; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
538; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
539; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v9, s[4:5]
540; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
541; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
542; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
543; GFX9-NEXT:  .LBB3_2: ; %Flow
544; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
545; GFX9-NEXT:    s_cbranch_execz .LBB3_4
546; GFX9-NEXT:  ; %bb.3:
547; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
548; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
549; GFX9-NEXT:    v_mov_b32_e32 v5, 0
550; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
551; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
552; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
553; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
554; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
555; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
556; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
557; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
558; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
559; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
560; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
561; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
562; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
563; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
564; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
565; GFX9-NEXT:  .LBB3_4:
566; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
567; GFX9-NEXT:    v_mov_b32_e32 v0, v4
568; GFX9-NEXT:    v_mov_b32_e32 v1, v5
569; GFX9-NEXT:    s_setpc_b64 s[30:31]
570  %d = urem i64 %a, %b
571  ret i64 %d
572}
573
574define i32 @sdiv32(i32 %a, i32 %b) {
575; GFX9-LABEL: sdiv32:
576; GFX9:       ; %bb.0:
577; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
578; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
579; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
580; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
581; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
582; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v1
583; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
584; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
585; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
586; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
587; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
588; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
589; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
590; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
591; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
592; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
593; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
594; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
595; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
596; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
597; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
598; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v1
599; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
600; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
601; GFX9-NEXT:    v_add_u32_e32 v4, 1, v3
602; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
603; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
604; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
605; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
606; GFX9-NEXT:    s_setpc_b64 s[30:31]
607  %d = sdiv i32 %a, %b
608  ret i32 %d
609}
610
611define i32 @udiv32(i32 %a, i32 %b) {
612; GFX9-LABEL: udiv32:
613; GFX9:       ; %bb.0:
614; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
616; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
617; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
618; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
619; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
620; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
621; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
622; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
623; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
624; GFX9-NEXT:    v_mul_lo_u32 v3, v2, v1
625; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
626; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
627; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
628; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v1
629; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
630; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
631; GFX9-NEXT:    v_add_u32_e32 v3, 1, v2
632; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
633; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
634; GFX9-NEXT:    s_setpc_b64 s[30:31]
635  %d = udiv i32 %a, %b
636  ret i32 %d
637}
638
639define i32 @srem32(i32 %a, i32 %b) {
640; GFX9-LABEL: srem32:
641; GFX9:       ; %bb.0:
642; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
644; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
645; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
646; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
647; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
648; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
649; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
650; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
651; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
652; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
653; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
654; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
655; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
656; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
657; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
658; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
659; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
660; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
661; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
662; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
663; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
664; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
665; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
666; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
667; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
668; GFX9-NEXT:    s_setpc_b64 s[30:31]
669  %d = srem i32 %a, %b
670  ret i32 %d
671}
672
673define i32 @urem32(i32 %a, i32 %b) {
674; GFX9-LABEL: urem32:
675; GFX9:       ; %bb.0:
676; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
678; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
679; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
680; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
681; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
682; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
683; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
684; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
685; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
686; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
687; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
688; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
689; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
690; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
691; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
692; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
693; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
694; GFX9-NEXT:    s_setpc_b64 s[30:31]
695  %d = urem i32 %a, %b
696  ret i32 %d
697}
698
699define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
700; GFX9-LABEL: sdivrem64:
701; GFX9:       ; %bb.0:
702; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
704; GFX9-NEXT:    v_mov_b32_e32 v4, 0
705; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
706; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
707; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
708; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
709; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
710; GFX9-NEXT:    s_cbranch_execz .LBB8_2
711; GFX9-NEXT:  ; %bb.1:
712; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
713; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
714; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
715; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
716; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
717; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
718; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
719; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
720; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
721; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
722; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
723; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
724; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
725; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
726; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
727; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
728; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
729; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
730; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
731; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
732; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
733; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
734; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
735; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
736; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
737; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
738; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
739; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
740; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
741; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
742; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
743; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
744; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
745; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
746; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
747; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
748; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
749; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
750; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
751; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
752; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
753; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
754; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
755; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
756; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
757; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
758; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
759; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
760; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
761; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
762; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
763; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
764; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
765; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v7
766; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
767; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
768; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v2
769; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v7
770; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
771; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
772; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
773; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
774; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
775; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
776; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
777; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
778; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
779; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v2
780; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
781; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
782; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v6
783; GFX9-NEXT:    v_sub_u32_e32 v6, v4, v1
784; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v5, v0
785; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc
786; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v11
787; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5]
788; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v10
789; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
790; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v11
791; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
792; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v12, v10
793; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
794; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v2
795; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7]
796; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v2
797; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
798; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7]
799; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
800; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v5
801; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
802; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
803; GFX9-NEXT:    v_cndmask_b32_e64 v5, v16, v14, s[6:7]
804; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
805; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
806; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
807; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
808; GFX9-NEXT:    v_cndmask_b32_e64 v4, v15, v13, s[6:7]
809; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
810; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
811; GFX9-NEXT:    v_xor_b32_e32 v5, v7, v9
812; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v5
813; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
814; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v2, v5
815; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5]
816; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9]
817; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v8, v11
818; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
819; GFX9-NEXT:    v_cndmask_b32_e64 v2, v12, v2, s[6:7]
820; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
821; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[6:7]
822; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
823; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
824; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
825; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v7
826; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
827; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
828; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
829; GFX9-NEXT:  .LBB8_2: ; %Flow
830; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
831; GFX9-NEXT:    s_cbranch_execz .LBB8_4
832; GFX9-NEXT:  ; %bb.3:
833; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
834; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
835; GFX9-NEXT:    v_mov_b32_e32 v5, 0
836; GFX9-NEXT:    v_mov_b32_e32 v7, v5
837; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
838; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
839; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
840; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
841; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
842; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
843; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
844; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
845; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
846; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
847; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
848; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
849; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
850; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
851; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
852; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
853; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
854; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
855; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
856; GFX9-NEXT:  .LBB8_4:
857; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
858; GFX9-NEXT:    v_mov_b32_e32 v0, v4
859; GFX9-NEXT:    v_mov_b32_e32 v1, v5
860; GFX9-NEXT:    v_mov_b32_e32 v2, v6
861; GFX9-NEXT:    v_mov_b32_e32 v3, v7
862; GFX9-NEXT:    s_setpc_b64 s[30:31]
863  %d = sdiv i64 %a, %b
864  %r = srem i64 %a, %b
865  %ins.0 = insertelement <2 x i64> undef, i64 %d, i32 0
866  %ins.1 = insertelement <2 x i64> %ins.0, i64 %r, i32 1
867  ret <2 x i64> %ins.1
868}
869
870define <2 x i64> @udivrem64(i64 %a, i64 %b) {
871; GFX9-LABEL: udivrem64:
872; GFX9:       ; %bb.0:
873; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
874; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
875; GFX9-NEXT:    v_mov_b32_e32 v4, 0
876; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
877; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
878; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
879; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
880; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
881; GFX9-NEXT:    s_cbranch_execz .LBB9_2
882; GFX9-NEXT:  ; %bb.1:
883; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
884; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
885; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
886; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
887; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
888; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
889; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
890; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
891; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
892; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
893; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
894; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
895; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
896; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
897; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
898; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
899; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
900; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
901; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
902; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
903; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
904; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
905; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
906; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
907; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
908; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
909; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
910; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
911; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
912; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
913; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
914; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
915; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
916; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
917; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
918; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
919; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
920; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
921; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
922; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
923; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
924; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
925; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
926; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
927; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
928; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
929; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
930; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
931; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
932; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
933; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
934; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
935; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
936; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
937; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
938; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
939; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
940; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
941; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v7
942; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
943; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
944; GFX9-NEXT:    v_sub_u32_e32 v8, v1, v5
945; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
946; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc
947; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v0, v2
948; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
949; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v3
950; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[6:7]
951; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v2
952; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
953; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v3
954; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v11, s[6:7]
955; GFX9-NEXT:    v_add_co_u32_e64 v11, s[6:7], 2, v6
956; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, v7, s[6:7]
957; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 1, v6
958; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
959; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v7, s[6:7]
960; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
961; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
962; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
963; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
964; GFX9-NEXT:    v_cndmask_b32_e64 v4, v14, v12, s[6:7]
965; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
966; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
967; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v8, v3, s[4:5]
968; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v9, v2
969; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
970; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
971; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
972; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[6:7]
973; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
974; GFX9-NEXT:    v_cndmask_b32_e64 v4, v13, v11, s[6:7]
975; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
976; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s[6:7]
977; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
978; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
979; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
980; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
981; GFX9-NEXT:  .LBB9_2: ; %Flow
982; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
983; GFX9-NEXT:    s_cbranch_execz .LBB9_4
984; GFX9-NEXT:  ; %bb.3:
985; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
986; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
987; GFX9-NEXT:    v_mov_b32_e32 v5, 0
988; GFX9-NEXT:    v_mov_b32_e32 v7, v5
989; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
990; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
991; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
992; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
993; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
994; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
995; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
996; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
997; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
998; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
999; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
1000; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1001; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1002; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1003; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
1004; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
1005; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1006; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
1007; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
1008; GFX9-NEXT:  .LBB9_4:
1009; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1010; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1011; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1012; GFX9-NEXT:    v_mov_b32_e32 v2, v6
1013; GFX9-NEXT:    v_mov_b32_e32 v3, v7
1014; GFX9-NEXT:    s_setpc_b64 s[30:31]
1015  %d = udiv i64 %a, %b
1016  %r = urem i64 %a, %b
1017  %ins.0 = insertelement <2 x i64> undef, i64 %d, i32 0
1018  %ins.1 = insertelement <2 x i64> %ins.0, i64 %r, i32 1
1019  ret <2 x i64> %ins.1
1020}
1021
1022define i64 @sdiv64_known32(i64 %a, i64 %b) {
1023; GFX9-LABEL: sdiv64_known32:
1024; GFX9:       ; %bb.0:
1025; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1026; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v3
1027; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
1028; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1029; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1030; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1031; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
1032; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1033; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1034; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1035; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
1036; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1037; GFX9-NEXT:    s_setpc_b64 s[30:31]
1038  %a.ext = ashr i64 %a, 32
1039  %b.ext = ashr i64 %b, 32
1040  %d = udiv i64 %a.ext, %b.ext
1041  ret i64 %d
1042}
1043
1044define i64 @udiv64_known32(i64 %a, i64 %b) {
1045; GFX9-LABEL: udiv64_known32:
1046; GFX9:       ; %bb.0:
1047; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1048; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
1049; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
1050; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v1
1051; GFX9-NEXT:    v_mul_f32_e32 v2, v0, v2
1052; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1053; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
1054; GFX9-NEXT:    v_mad_f32 v0, -v2, v1, v0
1055; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
1056; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1057; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
1058; GFX9-NEXT:    s_setpc_b64 s[30:31]
1059  %a.mask = and i64 %a, 4294967295
1060  %b.mask = and i64 %b, 4294967295
1061  %d = udiv i64 %a.mask, %b.mask
1062  ret i64 %d
1063}
1064