1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
4
5define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6; GCN-LABEL: s_test_udiv_i64:
7; GCN:       ; %bb.0:
8; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
9; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
10; GCN-NEXT:    s_mov_b32 s7, 0xf000
11; GCN-NEXT:    s_mov_b32 s6, -1
12; GCN-NEXT:    s_waitcnt lgkmcnt(0)
13; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
14; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
15; GCN-NEXT:    s_sub_u32 s4, 0, s8
16; GCN-NEXT:    s_subb_u32 s5, 0, s9
17; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
18; GCN-NEXT:    v_rcp_f32_e32 v0, v0
19; GCN-NEXT:    v_mov_b32_e32 v1, 0
20; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
21; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
22; GCN-NEXT:    v_trunc_f32_e32 v2, v2
23; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
24; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
25; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
26; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
27; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
28; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
29; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
30; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
31; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
32; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
33; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
34; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
35; GCN-NEXT:    v_mul_lo_u32 v7, v2, v5
36; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
37; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
38; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
39; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
40; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
41; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
42; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
43; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
44; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
45; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
46; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
47; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
48; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
49; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
50; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
51; GCN-NEXT:    s_mov_b32 s5, s1
52; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
53; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
54; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
55; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
56; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
57; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
58; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
59; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
60; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
61; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
62; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
63; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
64; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
65; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
66; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
67; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
68; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
69; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
70; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
71; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
72; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
73; GCN-NEXT:    v_mul_hi_u32 v5, s2, v2
74; GCN-NEXT:    v_mul_hi_u32 v6, s3, v2
75; GCN-NEXT:    v_mul_lo_u32 v2, s3, v2
76; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
77; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
78; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
79; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
80; GCN-NEXT:    s_mov_b32 s4, s0
81; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
82; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
83; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
84; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
85; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
86; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
87; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
88; GCN-NEXT:    v_mul_lo_u32 v4, s9, v0
89; GCN-NEXT:    v_mov_b32_e32 v5, s9
90; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
91; GCN-NEXT:    v_mul_lo_u32 v3, s8, v0
92; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
93; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
94; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
95; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
96; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v3
97; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
98; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
99; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
100; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
101; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
102; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
103; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
104; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
105; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
106; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
107; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
108; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
109; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
110; GCN-NEXT:    v_mov_b32_e32 v6, s3
111; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
112; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
113; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
114; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
115; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
116; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
117; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
118; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
119; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
120; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
121; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
122; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
123; GCN-NEXT:    s_endpgm
124;
125; GCN-IR-LABEL: s_test_udiv_i64:
126; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
127; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
128; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
129; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
130; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
131; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
132; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
133; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s4
134; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
135; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
136; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s5
137; GCN-IR-NEXT:    s_min_u32 s10, s12, s8
138; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
139; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
140; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
141; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
142; GCN-IR-NEXT:    s_sub_u32 s8, s10, s12
143; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
144; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
145; GCN-IR-NEXT:    s_mov_b32 s11, 0
146; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
147; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
148; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
149; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
150; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
151; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
152; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
153; GCN-IR-NEXT:    s_add_u32 s14, s8, 1
154; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
155; GCN-IR-NEXT:    s_addc_u32 s15, s9, 0
156; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
157; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
158; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
159; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
160; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
161; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
162; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
163; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s14
164; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
165; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
166; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
167; GCN-IR-NEXT:    s_add_u32 s2, s2, s12
168; GCN-IR-NEXT:    s_mov_b32 s13, s11
169; GCN-IR-NEXT:    s_addc_u32 s3, s3, s11
170; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
171; GCN-IR-NEXT:    s_mov_b32 s7, 0
172; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
173; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
174; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[14:15], 1
175; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
176; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
177; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
178; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
179; GCN-IR-NEXT:    s_sub_u32 s6, s16, s12
180; GCN-IR-NEXT:    s_subb_u32 s6, s17, s13
181; GCN-IR-NEXT:    s_ashr_i32 s10, s6, 31
182; GCN-IR-NEXT:    s_mov_b32 s11, s10
183; GCN-IR-NEXT:    s_and_b32 s6, s10, 1
184; GCN-IR-NEXT:    s_and_b64 s[14:15], s[10:11], s[4:5]
185; GCN-IR-NEXT:    s_sub_u32 s14, s12, s14
186; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
187; GCN-IR-NEXT:    s_subb_u32 s15, s13, s15
188; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
189; GCN-IR-NEXT:    s_add_u32 s2, s2, 1
190; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
191; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
192; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
193; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
194; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
195; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
196; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
197; GCN-IR-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
198; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
199; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
200; GCN-IR-NEXT:    s_branch .LBB0_6
201; GCN-IR-NEXT:  .LBB0_5:
202; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
203; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
204; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
205; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
206; GCN-IR-NEXT:  .LBB0_6: ; %udiv-end
207; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
208; GCN-IR-NEXT:    s_mov_b32 s2, -1
209; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
210; GCN-IR-NEXT:    s_endpgm
211  %result = udiv i64 %x, %y
212  store i64 %result, i64 addrspace(1)* %out
213  ret void
214}
215
216define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
217; GCN-LABEL: v_test_udiv_i64:
218; GCN:       ; %bb.0:
219; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
221; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
222; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
223; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
224; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
225; GCN-NEXT:    v_rcp_f32_e32 v4, v4
226; GCN-NEXT:    v_mov_b32_e32 v13, 0
227; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
228; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
229; GCN-NEXT:    v_trunc_f32_e32 v5, v5
230; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
231; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
232; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
233; GCN-NEXT:    v_mul_lo_u32 v9, v6, v5
234; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
235; GCN-NEXT:    v_mul_lo_u32 v10, v7, v4
236; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
237; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
238; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
239; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8
240; GCN-NEXT:    v_mul_hi_u32 v11, v4, v9
241; GCN-NEXT:    v_mul_hi_u32 v12, v4, v8
242; GCN-NEXT:    v_mul_hi_u32 v14, v5, v8
243; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
244; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
245; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
246; GCN-NEXT:    v_mul_lo_u32 v12, v5, v9
247; GCN-NEXT:    v_mul_hi_u32 v9, v5, v9
248; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
249; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
250; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v14, v13, vcc
251; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
252; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
253; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
254; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
255; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
256; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
257; GCN-NEXT:    v_mul_lo_u32 v7, v7, v4
258; GCN-NEXT:    v_mul_lo_u32 v6, v6, v4
259; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
260; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
261; GCN-NEXT:    v_mul_lo_u32 v10, v4, v7
262; GCN-NEXT:    v_mul_hi_u32 v11, v4, v6
263; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
264; GCN-NEXT:    v_mul_hi_u32 v9, v5, v6
265; GCN-NEXT:    v_mul_lo_u32 v6, v5, v6
266; GCN-NEXT:    v_mul_hi_u32 v8, v5, v7
267; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
268; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
269; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
270; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
271; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
272; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
273; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
274; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
275; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
276; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
277; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
278; GCN-NEXT:    v_mul_hi_u32 v7, v0, v4
279; GCN-NEXT:    v_mul_hi_u32 v8, v0, v5
280; GCN-NEXT:    v_mul_hi_u32 v9, v1, v5
281; GCN-NEXT:    v_mul_lo_u32 v5, v1, v5
282; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
283; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
284; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
285; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
286; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
287; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
288; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v13, vcc
289; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
290; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
291; GCN-NEXT:    v_mul_lo_u32 v6, v2, v5
292; GCN-NEXT:    v_mul_hi_u32 v7, v2, v4
293; GCN-NEXT:    v_mul_lo_u32 v8, v3, v4
294; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
295; GCN-NEXT:    v_mul_lo_u32 v7, v2, v4
296; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
297; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v1, v6
298; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
299; GCN-NEXT:    v_subb_u32_e64 v7, s[4:5], v8, v3, vcc
300; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v2
301; GCN-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
302; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
303; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
304; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
305; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
306; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
307; GCN-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
308; GCN-NEXT:    v_add_i32_e64 v8, s[4:5], 2, v4
309; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
310; GCN-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v5, s[4:5]
311; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
312; GCN-NEXT:    v_add_i32_e64 v10, s[4:5], 1, v4
313; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
314; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
315; GCN-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, v5, s[4:5]
316; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
317; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
318; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
319; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
320; GCN-NEXT:    v_cndmask_b32_e64 v7, v10, v8, s[4:5]
321; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
322; GCN-NEXT:    v_cndmask_b32_e64 v1, v11, v9, s[4:5]
323; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
324; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
325; GCN-NEXT:    s_setpc_b64 s[30:31]
326;
327; GCN-IR-LABEL: v_test_udiv_i64:
328; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
329; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
331; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
332; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
333; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
334; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
335; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
336; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
337; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
338; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
339; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
340; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
341; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
342; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[6:7], 0, 0, vcc
343; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
344; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
345; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
346; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
347; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
348; GCN-IR-NEXT:    v_mov_b32_e32 v11, v9
349; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v1, 0, s[4:5]
350; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
351; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
352; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
353; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
354; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
355; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v6
356; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v7, vcc
357; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v6
358; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7]
359; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
360; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
361; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
362; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
363; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
364; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
365; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
366; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
367; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
368; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
369; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
370; GCN-IR-NEXT:    v_not_b32_e32 v0, v8
371; GCN-IR-NEXT:    v_not_b32_e32 v1, v9
372; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
373; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
374; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
375; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
376; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
377; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
378; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
379; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[12:13], 1
380; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
381; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
382; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
383; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
384; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
385; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
386; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
387; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v8
388; GCN-IR-NEXT:    v_and_b32_e32 v13, v8, v3
389; GCN-IR-NEXT:    v_and_b32_e32 v12, v8, v2
390; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
391; GCN-IR-NEXT:    v_or_b32_e32 v5, v9, v5
392; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
393; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
394; GCN-IR-NEXT:    v_mov_b32_e32 v0, v8
395; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v12
396; GCN-IR-NEXT:    v_mov_b32_e32 v1, v9
397; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
398; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5]
399; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
400; GCN-IR-NEXT:    v_mov_b32_e32 v8, v6
401; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
402; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
403; GCN-IR-NEXT:  ; %bb.4: ; %Flow
404; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
405; GCN-IR-NEXT:  .LBB1_5: ; %Flow3
406; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
407; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
408; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
409; GCN-IR-NEXT:    v_or_b32_e32 v5, v6, v0
410; GCN-IR-NEXT:  .LBB1_6: ; %Flow4
411; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
412; GCN-IR-NEXT:    v_mov_b32_e32 v0, v5
413; GCN-IR-NEXT:    v_mov_b32_e32 v1, v4
414; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
415  %result = udiv i64 %x, %y
416  ret i64 %result
417}
418
419define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
420; GCN-LABEL: s_test_udiv24_64:
421; GCN:       ; %bb.0:
422; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
423; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
424; GCN-NEXT:    s_mov_b32 s7, 0xf000
425; GCN-NEXT:    s_mov_b32 s6, -1
426; GCN-NEXT:    s_waitcnt lgkmcnt(0)
427; GCN-NEXT:    s_lshr_b32 s2, s4, 8
428; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
429; GCN-NEXT:    s_lshr_b32 s2, s3, 8
430; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
431; GCN-NEXT:    s_mov_b32 s4, s0
432; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
433; GCN-NEXT:    s_mov_b32 s5, s1
434; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
435; GCN-NEXT:    v_trunc_f32_e32 v2, v2
436; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
437; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
438; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
439; GCN-NEXT:    v_mov_b32_e32 v1, 0
440; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
441; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
442; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
443; GCN-NEXT:    s_endpgm
444;
445; GCN-IR-LABEL: s_test_udiv24_64:
446; GCN-IR:       ; %bb.0:
447; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xe
448; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
449; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
450; GCN-IR-NEXT:    s_mov_b32 s6, -1
451; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
452; GCN-IR-NEXT:    s_lshr_b32 s2, s4, 8
453; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
454; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 8
455; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
456; GCN-IR-NEXT:    s_mov_b32 s4, s0
457; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
458; GCN-IR-NEXT:    s_mov_b32 s5, s1
459; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
460; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
461; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
462; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
463; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
464; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
465; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
466; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
467; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
468; GCN-IR-NEXT:    s_endpgm
469  %1 = lshr i64 %x, 40
470  %2 = lshr i64 %y, 40
471  %result = udiv i64 %1, %2
472  store i64 %result, i64 addrspace(1)* %out
473  ret void
474}
475
476define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
477; GCN-LABEL: v_test_udiv24_i64:
478; GCN:       ; %bb.0:
479; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
480; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
481; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
482; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
483; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
484; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
485; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
486; GCN-NEXT:    v_trunc_f32_e32 v2, v2
487; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
488; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
489; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
490; GCN-NEXT:    v_mov_b32_e32 v1, 0
491; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
492; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
493; GCN-NEXT:    s_setpc_b64 s[30:31]
494;
495; GCN-IR-LABEL: v_test_udiv24_i64:
496; GCN-IR:       ; %bb.0:
497; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
499; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
500; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
501; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
502; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
503; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
504; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
505; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
506; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
507; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
508; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
509; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
510; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
511; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
512  %1 = lshr i64 %x, 40
513  %2 = lshr i64 %y, 40
514  %result = udiv i64 %1, %2
515  ret i64 %result
516}
517
518define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
519; GCN-LABEL: s_test_udiv32_i64:
520; GCN:       ; %bb.0:
521; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
522; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
523; GCN-NEXT:    s_mov_b32 s7, 0xf000
524; GCN-NEXT:    s_mov_b32 s6, -1
525; GCN-NEXT:    s_waitcnt lgkmcnt(0)
526; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
527; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
528; GCN-NEXT:    s_mov_b32 s4, s0
529; GCN-NEXT:    s_mov_b32 s5, s1
530; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
531; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
532; GCN-NEXT:    v_trunc_f32_e32 v2, v2
533; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
534; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
535; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
536; GCN-NEXT:    v_mov_b32_e32 v1, 0
537; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
538; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
539; GCN-NEXT:    s_endpgm
540;
541; GCN-IR-LABEL: s_test_udiv32_i64:
542; GCN-IR:       ; %bb.0:
543; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xe
544; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
545; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
546; GCN-IR-NEXT:    s_mov_b32 s6, -1
547; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
548; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
549; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s3
550; GCN-IR-NEXT:    s_mov_b32 s4, s0
551; GCN-IR-NEXT:    s_mov_b32 s5, s1
552; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
553; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
554; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
555; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
556; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
557; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
558; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
559; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
560; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
561; GCN-IR-NEXT:    s_endpgm
562  %1 = lshr i64 %x, 32
563  %2 = lshr i64 %y, 32
564  %result = udiv i64 %1, %2
565  store i64 %result, i64 addrspace(1)* %out
566  ret void
567}
568
569define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
570; GCN-LABEL: s_test_udiv31_i64:
571; GCN:       ; %bb.0:
572; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
573; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
574; GCN-NEXT:    s_mov_b32 s7, 0xf000
575; GCN-NEXT:    s_mov_b32 s6, -1
576; GCN-NEXT:    s_waitcnt lgkmcnt(0)
577; GCN-NEXT:    s_lshr_b32 s2, s4, 1
578; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
579; GCN-NEXT:    s_lshr_b32 s2, s3, 1
580; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
581; GCN-NEXT:    s_mov_b32 s4, s0
582; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
583; GCN-NEXT:    s_mov_b32 s5, s1
584; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
585; GCN-NEXT:    v_trunc_f32_e32 v2, v2
586; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
587; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
588; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
589; GCN-NEXT:    v_mov_b32_e32 v1, 0
590; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
591; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
592; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
593; GCN-NEXT:    s_endpgm
594;
595; GCN-IR-LABEL: s_test_udiv31_i64:
596; GCN-IR:       ; %bb.0:
597; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xe
598; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
599; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
600; GCN-IR-NEXT:    s_mov_b32 s6, -1
601; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
602; GCN-IR-NEXT:    s_lshr_b32 s2, s4, 1
603; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
604; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 1
605; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
606; GCN-IR-NEXT:    s_mov_b32 s4, s0
607; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
608; GCN-IR-NEXT:    s_mov_b32 s5, s1
609; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
610; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
611; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
612; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
613; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
614; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
615; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
616; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
617; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
618; GCN-IR-NEXT:    s_endpgm
619  %1 = lshr i64 %x, 33
620  %2 = lshr i64 %y, 33
621  %result = udiv i64 %1, %2
622  store i64 %result, i64 addrspace(1)* %out
623  ret void
624}
625
626define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
627; GCN-LABEL: s_test_udiv23_i64:
628; GCN:       ; %bb.0:
629; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
630; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
631; GCN-NEXT:    s_mov_b32 s7, 0xf000
632; GCN-NEXT:    s_mov_b32 s6, -1
633; GCN-NEXT:    s_waitcnt lgkmcnt(0)
634; GCN-NEXT:    s_lshr_b32 s2, s4, 9
635; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
636; GCN-NEXT:    s_lshr_b32 s2, s3, 9
637; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
638; GCN-NEXT:    s_mov_b32 s4, s0
639; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
640; GCN-NEXT:    s_mov_b32 s5, s1
641; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
642; GCN-NEXT:    v_trunc_f32_e32 v2, v2
643; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
644; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
645; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
646; GCN-NEXT:    v_mov_b32_e32 v1, 0
647; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
648; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
649; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
650; GCN-NEXT:    s_endpgm
651;
652; GCN-IR-LABEL: s_test_udiv23_i64:
653; GCN-IR:       ; %bb.0:
654; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xe
655; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
656; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
657; GCN-IR-NEXT:    s_mov_b32 s6, -1
658; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
659; GCN-IR-NEXT:    s_lshr_b32 s2, s4, 9
660; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
661; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 9
662; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
663; GCN-IR-NEXT:    s_mov_b32 s4, s0
664; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
665; GCN-IR-NEXT:    s_mov_b32 s5, s1
666; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
667; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
668; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
669; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
670; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
671; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
672; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
673; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
674; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
675; GCN-IR-NEXT:    s_endpgm
676  %1 = lshr i64 %x, 41
677  %2 = lshr i64 %y, 41
678  %result = udiv i64 %1, %2
679  store i64 %result, i64 addrspace(1)* %out
680  ret void
681}
682
683define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
684; GCN-LABEL: s_test_udiv24_i48:
685; GCN:       ; %bb.0:
686; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
687; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
688; GCN-NEXT:    s_mov_b32 s6, 0xffff
689; GCN-NEXT:    s_mov_b32 s7, 0xff000000
690; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, s6
691; GCN-NEXT:    s_waitcnt lgkmcnt(0)
692; GCN-NEXT:    s_and_b32 s3, s2, s6
693; GCN-NEXT:    s_and_b32 s2, s4, s7
694; GCN-NEXT:    v_mov_b32_e32 v0, s2
695; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 24
696; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
697; GCN-NEXT:    s_load_dword s8, s[0:1], 0xb
698; GCN-NEXT:    s_load_dword s9, s[0:1], 0xc
699; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
700; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 24
701; GCN-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v2
702; GCN-NEXT:    v_rcp_f32_e32 v1, v1
703; GCN-NEXT:    s_waitcnt lgkmcnt(0)
704; GCN-NEXT:    s_and_b32 s6, s9, s6
705; GCN-NEXT:    s_and_b32 s8, s8, s7
706; GCN-NEXT:    s_sub_u32 s0, 0, s0
707; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
708; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
709; GCN-NEXT:    v_trunc_f32_e32 v2, v2
710; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
711; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
712; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
713; GCN-NEXT:    s_subb_u32 s1, 0, s1
714; GCN-NEXT:    v_mov_b32_e32 v8, 0
715; GCN-NEXT:    v_mul_hi_u32 v3, s0, v1
716; GCN-NEXT:    v_mul_lo_u32 v4, s0, v2
717; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
718; GCN-NEXT:    s_mov_b32 s7, 0xf000
719; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
720; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
721; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
722; GCN-NEXT:    v_mul_lo_u32 v5, v1, v3
723; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
724; GCN-NEXT:    v_mul_hi_u32 v7, v1, v3
725; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
726; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
727; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
728; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
729; GCN-NEXT:    v_mul_lo_u32 v7, v2, v4
730; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
731; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
732; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
733; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v8, vcc
734; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
735; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
736; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
737; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
738; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
739; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
740; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
741; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
742; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
743; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
744; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
745; GCN-NEXT:    v_mul_hi_u32 v9, v1, v4
746; GCN-NEXT:    v_mul_hi_u32 v10, v1, v3
747; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
748; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
749; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
750; GCN-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
751; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
752; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
753; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
754; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v6, vcc
755; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
756; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
757; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
758; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
759; GCN-NEXT:    v_mov_b32_e32 v3, s8
760; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
761; GCN-NEXT:    v_alignbit_b32 v3, s6, v3, 24
762; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
763; GCN-NEXT:    v_mul_hi_u32 v1, v3, v1
764; GCN-NEXT:    v_mul_hi_u32 v2, v3, v2
765; GCN-NEXT:    s_mov_b32 s6, -1
766; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
767; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
768; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
769; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
770; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v8, vcc
771; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
772; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
773; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
774; GCN-NEXT:    v_mul_hi_u32 v7, v0, v1
775; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v1
776; GCN-NEXT:    v_mul_lo_u32 v10, v0, v1
777; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v2, vcc
778; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
779; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v2, vcc
780; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
781; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
782; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v6, vcc
783; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v3, v0
784; GCN-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v6, vcc
785; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v0
786; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
787; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
788; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
789; GCN-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
790; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
791; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
792; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
793; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
794; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
795; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
796; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[0:1]
797; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
798; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
799; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
800; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
801; GCN-NEXT:    s_endpgm
802;
803; GCN-IR-LABEL: s_test_udiv24_i48:
804; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
805; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xc
806; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
807; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xb
808; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
809; GCN-IR-NEXT:    s_load_dword s7, s[0:1], 0xe
810; GCN-IR-NEXT:    s_mov_b32 s8, 0xffff
811; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
812; GCN-IR-NEXT:    s_and_b32 s1, s2, s8
813; GCN-IR-NEXT:    s_mov_b32 s2, 0xff000000
814; GCN-IR-NEXT:    s_and_b32 s0, s3, s2
815; GCN-IR-NEXT:    s_and_b32 s3, s7, s8
816; GCN-IR-NEXT:    s_and_b32 s2, s6, s2
817; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
818; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
819; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
820; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
821; GCN-IR-NEXT:    s_mov_b64 s[0:1], 0
822; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
823; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
824; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
825; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
826; GCN-IR-NEXT:    s_min_u32 s10, s8, s9
827; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
828; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
829; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
830; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
831; GCN-IR-NEXT:    s_sub_u32 s8, s10, s12
832; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
833; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
834; GCN-IR-NEXT:    s_mov_b32 s11, 0
835; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
836; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
837; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
838; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
839; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
840; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
841; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
842; GCN-IR-NEXT:    s_add_u32 s14, s8, 1
843; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
844; GCN-IR-NEXT:    s_addc_u32 s15, s9, 0
845; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
846; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
847; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
848; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
849; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
850; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
851; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
852; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
853; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
854; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
855; GCN-IR-NEXT:    s_not_b64 s[0:1], s[10:11]
856; GCN-IR-NEXT:    s_add_u32 s6, s0, s12
857; GCN-IR-NEXT:    s_mov_b32 s13, s11
858; GCN-IR-NEXT:    s_addc_u32 s7, s1, s11
859; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
860; GCN-IR-NEXT:    s_mov_b32 s1, 0
861; GCN-IR-NEXT:  .LBB7_3: ; %udiv-do-while
862; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
863; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[14:15], 1
864; GCN-IR-NEXT:    s_lshr_b32 s0, s9, 31
865; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
866; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[0:1]
867; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
868; GCN-IR-NEXT:    s_sub_u32 s0, s16, s12
869; GCN-IR-NEXT:    s_subb_u32 s0, s17, s13
870; GCN-IR-NEXT:    s_ashr_i32 s10, s0, 31
871; GCN-IR-NEXT:    s_mov_b32 s11, s10
872; GCN-IR-NEXT:    s_and_b32 s0, s10, 1
873; GCN-IR-NEXT:    s_and_b64 s[14:15], s[10:11], s[2:3]
874; GCN-IR-NEXT:    s_sub_u32 s14, s12, s14
875; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
876; GCN-IR-NEXT:    s_subb_u32 s15, s13, s15
877; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
878; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
879; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
880; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
881; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[0:1]
882; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
883; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
884; GCN-IR-NEXT:  .LBB7_4: ; %Flow3
885; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
886; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
887; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
888; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
889; GCN-IR-NEXT:    s_branch .LBB7_6
890; GCN-IR-NEXT:  .LBB7_5:
891; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
892; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
893; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
894; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
895; GCN-IR-NEXT:  .LBB7_6: ; %udiv-end
896; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
897; GCN-IR-NEXT:    s_mov_b32 s6, -1
898; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
899; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
900; GCN-IR-NEXT:    s_endpgm
901  %1 = lshr i48 %x, 24
902  %2 = lshr i48 %y, 24
903  %result = udiv i48 %1, %2
904  store i48 %result, i48 addrspace(1)* %out
905  ret void
906}
907
908define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
909; GCN-LABEL: s_test_udiv_k_num_i64:
910; GCN:       ; %bb.0:
911; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
912; GCN-NEXT:    s_mov_b32 s7, 0xf000
913; GCN-NEXT:    s_mov_b32 s6, -1
914; GCN-NEXT:    s_waitcnt lgkmcnt(0)
915; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
916; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
917; GCN-NEXT:    s_sub_u32 s4, 0, s2
918; GCN-NEXT:    s_subb_u32 s5, 0, s3
919; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
920; GCN-NEXT:    v_rcp_f32_e32 v0, v0
921; GCN-NEXT:    v_mov_b32_e32 v1, 0
922; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
923; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
924; GCN-NEXT:    v_trunc_f32_e32 v2, v2
925; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
926; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
927; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
928; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
929; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
930; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
931; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
932; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
933; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
934; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
935; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
936; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
937; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
938; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
939; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
940; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
941; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
942; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
943; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
944; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
945; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
946; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
947; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
948; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
949; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
950; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
951; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
952; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
953; GCN-NEXT:    s_mov_b32 s5, s1
954; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
955; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
956; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
957; GCN-NEXT:    v_mul_lo_u32 v7, v0, v3
958; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
959; GCN-NEXT:    v_mul_hi_u32 v9, v0, v3
960; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
961; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
962; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
963; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
964; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
965; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
966; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
967; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v6, vcc
968; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
969; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
970; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
971; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
972; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
973; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
974; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
975; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
976; GCN-NEXT:    v_mov_b32_e32 v4, s3
977; GCN-NEXT:    s_mov_b32 s4, s0
978; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
979; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
980; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
981; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
982; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
983; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
984; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
985; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
986; GCN-NEXT:    v_subb_u32_e64 v3, s[0:1], v3, v4, vcc
987; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s2, v2
988; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
989; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
990; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
991; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v4
992; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
993; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v3
994; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
995; GCN-NEXT:    v_add_i32_e64 v4, s[0:1], 2, v0
996; GCN-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1]
997; GCN-NEXT:    v_add_i32_e64 v6, s[0:1], 1, v0
998; GCN-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1]
999; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
1000; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
1001; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
1002; GCN-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[0:1]
1003; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
1004; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
1005; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
1006; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v1
1007; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
1008; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1009; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, v4, s[0:1]
1010; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
1011; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1012; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1013; GCN-NEXT:    s_endpgm
1014;
1015; GCN-IR-LABEL: s_test_udiv_k_num_i64:
1016; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1017; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1018; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1019; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
1020; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
1021; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
1022; GCN-IR-NEXT:    s_min_u32 s6, s4, s5
1023; GCN-IR-NEXT:    s_add_u32 s8, s6, 0xffffffc5
1024; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
1025; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
1026; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
1027; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
1028; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
1029; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[8:9], 63
1030; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
1031; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
1032; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
1033; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
1034; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1035; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
1036; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
1037; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
1038; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
1039; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
1040; GCN-IR-NEXT:    s_sub_i32 s7, 63, s8
1041; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
1042; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s7
1043; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
1044; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1045; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s10
1046; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
1047; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
1048; GCN-IR-NEXT:    s_sub_u32 s6, 58, s6
1049; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
1050; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1051; GCN-IR-NEXT:    s_mov_b32 s5, 0
1052; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
1053; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1054; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
1055; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
1056; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
1057; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
1058; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
1059; GCN-IR-NEXT:    s_sub_u32 s4, s14, s12
1060; GCN-IR-NEXT:    s_subb_u32 s4, s15, s13
1061; GCN-IR-NEXT:    s_ashr_i32 s10, s4, 31
1062; GCN-IR-NEXT:    s_mov_b32 s11, s10
1063; GCN-IR-NEXT:    s_and_b32 s4, s10, 1
1064; GCN-IR-NEXT:    s_and_b64 s[16:17], s[10:11], s[2:3]
1065; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
1066; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
1067; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
1068; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
1069; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
1070; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
1071; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
1072; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
1073; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
1074; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_3
1075; GCN-IR-NEXT:  .LBB8_4: ; %Flow5
1076; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
1077; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
1078; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
1079; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
1080; GCN-IR-NEXT:    s_branch .LBB8_6
1081; GCN-IR-NEXT:  .LBB8_5:
1082; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1083; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
1084; GCN-IR-NEXT:  .LBB8_6: ; %udiv-end
1085; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
1086; GCN-IR-NEXT:    s_mov_b32 s2, -1
1087; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1088; GCN-IR-NEXT:    s_endpgm
1089  %result = udiv i64 24, %x
1090  store i64 %result, i64 addrspace(1)* %out
1091  ret void
1092}
1093
1094; define i64 @v_test_udiv_k_num_i64(i64 %x) {
1095;   %result = udiv i64 24, %x
1096;   ret i64 %result
1097; }
1098
1099define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
1100; GCN-LABEL: v_test_udiv_pow2_k_num_i64:
1101; GCN:       ; %bb.0:
1102; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1103; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
1104; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
1105; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1106; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
1107; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
1108; GCN-NEXT:    v_rcp_f32_e32 v2, v2
1109; GCN-NEXT:    v_mov_b32_e32 v11, 0
1110; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
1111; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
1112; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1113; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
1114; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1115; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1116; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
1117; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
1118; GCN-NEXT:    v_mul_lo_u32 v8, v5, v2
1119; GCN-NEXT:    v_mul_lo_u32 v9, v4, v2
1120; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1121; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
1122; GCN-NEXT:    v_mul_lo_u32 v7, v2, v6
1123; GCN-NEXT:    v_mul_hi_u32 v8, v2, v9
1124; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
1125; GCN-NEXT:    v_mul_hi_u32 v12, v3, v6
1126; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
1127; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
1128; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v10, vcc
1129; GCN-NEXT:    v_mul_lo_u32 v10, v3, v9
1130; GCN-NEXT:    v_mul_hi_u32 v9, v3, v9
1131; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
1132; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v8, v9, vcc
1133; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v12, v11, vcc
1134; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1135; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1136; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
1137; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
1138; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
1139; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
1140; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
1141; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
1142; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1143; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
1144; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
1145; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
1146; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
1147; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
1148; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
1149; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
1150; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
1151; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
1152; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
1153; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
1154; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
1155; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
1156; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1157; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1158; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1159; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v5, vcc
1160; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
1161; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
1162; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
1163; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1164; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
1165; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
1166; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0x8000, v4
1167; GCN-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v1, vcc
1168; GCN-NEXT:    v_sub_i32_e64 v6, s[4:5], v4, v0
1169; GCN-NEXT:    v_subbrev_u32_e64 v5, s[4:5], 0, v5, s[4:5]
1170; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
1171; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
1172; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
1173; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
1174; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
1175; GCN-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[4:5]
1176; GCN-NEXT:    v_add_i32_e64 v6, s[4:5], 2, v2
1177; GCN-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5]
1178; GCN-NEXT:    v_add_i32_e64 v8, s[4:5], 1, v2
1179; GCN-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
1180; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
1181; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
1182; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
1183; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v6, s[4:5]
1184; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
1185; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
1186; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1187; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
1188; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
1189; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1190; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v7, s[4:5]
1191; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
1192; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1193; GCN-NEXT:    s_setpc_b64 s[30:31]
1194;
1195; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64:
1196; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1197; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1198; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
1199; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
1200; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
1201; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
1202; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffd0, v6
1203; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
1204; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
1205; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
1206; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
1207; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
1208; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
1209; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
1210; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
1211; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
1212; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
1213; GCN-IR-NEXT:    v_mov_b32_e32 v3, v7
1214; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
1215; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1216; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
1217; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1218; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
1219; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
1220; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
1221; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5]
1222; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
1223; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
1224; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1225; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1226; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1227; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
1228; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
1229; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1230; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
1231; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
1232; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
1233; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
1234; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
1235; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
1236; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
1237; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
1238; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1239; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
1240; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1241; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
1242; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
1243; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
1244; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1245; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
1246; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
1247; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
1248; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
1249; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
1250; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
1251; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
1252; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v6
1253; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
1254; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v7, vcc
1255; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
1256; GCN-IR-NEXT:    v_mov_b32_e32 v6, v10
1257; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
1258; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
1259; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
1260; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
1261; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
1262; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
1263; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
1264; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_3
1265; GCN-IR-NEXT:  ; %bb.4: ; %Flow
1266; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
1267; GCN-IR-NEXT:  .LBB9_5: ; %Flow3
1268; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
1269; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
1270; GCN-IR-NEXT:    v_or_b32_e32 v3, v5, v1
1271; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v0
1272; GCN-IR-NEXT:  .LBB9_6: ; %Flow4
1273; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
1274; GCN-IR-NEXT:    v_mov_b32_e32 v0, v2
1275; GCN-IR-NEXT:    v_mov_b32_e32 v1, v3
1276; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1277  %result = udiv i64 32768, %x
1278  ret i64 %result
1279}
1280
1281define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
1282; GCN-LABEL: v_test_udiv_pow2_k_den_i64:
1283; GCN:       ; %bb.0:
1284; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1285; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 15
1286; GCN-NEXT:    v_lshrrev_b32_e32 v1, 15, v1
1287; GCN-NEXT:    s_setpc_b64 s[30:31]
1288;
1289; GCN-IR-LABEL: v_test_udiv_pow2_k_den_i64:
1290; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1291; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1292; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
1293; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
1294; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
1295; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
1296; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v6
1297; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
1298; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
1299; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
1300; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1301; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
1302; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
1303; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
1304; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1305; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
1306; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1307; GCN-IR-NEXT:    s_cbranch_execz .LBB10_6
1308; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1309; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
1310; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
1311; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
1312; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5]
1313; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
1314; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
1315; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1316; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1317; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1318; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
1319; GCN-IR-NEXT:    s_cbranch_execz .LBB10_5
1320; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1321; GCN-IR-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
1322; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffcf, v6
1323; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
1324; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
1325; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
1326; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1327; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
1328; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
1329; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1330; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
1331; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
1332; GCN-IR-NEXT:    v_or_b32_e32 v6, v7, v4
1333; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1334; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v6
1335; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
1336; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
1337; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
1338; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
1339; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
1340; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
1341; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v7
1342; GCN-IR-NEXT:    v_and_b32_e32 v7, 0x8000, v7
1343; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
1344; GCN-IR-NEXT:    v_mov_b32_e32 v0, v9
1345; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v6, v7
1346; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
1347; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
1348; GCN-IR-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
1349; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
1350; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
1351; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
1352; GCN-IR-NEXT:    s_cbranch_execnz .LBB10_3
1353; GCN-IR-NEXT:  ; %bb.4: ; %Flow
1354; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
1355; GCN-IR-NEXT:  .LBB10_5: ; %Flow3
1356; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
1357; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
1358; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
1359; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
1360; GCN-IR-NEXT:  .LBB10_6: ; %Flow4
1361; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
1362; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
1363; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
1364; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1365  %result = udiv i64 %x, 32768
1366  ret i64 %result
1367}
1368
1369define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
1370; GCN-LABEL: s_test_udiv_k_den_i64:
1371; GCN:       ; %bb.0:
1372; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
1373; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
1374; GCN-NEXT:    v_rcp_f32_e32 v0, v0
1375; GCN-NEXT:    s_movk_i32 s4, 0xffe8
1376; GCN-NEXT:    v_mov_b32_e32 v7, 0
1377; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1378; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1379; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1380; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1381; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
1382; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1383; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1384; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1385; GCN-NEXT:    s_mov_b32 s5, s1
1386; GCN-NEXT:    s_mov_b32 s7, 0xf000
1387; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
1388; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
1389; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
1390; GCN-NEXT:    s_mov_b32 s6, -1
1391; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
1392; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1393; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
1394; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
1395; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
1396; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
1397; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
1398; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
1399; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1400; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
1401; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1402; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
1403; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
1404; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
1405; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1406; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1407; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1408; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1409; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
1410; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
1411; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
1412; GCN-NEXT:    s_mov_b32 s4, s0
1413; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
1414; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1415; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
1416; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
1417; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
1418; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
1419; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
1420; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
1421; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1422; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
1423; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1424; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
1425; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
1426; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v7, vcc
1427; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1428; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1429; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1430; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1431; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
1432; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
1433; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
1434; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
1435; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
1436; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1437; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1438; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
1439; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
1440; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1441; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
1442; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
1443; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1444; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
1445; GCN-NEXT:    v_mul_lo_u32 v4, v1, 24
1446; GCN-NEXT:    v_mul_hi_u32 v5, v0, 24
1447; GCN-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
1448; GCN-NEXT:    v_mul_lo_u32 v8, v0, 24
1449; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1450; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
1451; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1452; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1453; GCN-NEXT:    v_mov_b32_e32 v5, s3
1454; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
1455; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
1456; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v8
1457; GCN-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
1458; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v5
1459; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
1460; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
1461; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
1462; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v8
1463; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
1464; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1465; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
1466; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
1467; GCN-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
1468; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
1469; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
1470; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
1471; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1472; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1473; GCN-NEXT:    s_endpgm
1474;
1475; GCN-IR-LABEL: s_test_udiv_k_den_i64:
1476; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1477; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1478; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1479; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
1480; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
1481; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
1482; GCN-IR-NEXT:    s_min_u32 s8, s4, s5
1483; GCN-IR-NEXT:    s_sub_u32 s6, 59, s8
1484; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
1485; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
1486; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
1487; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
1488; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
1489; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
1490; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
1491; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
1492; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
1493; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
1494; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1495; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
1496; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
1497; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
1498; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
1499; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
1500; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
1501; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
1502; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
1503; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_4
1504; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1505; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s10
1506; GCN-IR-NEXT:    s_add_u32 s2, s8, 0xffffffc4
1507; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
1508; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
1509; GCN-IR-NEXT:    s_mov_b32 s5, 0
1510; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
1511; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1512; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
1513; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
1514; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
1515; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
1516; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
1517; GCN-IR-NEXT:    s_sub_u32 s4, 23, s10
1518; GCN-IR-NEXT:    s_subb_u32 s4, 0, s11
1519; GCN-IR-NEXT:    s_ashr_i32 s8, s4, 31
1520; GCN-IR-NEXT:    s_and_b32 s4, s8, 1
1521; GCN-IR-NEXT:    s_and_b32 s8, s8, 24
1522; GCN-IR-NEXT:    s_sub_u32 s10, s10, s8
1523; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
1524; GCN-IR-NEXT:    s_subb_u32 s11, s11, 0
1525; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
1526; GCN-IR-NEXT:    s_add_u32 s2, s2, 1
1527; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
1528; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
1529; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
1530; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
1531; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_3
1532; GCN-IR-NEXT:  .LBB11_4: ; %Flow5
1533; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
1534; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
1535; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
1536; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
1537; GCN-IR-NEXT:    s_branch .LBB11_6
1538; GCN-IR-NEXT:  .LBB11_5:
1539; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
1540; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[10:11]
1541; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
1542; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[10:11]
1543; GCN-IR-NEXT:  .LBB11_6: ; %udiv-end
1544; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
1545; GCN-IR-NEXT:    s_mov_b32 s2, -1
1546; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1547; GCN-IR-NEXT:    s_endpgm
1548  %result = udiv i64 %x, 24
1549  store i64 %result, i64 addrspace(1)* %out
1550  ret void
1551}
1552
1553define i64 @v_test_udiv_k_den_i64(i64 %x) {
1554; GCN-LABEL: v_test_udiv_k_den_i64:
1555; GCN:       ; %bb.0:
1556; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1557; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
1558; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x41c00000
1559; GCN-NEXT:    v_rcp_f32_e32 v2, v2
1560; GCN-NEXT:    s_movk_i32 s4, 0xffe8
1561; GCN-NEXT:    v_mov_b32_e32 v9, 0
1562; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
1563; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
1564; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1565; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
1566; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1567; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1568; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
1569; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
1570; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
1571; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
1572; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1573; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
1574; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
1575; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
1576; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
1577; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
1578; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
1579; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1580; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
1581; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
1582; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
1583; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
1584; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
1585; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1586; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1587; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1588; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
1589; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
1590; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
1591; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
1592; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
1593; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1594; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
1595; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
1596; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
1597; GCN-NEXT:    v_mul_hi_u32 v10, v3, v4
1598; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
1599; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
1600; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1601; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
1602; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
1603; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
1604; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
1605; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v9, vcc
1606; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1607; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1608; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1609; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
1610; GCN-NEXT:    v_mul_lo_u32 v4, v0, v3
1611; GCN-NEXT:    v_mul_hi_u32 v5, v0, v2
1612; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
1613; GCN-NEXT:    v_mul_hi_u32 v7, v1, v3
1614; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
1615; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1616; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1617; GCN-NEXT:    v_mul_lo_u32 v6, v1, v2
1618; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
1619; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
1620; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
1621; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v9, vcc
1622; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1623; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1624; GCN-NEXT:    v_mul_lo_u32 v4, v3, 24
1625; GCN-NEXT:    v_mul_hi_u32 v5, v2, 24
1626; GCN-NEXT:    v_mul_lo_u32 v6, v2, 24
1627; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1628; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
1629; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
1630; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v0
1631; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
1632; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v4
1633; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
1634; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
1635; GCN-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
1636; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v2
1637; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
1638; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
1639; GCN-NEXT:    v_cmp_lt_u32_e64 s[4:5], 23, v0
1640; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
1641; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
1642; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
1643; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
1644; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[4:5]
1645; GCN-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
1646; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
1647; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
1648; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
1649; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
1650; GCN-NEXT:    s_setpc_b64 s[30:31]
1651;
1652; GCN-IR-LABEL: v_test_udiv_k_den_i64:
1653; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1654; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1655; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
1656; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
1657; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
1658; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
1659; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 59, v6
1660; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
1661; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
1662; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
1663; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1664; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
1665; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
1666; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
1667; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1668; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
1669; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1670; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
1671; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1672; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
1673; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
1674; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
1675; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5]
1676; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
1677; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
1678; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1679; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1680; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1681; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
1682; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
1683; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1684; GCN-IR-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
1685; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc4, v6
1686; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
1687; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
1688; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
1689; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1690; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
1691; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1692; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
1693; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
1694; GCN-IR-NEXT:    v_or_b32_e32 v6, v7, v4
1695; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1696; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 23, v6
1697; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
1698; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
1699; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
1700; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
1701; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
1702; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
1703; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v7
1704; GCN-IR-NEXT:    v_and_b32_e32 v7, 24, v7
1705; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
1706; GCN-IR-NEXT:    v_mov_b32_e32 v0, v9
1707; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v6, v7
1708; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
1709; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
1710; GCN-IR-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
1711; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
1712; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
1713; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
1714; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
1715; GCN-IR-NEXT:  ; %bb.4: ; %Flow
1716; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
1717; GCN-IR-NEXT:  .LBB12_5: ; %Flow3
1718; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
1719; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
1720; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
1721; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
1722; GCN-IR-NEXT:  .LBB12_6: ; %Flow4
1723; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
1724; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
1725; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
1726; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1727  %result = udiv i64 %x, 24
1728  ret i64 %result
1729}
1730
1731define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
1732; GCN-LABEL: s_test_udiv24_k_num_i64:
1733; GCN:       ; %bb.0:
1734; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1735; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
1736; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1737; GCN-NEXT:    s_lshr_b32 s2, s3, 8
1738; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
1739; GCN-NEXT:    s_mov_b32 s3, 0xf000
1740; GCN-NEXT:    s_mov_b32 s2, -1
1741; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1742; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
1743; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1744; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1745; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s4
1746; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1747; GCN-NEXT:    v_mov_b32_e32 v1, 0
1748; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1749; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1750; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1751; GCN-NEXT:    s_endpgm
1752;
1753; GCN-IR-LABEL: s_test_udiv24_k_num_i64:
1754; GCN-IR:       ; %bb.0:
1755; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1756; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
1757; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1758; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 8
1759; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
1760; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
1761; GCN-IR-NEXT:    s_mov_b32 s2, -1
1762; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1763; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
1764; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
1765; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
1766; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s4
1767; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1768; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1769; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1770; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1771; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1772; GCN-IR-NEXT:    s_endpgm
1773  %x.shr = lshr i64 %x, 40
1774  %result = udiv i64 24, %x.shr
1775  store i64 %result, i64 addrspace(1)* %out
1776  ret void
1777}
1778
1779define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
1780; GCN-LABEL: s_test_udiv24_k_den_i64:
1781; GCN:       ; %bb.0:
1782; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1783; GCN-NEXT:    s_mov_b32 s7, 0xf000
1784; GCN-NEXT:    s_mov_b32 s6, -1
1785; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1786; GCN-NEXT:    s_lshr_b32 s2, s3, 8
1787; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
1788; GCN-NEXT:    s_mov_b32 s2, 0x46b6fe00
1789; GCN-NEXT:    s_mov_b32 s4, s0
1790; GCN-NEXT:    s_mov_b32 s5, s1
1791; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
1792; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1793; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1794; GCN-NEXT:    v_mad_f32 v0, -v1, s2, v0
1795; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
1796; GCN-NEXT:    v_mov_b32_e32 v1, 0
1797; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1798; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1799; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1800; GCN-NEXT:    s_endpgm
1801;
1802; GCN-IR-LABEL: s_test_udiv24_k_den_i64:
1803; GCN-IR:       ; %bb.0:
1804; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1805; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
1806; GCN-IR-NEXT:    s_mov_b32 s6, -1
1807; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1808; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 8
1809; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
1810; GCN-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
1811; GCN-IR-NEXT:    s_mov_b32 s4, s0
1812; GCN-IR-NEXT:    s_mov_b32 s5, s1
1813; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
1814; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
1815; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
1816; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s2, v0
1817; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
1818; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1819; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1820; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1821; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1822; GCN-IR-NEXT:    s_endpgm
1823  %x.shr = lshr i64 %x, 40
1824  %result = udiv i64 %x.shr, 23423
1825  store i64 %result, i64 addrspace(1)* %out
1826  ret void
1827}
1828
1829define i64 @v_test_udiv24_k_num_i64(i64 %x) {
1830; GCN-LABEL: v_test_udiv24_k_num_i64:
1831; GCN:       ; %bb.0:
1832; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1833; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
1834; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
1835; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
1836; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1837; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
1838; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1839; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1840; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s4
1841; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1842; GCN-NEXT:    v_mov_b32_e32 v1, 0
1843; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1844; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1845; GCN-NEXT:    s_setpc_b64 s[30:31]
1846;
1847; GCN-IR-LABEL: v_test_udiv24_k_num_i64:
1848; GCN-IR:       ; %bb.0:
1849; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1850; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
1851; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
1852; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
1853; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1854; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
1855; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
1856; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
1857; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s4
1858; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1859; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1860; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1861; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1862; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1863  %x.shr = lshr i64 %x, 40
1864  %result = udiv i64 24, %x.shr
1865  ret i64 %result
1866}
1867
1868define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
1869; GCN-LABEL: v_test_udiv24_pow2_k_num_i64:
1870; GCN:       ; %bb.0:
1871; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1872; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
1873; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
1874; GCN-NEXT:    s_mov_b32 s4, 0x47000000
1875; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1876; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
1877; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1878; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1879; GCN-NEXT:    v_mad_f32 v1, -v1, v0, s4
1880; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1881; GCN-NEXT:    v_mov_b32_e32 v1, 0
1882; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1883; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1884; GCN-NEXT:    s_setpc_b64 s[30:31]
1885;
1886; GCN-IR-LABEL: v_test_udiv24_pow2_k_num_i64:
1887; GCN-IR:       ; %bb.0:
1888; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1889; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
1890; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
1891; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
1892; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1893; GCN-IR-NEXT:    v_mul_f32_e32 v1, s4, v1
1894; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
1895; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
1896; GCN-IR-NEXT:    v_mad_f32 v1, -v1, v0, s4
1897; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1898; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1899; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1900; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1901; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1902  %x.shr = lshr i64 %x, 40
1903  %result = udiv i64 32768, %x.shr
1904  ret i64 %result
1905}
1906
1907define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) {
1908; GCN-LABEL: v_test_udiv24_pow2_k_den_i64:
1909; GCN:       ; %bb.0:
1910; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1911; GCN-NEXT:    v_lshrrev_b32_e32 v0, 23, v1
1912; GCN-NEXT:    v_mov_b32_e32 v1, 0
1913; GCN-NEXT:    s_setpc_b64 s[30:31]
1914;
1915; GCN-IR-LABEL: v_test_udiv24_pow2_k_den_i64:
1916; GCN-IR:       ; %bb.0:
1917; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1918; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
1919; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
1920; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
1921; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38000000, v0
1922; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
1923; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
1924; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s4, v0
1925; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
1926; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1927; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1928; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1929; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1930  %x.shr = lshr i64 %x, 40
1931  %result = udiv i64 %x.shr, 32768
1932  ret i64 %result
1933}
1934