1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
4
5define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6; GCN-LABEL: s_test_srem:
7; GCN:       ; %bb.0:
8; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
9; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
10; GCN-NEXT:    s_mov_b32 s7, 0xf000
11; GCN-NEXT:    s_mov_b32 s6, -1
12; GCN-NEXT:    s_waitcnt lgkmcnt(0)
13; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
14; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
15; GCN-NEXT:    s_sub_u32 s0, 0, s12
16; GCN-NEXT:    s_subb_u32 s1, 0, s13
17; GCN-NEXT:    s_mov_b32 s4, s8
18; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
19; GCN-NEXT:    v_rcp_f32_e32 v0, v0
20; GCN-NEXT:    s_mov_b32 s5, s9
21; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
22; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
23; GCN-NEXT:    v_trunc_f32_e32 v1, v1
24; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
25; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
26; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
27; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
28; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
29; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
30; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
31; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
32; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
33; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
34; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
35; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
36; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
37; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
38; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
39; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
40; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
41; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
42; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
43; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
44; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
45; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
46; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
47; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
48; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
49; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
50; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
51; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
52; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
53; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
54; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
55; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
56; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
57; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
58; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
59; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
60; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
61; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
62; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
63; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
64; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
65; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
66; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
67; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
68; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
69; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
70; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
71; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
72; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
73; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
74; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
75; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
76; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
77; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
78; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
79; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
80; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
81; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
82; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
83; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
84; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
85; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
86; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
87; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
88; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
89; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
90; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
91; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
92; GCN-NEXT:    v_mov_b32_e32 v3, s13
93; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
94; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
95; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
96; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
97; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
98; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
99; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
100; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
101; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
102; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
103; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
104; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
105; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
106; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
107; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
108; GCN-NEXT:    v_mov_b32_e32 v5, s11
109; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
110; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
111; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
112; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
113; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
114; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
115; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
116; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
117; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
118; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
119; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
120; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
121; GCN-NEXT:    s_endpgm
122;
123; GCN-IR-LABEL: s_test_srem:
124; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
125; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
126; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
127; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
128; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
129; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
130; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
131; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s4
132; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
133; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
134; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s5
135; GCN-IR-NEXT:    s_min_u32 s10, s14, s8
136; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
137; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
138; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
139; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
140; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
141; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
142; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
143; GCN-IR-NEXT:    s_mov_b32 s11, 0
144; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
145; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
146; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[12:13], -1
147; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
148; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
149; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
150; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
151; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
152; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
153; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[12:13], 0
154; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
155; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
156; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
157; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
158; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
159; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s12
160; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
161; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
162; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
163; GCN-IR-NEXT:    s_add_u32 s10, s6, s14
164; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
165; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
166; GCN-IR-NEXT:    s_mov_b32 s7, 0
167; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
168; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
169; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
170; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
171; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
172; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
173; GCN-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
174; GCN-IR-NEXT:    s_sub_u32 s6, s16, s12
175; GCN-IR-NEXT:    s_subb_u32 s6, s17, s13
176; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
177; GCN-IR-NEXT:    s_mov_b32 s15, s14
178; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
179; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[4:5]
180; GCN-IR-NEXT:    s_sub_u32 s12, s12, s14
181; GCN-IR-NEXT:    s_subb_u32 s13, s13, s15
182; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
183; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
184; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
185; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
186; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
187; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
188; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
189; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
190; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
191; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
192; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
193; GCN-IR-NEXT:    s_branch .LBB0_6
194; GCN-IR-NEXT:  .LBB0_5:
195; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
196; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
197; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
198; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
199; GCN-IR-NEXT:  .LBB0_6: ; %udiv-end
200; GCN-IR-NEXT:    v_mul_lo_u32 v1, s4, v1
201; GCN-IR-NEXT:    v_mul_hi_u32 v2, s4, v0
202; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
203; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
204; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
205; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
206; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
207; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
208; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
209; GCN-IR-NEXT:    s_mov_b32 s10, -1
210; GCN-IR-NEXT:    s_mov_b32 s8, s0
211; GCN-IR-NEXT:    s_mov_b32 s9, s1
212; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
213; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
214; GCN-IR-NEXT:    s_endpgm
215  %result = urem i64 %x, %y
216  store i64 %result, i64 addrspace(1)* %out
217  ret void
218}
219
220define i64 @v_test_srem(i64 %x, i64 %y) {
221; GCN-LABEL: v_test_srem:
222; GCN:       ; %bb.0:
223; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
225; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
226; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
227; GCN-NEXT:    v_xor_b32_e32 v3, v3, v4
228; GCN-NEXT:    v_xor_b32_e32 v2, v2, v4
229; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
230; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
231; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
232; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
233; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
234; GCN-NEXT:    v_rcp_f32_e32 v4, v4
235; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
236; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
237; GCN-NEXT:    v_trunc_f32_e32 v5, v5
238; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
239; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
240; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
241; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
242; GCN-NEXT:    v_mul_lo_u32 v9, v6, v5
243; GCN-NEXT:    v_mul_lo_u32 v10, v7, v4
244; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
245; GCN-NEXT:    v_mul_lo_u32 v9, v6, v4
246; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
247; GCN-NEXT:    v_mul_lo_u32 v10, v4, v8
248; GCN-NEXT:    v_mul_hi_u32 v11, v4, v9
249; GCN-NEXT:    v_mul_hi_u32 v12, v4, v8
250; GCN-NEXT:    v_mul_hi_u32 v13, v5, v8
251; GCN-NEXT:    v_mul_lo_u32 v8, v5, v8
252; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
253; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
254; GCN-NEXT:    v_mul_lo_u32 v12, v5, v9
255; GCN-NEXT:    v_mul_hi_u32 v9, v5, v9
256; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
257; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
258; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v13, vcc
259; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
260; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
261; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
262; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
263; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
264; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
265; GCN-NEXT:    v_mul_lo_u32 v7, v7, v4
266; GCN-NEXT:    v_mul_lo_u32 v6, v6, v4
267; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
268; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
269; GCN-NEXT:    v_mul_lo_u32 v10, v4, v7
270; GCN-NEXT:    v_mul_hi_u32 v11, v4, v6
271; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
272; GCN-NEXT:    v_mul_hi_u32 v9, v5, v6
273; GCN-NEXT:    v_mul_lo_u32 v6, v5, v6
274; GCN-NEXT:    v_mul_hi_u32 v8, v5, v7
275; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
276; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
277; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
278; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
279; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
280; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
281; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
282; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
283; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
284; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
285; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
286; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
287; GCN-NEXT:    v_xor_b32_e32 v0, v0, v6
288; GCN-NEXT:    v_mul_lo_u32 v7, v0, v5
289; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
290; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
291; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
292; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
293; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
294; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
295; GCN-NEXT:    v_mul_lo_u32 v9, v1, v4
296; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
297; GCN-NEXT:    v_mul_hi_u32 v10, v1, v5
298; GCN-NEXT:    v_mul_lo_u32 v5, v1, v5
299; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
300; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
301; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
302; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
303; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
304; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
305; GCN-NEXT:    v_mul_hi_u32 v7, v2, v4
306; GCN-NEXT:    v_mul_lo_u32 v8, v3, v4
307; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
308; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
309; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
310; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
311; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
312; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v7, v3, vcc
313; GCN-NEXT:    v_sub_i32_e64 v7, s[4:5], v0, v2
314; GCN-NEXT:    v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5]
315; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
316; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
317; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
318; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
319; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
320; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
321; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
322; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
323; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
324; GCN-NEXT:    v_sub_i32_e64 v10, s[4:5], v7, v2
325; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
326; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
327; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
328; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
329; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
330; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
331; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
332; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
333; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v10, s[4:5]
334; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
335; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
336; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
337; GCN-NEXT:    v_xor_b32_e32 v0, v0, v6
338; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
339; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
340; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
341; GCN-NEXT:    s_setpc_b64 s[30:31]
342;
343; GCN-IR-LABEL: v_test_srem:
344; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
345; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
347; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
348; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
349; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v4
350; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
351; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v6
352; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
353; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v6
354; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
355; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
356; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v2
357; GCN-IR-NEXT:    v_add_i32_e64 v6, s[6:7], 32, v6
358; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v3
359; GCN-IR-NEXT:    v_min_u32_e32 v10, v6, v7
360; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v0
361; GCN-IR-NEXT:    v_add_i32_e64 v6, s[6:7], 32, v6
362; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v1
363; GCN-IR-NEXT:    v_min_u32_e32 v11, v6, v7
364; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[6:7], v10, v11
365; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
366; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
367; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7]
368; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[7:8]
369; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
370; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
371; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[7:8]
372; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
373; GCN-IR-NEXT:    v_mov_b32_e32 v5, v4
374; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v1, 0, s[4:5]
375; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
376; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
377; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
378; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
379; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
380; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v7
381; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v8, vcc
382; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 63, v7
383; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
384; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
385; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
386; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
387; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
388; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
389; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
390; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
391; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
392; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
393; GCN-IR-NEXT:    v_not_b32_e32 v9, v10
394; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
395; GCN-IR-NEXT:    v_not_b32_e32 v8, 0
396; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v9, v11
397; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
398; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
399; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
400; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
401; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
402; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
403; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
404; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
405; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
406; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
407; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
408; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v16, v12
409; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v17, v13, vcc
410; GCN-IR-NEXT:    v_or_b32_e32 v6, v14, v6
411; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
412; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v10
413; GCN-IR-NEXT:    v_or_b32_e32 v7, v15, v7
414; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
415; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
416; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
417; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
418; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
419; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v14
420; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v15, s[4:5]
421; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
422; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
423; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
424; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
425; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
426; GCN-IR-NEXT:  ; %bb.4: ; %Flow
427; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
428; GCN-IR-NEXT:  .LBB1_5: ; %Flow3
429; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
430; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
431; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v7
432; GCN-IR-NEXT:    v_or_b32_e32 v6, v8, v6
433; GCN-IR-NEXT:  .LBB1_6: ; %Flow4
434; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
435; GCN-IR-NEXT:    v_mul_lo_u32 v7, v2, v9
436; GCN-IR-NEXT:    v_mul_hi_u32 v8, v2, v6
437; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v6
438; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, v6
439; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
440; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
441; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
442; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
443; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
444; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v5
445; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
446; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
447; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
448  %result = srem i64 %x, %y
449  ret i64 %result
450}
451
452define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
453; GCN-LABEL: s_test_srem23_64:
454; GCN:       ; %bb.0:
455; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
456; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
457; GCN-NEXT:    s_mov_b32 s7, 0xf000
458; GCN-NEXT:    s_mov_b32 s6, -1
459; GCN-NEXT:    s_waitcnt lgkmcnt(0)
460; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 41
461; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
462; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 41
463; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
464; GCN-NEXT:    s_xor_b32 s3, s2, s4
465; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
466; GCN-NEXT:    s_ashr_i32 s3, s3, 30
467; GCN-NEXT:    s_or_b32 s3, s3, 1
468; GCN-NEXT:    v_mov_b32_e32 v3, s3
469; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
470; GCN-NEXT:    v_trunc_f32_e32 v2, v2
471; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
472; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
473; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
474; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
475; GCN-NEXT:    s_mov_b32 s5, s1
476; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
477; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
478; GCN-NEXT:    s_mov_b32 s4, s0
479; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
480; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
481; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
482; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
483; GCN-NEXT:    s_endpgm
484;
485; GCN-IR-LABEL: s_test_srem23_64:
486; GCN-IR:       ; %bb.0:
487; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
488; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
489; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
490; GCN-IR-NEXT:    s_mov_b32 s6, -1
491; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
492; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 41
493; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
494; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 41
495; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
496; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
497; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
498; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
499; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
500; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
501; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
502; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
503; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
504; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
505; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
506; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
507; GCN-IR-NEXT:    s_mov_b32 s5, s1
508; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
509; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
510; GCN-IR-NEXT:    s_mov_b32 s4, s0
511; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
512; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
513; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
514; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
515; GCN-IR-NEXT:    s_endpgm
516  %1 = ashr i64 %x, 41
517  %2 = ashr i64 %y, 41
518  %result = srem i64 %1, %2
519  store i64 %result, i64 addrspace(1)* %out
520  ret void
521}
522
523define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
524; GCN-LABEL: s_test_srem24_64:
525; GCN:       ; %bb.0:
526; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
527; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
528; GCN-NEXT:    s_mov_b32 s7, 0xf000
529; GCN-NEXT:    s_mov_b32 s6, -1
530; GCN-NEXT:    s_waitcnt lgkmcnt(0)
531; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
532; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
533; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
534; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
535; GCN-NEXT:    s_xor_b32 s3, s2, s4
536; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
537; GCN-NEXT:    s_ashr_i32 s3, s3, 30
538; GCN-NEXT:    s_or_b32 s3, s3, 1
539; GCN-NEXT:    v_mov_b32_e32 v3, s3
540; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
541; GCN-NEXT:    v_trunc_f32_e32 v2, v2
542; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
543; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
544; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
545; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
546; GCN-NEXT:    s_mov_b32 s5, s1
547; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
548; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
549; GCN-NEXT:    s_mov_b32 s4, s0
550; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
551; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
552; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
553; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
554; GCN-NEXT:    s_endpgm
555;
556; GCN-IR-LABEL: s_test_srem24_64:
557; GCN-IR:       ; %bb.0:
558; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
559; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
560; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
561; GCN-IR-NEXT:    s_mov_b32 s6, -1
562; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
563; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
564; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
565; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
566; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
567; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
568; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
569; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
570; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
571; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
572; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
573; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
574; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
575; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
576; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
577; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
578; GCN-IR-NEXT:    s_mov_b32 s5, s1
579; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
580; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
581; GCN-IR-NEXT:    s_mov_b32 s4, s0
582; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
583; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
584; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
585; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
586; GCN-IR-NEXT:    s_endpgm
587  %1 = ashr i64 %x, 40
588  %2 = ashr i64 %y, 40
589  %result = srem i64 %1, %2
590  store i64 %result, i64 addrspace(1)* %out
591  ret void
592}
593
594define i64 @v_test_srem24_64(i64 %x, i64 %y) {
595; GCN-LABEL: v_test_srem24_64:
596; GCN:       ; %bb.0:
597; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], 40
599; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
600; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v2
601; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
602; GCN-NEXT:    v_xor_b32_e32 v5, v0, v2
603; GCN-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
604; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
605; GCN-NEXT:    v_or_b32_e32 v5, 1, v5
606; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
607; GCN-NEXT:    v_trunc_f32_e32 v4, v4
608; GCN-NEXT:    v_mad_f32 v1, -v4, v3, v1
609; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
610; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
611; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
612; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
613; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
614; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
615; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
616; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
617; GCN-NEXT:    s_setpc_b64 s[30:31]
618;
619; GCN-IR-LABEL: v_test_srem24_64:
620; GCN-IR:       ; %bb.0:
621; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
622; GCN-IR-NEXT:    v_ashr_i64 v[2:3], v[2:3], 40
623; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
624; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
625; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
626; GCN-IR-NEXT:    v_xor_b32_e32 v5, v0, v2
627; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
628; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v3
629; GCN-IR-NEXT:    v_or_b32_e32 v5, 1, v5
630; GCN-IR-NEXT:    v_mul_f32_e32 v4, v1, v4
631; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
632; GCN-IR-NEXT:    v_mad_f32 v1, -v4, v3, v1
633; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
634; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
635; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
636; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
637; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
638; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
639; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
640; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
641; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
642  %1 = ashr i64 %x, 40
643  %2 = ashr i64 %y, 40
644  %result = srem i64 %1, %2
645  ret i64 %result
646}
647
648define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
649; GCN-LABEL: s_test_srem25_64:
650; GCN:       ; %bb.0:
651; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
652; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
653; GCN-NEXT:    s_mov_b32 s7, 0xf000
654; GCN-NEXT:    s_mov_b32 s6, -1
655; GCN-NEXT:    s_waitcnt lgkmcnt(0)
656; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 39
657; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
658; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
659; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
660; GCN-NEXT:    s_xor_b32 s3, s2, s4
661; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
662; GCN-NEXT:    s_ashr_i32 s3, s3, 30
663; GCN-NEXT:    s_or_b32 s3, s3, 1
664; GCN-NEXT:    v_mov_b32_e32 v3, s3
665; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
666; GCN-NEXT:    v_trunc_f32_e32 v2, v2
667; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
668; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
669; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
670; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
671; GCN-NEXT:    s_mov_b32 s5, s1
672; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
673; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
674; GCN-NEXT:    s_mov_b32 s4, s0
675; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
676; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
677; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
678; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
679; GCN-NEXT:    s_endpgm
680;
681; GCN-IR-LABEL: s_test_srem25_64:
682; GCN-IR:       ; %bb.0:
683; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
684; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
685; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
686; GCN-IR-NEXT:    s_mov_b32 s6, -1
687; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
688; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 39
689; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
690; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
691; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
692; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
693; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
694; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
695; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
696; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
697; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
698; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
699; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
700; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
701; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
702; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
703; GCN-IR-NEXT:    s_mov_b32 s5, s1
704; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
705; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
706; GCN-IR-NEXT:    s_mov_b32 s4, s0
707; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
708; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
709; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
710; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
711; GCN-IR-NEXT:    s_endpgm
712  %1 = ashr i64 %x, 39
713  %2 = ashr i64 %y, 39
714  %result = srem i64 %1, %2
715  store i64 %result, i64 addrspace(1)* %out
716  ret void
717}
718
719define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
720; GCN-LABEL: s_test_srem31_64:
721; GCN:       ; %bb.0:
722; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
723; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
724; GCN-NEXT:    s_mov_b32 s7, 0xf000
725; GCN-NEXT:    s_mov_b32 s6, -1
726; GCN-NEXT:    s_waitcnt lgkmcnt(0)
727; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 33
728; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
729; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
730; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
731; GCN-NEXT:    s_xor_b32 s3, s2, s4
732; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
733; GCN-NEXT:    s_ashr_i32 s3, s3, 30
734; GCN-NEXT:    s_or_b32 s3, s3, 1
735; GCN-NEXT:    v_mov_b32_e32 v3, s3
736; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
737; GCN-NEXT:    v_trunc_f32_e32 v2, v2
738; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
739; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
740; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
741; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
742; GCN-NEXT:    s_mov_b32 s5, s1
743; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
744; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
745; GCN-NEXT:    s_mov_b32 s4, s0
746; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
747; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
748; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
749; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
750; GCN-NEXT:    s_endpgm
751;
752; GCN-IR-LABEL: s_test_srem31_64:
753; GCN-IR:       ; %bb.0:
754; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
755; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
756; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
757; GCN-IR-NEXT:    s_mov_b32 s6, -1
758; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
759; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 33
760; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
761; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
762; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
763; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
764; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
765; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
766; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
767; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
768; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
769; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
770; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
771; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
772; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
773; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
774; GCN-IR-NEXT:    s_mov_b32 s5, s1
775; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
776; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
777; GCN-IR-NEXT:    s_mov_b32 s4, s0
778; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
779; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
780; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
781; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
782; GCN-IR-NEXT:    s_endpgm
783  %1 = ashr i64 %x, 33
784  %2 = ashr i64 %y, 33
785  %result = srem i64 %1, %2
786  store i64 %result, i64 addrspace(1)* %out
787  ret void
788}
789
790; 32 known sign bits
791define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
792; GCN-LABEL: s_test_srem32_64:
793; GCN:       ; %bb.0:
794; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
795; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
796; GCN-NEXT:    s_mov_b32 s7, 0xf000
797; GCN-NEXT:    s_mov_b32 s6, -1
798; GCN-NEXT:    s_waitcnt lgkmcnt(0)
799; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
800; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
801; GCN-NEXT:    s_xor_b32 s2, s3, s4
802; GCN-NEXT:    s_ashr_i32 s2, s2, 30
803; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
804; GCN-NEXT:    s_or_b32 s2, s2, 1
805; GCN-NEXT:    v_mov_b32_e32 v3, s2
806; GCN-NEXT:    s_mov_b32 s5, s1
807; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
808; GCN-NEXT:    v_trunc_f32_e32 v2, v2
809; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
810; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
811; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
812; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
813; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
814; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
815; GCN-NEXT:    s_mov_b32 s4, s0
816; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
817; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
818; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
819; GCN-NEXT:    s_endpgm
820;
821; GCN-IR-LABEL: s_test_srem32_64:
822; GCN-IR:       ; %bb.0:
823; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xe
824; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
825; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
826; GCN-IR-NEXT:    s_mov_b32 s6, -1
827; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
828; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
829; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
830; GCN-IR-NEXT:    s_xor_b32 s2, s3, s4
831; GCN-IR-NEXT:    s_ashr_i32 s2, s2, 30
832; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
833; GCN-IR-NEXT:    s_or_b32 s2, s2, 1
834; GCN-IR-NEXT:    v_mov_b32_e32 v3, s2
835; GCN-IR-NEXT:    s_mov_b32 s5, s1
836; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
837; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
838; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
839; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
840; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
841; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
842; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
843; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
844; GCN-IR-NEXT:    s_mov_b32 s4, s0
845; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
846; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
847; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
848; GCN-IR-NEXT:    s_endpgm
849  %1 = ashr i64 %x, 32
850  %2 = ashr i64 %y, 32
851  %result = srem i64 %1, %2
852  store i64 %result, i64 addrspace(1)* %out
853  ret void
854}
855
856; 33 known sign bits
857define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
858; GCN-LABEL: s_test_srem33_64:
859; GCN:       ; %bb.0:
860; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
861; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
862; GCN-NEXT:    s_mov_b32 s11, 0xf000
863; GCN-NEXT:    s_mov_b32 s10, -1
864; GCN-NEXT:    s_waitcnt lgkmcnt(0)
865; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
866; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 31
867; GCN-NEXT:    s_ashr_i32 s0, s1, 31
868; GCN-NEXT:    s_add_u32 s8, s8, s0
869; GCN-NEXT:    s_mov_b32 s1, s0
870; GCN-NEXT:    s_addc_u32 s9, s9, s0
871; GCN-NEXT:    s_xor_b64 s[12:13], s[8:9], s[0:1]
872; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
873; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
874; GCN-NEXT:    s_sub_u32 s0, 0, s12
875; GCN-NEXT:    s_subb_u32 s1, 0, s13
876; GCN-NEXT:    s_ashr_i32 s6, s7, 31
877; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
878; GCN-NEXT:    v_rcp_f32_e32 v0, v0
879; GCN-NEXT:    s_mov_b32 s7, s6
880; GCN-NEXT:    s_mov_b32 s8, s4
881; GCN-NEXT:    s_mov_b32 s9, s5
882; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
883; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
884; GCN-NEXT:    v_trunc_f32_e32 v1, v1
885; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
886; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
887; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
888; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
889; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
890; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
891; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
892; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
893; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
894; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
895; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
896; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
897; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
898; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
899; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
900; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
901; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
902; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
903; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
904; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
905; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
906; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
907; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
908; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
909; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
910; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
911; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
912; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
913; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
914; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
915; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
916; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
917; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
918; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
919; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
920; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
921; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
922; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
923; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
924; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
925; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
926; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
927; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
928; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
929; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
930; GCN-NEXT:    s_add_u32 s0, s2, s6
931; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
932; GCN-NEXT:    s_addc_u32 s1, s3, s6
933; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
934; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[6:7]
935; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
936; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
937; GCN-NEXT:    v_mul_hi_u32 v4, s14, v1
938; GCN-NEXT:    v_mul_hi_u32 v5, s15, v1
939; GCN-NEXT:    v_mul_lo_u32 v1, s15, v1
940; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
941; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
942; GCN-NEXT:    v_mul_lo_u32 v4, s15, v0
943; GCN-NEXT:    v_mul_hi_u32 v0, s15, v0
944; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
945; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
946; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
947; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
948; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
949; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
950; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
951; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
952; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
953; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
954; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
955; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s15, v1
956; GCN-NEXT:    v_mov_b32_e32 v3, s13
957; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s14, v0
958; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
959; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
960; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
961; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
962; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
963; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
964; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
965; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
966; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
967; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
968; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
969; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
970; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
971; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
972; GCN-NEXT:    v_mov_b32_e32 v5, s15
973; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
974; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
975; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
976; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
977; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
978; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
979; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
980; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
981; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
982; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
983; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
984; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
985; GCN-NEXT:    v_xor_b32_e32 v1, s6, v1
986; GCN-NEXT:    v_mov_b32_e32 v2, s6
987; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
988; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
989; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
990; GCN-NEXT:    s_endpgm
991;
992; GCN-IR-LABEL: s_test_srem33_64:
993; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
994; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
995; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
996; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
997; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
998; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 31
999; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
1000; GCN-IR-NEXT:    s_ashr_i32 s6, s1, 31
1001; GCN-IR-NEXT:    s_mov_b32 s1, s0
1002; GCN-IR-NEXT:    s_mov_b32 s7, s6
1003; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
1004; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
1005; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
1006; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
1007; GCN-IR-NEXT:    s_sub_u32 s8, s8, s6
1008; GCN-IR-NEXT:    s_subb_u32 s9, s9, s6
1009; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
1010; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
1011; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
1012; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
1013; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s8
1014; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
1015; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s9
1016; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
1017; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
1018; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
1019; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
1020; GCN-IR-NEXT:    s_min_u32 s16, s10, s11
1021; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
1022; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
1023; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
1024; GCN-IR-NEXT:    s_mov_b32 s13, 0
1025; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
1026; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[10:11], 63
1027; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[14:15], -1
1028; GCN-IR-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
1029; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
1030; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
1031; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1032; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
1033; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
1034; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
1035; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
1036; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
1037; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
1038; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
1039; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1040; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s14
1041; GCN-IR-NEXT:    s_add_u32 s18, s8, -1
1042; GCN-IR-NEXT:    s_addc_u32 s19, s9, -1
1043; GCN-IR-NEXT:    s_not_b64 s[6:7], s[12:13]
1044; GCN-IR-NEXT:    s_add_u32 s12, s6, s16
1045; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
1046; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
1047; GCN-IR-NEXT:    s_mov_b32 s7, 0
1048; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
1049; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1050; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
1051; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 31
1052; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
1053; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
1054; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
1055; GCN-IR-NEXT:    s_sub_u32 s6, s18, s14
1056; GCN-IR-NEXT:    s_subb_u32 s6, s19, s15
1057; GCN-IR-NEXT:    s_ashr_i32 s16, s6, 31
1058; GCN-IR-NEXT:    s_mov_b32 s17, s16
1059; GCN-IR-NEXT:    s_and_b32 s6, s16, 1
1060; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[8:9]
1061; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
1062; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
1063; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
1064; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
1065; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
1066; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[6:7]
1067; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
1068; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_3
1069; GCN-IR-NEXT:  .LBB8_4: ; %Flow6
1070; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
1071; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
1072; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
1073; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
1074; GCN-IR-NEXT:    s_branch .LBB8_6
1075; GCN-IR-NEXT:  .LBB8_5:
1076; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
1077; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
1078; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
1079; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
1080; GCN-IR-NEXT:  .LBB8_6: ; %udiv-end
1081; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
1082; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
1083; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
1084; GCN-IR-NEXT:    s_mov_b32 s6, -1
1085; GCN-IR-NEXT:    v_readfirstlane_b32 s10, v2
1086; GCN-IR-NEXT:    v_mul_lo_u32 v2, s9, v0
1087; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
1088; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s10, v1
1089; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1090; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
1091; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1092; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1093; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
1094; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
1095; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
1096; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
1097; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
1098; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1099; GCN-IR-NEXT:    s_endpgm
1100  %1 = ashr i64 %x, 31
1101  %2 = ashr i64 %y, 31
1102  %result = srem i64 %1, %2
1103  store i64 %result, i64 addrspace(1)* %out
1104  ret void
1105}
1106
1107define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
1108; GCN-LABEL: s_test_srem24_48:
1109; GCN:       ; %bb.0:
1110; GCN-NEXT:    s_load_dword s2, s[0:1], 0xc
1111; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
1112; GCN-NEXT:    s_load_dword s6, s[0:1], 0xd
1113; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1114; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
1115; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1116; GCN-NEXT:    s_sext_i32_i16 s1, s2
1117; GCN-NEXT:    s_sext_i32_i16 s2, s3
1118; GCN-NEXT:    v_mov_b32_e32 v0, s6
1119; GCN-NEXT:    v_alignbit_b32 v0, s2, v0, 24
1120; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
1121; GCN-NEXT:    v_mov_b32_e32 v2, s0
1122; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 24
1123; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v2
1124; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1125; GCN-NEXT:    v_xor_b32_e32 v5, v2, v0
1126; GCN-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
1127; GCN-NEXT:    v_or_b32_e32 v5, 1, v5
1128; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1129; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1130; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
1131; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
1132; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1133; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
1134; GCN-NEXT:    s_mov_b32 s7, 0xf000
1135; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1136; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
1137; GCN-NEXT:    s_mov_b32 s6, -1
1138; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
1139; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
1140; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1141; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1142; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
1143; GCN-NEXT:    s_endpgm
1144;
1145; GCN-IR-LABEL: s_test_srem24_48:
1146; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1147; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
1148; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
1149; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
1150; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xd
1151; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1152; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1153; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
1154; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
1155; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[2:3], 24
1156; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
1157; GCN-IR-NEXT:    s_ashr_i32 s10, s5, 31
1158; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 24
1159; GCN-IR-NEXT:    s_mov_b32 s3, s2
1160; GCN-IR-NEXT:    s_mov_b32 s11, s10
1161; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
1162; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
1163; GCN-IR-NEXT:    s_sub_u32 s4, s4, s2
1164; GCN-IR-NEXT:    s_subb_u32 s5, s5, s2
1165; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
1166; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
1167; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
1168; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[4:5], 0
1169; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
1170; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
1171; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
1172; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
1173; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
1174; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
1175; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
1176; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
1177; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
1178; GCN-IR-NEXT:    s_min_u32 s16, s10, s11
1179; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
1180; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
1181; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
1182; GCN-IR-NEXT:    s_mov_b32 s13, 0
1183; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
1184; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[10:11], 63
1185; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[14:15], -1
1186; GCN-IR-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
1187; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
1188; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
1189; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1190; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
1191; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
1192; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
1193; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
1194; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
1195; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
1196; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
1197; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1198; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[4:5], s14
1199; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
1200; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
1201; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
1202; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
1203; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
1204; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
1205; GCN-IR-NEXT:    s_mov_b32 s9, 0
1206; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
1207; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1208; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
1209; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
1210; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
1211; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[8:9]
1212; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
1213; GCN-IR-NEXT:    s_sub_u32 s8, s18, s14
1214; GCN-IR-NEXT:    s_subb_u32 s8, s19, s15
1215; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
1216; GCN-IR-NEXT:    s_mov_b32 s17, s16
1217; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
1218; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[6:7]
1219; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
1220; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
1221; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
1222; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
1223; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
1224; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
1225; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
1226; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
1227; GCN-IR-NEXT:  .LBB9_4: ; %Flow3
1228; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
1229; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
1230; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
1231; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
1232; GCN-IR-NEXT:    s_branch .LBB9_6
1233; GCN-IR-NEXT:  .LBB9_5:
1234; GCN-IR-NEXT:    v_mov_b32_e32 v0, s5
1235; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
1236; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
1237; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
1238; GCN-IR-NEXT:  .LBB9_6: ; %udiv-end
1239; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
1240; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
1241; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
1242; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
1243; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1244; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1245; GCN-IR-NEXT:    v_mov_b32_e32 v2, s5
1246; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1247; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1248; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
1249; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
1250; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
1251; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1252; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
1253; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
1254; GCN-IR-NEXT:    s_mov_b32 s2, -1
1255; GCN-IR-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
1256; GCN-IR-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1257; GCN-IR-NEXT:    s_endpgm
1258  %1 = ashr i48 %x, 24
1259  %2 = ashr i48 %y, 24
1260  %result = srem i48 %1, %2
1261  store i48 %result, i48 addrspace(1)* %out
1262  ret void
1263}
1264
1265define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
1266; GCN-LABEL: s_test_srem_k_num_i64:
1267; GCN:       ; %bb.0:
1268; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1269; GCN-NEXT:    s_mov_b32 s7, 0xf000
1270; GCN-NEXT:    s_mov_b32 s6, -1
1271; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1272; GCN-NEXT:    s_ashr_i32 s4, s3, 31
1273; GCN-NEXT:    s_add_u32 s2, s2, s4
1274; GCN-NEXT:    s_mov_b32 s5, s4
1275; GCN-NEXT:    s_addc_u32 s3, s3, s4
1276; GCN-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
1277; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
1278; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
1279; GCN-NEXT:    s_sub_u32 s2, 0, s8
1280; GCN-NEXT:    s_subb_u32 s3, 0, s9
1281; GCN-NEXT:    s_mov_b32 s4, s0
1282; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
1283; GCN-NEXT:    v_rcp_f32_e32 v0, v0
1284; GCN-NEXT:    s_mov_b32 s5, s1
1285; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1286; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1287; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1288; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
1289; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1290; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1291; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
1292; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
1293; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
1294; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
1295; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1296; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
1297; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
1298; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
1299; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
1300; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
1301; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
1302; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
1303; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
1304; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
1305; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
1306; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1307; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
1308; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
1309; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1310; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1311; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1312; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1313; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
1314; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
1315; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
1316; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1317; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
1318; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1319; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
1320; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
1321; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
1322; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
1323; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
1324; GCN-NEXT:    v_mul_hi_u32 v4, v1, v2
1325; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1326; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1327; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
1328; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
1329; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
1330; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
1331; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1332; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1333; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1334; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1335; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
1336; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
1337; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
1338; GCN-NEXT:    v_mov_b32_e32 v3, s9
1339; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1340; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
1341; GCN-NEXT:    v_mul_lo_u32 v1, s9, v0
1342; GCN-NEXT:    v_mul_hi_u32 v2, s8, v0
1343; GCN-NEXT:    v_mul_lo_u32 v0, s8, v0
1344; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1345; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
1346; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
1347; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
1348; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
1349; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
1350; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
1351; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
1352; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
1353; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
1354; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
1355; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
1356; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
1357; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
1358; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
1359; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
1360; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
1361; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1362; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
1363; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
1364; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1365; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
1366; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
1367; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1368; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
1369; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1370; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
1371; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1372; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1373; GCN-NEXT:    s_endpgm
1374;
1375; GCN-IR-LABEL: s_test_srem_k_num_i64:
1376; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1377; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1378; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1379; GCN-IR-NEXT:    s_ashr_i32 s6, s3, 31
1380; GCN-IR-NEXT:    s_mov_b32 s7, s6
1381; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
1382; GCN-IR-NEXT:    s_sub_u32 s4, s2, s6
1383; GCN-IR-NEXT:    s_subb_u32 s5, s3, s6
1384; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s4
1385; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
1386; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s5
1387; GCN-IR-NEXT:    s_min_u32 s8, s2, s3
1388; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
1389; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
1390; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
1391; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
1392; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
1393; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
1394; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
1395; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
1396; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
1397; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
1398; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
1399; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1400; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
1401; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
1402; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
1403; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
1404; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
1405; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
1406; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
1407; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1408; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
1409; GCN-IR-NEXT:    s_add_u32 s14, s4, -1
1410; GCN-IR-NEXT:    s_addc_u32 s15, s5, -1
1411; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
1412; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
1413; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
1414; GCN-IR-NEXT:    s_mov_b32 s3, 0
1415; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
1416; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1417; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
1418; GCN-IR-NEXT:    s_lshr_b32 s2, s7, 31
1419; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
1420; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[2:3]
1421; GCN-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
1422; GCN-IR-NEXT:    s_sub_u32 s2, s14, s10
1423; GCN-IR-NEXT:    s_subb_u32 s2, s15, s11
1424; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
1425; GCN-IR-NEXT:    s_mov_b32 s13, s12
1426; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
1427; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[4:5]
1428; GCN-IR-NEXT:    s_sub_u32 s10, s10, s12
1429; GCN-IR-NEXT:    s_subb_u32 s11, s11, s13
1430; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
1431; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
1432; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
1433; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
1434; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
1435; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_3
1436; GCN-IR-NEXT:  .LBB10_4: ; %Flow5
1437; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
1438; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
1439; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
1440; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
1441; GCN-IR-NEXT:    s_branch .LBB10_6
1442; GCN-IR-NEXT:  .LBB10_5:
1443; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
1444; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
1445; GCN-IR-NEXT:  .LBB10_6: ; %udiv-end
1446; GCN-IR-NEXT:    v_mul_lo_u32 v1, s4, v1
1447; GCN-IR-NEXT:    v_mul_hi_u32 v2, s4, v0
1448; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
1449; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
1450; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
1451; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1452; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1453; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
1454; GCN-IR-NEXT:    s_mov_b32 s2, -1
1455; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
1456; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1457; GCN-IR-NEXT:    s_endpgm
1458  %result = srem i64 24, %x
1459  store i64 %result, i64 addrspace(1)* %out
1460  ret void
1461}
1462
1463define i64 @v_test_srem_k_num_i64(i64 %x) {
1464; GCN-LABEL: v_test_srem_k_num_i64:
1465; GCN:       ; %bb.0:
1466; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1467; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1468; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1469; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
1470; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
1471; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
1472; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
1473; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
1474; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1475; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
1476; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
1477; GCN-NEXT:    v_rcp_f32_e32 v2, v2
1478; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
1479; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
1480; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1481; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
1482; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1483; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1484; GCN-NEXT:    v_mul_hi_u32 v6, v4, v2
1485; GCN-NEXT:    v_mul_lo_u32 v7, v4, v3
1486; GCN-NEXT:    v_mul_lo_u32 v8, v5, v2
1487; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
1488; GCN-NEXT:    v_mul_lo_u32 v7, v4, v2
1489; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
1490; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
1491; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
1492; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
1493; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
1494; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
1495; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
1496; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
1497; GCN-NEXT:    v_mul_lo_u32 v10, v3, v7
1498; GCN-NEXT:    v_mul_hi_u32 v7, v3, v7
1499; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
1500; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
1501; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
1502; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1503; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1504; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
1505; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
1506; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
1507; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
1508; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
1509; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
1510; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1511; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
1512; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
1513; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
1514; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
1515; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
1516; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
1517; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
1518; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
1519; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
1520; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
1521; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
1522; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
1523; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
1524; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1525; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1526; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1527; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
1528; GCN-NEXT:    v_mul_lo_u32 v4, v3, 24
1529; GCN-NEXT:    v_mul_hi_u32 v2, v2, 24
1530; GCN-NEXT:    v_mul_hi_u32 v3, v3, 24
1531; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1532; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
1533; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
1534; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
1535; GCN-NEXT:    v_mul_lo_u32 v2, v0, v2
1536; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1537; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
1538; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
1539; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
1540; GCN-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v0
1541; GCN-NEXT:    v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
1542; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v1
1543; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
1544; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v5, v0
1545; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
1546; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
1547; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
1548; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
1549; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
1550; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
1551; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
1552; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
1553; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
1554; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
1555; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1556; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1557; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
1558; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
1559; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
1560; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1561; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
1562; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
1563; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
1564; GCN-NEXT:    s_setpc_b64 s[30:31]
1565;
1566; GCN-IR-LABEL: v_test_srem_k_num_i64:
1567; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1568; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1569; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1570; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
1571; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
1572; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1573; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
1574; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
1575; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
1576; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
1577; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
1578; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
1579; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v6
1580; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
1581; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
1582; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
1583; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
1584; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
1585; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, 24, 0, s[4:5]
1586; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
1587; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
1588; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1589; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1590; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
1591; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1592; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
1593; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
1594; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
1595; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
1596; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
1597; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
1598; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1599; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1600; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
1601; GCN-IR-NEXT:    s_cbranch_execz .LBB11_5
1602; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1603; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
1604; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
1605; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v7
1606; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
1607; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
1608; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
1609; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1610; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
1611; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1612; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
1613; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1614; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
1615; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
1616; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
1617; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1618; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
1619; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
1620; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
1621; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
1622; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
1623; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
1624; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
1625; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
1626; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
1627; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
1628; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
1629; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
1630; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
1631; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
1632; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
1633; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
1634; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
1635; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_3
1636; GCN-IR-NEXT:  ; %bb.4: ; %Flow
1637; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
1638; GCN-IR-NEXT:  .LBB11_5: ; %Flow3
1639; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
1640; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
1641; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
1642; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v6
1643; GCN-IR-NEXT:  .LBB11_6: ; %Flow4
1644; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
1645; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
1646; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v3
1647; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v3
1648; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v3
1649; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1650; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1651; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
1652; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
1653; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1654  %result = srem i64 24, %x
1655  ret i64 %result
1656}
1657
1658define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
1659; GCN-LABEL: v_test_srem_pow2_k_num_i64:
1660; GCN:       ; %bb.0:
1661; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1662; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1663; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1664; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
1665; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
1666; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
1667; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
1668; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
1669; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
1670; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
1671; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
1672; GCN-NEXT:    v_rcp_f32_e32 v2, v2
1673; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
1674; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
1675; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1676; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
1677; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1678; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1679; GCN-NEXT:    v_mul_hi_u32 v6, v4, v2
1680; GCN-NEXT:    v_mul_lo_u32 v7, v4, v3
1681; GCN-NEXT:    v_mul_lo_u32 v8, v5, v2
1682; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
1683; GCN-NEXT:    v_mul_lo_u32 v7, v4, v2
1684; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
1685; GCN-NEXT:    v_mul_lo_u32 v8, v2, v6
1686; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
1687; GCN-NEXT:    v_mul_hi_u32 v10, v2, v6
1688; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
1689; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
1690; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
1691; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
1692; GCN-NEXT:    v_mul_lo_u32 v10, v3, v7
1693; GCN-NEXT:    v_mul_hi_u32 v7, v3, v7
1694; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
1695; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
1696; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v11, vcc
1697; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1698; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1699; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
1700; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
1701; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
1702; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
1703; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
1704; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
1705; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1706; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
1707; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
1708; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
1709; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
1710; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
1711; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
1712; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
1713; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
1714; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
1715; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
1716; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
1717; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
1718; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
1719; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1720; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
1721; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1722; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v5, vcc
1723; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
1724; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
1725; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
1726; GCN-NEXT:    v_mul_lo_u32 v2, v0, v2
1727; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1728; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
1729; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0x8000, v2
1730; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
1731; GCN-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v0
1732; GCN-NEXT:    v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
1733; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v1
1734; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
1735; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v5, v0
1736; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
1737; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
1738; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
1739; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
1740; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
1741; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
1742; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
1743; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
1744; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
1745; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
1746; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1747; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1748; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
1749; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
1750; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
1751; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1752; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
1753; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
1754; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
1755; GCN-NEXT:    s_setpc_b64 s[30:31]
1756;
1757; GCN-IR-LABEL: v_test_srem_pow2_k_num_i64:
1758; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1759; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1760; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1761; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
1762; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
1763; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1764; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
1765; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
1766; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
1767; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
1768; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
1769; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
1770; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
1771; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
1772; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
1773; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
1774; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
1775; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
1776; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
1777; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
1778; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
1779; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
1780; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
1781; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
1782; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1783; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
1784; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1785; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
1786; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
1787; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
1788; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
1789; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
1790; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
1791; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1792; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1793; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
1794; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
1795; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1796; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
1797; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
1798; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
1799; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
1800; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
1801; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
1802; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
1803; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1804; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
1805; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
1806; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
1807; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1808; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
1809; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
1810; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
1811; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1812; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
1813; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
1814; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
1815; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
1816; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
1817; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
1818; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
1819; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
1820; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
1821; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
1822; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
1823; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
1824; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
1825; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
1826; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
1827; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
1828; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
1829; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
1830; GCN-IR-NEXT:  ; %bb.4: ; %Flow
1831; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
1832; GCN-IR-NEXT:  .LBB12_5: ; %Flow3
1833; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
1834; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
1835; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
1836; GCN-IR-NEXT:    v_or_b32_e32 v5, v4, v6
1837; GCN-IR-NEXT:  .LBB12_6: ; %Flow4
1838; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
1839; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
1840; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v5
1841; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v5
1842; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v5
1843; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1844; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1845; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
1846; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
1847; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1848  %result = srem i64 32768, %x
1849  ret i64 %result
1850}
1851
1852define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
1853; GCN-LABEL: v_test_srem_pow2_k_den_i64:
1854; GCN:       ; %bb.0:
1855; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1856; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1857; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
1858; GCN-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
1859; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1860; GCN-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
1861; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1862; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
1863; GCN-NEXT:    s_setpc_b64 s[30:31]
1864;
1865; GCN-IR-LABEL: v_test_srem_pow2_k_den_i64:
1866; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
1867; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1868; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1869; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
1870; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
1871; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1872; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
1873; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v0
1874; GCN-IR-NEXT:    v_add_i32_e64 v3, s[4:5], 32, v3
1875; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v1
1876; GCN-IR-NEXT:    v_min_u32_e32 v8, v3, v4
1877; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v8
1878; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
1879; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
1880; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
1881; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
1882; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1883; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
1884; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
1885; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
1886; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
1887; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
1888; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1889; GCN-IR-NEXT:    s_cbranch_execz .LBB13_6
1890; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
1891; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
1892; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v5, vcc
1893; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
1894; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
1895; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
1896; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
1897; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
1898; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1899; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
1900; GCN-IR-NEXT:    s_cbranch_execz .LBB13_5
1901; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
1902; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v9
1903; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffcf, v8
1904; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
1905; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, -1, vcc
1906; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
1907; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
1908; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
1909; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
1910; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
1911; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
1912; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
1913; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
1914; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
1915; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, s12, v10
1916; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
1917; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
1918; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
1919; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
1920; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
1921; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
1922; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
1923; GCN-IR-NEXT:    v_and_b32_e32 v12, 0x8000, v12
1924; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1925; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
1926; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
1927; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
1928; GCN-IR-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5]
1929; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
1930; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
1931; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
1932; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_3
1933; GCN-IR-NEXT:  ; %bb.4: ; %Flow
1934; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
1935; GCN-IR-NEXT:  .LBB13_5: ; %Flow3
1936; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
1937; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
1938; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
1939; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
1940; GCN-IR-NEXT:  .LBB13_6: ; %Flow4
1941; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
1942; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[6:7], 15
1943; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
1944; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
1945; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
1946; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
1947; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1948; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
1949; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
1950  %result = srem i64 %x, 32768
1951  ret i64 %result
1952}
1953
1954define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
1955; GCN-LABEL: s_test_srem24_k_num_i64:
1956; GCN:       ; %bb.0:
1957; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1958; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1959; GCN-NEXT:    s_ashr_i64 s[4:5], s[2:3], 40
1960; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
1961; GCN-NEXT:    s_mov_b32 s5, 0x41c00000
1962; GCN-NEXT:    s_ashr_i32 s6, s4, 30
1963; GCN-NEXT:    s_or_b32 s6, s6, 1
1964; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1965; GCN-NEXT:    v_mov_b32_e32 v3, s6
1966; GCN-NEXT:    s_mov_b32 s3, 0xf000
1967; GCN-NEXT:    s_mov_b32 s2, -1
1968; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
1969; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1970; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s5
1971; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
1972; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
1973; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
1974; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1975; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
1976; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
1977; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
1978; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1979; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1980; GCN-NEXT:    s_endpgm
1981;
1982; GCN-IR-LABEL: s_test_srem24_k_num_i64:
1983; GCN-IR:       ; %bb.0:
1984; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1985; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
1986; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[2:3], 40
1987; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
1988; GCN-IR-NEXT:    s_mov_b32 s5, 0x41c00000
1989; GCN-IR-NEXT:    s_ashr_i32 s6, s4, 30
1990; GCN-IR-NEXT:    s_or_b32 s6, s6, 1
1991; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1992; GCN-IR-NEXT:    v_mov_b32_e32 v3, s6
1993; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
1994; GCN-IR-NEXT:    s_mov_b32 s2, -1
1995; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
1996; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
1997; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s5
1998; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
1999; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
2000; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2001; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2002; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
2003; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
2004; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
2005; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2006; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2007; GCN-IR-NEXT:    s_endpgm
2008  %x.shr = ashr i64 %x, 40
2009  %result = srem i64 24, %x.shr
2010  store i64 %result, i64 addrspace(1)* %out
2011  ret void
2012}
2013
2014define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
2015; GCN-LABEL: s_test_srem24_k_den_i64:
2016; GCN:       ; %bb.0:
2017; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2018; GCN-NEXT:    s_mov_b32 s4, 0x46b6fe00
2019; GCN-NEXT:    s_mov_b32 s7, 0xf000
2020; GCN-NEXT:    s_mov_b32 s6, -1
2021; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2022; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
2023; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
2024; GCN-NEXT:    s_ashr_i32 s3, s2, 30
2025; GCN-NEXT:    s_or_b32 s3, s3, 1
2026; GCN-NEXT:    v_mov_b32_e32 v1, s3
2027; GCN-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
2028; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2029; GCN-NEXT:    v_mad_f32 v0, -v2, s4, v0
2030; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2031; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
2032; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
2033; GCN-NEXT:    s_movk_i32 s3, 0x5b7f
2034; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2035; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
2036; GCN-NEXT:    s_mov_b32 s4, s0
2037; GCN-NEXT:    s_mov_b32 s5, s1
2038; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2039; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
2040; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2041; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2042; GCN-NEXT:    s_endpgm
2043;
2044; GCN-IR-LABEL: s_test_srem24_k_den_i64:
2045; GCN-IR:       ; %bb.0:
2046; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2047; GCN-IR-NEXT:    s_mov_b32 s4, 0x46b6fe00
2048; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
2049; GCN-IR-NEXT:    s_mov_b32 s6, -1
2050; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
2051; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
2052; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
2053; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 30
2054; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
2055; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
2056; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
2057; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
2058; GCN-IR-NEXT:    v_mad_f32 v0, -v2, s4, v0
2059; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
2060; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
2061; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
2062; GCN-IR-NEXT:    s_movk_i32 s3, 0x5b7f
2063; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2064; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s3
2065; GCN-IR-NEXT:    s_mov_b32 s4, s0
2066; GCN-IR-NEXT:    s_mov_b32 s5, s1
2067; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2068; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
2069; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2070; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2071; GCN-IR-NEXT:    s_endpgm
2072  %x.shr = ashr i64 %x, 40
2073  %result = srem i64 %x.shr, 23423
2074  store i64 %result, i64 addrspace(1)* %out
2075  ret void
2076}
2077
2078define i64 @v_test_srem24_k_num_i64(i64 %x) {
2079; GCN-LABEL: v_test_srem24_k_num_i64:
2080; GCN:       ; %bb.0:
2081; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2082; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
2083; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
2084; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
2085; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
2086; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
2087; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
2088; GCN-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
2089; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2090; GCN-NEXT:    v_mad_f32 v4, -v2, v1, s4
2091; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2092; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v1|
2093; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
2094; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2095; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
2096; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
2097; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
2098; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2099; GCN-NEXT:    s_setpc_b64 s[30:31]
2100;
2101; GCN-IR-LABEL: v_test_srem24_k_num_i64:
2102; GCN-IR:       ; %bb.0:
2103; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2104; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
2105; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
2106; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
2107; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
2108; GCN-IR-NEXT:    v_or_b32_e32 v3, 1, v3
2109; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
2110; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
2111; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
2112; GCN-IR-NEXT:    v_mad_f32 v4, -v2, v1, s4
2113; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
2114; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v1|
2115; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
2116; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2117; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
2118; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
2119; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
2120; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2121; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
2122  %x.shr = ashr i64 %x, 40
2123  %result = srem i64 24, %x.shr
2124  ret i64 %result
2125}
2126
2127define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
2128; GCN-LABEL: v_test_srem24_pow2_k_num_i64:
2129; GCN:       ; %bb.0:
2130; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2131; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
2132; GCN-NEXT:    s_mov_b32 s4, 0x47000000
2133; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
2134; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
2135; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
2136; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
2137; GCN-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
2138; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2139; GCN-NEXT:    v_mad_f32 v4, -v2, v1, s4
2140; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2141; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v1|
2142; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
2143; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2144; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
2145; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
2146; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
2147; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2148; GCN-NEXT:    s_setpc_b64 s[30:31]
2149;
2150; GCN-IR-LABEL: v_test_srem24_pow2_k_num_i64:
2151; GCN-IR:       ; %bb.0:
2152; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2153; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
2154; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
2155; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
2156; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 30, v0
2157; GCN-IR-NEXT:    v_or_b32_e32 v3, 1, v3
2158; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
2159; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
2160; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
2161; GCN-IR-NEXT:    v_mad_f32 v4, -v2, v1, s4
2162; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
2163; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v1|
2164; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
2165; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2166; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
2167; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
2168; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
2169; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2170; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
2171  %x.shr = ashr i64 %x, 40
2172  %result = srem i64 32768, %x.shr
2173  ret i64 %result
2174}
2175
2176define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
2177; GCN-LABEL: v_test_srem24_pow2_k_den_i64:
2178; GCN:       ; %bb.0:
2179; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2180; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
2181; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v1
2182; GCN-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
2183; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2184; GCN-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
2185; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2186; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
2187; GCN-NEXT:    s_setpc_b64 s[30:31]
2188;
2189; GCN-IR-LABEL: v_test_srem24_pow2_k_den_i64:
2190; GCN-IR:       ; %bb.0:
2191; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2192; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
2193; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
2194; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
2195; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 30, v0
2196; GCN-IR-NEXT:    v_or_b32_e32 v2, 1, v2
2197; GCN-IR-NEXT:    v_mul_f32_e32 v3, 0x38000000, v1
2198; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
2199; GCN-IR-NEXT:    v_mad_f32 v1, -v3, s4, v1
2200; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
2201; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, s4
2202; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
2203; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
2204; GCN-IR-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
2205; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
2206; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
2207; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
2208; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
2209  %x.shr = ashr i64 %x, 40
2210  %result = srem i64 %x.shr, 32768
2211  ret i64 %result
2212}
2213