1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
6
7define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
8; SI-LABEL: umulo_i64_v_v:
9; SI:       ; %bb.0: ; %bb
10; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
12; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
13; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
14; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
15; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
16; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
17; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
18; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
19; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
20; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
21; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
22; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
23; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
24; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
25; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
26; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
27; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
28; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
29; SI-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX9-LABEL: umulo_i64_v_v:
32; GFX9:       ; %bb.0: ; %bb
33; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX9-NEXT:    v_mov_b32_e32 v5, v0
35; GFX9-NEXT:    v_mov_b32_e32 v4, v1
36; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
37; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
38; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
39; GFX9-NEXT:    v_mov_b32_e32 v10, v1
40; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
41; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
42; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0
43; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
44; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
45; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
46; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
47; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
48; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
49; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
50; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
51; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
52; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
53; GFX9-NEXT:    s_setpc_b64 s[30:31]
54;
55; GFX10-LABEL: umulo_i64_v_v:
56; GFX10:       ; %bb.0: ; %bb
57; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
59; GFX10-NEXT:    v_mov_b32_e32 v4, v0
60; GFX10-NEXT:    v_mov_b32_e32 v5, v1
61; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
62; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
63; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
64; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v5, v3, 0
65; GFX10-NEXT:    v_mov_b32_e32 v8, v1
66; GFX10-NEXT:    v_mul_lo_u32 v5, v5, v2
67; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v3
68; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
69; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
70; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
71; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
72; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
73; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
74; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
75; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
76; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
77; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
78; GFX10-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX11-LABEL: umulo_i64_v_v:
81; GFX11:       ; %bb.0: ; %bb
82; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
84; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
85; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
86; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
87; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
88; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
89; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v5, v3, 0
90; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
91; GFX11-NEXT:    v_mov_b32_e32 v8, v1
92; GFX11-NEXT:    v_mul_lo_u32 v5, v5, v2
93; GFX11-NEXT:    v_mul_lo_u32 v4, v4, v3
94; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
95; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
96; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
97; GFX11-NEXT:    v_add3_u32 v1, v1, v4, v5
98; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
99; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
100; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
101; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
102; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
103; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
104; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
105; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
106; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
107; GFX11-NEXT:    s_setpc_b64 s[30:31]
108bb:
109  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
110  ret { i64, i1 } %umulo
111}
112
113define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
114; SI-LABEL: smulo_i64_v_v:
115; SI:       ; %bb.0: ; %bb
116; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
118; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
119; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
120; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
121; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
122; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
123; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
124; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
125; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
126; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
127; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
128; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
129; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
130; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
131; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
132; SI-NEXT:    v_mov_b32_e32 v7, v6
133; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
134; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
135; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
136; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v9, vcc
137; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
138; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
139; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
140; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
141; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
142; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
143; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
144; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
145; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
146; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
147; SI-NEXT:    v_mov_b32_e32 v0, v4
148; SI-NEXT:    v_mov_b32_e32 v1, v5
149; SI-NEXT:    s_setpc_b64 s[30:31]
150;
151; GFX9-LABEL: smulo_i64_v_v:
152; GFX9:       ; %bb.0: ; %bb
153; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; GFX9-NEXT:    v_mov_b32_e32 v5, v0
155; GFX9-NEXT:    v_mov_b32_e32 v4, v1
156; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
157; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
158; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
159; GFX9-NEXT:    v_mov_b32_e32 v10, v1
160; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
161; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
162; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0
163; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
164; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
165; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
166; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
167; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
168; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v2
169; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
170; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
171; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
172; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
173; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v5
174; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
175; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
176; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
177; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
178; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
179; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
180; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
181; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
182; GFX9-NEXT:    v_mov_b32_e32 v5, v4
183; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
184; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
185; GFX9-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX10-LABEL: smulo_i64_v_v:
188; GFX10:       ; %bb.0: ; %bb
189; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
191; GFX10-NEXT:    v_mov_b32_e32 v4, v0
192; GFX10-NEXT:    v_mov_b32_e32 v5, v1
193; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
194; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
195; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
196; GFX10-NEXT:    v_mad_i64_i32 v[11:12], s4, v5, v3, 0
197; GFX10-NEXT:    v_mov_b32_e32 v8, v1
198; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
199; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
200; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v2
201; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
202; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
203; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
204; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v3
205; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
206; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
207; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
208; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
209; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
210; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
211; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
212; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
213; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
214; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
215; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
216; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
217; GFX10-NEXT:    v_mov_b32_e32 v3, v2
218; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
219; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
220; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
221; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
222; GFX10-NEXT:    s_setpc_b64 s[30:31]
223;
224; GFX11-LABEL: smulo_i64_v_v:
225; GFX11:       ; %bb.0: ; %bb
226; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
228; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
229; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
230; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
231; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
232; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
233; GFX11-NEXT:    v_mad_i64_i32 v[11:12], null, v5, v3, 0
234; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
235; GFX11-NEXT:    v_mov_b32_e32 v8, v1
236; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
237; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
238; GFX11-NEXT:    v_mul_lo_u32 v8, v5, v2
239; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
240; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
241; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
242; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
243; GFX11-NEXT:    v_mul_lo_u32 v9, v4, v3
244; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
245; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
246; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
247; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
248; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
249; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
250; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
251; GFX11-NEXT:    v_add3_u32 v1, v1, v9, v8
252; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
253; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
254; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
255; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
256; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
257; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
258; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
259; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
260; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
261; GFX11-NEXT:    v_mov_b32_e32 v3, v2
262; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4
263; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
264; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
265; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
266; GFX11-NEXT:    s_setpc_b64 s[30:31]
267bb:
268  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
269  ret { i64, i1 } %smulo
270}
271
272define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
273; SI-LABEL: umulo_i64_s:
274; SI:       ; %bb.0: ; %bb
275; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
276; SI-NEXT:    s_mov_b32 s7, 0xf000
277; SI-NEXT:    s_waitcnt lgkmcnt(0)
278; SI-NEXT:    v_mov_b32_e32 v0, s2
279; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
280; SI-NEXT:    s_mul_i32 s4, s1, s2
281; SI-NEXT:    v_mov_b32_e32 v2, s3
282; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
283; SI-NEXT:    s_mul_i32 s5, s0, s3
284; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
285; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
286; SI-NEXT:    s_mul_i32 s1, s1, s3
287; SI-NEXT:    s_mul_i32 s0, s0, s2
288; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
289; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
290; SI-NEXT:    v_mov_b32_e32 v5, s0
291; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
292; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
293; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
294; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
295; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
296; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
297; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
298; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
299; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
300; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
301; SI-NEXT:    s_mov_b32 s6, -1
302; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
303; SI-NEXT:    s_endpgm
304;
305; GFX9-LABEL: umulo_i64_s:
306; GFX9:       ; %bb.0: ; %bb
307; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
308; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX9-NEXT:    s_mul_i32 s7, s0, s3
310; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
311; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
312; GFX9-NEXT:    s_add_u32 s9, s8, s7
313; GFX9-NEXT:    s_mul_i32 s6, s1, s2
314; GFX9-NEXT:    s_addc_u32 s5, 0, s5
315; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
316; GFX9-NEXT:    s_add_u32 s9, s9, s6
317; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
318; GFX9-NEXT:    s_addc_u32 s4, s5, s4
319; GFX9-NEXT:    s_addc_u32 s5, s10, 0
320; GFX9-NEXT:    s_mul_i32 s1, s1, s3
321; GFX9-NEXT:    s_add_u32 s4, s4, s1
322; GFX9-NEXT:    s_addc_u32 s5, 0, s5
323; GFX9-NEXT:    s_add_i32 s1, s8, s7
324; GFX9-NEXT:    s_add_i32 s1, s1, s6
325; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
326; GFX9-NEXT:    s_mul_i32 s2, s0, s2
327; GFX9-NEXT:    v_mov_b32_e32 v0, s1
328; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
329; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
330; GFX9-NEXT:    v_mov_b32_e32 v0, s2
331; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
332; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
333; GFX9-NEXT:    s_endpgm
334;
335; GFX10-LABEL: umulo_i64_s:
336; GFX10:       ; %bb.0: ; %bb
337; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
338; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX10-NEXT:    s_mul_i32 s7, s0, s3
340; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
341; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
342; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
343; GFX10-NEXT:    s_mul_i32 s6, s1, s2
344; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
345; GFX10-NEXT:    s_mul_i32 s1, s1, s3
346; GFX10-NEXT:    s_add_u32 s3, s8, s7
347; GFX10-NEXT:    s_addc_u32 s5, 0, s5
348; GFX10-NEXT:    s_add_u32 s3, s3, s6
349; GFX10-NEXT:    s_addc_u32 s3, s5, s4
350; GFX10-NEXT:    s_addc_u32 s5, s9, 0
351; GFX10-NEXT:    s_add_u32 s4, s3, s1
352; GFX10-NEXT:    s_addc_u32 s5, 0, s5
353; GFX10-NEXT:    s_add_i32 s1, s8, s7
354; GFX10-NEXT:    s_mul_i32 s0, s0, s2
355; GFX10-NEXT:    s_add_i32 s1, s1, s6
356; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
357; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
358; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
359; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
360; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
361; GFX10-NEXT:    s_endpgm
362;
363; GFX11-LABEL: umulo_i64_s:
364; GFX11:       ; %bb.0: ; %bb
365; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
366; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX11-NEXT:    s_mul_i32 s7, s0, s3
368; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
369; GFX11-NEXT:    s_mul_hi_u32 s5, s0, s3
370; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
371; GFX11-NEXT:    s_mul_i32 s6, s1, s2
372; GFX11-NEXT:    s_mul_hi_u32 s9, s1, s3
373; GFX11-NEXT:    s_mul_i32 s1, s1, s3
374; GFX11-NEXT:    s_add_u32 s3, s8, s7
375; GFX11-NEXT:    s_addc_u32 s5, 0, s5
376; GFX11-NEXT:    s_add_u32 s3, s3, s6
377; GFX11-NEXT:    s_addc_u32 s3, s5, s4
378; GFX11-NEXT:    s_addc_u32 s5, s9, 0
379; GFX11-NEXT:    s_add_u32 s4, s3, s1
380; GFX11-NEXT:    s_addc_u32 s5, 0, s5
381; GFX11-NEXT:    s_add_i32 s1, s8, s7
382; GFX11-NEXT:    s_mul_i32 s0, s0, s2
383; GFX11-NEXT:    s_add_i32 s1, s1, s6
384; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
385; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
386; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
387; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
388; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
389; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
390; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
391; GFX11-NEXT:    s_endpgm
392bb:
393  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
394  %mul = extractvalue { i64, i1 } %umulo, 0
395  %overflow = extractvalue { i64, i1 } %umulo, 1
396  %res = select i1 %overflow, i64 0, i64 %mul
397  store i64 %res, i64 addrspace(1)* undef
398  ret void
399}
400
401define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
402; SI-LABEL: smulo_i64_s:
403; SI:       ; %bb.0: ; %bb
404; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
405; SI-NEXT:    s_mov_b32 s7, 0xf000
406; SI-NEXT:    s_waitcnt lgkmcnt(0)
407; SI-NEXT:    v_mov_b32_e32 v0, s2
408; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
409; SI-NEXT:    s_mul_i32 s4, s1, s2
410; SI-NEXT:    v_mov_b32_e32 v2, s3
411; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
412; SI-NEXT:    s_mul_i32 s5, s0, s3
413; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
414; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
415; SI-NEXT:    s_mul_i32 s6, s1, s3
416; SI-NEXT:    s_cmp_lt_i32 s1, 0
417; SI-NEXT:    s_mul_i32 s1, s0, s2
418; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
419; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
420; SI-NEXT:    v_mov_b32_e32 v5, s1
421; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
422; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
423; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
424; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
425; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
426; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
427; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v0
428; SI-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v1
429; SI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v2, vcc
430; SI-NEXT:    s_cselect_b64 vcc, -1, 0
431; SI-NEXT:    s_cmp_lt_i32 s3, 0
432; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
433; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
434; SI-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
435; SI-NEXT:    v_mov_b32_e32 v1, v0
436; SI-NEXT:    v_subrev_i32_e32 v7, vcc, s0, v6
437; SI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v2, vcc
438; SI-NEXT:    s_cselect_b64 vcc, -1, 0
439; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
440; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
441; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
442; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
443; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
444; SI-NEXT:    s_mov_b32 s6, -1
445; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
446; SI-NEXT:    s_endpgm
447;
448; GFX9-LABEL: smulo_i64_s:
449; GFX9:       ; %bb.0: ; %bb
450; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-NEXT:    s_mul_i32 s7, s0, s3
453; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
454; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
455; GFX9-NEXT:    s_add_u32 s9, s8, s7
456; GFX9-NEXT:    s_mul_i32 s5, s1, s2
457; GFX9-NEXT:    s_addc_u32 s6, 0, s6
458; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
459; GFX9-NEXT:    s_add_u32 s9, s9, s5
460; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
461; GFX9-NEXT:    s_addc_u32 s4, s6, s4
462; GFX9-NEXT:    s_addc_u32 s6, s10, 0
463; GFX9-NEXT:    s_mul_i32 s9, s1, s3
464; GFX9-NEXT:    s_add_u32 s4, s4, s9
465; GFX9-NEXT:    s_addc_u32 s6, 0, s6
466; GFX9-NEXT:    s_sub_u32 s9, s4, s2
467; GFX9-NEXT:    s_subb_u32 s10, s6, 0
468; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
469; GFX9-NEXT:    v_mov_b32_e32 v0, s6
470; GFX9-NEXT:    v_mov_b32_e32 v1, s10
471; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
472; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
473; GFX9-NEXT:    v_mov_b32_e32 v1, s4
474; GFX9-NEXT:    v_mov_b32_e32 v2, s9
475; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
476; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
477; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
478; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
479; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
480; GFX9-NEXT:    s_add_i32 s1, s8, s7
481; GFX9-NEXT:    s_add_i32 s1, s1, s5
482; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
483; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
484; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
485; GFX9-NEXT:    s_mov_b32 s5, s4
486; GFX9-NEXT:    s_mul_i32 s0, s0, s2
487; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
488; GFX9-NEXT:    v_mov_b32_e32 v2, s1
489; GFX9-NEXT:    v_mov_b32_e32 v0, s0
490; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
491; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
492; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
493; GFX9-NEXT:    s_endpgm
494;
495; GFX10-LABEL: smulo_i64_s:
496; GFX10:       ; %bb.0: ; %bb
497; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
498; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX10-NEXT:    s_mul_i32 s7, s0, s3
500; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
501; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
502; GFX10-NEXT:    s_mul_i32 s5, s1, s2
503; GFX10-NEXT:    s_add_u32 s11, s8, s7
504; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
505; GFX10-NEXT:    s_addc_u32 s6, 0, s6
506; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
507; GFX10-NEXT:    s_add_u32 s11, s11, s5
508; GFX10-NEXT:    s_mul_i32 s10, s1, s3
509; GFX10-NEXT:    s_addc_u32 s4, s6, s4
510; GFX10-NEXT:    s_addc_u32 s6, s9, 0
511; GFX10-NEXT:    s_add_u32 s4, s4, s10
512; GFX10-NEXT:    s_addc_u32 s6, 0, s6
513; GFX10-NEXT:    s_sub_u32 s9, s4, s2
514; GFX10-NEXT:    s_subb_u32 s10, s6, 0
515; GFX10-NEXT:    v_mov_b32_e32 v1, s9
516; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
517; GFX10-NEXT:    v_mov_b32_e32 v0, s10
518; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
519; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
520; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
521; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
522; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
523; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
524; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
525; GFX10-NEXT:    s_add_i32 s1, s8, s7
526; GFX10-NEXT:    s_mul_i32 s0, s0, s2
527; GFX10-NEXT:    s_add_i32 s1, s1, s5
528; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
529; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
530; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
531; GFX10-NEXT:    s_mov_b32 s5, s4
532; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
533; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
534; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
535; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
536; GFX10-NEXT:    s_endpgm
537;
538; GFX11-LABEL: smulo_i64_s:
539; GFX11:       ; %bb.0: ; %bb
540; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
541; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX11-NEXT:    s_mul_i32 s7, s0, s3
543; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
544; GFX11-NEXT:    s_mul_hi_u32 s6, s0, s3
545; GFX11-NEXT:    s_mul_i32 s5, s1, s2
546; GFX11-NEXT:    s_add_u32 s11, s8, s7
547; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
548; GFX11-NEXT:    s_addc_u32 s6, 0, s6
549; GFX11-NEXT:    s_mul_hi_i32 s9, s1, s3
550; GFX11-NEXT:    s_add_u32 s11, s11, s5
551; GFX11-NEXT:    s_mul_i32 s10, s1, s3
552; GFX11-NEXT:    s_addc_u32 s4, s6, s4
553; GFX11-NEXT:    s_addc_u32 s6, s9, 0
554; GFX11-NEXT:    s_add_u32 s4, s4, s10
555; GFX11-NEXT:    s_addc_u32 s6, 0, s6
556; GFX11-NEXT:    s_sub_u32 s9, s4, s2
557; GFX11-NEXT:    s_subb_u32 s10, s6, 0
558; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
559; GFX11-NEXT:    v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10
560; GFX11-NEXT:    s_cmp_lt_i32 s1, 0
561; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
562; GFX11-NEXT:    s_cmp_lt_i32 s3, 0
563; GFX11-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
564; GFX11-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
565; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
566; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
567; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
568; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
569; GFX11-NEXT:    s_add_i32 s1, s8, s7
570; GFX11-NEXT:    s_mul_i32 s0, s0, s2
571; GFX11-NEXT:    s_add_i32 s1, s1, s5
572; GFX11-NEXT:    v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3
573; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
574; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
575; GFX11-NEXT:    s_mov_b32 s5, s4
576; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
577; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
578; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
579; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
580; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
581; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
582; GFX11-NEXT:    s_endpgm
583bb:
584  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
585  %mul = extractvalue { i64, i1 } %umulo, 0
586  %overflow = extractvalue { i64, i1 } %umulo, 1
587  %res = select i1 %overflow, i64 0, i64 %mul
588  store i64 %res, i64 addrspace(1)* undef
589  ret void
590}
591
592define { i64, i1 } @smulo_i64_v_4(i64 %i) {
593; SI-LABEL: smulo_i64_v_4:
594; SI:       ; %bb.0: ; %bb
595; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
596; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
597; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
598; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
599; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
600; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
601; SI-NEXT:    v_mov_b32_e32 v0, v5
602; SI-NEXT:    v_mov_b32_e32 v1, v4
603; SI-NEXT:    s_setpc_b64 s[30:31]
604;
605; GFX9-LABEL: smulo_i64_v_4:
606; GFX9:       ; %bb.0: ; %bb
607; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
609; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
610; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
611; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
612; GFX9-NEXT:    v_mov_b32_e32 v0, v4
613; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
614; GFX9-NEXT:    v_mov_b32_e32 v1, v3
615; GFX9-NEXT:    s_setpc_b64 s[30:31]
616;
617; GFX10-LABEL: smulo_i64_v_4:
618; GFX10:       ; %bb.0: ; %bb
619; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
621; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
622; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
623; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
624; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
625; GFX10-NEXT:    v_mov_b32_e32 v0, v4
626; GFX10-NEXT:    v_mov_b32_e32 v1, v3
627; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
628; GFX10-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX11-LABEL: smulo_i64_v_4:
631; GFX11:       ; %bb.0: ; %bb
632; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
634; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
635; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
636; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
637; GFX11-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
638; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
639; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
640; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
641; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
642; GFX11-NEXT:    s_setpc_b64 s[30:31]
643bb:
644  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
645  ret { i64, i1 } %umulo
646}
647
648define { i64, i1 } @umulo_i64_v_4(i64 %i) {
649; SI-LABEL: umulo_i64_v_4:
650; SI:       ; %bb.0: ; %bb
651; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
653; SI-NEXT:    v_mov_b32_e32 v6, v0
654; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
655; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
656; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
657; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
658; SI-NEXT:    v_mov_b32_e32 v0, v4
659; SI-NEXT:    v_mov_b32_e32 v1, v3
660; SI-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX9-LABEL: umulo_i64_v_4:
663; GFX9:       ; %bb.0: ; %bb
664; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
666; GFX9-NEXT:    v_mov_b32_e32 v6, v0
667; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
668; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
669; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
670; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
671; GFX9-NEXT:    v_mov_b32_e32 v0, v4
672; GFX9-NEXT:    v_mov_b32_e32 v1, v3
673; GFX9-NEXT:    s_setpc_b64 s[30:31]
674;
675; GFX10-LABEL: umulo_i64_v_4:
676; GFX10:       ; %bb.0: ; %bb
677; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
679; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
680; GFX10-NEXT:    v_mov_b32_e32 v6, v0
681; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
682; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
683; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
684; GFX10-NEXT:    v_mov_b32_e32 v0, v4
685; GFX10-NEXT:    v_mov_b32_e32 v1, v3
686; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
687; GFX10-NEXT:    s_setpc_b64 s[30:31]
688;
689; GFX11-LABEL: umulo_i64_v_4:
690; GFX11:       ; %bb.0: ; %bb
691; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
693; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
694; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
695; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
696; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
697; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
698; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
699; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
700; GFX11-NEXT:    s_setpc_b64 s[30:31]
701bb:
702  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
703  ret { i64, i1 } %umulo
704}
705
706declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
707declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
708