1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
6
7define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
8; SI-LABEL: umulo_i64_v_v:
9; SI:       ; %bb.0: ; %bb
10; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
12; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
13; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
14; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
15; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
16; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
17; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
18; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
19; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
20; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
21; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
22; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
23; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
24; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
25; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
26; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
27; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
28; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
29; SI-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX9-LABEL: umulo_i64_v_v:
32; GFX9:       ; %bb.0: ; %bb
33; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX9-NEXT:    v_mov_b32_e32 v5, v0
35; GFX9-NEXT:    v_mov_b32_e32 v4, v1
36; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
37; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
38; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
39; GFX9-NEXT:    v_mov_b32_e32 v10, v1
40; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
41; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
42; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0
43; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
44; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
45; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
46; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
47; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
48; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
49; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
50; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
51; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
52; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
53; GFX9-NEXT:    s_setpc_b64 s[30:31]
54;
55; GFX10-LABEL: umulo_i64_v_v:
56; GFX10:       ; %bb.0: ; %bb
57; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
59; GFX10-NEXT:    v_mov_b32_e32 v4, v0
60; GFX10-NEXT:    v_mov_b32_e32 v5, v1
61; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
62; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
63; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
64; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v5, v3, 0
65; GFX10-NEXT:    v_mov_b32_e32 v8, v1
66; GFX10-NEXT:    v_mul_lo_u32 v5, v5, v2
67; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v3
68; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
69; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
70; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
71; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
72; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
73; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
74; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
75; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
76; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
77; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
78; GFX10-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX11-LABEL: umulo_i64_v_v:
81; GFX11:       ; %bb.0: ; %bb
82; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
84; GFX11-NEXT:    v_mov_b32_e32 v4, v0
85; GFX11-NEXT:    v_mov_b32_e32 v5, v1
86; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
87; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
88; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
89; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
90; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v5, v3, 0
91; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
92; GFX11-NEXT:    v_mov_b32_e32 v8, v1
93; GFX11-NEXT:    v_mul_lo_u32 v5, v5, v2
94; GFX11-NEXT:    v_mul_lo_u32 v4, v4, v3
95; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
96; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
97; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
98; GFX11-NEXT:    v_add3_u32 v1, v1, v4, v5
99; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
100; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
101; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
102; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
103; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
104; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
105; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
106; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
107; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
108; GFX11-NEXT:    s_setpc_b64 s[30:31]
109bb:
110  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
111  ret { i64, i1 } %umulo
112}
113
114define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
115; SI-LABEL: smulo_i64_v_v:
116; SI:       ; %bb.0: ; %bb
117; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
119; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
120; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
121; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
122; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
123; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
124; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
125; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
126; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
127; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
128; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
129; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
130; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
131; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
132; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
133; SI-NEXT:    v_mov_b32_e32 v7, v6
134; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
135; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
136; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
137; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v9, vcc
138; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
139; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
140; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
141; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
142; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
143; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
144; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
145; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
146; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
147; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
148; SI-NEXT:    v_mov_b32_e32 v0, v4
149; SI-NEXT:    v_mov_b32_e32 v1, v5
150; SI-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX9-LABEL: smulo_i64_v_v:
153; GFX9:       ; %bb.0: ; %bb
154; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX9-NEXT:    v_mov_b32_e32 v5, v0
156; GFX9-NEXT:    v_mov_b32_e32 v4, v1
157; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
158; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
159; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
160; GFX9-NEXT:    v_mov_b32_e32 v10, v1
161; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
162; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
163; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0
164; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
165; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
166; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
167; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
168; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
169; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v2
170; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
171; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
172; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
173; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
174; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v5
175; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
176; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
177; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
178; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
179; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
180; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
181; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
182; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
183; GFX9-NEXT:    v_mov_b32_e32 v5, v4
184; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
185; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
186; GFX9-NEXT:    s_setpc_b64 s[30:31]
187;
188; GFX10-LABEL: smulo_i64_v_v:
189; GFX10:       ; %bb.0: ; %bb
190; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
192; GFX10-NEXT:    v_mov_b32_e32 v4, v0
193; GFX10-NEXT:    v_mov_b32_e32 v5, v1
194; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
195; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
196; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
197; GFX10-NEXT:    v_mad_i64_i32 v[11:12], s4, v5, v3, 0
198; GFX10-NEXT:    v_mov_b32_e32 v8, v1
199; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
200; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
201; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v2
202; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
203; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
204; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
205; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v3
206; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
207; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
208; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
209; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
210; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
211; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
212; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
213; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
214; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
215; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
216; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
217; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
218; GFX10-NEXT:    v_mov_b32_e32 v3, v2
219; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
220; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
221; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
222; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
223; GFX10-NEXT:    s_setpc_b64 s[30:31]
224;
225; GFX11-LABEL: smulo_i64_v_v:
226; GFX11:       ; %bb.0: ; %bb
227; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
229; GFX11-NEXT:    v_mov_b32_e32 v4, v0
230; GFX11-NEXT:    v_mov_b32_e32 v5, v1
231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
232; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
233; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
234; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
235; GFX11-NEXT:    v_mad_i64_i32 v[11:12], null, v5, v3, 0
236; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
237; GFX11-NEXT:    v_mov_b32_e32 v8, v1
238; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
239; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
240; GFX11-NEXT:    v_mul_lo_u32 v8, v5, v2
241; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
242; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
243; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
244; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
245; GFX11-NEXT:    v_mul_lo_u32 v9, v4, v3
246; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
247; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
248; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
249; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
250; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
251; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
252; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
253; GFX11-NEXT:    v_add3_u32 v1, v1, v9, v8
254; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
255; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
256; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
257; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
258; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
259; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
260; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
261; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
262; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
263; GFX11-NEXT:    v_mov_b32_e32 v3, v2
264; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
265; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
266; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
267; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
268; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
269; GFX11-NEXT:    s_setpc_b64 s[30:31]
270bb:
271  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
272  ret { i64, i1 } %smulo
273}
274
275define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
276; SI-LABEL: umulo_i64_s:
277; SI:       ; %bb.0: ; %bb
278; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
279; SI-NEXT:    s_mov_b32 s7, 0xf000
280; SI-NEXT:    s_waitcnt lgkmcnt(0)
281; SI-NEXT:    v_mov_b32_e32 v0, s2
282; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
283; SI-NEXT:    s_mul_i32 s4, s1, s2
284; SI-NEXT:    v_mov_b32_e32 v2, s3
285; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
286; SI-NEXT:    s_mul_i32 s5, s0, s3
287; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
288; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
289; SI-NEXT:    s_mul_i32 s1, s1, s3
290; SI-NEXT:    s_mul_i32 s0, s0, s2
291; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
292; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
293; SI-NEXT:    v_mov_b32_e32 v5, s0
294; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
295; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
296; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
297; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
298; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
299; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
300; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
301; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
302; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
303; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
304; SI-NEXT:    s_mov_b32 s6, -1
305; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
306; SI-NEXT:    s_endpgm
307;
308; GFX9-LABEL: umulo_i64_s:
309; GFX9:       ; %bb.0: ; %bb
310; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
311; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX9-NEXT:    s_mul_i32 s7, s0, s3
313; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
314; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
315; GFX9-NEXT:    s_add_u32 s9, s8, s7
316; GFX9-NEXT:    s_mul_i32 s6, s1, s2
317; GFX9-NEXT:    s_addc_u32 s5, 0, s5
318; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
319; GFX9-NEXT:    s_add_u32 s9, s9, s6
320; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
321; GFX9-NEXT:    s_addc_u32 s4, s5, s4
322; GFX9-NEXT:    s_addc_u32 s5, s10, 0
323; GFX9-NEXT:    s_mul_i32 s1, s1, s3
324; GFX9-NEXT:    s_add_u32 s4, s4, s1
325; GFX9-NEXT:    s_addc_u32 s5, 0, s5
326; GFX9-NEXT:    s_add_i32 s1, s8, s7
327; GFX9-NEXT:    s_add_i32 s1, s1, s6
328; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
329; GFX9-NEXT:    s_mul_i32 s2, s0, s2
330; GFX9-NEXT:    v_mov_b32_e32 v0, s1
331; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
332; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
333; GFX9-NEXT:    v_mov_b32_e32 v0, s2
334; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
335; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
336; GFX9-NEXT:    s_endpgm
337;
338; GFX10-LABEL: umulo_i64_s:
339; GFX10:       ; %bb.0: ; %bb
340; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
341; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX10-NEXT:    s_mul_i32 s7, s0, s3
343; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
344; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
345; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
346; GFX10-NEXT:    s_mul_i32 s6, s1, s2
347; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
348; GFX10-NEXT:    s_mul_i32 s1, s1, s3
349; GFX10-NEXT:    s_add_u32 s3, s8, s7
350; GFX10-NEXT:    s_addc_u32 s5, 0, s5
351; GFX10-NEXT:    s_add_u32 s3, s3, s6
352; GFX10-NEXT:    s_addc_u32 s3, s5, s4
353; GFX10-NEXT:    s_addc_u32 s5, s9, 0
354; GFX10-NEXT:    s_add_u32 s4, s3, s1
355; GFX10-NEXT:    s_addc_u32 s5, 0, s5
356; GFX10-NEXT:    s_add_i32 s1, s8, s7
357; GFX10-NEXT:    s_mul_i32 s0, s0, s2
358; GFX10-NEXT:    s_add_i32 s1, s1, s6
359; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
360; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
361; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
362; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
363; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
364; GFX10-NEXT:    s_endpgm
365;
366; GFX11-LABEL: umulo_i64_s:
367; GFX11:       ; %bb.0: ; %bb
368; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
369; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX11-NEXT:    s_mul_i32 s7, s0, s3
371; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
372; GFX11-NEXT:    s_mul_hi_u32 s5, s0, s3
373; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
374; GFX11-NEXT:    s_mul_i32 s6, s1, s2
375; GFX11-NEXT:    s_mul_hi_u32 s9, s1, s3
376; GFX11-NEXT:    s_mul_i32 s1, s1, s3
377; GFX11-NEXT:    s_add_u32 s3, s8, s7
378; GFX11-NEXT:    s_addc_u32 s5, 0, s5
379; GFX11-NEXT:    s_add_u32 s3, s3, s6
380; GFX11-NEXT:    s_addc_u32 s3, s5, s4
381; GFX11-NEXT:    s_addc_u32 s5, s9, 0
382; GFX11-NEXT:    s_add_u32 s4, s3, s1
383; GFX11-NEXT:    s_addc_u32 s5, 0, s5
384; GFX11-NEXT:    s_add_i32 s1, s8, s7
385; GFX11-NEXT:    s_mul_i32 s0, s0, s2
386; GFX11-NEXT:    s_add_i32 s1, s1, s6
387; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
388; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
389; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
390; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
391; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
392; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
393; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
394; GFX11-NEXT:    s_endpgm
395bb:
396  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
397  %mul = extractvalue { i64, i1 } %umulo, 0
398  %overflow = extractvalue { i64, i1 } %umulo, 1
399  %res = select i1 %overflow, i64 0, i64 %mul
400  store i64 %res, i64 addrspace(1)* undef
401  ret void
402}
403
404define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
405; SI-LABEL: smulo_i64_s:
406; SI:       ; %bb.0: ; %bb
407; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
408; SI-NEXT:    s_mov_b32 s7, 0xf000
409; SI-NEXT:    s_waitcnt lgkmcnt(0)
410; SI-NEXT:    v_mov_b32_e32 v0, s2
411; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
412; SI-NEXT:    s_mul_i32 s4, s1, s2
413; SI-NEXT:    v_mov_b32_e32 v2, s3
414; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
415; SI-NEXT:    s_mul_i32 s5, s0, s3
416; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
417; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
418; SI-NEXT:    s_mul_i32 s6, s1, s3
419; SI-NEXT:    s_cmp_lt_i32 s1, 0
420; SI-NEXT:    s_mul_i32 s1, s0, s2
421; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
422; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
423; SI-NEXT:    v_mov_b32_e32 v5, s1
424; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
425; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
426; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
427; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
428; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
429; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
430; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v0
431; SI-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v1
432; SI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v2, vcc
433; SI-NEXT:    s_cselect_b64 vcc, -1, 0
434; SI-NEXT:    s_cmp_lt_i32 s3, 0
435; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
436; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
437; SI-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
438; SI-NEXT:    v_mov_b32_e32 v1, v0
439; SI-NEXT:    v_subrev_i32_e32 v7, vcc, s0, v6
440; SI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v2, vcc
441; SI-NEXT:    s_cselect_b64 vcc, -1, 0
442; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
443; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
444; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
445; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
446; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
447; SI-NEXT:    s_mov_b32 s6, -1
448; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
449; SI-NEXT:    s_endpgm
450;
451; GFX9-LABEL: smulo_i64_s:
452; GFX9:       ; %bb.0: ; %bb
453; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
454; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX9-NEXT:    s_mul_i32 s7, s0, s3
456; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
457; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
458; GFX9-NEXT:    s_add_u32 s9, s8, s7
459; GFX9-NEXT:    s_mul_i32 s5, s1, s2
460; GFX9-NEXT:    s_addc_u32 s6, 0, s6
461; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
462; GFX9-NEXT:    s_add_u32 s9, s9, s5
463; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
464; GFX9-NEXT:    s_addc_u32 s4, s6, s4
465; GFX9-NEXT:    s_addc_u32 s6, s10, 0
466; GFX9-NEXT:    s_mul_i32 s9, s1, s3
467; GFX9-NEXT:    s_add_u32 s4, s4, s9
468; GFX9-NEXT:    s_addc_u32 s6, 0, s6
469; GFX9-NEXT:    s_sub_u32 s9, s4, s2
470; GFX9-NEXT:    s_subb_u32 s10, s6, 0
471; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
472; GFX9-NEXT:    v_mov_b32_e32 v0, s6
473; GFX9-NEXT:    v_mov_b32_e32 v1, s10
474; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
475; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
476; GFX9-NEXT:    v_mov_b32_e32 v1, s4
477; GFX9-NEXT:    v_mov_b32_e32 v2, s9
478; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
479; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
480; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
481; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
482; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
483; GFX9-NEXT:    s_add_i32 s1, s8, s7
484; GFX9-NEXT:    s_add_i32 s1, s1, s5
485; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
486; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
487; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
488; GFX9-NEXT:    s_mov_b32 s5, s4
489; GFX9-NEXT:    s_mul_i32 s0, s0, s2
490; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
491; GFX9-NEXT:    v_mov_b32_e32 v2, s1
492; GFX9-NEXT:    v_mov_b32_e32 v0, s0
493; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
494; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
495; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
496; GFX9-NEXT:    s_endpgm
497;
498; GFX10-LABEL: smulo_i64_s:
499; GFX10:       ; %bb.0: ; %bb
500; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
501; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX10-NEXT:    s_mul_i32 s7, s0, s3
503; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
504; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
505; GFX10-NEXT:    s_mul_i32 s5, s1, s2
506; GFX10-NEXT:    s_add_u32 s11, s8, s7
507; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
508; GFX10-NEXT:    s_addc_u32 s6, 0, s6
509; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
510; GFX10-NEXT:    s_add_u32 s11, s11, s5
511; GFX10-NEXT:    s_mul_i32 s10, s1, s3
512; GFX10-NEXT:    s_addc_u32 s4, s6, s4
513; GFX10-NEXT:    s_addc_u32 s6, s9, 0
514; GFX10-NEXT:    s_add_u32 s4, s4, s10
515; GFX10-NEXT:    s_addc_u32 s6, 0, s6
516; GFX10-NEXT:    s_sub_u32 s9, s4, s2
517; GFX10-NEXT:    s_subb_u32 s10, s6, 0
518; GFX10-NEXT:    v_mov_b32_e32 v1, s9
519; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
520; GFX10-NEXT:    v_mov_b32_e32 v0, s10
521; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
522; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
523; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
524; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
525; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
526; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
527; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
528; GFX10-NEXT:    s_add_i32 s1, s8, s7
529; GFX10-NEXT:    s_mul_i32 s0, s0, s2
530; GFX10-NEXT:    s_add_i32 s1, s1, s5
531; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
532; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
533; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
534; GFX10-NEXT:    s_mov_b32 s5, s4
535; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
536; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
537; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
538; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
539; GFX10-NEXT:    s_endpgm
540;
541; GFX11-LABEL: smulo_i64_s:
542; GFX11:       ; %bb.0: ; %bb
543; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
544; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
545; GFX11-NEXT:    s_mul_i32 s7, s0, s3
546; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
547; GFX11-NEXT:    s_mul_hi_u32 s6, s0, s3
548; GFX11-NEXT:    s_mul_i32 s5, s1, s2
549; GFX11-NEXT:    s_add_u32 s11, s8, s7
550; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
551; GFX11-NEXT:    s_addc_u32 s6, 0, s6
552; GFX11-NEXT:    s_mul_hi_i32 s9, s1, s3
553; GFX11-NEXT:    s_add_u32 s11, s11, s5
554; GFX11-NEXT:    s_mul_i32 s10, s1, s3
555; GFX11-NEXT:    s_addc_u32 s4, s6, s4
556; GFX11-NEXT:    s_addc_u32 s6, s9, 0
557; GFX11-NEXT:    s_add_u32 s4, s4, s10
558; GFX11-NEXT:    s_addc_u32 s6, 0, s6
559; GFX11-NEXT:    s_sub_u32 s9, s4, s2
560; GFX11-NEXT:    s_subb_u32 s10, s6, 0
561; GFX11-NEXT:    v_mov_b32_e32 v1, s9
562; GFX11-NEXT:    s_cmp_lt_i32 s1, 0
563; GFX11-NEXT:    v_mov_b32_e32 v0, s10
564; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
565; GFX11-NEXT:    s_cmp_lt_i32 s3, 0
566; GFX11-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
567; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
568; GFX11-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
569; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
570; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
571; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
572; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
573; GFX11-NEXT:    s_add_i32 s1, s8, s7
574; GFX11-NEXT:    s_mul_i32 s0, s0, s2
575; GFX11-NEXT:    s_add_i32 s1, s1, s5
576; GFX11-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
577; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
578; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
579; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
580; GFX11-NEXT:    s_mov_b32 s5, s4
581; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
582; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
583; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
584; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
585; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
586; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
587; GFX11-NEXT:    s_endpgm
588bb:
589  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
590  %mul = extractvalue { i64, i1 } %umulo, 0
591  %overflow = extractvalue { i64, i1 } %umulo, 1
592  %res = select i1 %overflow, i64 0, i64 %mul
593  store i64 %res, i64 addrspace(1)* undef
594  ret void
595}
596
597define { i64, i1 } @smulo_i64_v_4(i64 %i) {
598; SI-LABEL: smulo_i64_v_4:
599; SI:       ; %bb.0: ; %bb
600; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
601; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
602; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
603; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
604; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
605; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
606; SI-NEXT:    v_mov_b32_e32 v0, v5
607; SI-NEXT:    v_mov_b32_e32 v1, v4
608; SI-NEXT:    s_setpc_b64 s[30:31]
609;
610; GFX9-LABEL: smulo_i64_v_4:
611; GFX9:       ; %bb.0: ; %bb
612; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
614; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
615; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
616; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
617; GFX9-NEXT:    v_mov_b32_e32 v0, v4
618; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
619; GFX9-NEXT:    v_mov_b32_e32 v1, v3
620; GFX9-NEXT:    s_setpc_b64 s[30:31]
621;
622; GFX10-LABEL: smulo_i64_v_4:
623; GFX10:       ; %bb.0: ; %bb
624; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
626; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
627; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
628; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
629; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
630; GFX10-NEXT:    v_mov_b32_e32 v0, v4
631; GFX10-NEXT:    v_mov_b32_e32 v1, v3
632; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
633; GFX10-NEXT:    s_setpc_b64 s[30:31]
634;
635; GFX11-LABEL: smulo_i64_v_4:
636; GFX11:       ; %bb.0: ; %bb
637; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
639; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
640; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
641; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
642; GFX11-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
643; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
644; GFX11-NEXT:    v_mov_b32_e32 v0, v4
645; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
646; GFX11-NEXT:    v_mov_b32_e32 v1, v3
647; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
648; GFX11-NEXT:    s_setpc_b64 s[30:31]
649bb:
650  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
651  ret { i64, i1 } %umulo
652}
653
654define { i64, i1 } @umulo_i64_v_4(i64 %i) {
655; SI-LABEL: umulo_i64_v_4:
656; SI:       ; %bb.0: ; %bb
657; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
659; SI-NEXT:    v_mov_b32_e32 v6, v0
660; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
661; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
662; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
663; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
664; SI-NEXT:    v_mov_b32_e32 v0, v4
665; SI-NEXT:    v_mov_b32_e32 v1, v3
666; SI-NEXT:    s_setpc_b64 s[30:31]
667;
668; GFX9-LABEL: umulo_i64_v_4:
669; GFX9:       ; %bb.0: ; %bb
670; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
671; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
672; GFX9-NEXT:    v_mov_b32_e32 v6, v0
673; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
674; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
675; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
676; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
677; GFX9-NEXT:    v_mov_b32_e32 v0, v4
678; GFX9-NEXT:    v_mov_b32_e32 v1, v3
679; GFX9-NEXT:    s_setpc_b64 s[30:31]
680;
681; GFX10-LABEL: umulo_i64_v_4:
682; GFX10:       ; %bb.0: ; %bb
683; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
685; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
686; GFX10-NEXT:    v_mov_b32_e32 v6, v0
687; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
688; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
689; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
690; GFX10-NEXT:    v_mov_b32_e32 v0, v4
691; GFX10-NEXT:    v_mov_b32_e32 v1, v3
692; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
693; GFX10-NEXT:    s_setpc_b64 s[30:31]
694;
695; GFX11-LABEL: umulo_i64_v_4:
696; GFX11:       ; %bb.0: ; %bb
697; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
698; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
699; GFX11-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
700; GFX11-NEXT:    v_mov_b32_e32 v6, v0
701; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
702; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
703; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
704; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
705; GFX11-NEXT:    v_mov_b32_e32 v0, v4
706; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
707; GFX11-NEXT:    v_mov_b32_e32 v1, v3
708; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
709; GFX11-NEXT:    s_setpc_b64 s[30:31]
710bb:
711  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
712  ret { i64, i1 } %umulo
713}
714
715declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
716declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
717