1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
6
7define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
8; SI-LABEL: umulo_i64_v_v:
9; SI:       ; %bb.0: ; %bb
10; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
12; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
13; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
14; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
15; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
16; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
17; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
18; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
19; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
20; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
21; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
22; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
23; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
24; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
25; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
26; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
27; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
28; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
29; SI-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX9-LABEL: umulo_i64_v_v:
32; GFX9:       ; %bb.0: ; %bb
33; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX9-NEXT:    v_mov_b32_e32 v5, v0
35; GFX9-NEXT:    v_mov_b32_e32 v4, v1
36; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
37; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
38; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
39; GFX9-NEXT:    v_mov_b32_e32 v10, v1
40; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
41; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
42; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0
43; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
44; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
45; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
46; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
47; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
48; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
49; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
50; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
51; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
52; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
53; GFX9-NEXT:    s_setpc_b64 s[30:31]
54;
55; GFX10-LABEL: umulo_i64_v_v:
56; GFX10:       ; %bb.0: ; %bb
57; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
59; GFX10-NEXT:    v_mov_b32_e32 v4, v0
60; GFX10-NEXT:    v_mov_b32_e32 v5, v1
61; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
62; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
63; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
64; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v5, v3, 0
65; GFX10-NEXT:    v_mov_b32_e32 v8, v1
66; GFX10-NEXT:    v_mul_lo_u32 v5, v5, v2
67; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v3
68; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
69; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
70; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
71; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
72; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
73; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
74; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
75; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
76; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
77; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
78; GFX10-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX11-LABEL: umulo_i64_v_v:
81; GFX11:       ; %bb.0: ; %bb
82; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
84; GFX11-NEXT:    v_mov_b32_e32 v4, v0
85; GFX11-NEXT:    v_mov_b32_e32 v5, v1
86; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v4, v2, 0
87; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s0, v4, v3, 0
88; GFX11-NEXT:    v_mad_u64_u32 v[9:10], s0, v5, v2, 0
89; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s0, v5, v3, 0
90; GFX11-NEXT:    v_mov_b32_e32 v8, v1
91; GFX11-NEXT:    v_mul_lo_u32 v5, v5, v2
92; GFX11-NEXT:    v_mul_lo_u32 v4, v4, v3
93; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
94; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
95; GFX11-NEXT:    v_add3_u32 v1, v1, v4, v5
96; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
97; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
98; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
99; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
100; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
101; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
102; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
103; GFX11-NEXT:    s_setpc_b64 s[30:31]
104bb:
105  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
106  ret { i64, i1 } %umulo
107}
108
109define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
110; SI-LABEL: smulo_i64_v_v:
111; SI:       ; %bb.0: ; %bb
112; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
114; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
115; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
116; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
117; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
118; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
119; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
120; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
121; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
122; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
123; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
124; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
125; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
126; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
127; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
128; SI-NEXT:    v_mov_b32_e32 v7, v6
129; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
130; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
131; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
132; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v9, vcc
133; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
134; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
135; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
136; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
137; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
138; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
139; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
140; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
141; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
142; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
143; SI-NEXT:    v_mov_b32_e32 v0, v4
144; SI-NEXT:    v_mov_b32_e32 v1, v5
145; SI-NEXT:    s_setpc_b64 s[30:31]
146;
147; GFX9-LABEL: smulo_i64_v_v:
148; GFX9:       ; %bb.0: ; %bb
149; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GFX9-NEXT:    v_mov_b32_e32 v5, v0
151; GFX9-NEXT:    v_mov_b32_e32 v4, v1
152; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
153; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
154; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
155; GFX9-NEXT:    v_mov_b32_e32 v10, v1
156; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
157; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
158; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0
159; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
160; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
161; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
162; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
163; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
164; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v2
165; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
166; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
167; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
168; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
169; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v5
170; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
171; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
172; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
173; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
174; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
175; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
176; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
177; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
178; GFX9-NEXT:    v_mov_b32_e32 v5, v4
179; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
180; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
181; GFX9-NEXT:    s_setpc_b64 s[30:31]
182;
183; GFX10-LABEL: smulo_i64_v_v:
184; GFX10:       ; %bb.0: ; %bb
185; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
187; GFX10-NEXT:    v_mov_b32_e32 v4, v0
188; GFX10-NEXT:    v_mov_b32_e32 v5, v1
189; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
190; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
191; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
192; GFX10-NEXT:    v_mad_i64_i32 v[11:12], s4, v5, v3, 0
193; GFX10-NEXT:    v_mov_b32_e32 v8, v1
194; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
195; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
196; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v2
197; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
198; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
199; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
200; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v3
201; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
202; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
203; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
204; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
205; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
206; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
207; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
208; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
209; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
210; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
211; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
212; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
213; GFX10-NEXT:    v_mov_b32_e32 v3, v2
214; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
215; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
216; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
217; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
218; GFX10-NEXT:    s_setpc_b64 s[30:31]
219;
220; GFX11-LABEL: smulo_i64_v_v:
221; GFX11:       ; %bb.0: ; %bb
222; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
224; GFX11-NEXT:    v_mov_b32_e32 v4, v0
225; GFX11-NEXT:    v_mov_b32_e32 v5, v1
226; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v4, v2, 0
227; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s0, v4, v3, 0
228; GFX11-NEXT:    v_mad_u64_u32 v[9:10], s0, v5, v2, 0
229; GFX11-NEXT:    v_mad_i64_i32 v[11:12], s0, v5, v3, 0
230; GFX11-NEXT:    v_mov_b32_e32 v8, v1
231; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
232; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
233; GFX11-NEXT:    v_mul_lo_u32 v8, v5, v2
234; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
235; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
236; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
237; GFX11-NEXT:    v_mul_lo_u32 v9, v4, v3
238; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
239; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
240; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
241; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
242; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
243; GFX11-NEXT:    v_add3_u32 v1, v1, v9, v8
244; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
245; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
246; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
247; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
248; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
249; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
250; GFX11-NEXT:    v_mov_b32_e32 v3, v2
251; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
252; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
253; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
254; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
255; GFX11-NEXT:    s_setpc_b64 s[30:31]
256bb:
257  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
258  ret { i64, i1 } %smulo
259}
260
261define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
262; SI-LABEL: umulo_i64_s:
263; SI:       ; %bb.0: ; %bb
264; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
265; SI-NEXT:    s_mov_b32 s7, 0xf000
266; SI-NEXT:    s_waitcnt lgkmcnt(0)
267; SI-NEXT:    v_mov_b32_e32 v0, s2
268; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
269; SI-NEXT:    s_mul_i32 s4, s1, s2
270; SI-NEXT:    v_mov_b32_e32 v2, s3
271; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
272; SI-NEXT:    s_mul_i32 s5, s0, s3
273; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
274; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
275; SI-NEXT:    s_mul_i32 s1, s1, s3
276; SI-NEXT:    s_mul_i32 s0, s0, s2
277; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
278; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
279; SI-NEXT:    v_mov_b32_e32 v5, s0
280; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
281; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
282; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
283; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
284; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
285; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
286; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
287; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
288; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
289; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
290; SI-NEXT:    s_mov_b32 s6, -1
291; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
292; SI-NEXT:    s_endpgm
293;
294; GFX9-LABEL: umulo_i64_s:
295; GFX9:       ; %bb.0: ; %bb
296; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
297; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX9-NEXT:    s_mul_i32 s7, s0, s3
299; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
300; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
301; GFX9-NEXT:    s_add_u32 s9, s8, s7
302; GFX9-NEXT:    s_mul_i32 s6, s1, s2
303; GFX9-NEXT:    s_addc_u32 s5, 0, s5
304; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
305; GFX9-NEXT:    s_add_u32 s9, s9, s6
306; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
307; GFX9-NEXT:    s_addc_u32 s4, s5, s4
308; GFX9-NEXT:    s_addc_u32 s5, s10, 0
309; GFX9-NEXT:    s_mul_i32 s1, s1, s3
310; GFX9-NEXT:    s_add_u32 s4, s4, s1
311; GFX9-NEXT:    s_addc_u32 s5, 0, s5
312; GFX9-NEXT:    s_add_i32 s1, s8, s7
313; GFX9-NEXT:    s_add_i32 s1, s1, s6
314; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
315; GFX9-NEXT:    s_mul_i32 s2, s0, s2
316; GFX9-NEXT:    v_mov_b32_e32 v0, s1
317; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
318; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
319; GFX9-NEXT:    v_mov_b32_e32 v0, s2
320; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
321; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
322; GFX9-NEXT:    s_endpgm
323;
324; GFX10-LABEL: umulo_i64_s:
325; GFX10:       ; %bb.0: ; %bb
326; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
327; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX10-NEXT:    s_mul_i32 s7, s0, s3
329; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
330; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
331; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
332; GFX10-NEXT:    s_mul_i32 s6, s1, s2
333; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
334; GFX10-NEXT:    s_mul_i32 s1, s1, s3
335; GFX10-NEXT:    s_add_u32 s3, s8, s7
336; GFX10-NEXT:    s_addc_u32 s5, 0, s5
337; GFX10-NEXT:    s_add_u32 s3, s3, s6
338; GFX10-NEXT:    s_addc_u32 s3, s5, s4
339; GFX10-NEXT:    s_addc_u32 s5, s9, 0
340; GFX10-NEXT:    s_add_u32 s4, s3, s1
341; GFX10-NEXT:    s_addc_u32 s5, 0, s5
342; GFX10-NEXT:    s_add_i32 s1, s8, s7
343; GFX10-NEXT:    s_mul_i32 s0, s0, s2
344; GFX10-NEXT:    s_add_i32 s1, s1, s6
345; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
346; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
347; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
348; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
349; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
350; GFX10-NEXT:    s_endpgm
351;
352; GFX11-LABEL: umulo_i64_s:
353; GFX11:       ; %bb.0: ; %bb
354; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
355; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX11-NEXT:    s_mul_i32 s7, s0, s3
357; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
358; GFX11-NEXT:    s_mul_hi_u32 s5, s0, s3
359; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
360; GFX11-NEXT:    s_mul_i32 s6, s1, s2
361; GFX11-NEXT:    s_mul_hi_u32 s9, s1, s3
362; GFX11-NEXT:    s_mul_i32 s1, s1, s3
363; GFX11-NEXT:    s_add_u32 s3, s8, s7
364; GFX11-NEXT:    s_addc_u32 s5, 0, s5
365; GFX11-NEXT:    s_add_u32 s3, s3, s6
366; GFX11-NEXT:    s_addc_u32 s3, s5, s4
367; GFX11-NEXT:    s_addc_u32 s5, s9, 0
368; GFX11-NEXT:    s_add_u32 s4, s3, s1
369; GFX11-NEXT:    s_addc_u32 s5, 0, s5
370; GFX11-NEXT:    s_add_i32 s1, s8, s7
371; GFX11-NEXT:    s_mul_i32 s0, s0, s2
372; GFX11-NEXT:    s_add_i32 s1, s1, s6
373; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
374; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
375; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
376; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
377; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
378; GFX11-NEXT:    s_endpgm
379bb:
380  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
381  %mul = extractvalue { i64, i1 } %umulo, 0
382  %overflow = extractvalue { i64, i1 } %umulo, 1
383  %res = select i1 %overflow, i64 0, i64 %mul
384  store i64 %res, i64 addrspace(1)* undef
385  ret void
386}
387
388define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
389; SI-LABEL: smulo_i64_s:
390; SI:       ; %bb.0: ; %bb
391; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
392; SI-NEXT:    s_mov_b32 s7, 0xf000
393; SI-NEXT:    s_waitcnt lgkmcnt(0)
394; SI-NEXT:    v_mov_b32_e32 v0, s2
395; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
396; SI-NEXT:    s_mul_i32 s4, s1, s2
397; SI-NEXT:    v_mov_b32_e32 v2, s3
398; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
399; SI-NEXT:    s_mul_i32 s5, s0, s3
400; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
401; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
402; SI-NEXT:    s_mul_i32 s6, s1, s3
403; SI-NEXT:    s_cmp_lt_i32 s1, 0
404; SI-NEXT:    s_mul_i32 s1, s0, s2
405; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
406; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
407; SI-NEXT:    v_mov_b32_e32 v5, s1
408; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
409; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
410; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
411; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
412; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
413; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
414; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v0
415; SI-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v1
416; SI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v2, vcc
417; SI-NEXT:    s_cselect_b64 vcc, -1, 0
418; SI-NEXT:    s_cmp_lt_i32 s3, 0
419; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
420; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
421; SI-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
422; SI-NEXT:    v_mov_b32_e32 v1, v0
423; SI-NEXT:    v_subrev_i32_e32 v7, vcc, s0, v6
424; SI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v2, vcc
425; SI-NEXT:    s_cselect_b64 vcc, -1, 0
426; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
427; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
428; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
429; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
430; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
431; SI-NEXT:    s_mov_b32 s6, -1
432; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
433; SI-NEXT:    s_endpgm
434;
435; GFX9-LABEL: smulo_i64_s:
436; GFX9:       ; %bb.0: ; %bb
437; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
438; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX9-NEXT:    s_mul_i32 s7, s0, s3
440; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
441; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
442; GFX9-NEXT:    s_add_u32 s9, s8, s7
443; GFX9-NEXT:    s_mul_i32 s5, s1, s2
444; GFX9-NEXT:    s_addc_u32 s6, 0, s6
445; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
446; GFX9-NEXT:    s_add_u32 s9, s9, s5
447; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
448; GFX9-NEXT:    s_addc_u32 s4, s6, s4
449; GFX9-NEXT:    s_addc_u32 s6, s10, 0
450; GFX9-NEXT:    s_mul_i32 s9, s1, s3
451; GFX9-NEXT:    s_add_u32 s4, s4, s9
452; GFX9-NEXT:    s_addc_u32 s6, 0, s6
453; GFX9-NEXT:    s_sub_u32 s9, s4, s2
454; GFX9-NEXT:    s_subb_u32 s10, s6, 0
455; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
456; GFX9-NEXT:    v_mov_b32_e32 v0, s6
457; GFX9-NEXT:    v_mov_b32_e32 v1, s10
458; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
459; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
460; GFX9-NEXT:    v_mov_b32_e32 v1, s4
461; GFX9-NEXT:    v_mov_b32_e32 v2, s9
462; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
463; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
464; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
465; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
466; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
467; GFX9-NEXT:    s_add_i32 s1, s8, s7
468; GFX9-NEXT:    s_add_i32 s1, s1, s5
469; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
470; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
471; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
472; GFX9-NEXT:    s_mov_b32 s5, s4
473; GFX9-NEXT:    s_mul_i32 s0, s0, s2
474; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
475; GFX9-NEXT:    v_mov_b32_e32 v2, s1
476; GFX9-NEXT:    v_mov_b32_e32 v0, s0
477; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
478; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
479; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
480; GFX9-NEXT:    s_endpgm
481;
482; GFX10-LABEL: smulo_i64_s:
483; GFX10:       ; %bb.0: ; %bb
484; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
485; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX10-NEXT:    s_mul_i32 s7, s0, s3
487; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
488; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
489; GFX10-NEXT:    s_mul_i32 s5, s1, s2
490; GFX10-NEXT:    s_add_u32 s11, s8, s7
491; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
492; GFX10-NEXT:    s_addc_u32 s6, 0, s6
493; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
494; GFX10-NEXT:    s_add_u32 s11, s11, s5
495; GFX10-NEXT:    s_mul_i32 s10, s1, s3
496; GFX10-NEXT:    s_addc_u32 s4, s6, s4
497; GFX10-NEXT:    s_addc_u32 s6, s9, 0
498; GFX10-NEXT:    s_add_u32 s4, s4, s10
499; GFX10-NEXT:    s_addc_u32 s6, 0, s6
500; GFX10-NEXT:    s_sub_u32 s9, s4, s2
501; GFX10-NEXT:    s_subb_u32 s10, s6, 0
502; GFX10-NEXT:    v_mov_b32_e32 v1, s9
503; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
504; GFX10-NEXT:    v_mov_b32_e32 v0, s10
505; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
506; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
507; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
508; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
509; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
510; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
511; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
512; GFX10-NEXT:    s_add_i32 s1, s8, s7
513; GFX10-NEXT:    s_mul_i32 s0, s0, s2
514; GFX10-NEXT:    s_add_i32 s1, s1, s5
515; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
516; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
517; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
518; GFX10-NEXT:    s_mov_b32 s5, s4
519; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
520; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
521; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
522; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
523; GFX10-NEXT:    s_endpgm
524;
525; GFX11-LABEL: smulo_i64_s:
526; GFX11:       ; %bb.0: ; %bb
527; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
528; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX11-NEXT:    s_mul_i32 s7, s0, s3
530; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
531; GFX11-NEXT:    s_mul_hi_u32 s6, s0, s3
532; GFX11-NEXT:    s_mul_i32 s5, s1, s2
533; GFX11-NEXT:    s_add_u32 s11, s8, s7
534; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
535; GFX11-NEXT:    s_addc_u32 s6, 0, s6
536; GFX11-NEXT:    s_mul_hi_i32 s9, s1, s3
537; GFX11-NEXT:    s_add_u32 s11, s11, s5
538; GFX11-NEXT:    s_mul_i32 s10, s1, s3
539; GFX11-NEXT:    s_addc_u32 s4, s6, s4
540; GFX11-NEXT:    s_addc_u32 s6, s9, 0
541; GFX11-NEXT:    s_add_u32 s4, s4, s10
542; GFX11-NEXT:    s_addc_u32 s6, 0, s6
543; GFX11-NEXT:    s_sub_u32 s9, s4, s2
544; GFX11-NEXT:    s_subb_u32 s10, s6, 0
545; GFX11-NEXT:    v_mov_b32_e32 v1, s9
546; GFX11-NEXT:    s_cmp_lt_i32 s1, 0
547; GFX11-NEXT:    v_mov_b32_e32 v0, s10
548; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
549; GFX11-NEXT:    s_cmp_lt_i32 s3, 0
550; GFX11-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
551; GFX11-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
552; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
553; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
554; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
555; GFX11-NEXT:    s_add_i32 s1, s8, s7
556; GFX11-NEXT:    s_mul_i32 s0, s0, s2
557; GFX11-NEXT:    s_add_i32 s1, s1, s5
558; GFX11-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
559; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
560; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
561; GFX11-NEXT:    s_mov_b32 s5, s4
562; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
563; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
564; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
565; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
566; GFX11-NEXT:    s_endpgm
567bb:
568  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
569  %mul = extractvalue { i64, i1 } %umulo, 0
570  %overflow = extractvalue { i64, i1 } %umulo, 1
571  %res = select i1 %overflow, i64 0, i64 %mul
572  store i64 %res, i64 addrspace(1)* undef
573  ret void
574}
575
576define { i64, i1 } @smulo_i64_v_4(i64 %i) {
577; SI-LABEL: smulo_i64_v_4:
578; SI:       ; %bb.0: ; %bb
579; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
581; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
582; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
583; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
584; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
585; SI-NEXT:    v_mov_b32_e32 v0, v5
586; SI-NEXT:    v_mov_b32_e32 v1, v4
587; SI-NEXT:    s_setpc_b64 s[30:31]
588;
589; GFX9-LABEL: smulo_i64_v_4:
590; GFX9:       ; %bb.0: ; %bb
591; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
593; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
594; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
595; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
596; GFX9-NEXT:    v_mov_b32_e32 v0, v4
597; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
598; GFX9-NEXT:    v_mov_b32_e32 v1, v3
599; GFX9-NEXT:    s_setpc_b64 s[30:31]
600;
601; GFX10-LABEL: smulo_i64_v_4:
602; GFX10:       ; %bb.0: ; %bb
603; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
605; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
606; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
607; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
608; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
609; GFX10-NEXT:    v_mov_b32_e32 v0, v4
610; GFX10-NEXT:    v_mov_b32_e32 v1, v3
611; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
612; GFX10-NEXT:    s_setpc_b64 s[30:31]
613;
614; GFX11-LABEL: smulo_i64_v_4:
615; GFX11:       ; %bb.0: ; %bb
616; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
617; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
618; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
619; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
620; GFX11-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
621; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
622; GFX11-NEXT:    v_mov_b32_e32 v0, v4
623; GFX11-NEXT:    v_mov_b32_e32 v1, v3
624; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
625; GFX11-NEXT:    s_setpc_b64 s[30:31]
626bb:
627  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
628  ret { i64, i1 } %umulo
629}
630
631define { i64, i1 } @umulo_i64_v_4(i64 %i) {
632; SI-LABEL: umulo_i64_v_4:
633; SI:       ; %bb.0: ; %bb
634; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
635; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
636; SI-NEXT:    v_mov_b32_e32 v6, v0
637; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
638; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
639; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
640; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
641; SI-NEXT:    v_mov_b32_e32 v0, v4
642; SI-NEXT:    v_mov_b32_e32 v1, v3
643; SI-NEXT:    s_setpc_b64 s[30:31]
644;
645; GFX9-LABEL: umulo_i64_v_4:
646; GFX9:       ; %bb.0: ; %bb
647; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
649; GFX9-NEXT:    v_mov_b32_e32 v6, v0
650; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
651; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
652; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
653; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
654; GFX9-NEXT:    v_mov_b32_e32 v0, v4
655; GFX9-NEXT:    v_mov_b32_e32 v1, v3
656; GFX9-NEXT:    s_setpc_b64 s[30:31]
657;
658; GFX10-LABEL: umulo_i64_v_4:
659; GFX10:       ; %bb.0: ; %bb
660; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
661; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
662; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
663; GFX10-NEXT:    v_mov_b32_e32 v6, v0
664; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
665; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
666; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
667; GFX10-NEXT:    v_mov_b32_e32 v0, v4
668; GFX10-NEXT:    v_mov_b32_e32 v1, v3
669; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
670; GFX10-NEXT:    s_setpc_b64 s[30:31]
671;
672; GFX11-LABEL: umulo_i64_v_4:
673; GFX11:       ; %bb.0: ; %bb
674; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
675; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
676; GFX11-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
677; GFX11-NEXT:    v_mov_b32_e32 v6, v0
678; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
679; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
680; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
681; GFX11-NEXT:    v_mov_b32_e32 v0, v4
682; GFX11-NEXT:    v_mov_b32_e32 v1, v3
683; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
684; GFX11-NEXT:    s_setpc_b64 s[30:31]
685bb:
686  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
687  ret { i64, i1 } %umulo
688}
689
690declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
691declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
692