1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
7; SI-LABEL: umulo_i64_v_v:
8; SI:       ; %bb.0: ; %bb
9; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
11; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
12; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
13; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
14; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
15; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
16; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
17; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
18; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
19; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
20; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
21; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
22; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
23; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
24; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
25; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
26; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
27; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
28; SI-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX9-LABEL: umulo_i64_v_v:
31; GFX9:       ; %bb.0: ; %bb
32; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
34; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
35; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
36; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
37; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
38; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
39; GFX9-NEXT:    v_mul_hi_u32 v10, v1, v3
40; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
41; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
42; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
43; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
44; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
45; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v1
46; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
47; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
48; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
49; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
50; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
51; GFX9-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX10-LABEL: umulo_i64_v_v:
54; GFX10:       ; %bb.0: ; %bb
55; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
57; GFX10-NEXT:    v_mul_lo_u32 v5, v0, v3
58; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v2
59; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v3
60; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v2
61; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v2
62; GFX10-NEXT:    v_mul_hi_u32 v9, v1, v3
63; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
64; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
65; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v5
66; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
67; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v10, v8
68; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
69; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v9, vcc_lo
70; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v1
71; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
72; GFX10-NEXT:    v_add3_u32 v1, v6, v5, v8
73; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[3:4]
74; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
75; GFX10-NEXT:    s_setpc_b64 s[30:31]
76bb:
77  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
78  ret { i64, i1 } %umulo
79}
80
81define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
82; SI-LABEL: smulo_i64_s_s:
83; SI:       ; %bb.0: ; %bb
84; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
86; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
87; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
88; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
89; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
90; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
91; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
92; SI-NEXT:    v_mov_b32_e32 v12, 0
93; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
94; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
95; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
96; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
97; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
98; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
99; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
100; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
101; SI-NEXT:    v_mov_b32_e32 v7, v6
102; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
103; SI-NEXT:    v_addc_u32_e32 v9, vcc, v12, v9, vcc
104; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
105; SI-NEXT:    v_subb_u32_e32 v10, vcc, v9, v12, vcc
106; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
107; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
108; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
109; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
110; SI-NEXT:    v_subb_u32_e32 v8, vcc, v1, v12, vcc
111; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
112; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
113; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
114; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
115; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
116; SI-NEXT:    v_mov_b32_e32 v0, v4
117; SI-NEXT:    v_mov_b32_e32 v1, v5
118; SI-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX9-LABEL: smulo_i64_s_s:
121; GFX9:       ; %bb.0: ; %bb
122; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
124; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
125; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
126; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
127; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
128; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
129; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
130; GFX9-NEXT:    v_mul_hi_i32 v10, v1, v3
131; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
132; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
133; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
134; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
135; GFX9-NEXT:    v_mov_b32_e32 v10, 0
136; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
137; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v9, vcc
138; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v4, v2
139; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v10, vcc
140; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
141; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v11, vcc
142; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v9, vcc
143; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v8, v0
144; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v10, vcc
145; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
146; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
147; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
148; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
149; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
150; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
151; GFX9-NEXT:    v_mov_b32_e32 v6, v5
152; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6]
153; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
154; GFX9-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX10-LABEL: smulo_i64_s_s:
157; GFX10:       ; %bb.0: ; %bb
158; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
160; GFX10-NEXT:    v_mul_lo_u32 v4, v0, v3
161; GFX10-NEXT:    v_mul_hi_u32 v5, v0, v2
162; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v3
163; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v2
164; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v2
165; GFX10-NEXT:    v_mul_hi_i32 v9, v1, v3
166; GFX10-NEXT:    v_mul_lo_u32 v11, v1, v3
167; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v5, v4
168; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
169; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v10, v8
170; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo
171; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo
172; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
173; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
174; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, v6, v2
175; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
176; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v1
177; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v8
178; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
179; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
180; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
181; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v6, v0
182; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo
183; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
184; GFX10-NEXT:    v_mov_b32_e32 v5, v4
185; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
186; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
187; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
188; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[4:5]
189; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
190; GFX10-NEXT:    s_setpc_b64 s[30:31]
191bb:
192  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
193  ret { i64, i1 } %smulo
194}
195
196define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
197; SI-LABEL: umulo_i64_s:
198; SI:       ; %bb.0: ; %bb
199; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
200; SI-NEXT:    s_mov_b32 s7, 0xf000
201; SI-NEXT:    s_waitcnt lgkmcnt(0)
202; SI-NEXT:    v_mov_b32_e32 v0, s2
203; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
204; SI-NEXT:    s_mul_i32 s4, s1, s2
205; SI-NEXT:    v_mov_b32_e32 v2, s3
206; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
207; SI-NEXT:    s_mul_i32 s5, s0, s3
208; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
209; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
210; SI-NEXT:    s_mul_i32 s1, s1, s3
211; SI-NEXT:    s_mul_i32 s0, s0, s2
212; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
213; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
214; SI-NEXT:    v_mov_b32_e32 v5, s0
215; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
216; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
217; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
218; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
219; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
220; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
221; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
222; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
223; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
224; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
225; SI-NEXT:    s_mov_b32 s6, -1
226; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
227; SI-NEXT:    s_endpgm
228;
229; GFX9-LABEL: umulo_i64_s:
230; GFX9:       ; %bb.0: ; %bb
231; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
232; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX9-NEXT:    s_mul_i32 s7, s0, s3
234; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
235; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
236; GFX9-NEXT:    s_add_u32 s9, s8, s7
237; GFX9-NEXT:    s_mul_i32 s6, s1, s2
238; GFX9-NEXT:    s_addc_u32 s5, 0, s5
239; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
240; GFX9-NEXT:    s_add_u32 s9, s9, s6
241; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
242; GFX9-NEXT:    s_addc_u32 s4, s5, s4
243; GFX9-NEXT:    s_addc_u32 s5, s10, 0
244; GFX9-NEXT:    s_mul_i32 s1, s1, s3
245; GFX9-NEXT:    s_add_u32 s4, s4, s1
246; GFX9-NEXT:    s_addc_u32 s5, 0, s5
247; GFX9-NEXT:    s_add_i32 s1, s8, s7
248; GFX9-NEXT:    s_add_i32 s1, s1, s6
249; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
250; GFX9-NEXT:    s_mul_i32 s2, s0, s2
251; GFX9-NEXT:    v_mov_b32_e32 v0, s1
252; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
253; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
254; GFX9-NEXT:    v_mov_b32_e32 v0, s2
255; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
256; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
257; GFX9-NEXT:    s_endpgm
258;
259; GFX10-LABEL: umulo_i64_s:
260; GFX10:       ; %bb.0: ; %bb
261; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
262; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX10-NEXT:    s_mul_i32 s7, s0, s3
264; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
265; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
266; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
267; GFX10-NEXT:    s_mul_i32 s6, s1, s2
268; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
269; GFX10-NEXT:    s_mul_i32 s1, s1, s3
270; GFX10-NEXT:    s_add_u32 s3, s8, s7
271; GFX10-NEXT:    s_addc_u32 s5, 0, s5
272; GFX10-NEXT:    s_add_u32 s3, s3, s6
273; GFX10-NEXT:    s_addc_u32 s3, s5, s4
274; GFX10-NEXT:    s_addc_u32 s5, s9, 0
275; GFX10-NEXT:    s_add_u32 s4, s3, s1
276; GFX10-NEXT:    s_addc_u32 s5, 0, s5
277; GFX10-NEXT:    s_add_i32 s1, s8, s7
278; GFX10-NEXT:    s_mul_i32 s0, s0, s2
279; GFX10-NEXT:    s_add_i32 s1, s1, s6
280; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
281; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
282; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
283; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
284; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
285; GFX10-NEXT:    s_endpgm
286bb:
287  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
288  %mul = extractvalue { i64, i1 } %umulo, 0
289  %overflow = extractvalue { i64, i1 } %umulo, 1
290  %res = select i1 %overflow, i64 0, i64 %mul
291  store i64 %res, i64 addrspace(1)* undef
292  ret void
293}
294
295define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
296; SI-LABEL: smulo_i64_s:
297; SI:       ; %bb.0: ; %bb
298; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
299; SI-NEXT:    v_mov_b32_e32 v0, 0
300; SI-NEXT:    s_mov_b32 s7, 0xf000
301; SI-NEXT:    s_waitcnt lgkmcnt(0)
302; SI-NEXT:    v_mov_b32_e32 v1, s2
303; SI-NEXT:    v_mul_hi_u32 v2, s1, v1
304; SI-NEXT:    s_mul_i32 s4, s1, s2
305; SI-NEXT:    v_mov_b32_e32 v3, s3
306; SI-NEXT:    v_mul_hi_u32 v4, s0, v3
307; SI-NEXT:    s_mul_i32 s5, s0, s3
308; SI-NEXT:    v_mul_hi_u32 v1, s0, v1
309; SI-NEXT:    v_mul_hi_i32 v3, s1, v3
310; SI-NEXT:    s_mul_i32 s6, s1, s3
311; SI-NEXT:    s_cmp_lt_i32 s1, 0
312; SI-NEXT:    s_mul_i32 s1, s0, s2
313; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v1
314; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
315; SI-NEXT:    v_mov_b32_e32 v6, s1
316; SI-NEXT:    v_add_i32_e32 v5, vcc, s4, v5
317; SI-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
318; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
319; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v1
320; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
321; SI-NEXT:    v_addc_u32_e32 v3, vcc, v0, v3, vcc
322; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v1
323; SI-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v2
324; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
325; SI-NEXT:    s_cselect_b64 vcc, -1, 0
326; SI-NEXT:    s_cmp_lt_i32 s3, 0
327; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
328; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
329; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
330; SI-NEXT:    v_mov_b32_e32 v1, v0
331; SI-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v2
332; SI-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v3, vcc
333; SI-NEXT:    s_cselect_b64 vcc, -1, 0
334; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
335; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
336; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
337; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
338; SI-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
339; SI-NEXT:    s_mov_b32 s6, -1
340; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
341; SI-NEXT:    s_endpgm
342;
343; GFX9-LABEL: smulo_i64_s:
344; GFX9:       ; %bb.0: ; %bb
345; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
346; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX9-NEXT:    s_mul_i32 s9, s0, s3
348; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s2
349; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
350; GFX9-NEXT:    s_add_u32 s6, s10, s9
351; GFX9-NEXT:    s_mul_i32 s8, s1, s2
352; GFX9-NEXT:    s_addc_u32 s5, 0, s5
353; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
354; GFX9-NEXT:    s_add_u32 s6, s6, s8
355; GFX9-NEXT:    s_mul_hi_i32 s7, s1, s3
356; GFX9-NEXT:    s_addc_u32 s4, s5, s4
357; GFX9-NEXT:    s_addc_u32 s5, s7, 0
358; GFX9-NEXT:    s_mul_i32 s6, s1, s3
359; GFX9-NEXT:    s_add_u32 s4, s4, s6
360; GFX9-NEXT:    s_addc_u32 s5, 0, s5
361; GFX9-NEXT:    s_sub_u32 s6, s4, s2
362; GFX9-NEXT:    s_subb_u32 s7, s5, 0
363; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
364; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
365; GFX9-NEXT:    s_sub_u32 s6, s4, s0
366; GFX9-NEXT:    s_subb_u32 s7, s5, 0
367; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
368; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
369; GFX9-NEXT:    s_add_i32 s1, s10, s9
370; GFX9-NEXT:    s_add_i32 s1, s1, s8
371; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
372; GFX9-NEXT:    s_mov_b32 s7, s6
373; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
374; GFX9-NEXT:    s_mul_i32 s2, s0, s2
375; GFX9-NEXT:    v_mov_b32_e32 v0, s1
376; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
377; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
378; GFX9-NEXT:    v_mov_b32_e32 v0, s2
379; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
380; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
381; GFX9-NEXT:    s_endpgm
382;
383; GFX10-LABEL: smulo_i64_s:
384; GFX10:       ; %bb.0: ; %bb
385; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
386; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX10-NEXT:    s_mul_i32 s9, s0, s3
388; GFX10-NEXT:    s_mul_hi_u32 s10, s0, s2
389; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
390; GFX10-NEXT:    s_mul_i32 s8, s1, s2
391; GFX10-NEXT:    s_add_u32 s11, s10, s9
392; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
393; GFX10-NEXT:    s_addc_u32 s5, 0, s5
394; GFX10-NEXT:    s_mul_hi_i32 s6, s1, s3
395; GFX10-NEXT:    s_add_u32 s11, s11, s8
396; GFX10-NEXT:    s_mul_i32 s7, s1, s3
397; GFX10-NEXT:    s_addc_u32 s4, s5, s4
398; GFX10-NEXT:    s_addc_u32 s5, s6, 0
399; GFX10-NEXT:    s_add_u32 s4, s4, s7
400; GFX10-NEXT:    s_addc_u32 s5, 0, s5
401; GFX10-NEXT:    s_sub_u32 s6, s4, s2
402; GFX10-NEXT:    s_subb_u32 s7, s5, 0
403; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
404; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
405; GFX10-NEXT:    s_sub_u32 s6, s4, s0
406; GFX10-NEXT:    s_subb_u32 s7, s5, 0
407; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
408; GFX10-NEXT:    s_mul_i32 s0, s0, s2
409; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
410; GFX10-NEXT:    s_add_i32 s1, s10, s9
411; GFX10-NEXT:    s_add_i32 s1, s1, s8
412; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
413; GFX10-NEXT:    s_mov_b32 s7, s6
414; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
415; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
416; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
417; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
418; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
419; GFX10-NEXT:    s_endpgm
420bb:
421  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
422  %mul = extractvalue { i64, i1 } %umulo, 0
423  %overflow = extractvalue { i64, i1 } %umulo, 1
424  %res = select i1 %overflow, i64 0, i64 %mul
425  store i64 %res, i64 addrspace(1)* undef
426  ret void
427}
428
429define { i64, i1 } @smulo_i64_v_4(i64 %i) {
430; SI-LABEL: smulo_i64_v_4:
431; SI:       ; %bb.0: ; %bb
432; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
434; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
435; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
436; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
437; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
438; SI-NEXT:    v_mov_b32_e32 v0, v5
439; SI-NEXT:    v_mov_b32_e32 v1, v4
440; SI-NEXT:    s_setpc_b64 s[30:31]
441;
442; GFX9-LABEL: smulo_i64_v_4:
443; GFX9:       ; %bb.0: ; %bb
444; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
446; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
447; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
448; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
449; GFX9-NEXT:    v_mov_b32_e32 v0, v4
450; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
451; GFX9-NEXT:    v_mov_b32_e32 v1, v3
452; GFX9-NEXT:    s_setpc_b64 s[30:31]
453;
454; GFX10-LABEL: smulo_i64_v_4:
455; GFX10:       ; %bb.0: ; %bb
456; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
458; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
459; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
460; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
461; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
462; GFX10-NEXT:    v_mov_b32_e32 v0, v4
463; GFX10-NEXT:    v_mov_b32_e32 v1, v3
464; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
465; GFX10-NEXT:    s_setpc_b64 s[30:31]
466bb:
467  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
468  ret { i64, i1 } %umulo
469}
470
471define { i64, i1 } @umulo_i64_v_4(i64 %i) {
472; SI-LABEL: umulo_i64_v_4:
473; SI:       ; %bb.0: ; %bb
474; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
476; SI-NEXT:    v_mov_b32_e32 v6, v0
477; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
478; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
479; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
480; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
481; SI-NEXT:    v_mov_b32_e32 v0, v4
482; SI-NEXT:    v_mov_b32_e32 v1, v3
483; SI-NEXT:    s_setpc_b64 s[30:31]
484;
485; GFX9-LABEL: umulo_i64_v_4:
486; GFX9:       ; %bb.0: ; %bb
487; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
489; GFX9-NEXT:    v_mov_b32_e32 v6, v0
490; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
491; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
492; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
493; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
494; GFX9-NEXT:    v_mov_b32_e32 v0, v4
495; GFX9-NEXT:    v_mov_b32_e32 v1, v3
496; GFX9-NEXT:    s_setpc_b64 s[30:31]
497;
498; GFX10-LABEL: umulo_i64_v_4:
499; GFX10:       ; %bb.0: ; %bb
500; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
502; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
503; GFX10-NEXT:    v_mov_b32_e32 v6, v0
504; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
505; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
506; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
507; GFX10-NEXT:    v_mov_b32_e32 v0, v4
508; GFX10-NEXT:    v_mov_b32_e32 v1, v3
509; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
510; GFX10-NEXT:    s_setpc_b64 s[30:31]
511bb:
512  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
513  ret { i64, i1 } %umulo
514}
515
516declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
517declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
518