1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
7; SI-LABEL: umulo_i64_v_v:
8; SI:       ; %bb.0: ; %bb
9; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
11; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
12; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
13; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
14; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
15; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
16; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
17; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
18; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
19; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
20; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
21; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
22; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
23; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
24; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
25; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
26; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
27; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
28; SI-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX9-LABEL: umulo_i64_v_v:
31; GFX9:       ; %bb.0: ; %bb
32; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
34; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
35; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
36; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
37; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
38; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
39; GFX9-NEXT:    v_mul_hi_u32 v10, v1, v3
40; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
41; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
42; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
43; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
44; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
45; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v1
46; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
47; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
48; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
49; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
50; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
51; GFX9-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX10-LABEL: umulo_i64_v_v:
54; GFX10:       ; %bb.0: ; %bb
55; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
57; GFX10-NEXT:    v_mul_lo_u32 v5, v0, v3
58; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v2
59; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v3
60; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v2
61; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v2
62; GFX10-NEXT:    v_mul_hi_u32 v9, v1, v3
63; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
64; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
65; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v5
66; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
67; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v10, v8
68; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
69; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v9, vcc_lo
70; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v1
71; GFX10-NEXT:    v_add3_u32 v1, v6, v5, v8
72; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
73; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[3:4]
74; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
75; GFX10-NEXT:    s_setpc_b64 s[30:31]
76bb:
77  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
78  ret { i64, i1 } %umulo
79}
80
81define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
82; SI-LABEL: smulo_i64_s_s:
83; SI:       ; %bb.0: ; %bb
84; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
86; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
87; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
88; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
89; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
90; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
91; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
92; SI-NEXT:    v_mov_b32_e32 v12, 0
93; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
94; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
95; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
96; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
97; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
98; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
99; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
100; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
101; SI-NEXT:    v_mov_b32_e32 v7, v6
102; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
103; SI-NEXT:    v_addc_u32_e32 v9, vcc, v12, v9, vcc
104; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
105; SI-NEXT:    v_subb_u32_e32 v10, vcc, v9, v12, vcc
106; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
107; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
108; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
109; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
110; SI-NEXT:    v_subb_u32_e32 v8, vcc, v1, v12, vcc
111; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
112; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
113; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
114; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
115; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
116; SI-NEXT:    v_mov_b32_e32 v0, v4
117; SI-NEXT:    v_mov_b32_e32 v1, v5
118; SI-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX9-LABEL: smulo_i64_s_s:
121; GFX9:       ; %bb.0: ; %bb
122; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
124; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
125; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
126; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
127; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
128; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
129; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
130; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
131; GFX9-NEXT:    v_mul_hi_i32 v10, v1, v3
132; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
133; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
134; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
135; GFX9-NEXT:    v_mov_b32_e32 v10, 0
136; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
137; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v9, vcc
138; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v4, v2
139; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v10, vcc
140; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
141; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v11, vcc
142; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v9, vcc
143; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v8, v0
144; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v10, vcc
145; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
146; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
147; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
148; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
149; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
150; GFX9-NEXT:    v_mov_b32_e32 v6, v5
151; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
152; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6]
153; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
154; GFX9-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX10-LABEL: smulo_i64_s_s:
157; GFX10:       ; %bb.0: ; %bb
158; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
160; GFX10-NEXT:    v_mul_lo_u32 v4, v0, v3
161; GFX10-NEXT:    v_mul_hi_u32 v5, v0, v2
162; GFX10-NEXT:    v_mul_hi_u32 v6, v0, v3
163; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v2
164; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v2
165; GFX10-NEXT:    v_mul_hi_i32 v9, v1, v3
166; GFX10-NEXT:    v_mul_lo_u32 v11, v1, v3
167; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v5, v4
168; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
169; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v10, v8
170; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo
171; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo
172; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
173; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
174; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, v6, v2
175; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
176; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v1
177; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v8
178; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
179; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
180; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
181; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v6, v0
182; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
183; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo
184; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
185; GFX10-NEXT:    v_mov_b32_e32 v5, v4
186; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
187; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
188; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[4:5]
189; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
190; GFX10-NEXT:    s_setpc_b64 s[30:31]
191bb:
192  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
193  ret { i64, i1 } %smulo
194}
195
196define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
197; SI-LABEL: umulo_i64_s:
198; SI:       ; %bb.0: ; %bb
199; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
200; SI-NEXT:    s_mov_b32 s7, 0xf000
201; SI-NEXT:    s_waitcnt lgkmcnt(0)
202; SI-NEXT:    v_mov_b32_e32 v0, s2
203; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
204; SI-NEXT:    s_mul_i32 s4, s1, s2
205; SI-NEXT:    v_mov_b32_e32 v2, s3
206; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
207; SI-NEXT:    s_mul_i32 s5, s0, s3
208; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
209; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
210; SI-NEXT:    s_mul_i32 s1, s1, s3
211; SI-NEXT:    s_mul_i32 s0, s0, s2
212; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
213; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
214; SI-NEXT:    v_mov_b32_e32 v5, s0
215; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
216; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
217; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
218; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
219; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
220; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
221; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
222; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
223; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
224; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
225; SI-NEXT:    s_mov_b32 s6, -1
226; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
227; SI-NEXT:    s_endpgm
228;
229; GFX9-LABEL: umulo_i64_s:
230; GFX9:       ; %bb.0: ; %bb
231; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
232; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX9-NEXT:    s_mul_i32 s7, s0, s3
234; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
235; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
236; GFX9-NEXT:    s_add_u32 s9, s8, s7
237; GFX9-NEXT:    s_mul_i32 s6, s1, s2
238; GFX9-NEXT:    s_addc_u32 s5, 0, s5
239; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
240; GFX9-NEXT:    s_add_u32 s9, s9, s6
241; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
242; GFX9-NEXT:    s_addc_u32 s4, s5, s4
243; GFX9-NEXT:    s_addc_u32 s5, s10, 0
244; GFX9-NEXT:    s_mul_i32 s1, s1, s3
245; GFX9-NEXT:    s_add_u32 s4, s4, s1
246; GFX9-NEXT:    s_addc_u32 s5, 0, s5
247; GFX9-NEXT:    s_add_i32 s1, s8, s7
248; GFX9-NEXT:    s_add_i32 s1, s1, s6
249; GFX9-NEXT:    s_mul_i32 s2, s0, s2
250; GFX9-NEXT:    v_mov_b32_e32 v0, s1
251; GFX9-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
252; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
253; GFX9-NEXT:    v_mov_b32_e32 v0, s2
254; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
255; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
256; GFX9-NEXT:    s_endpgm
257;
258; GFX10-LABEL: umulo_i64_s:
259; GFX10:       ; %bb.0: ; %bb
260; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
261; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX10-NEXT:    s_mul_i32 s7, s0, s3
263; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
264; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
265; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
266; GFX10-NEXT:    s_mul_i32 s6, s1, s2
267; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
268; GFX10-NEXT:    s_mul_i32 s1, s1, s3
269; GFX10-NEXT:    s_add_u32 s3, s8, s7
270; GFX10-NEXT:    s_addc_u32 s5, 0, s5
271; GFX10-NEXT:    s_add_u32 s3, s3, s6
272; GFX10-NEXT:    s_addc_u32 s3, s5, s4
273; GFX10-NEXT:    s_addc_u32 s5, s9, 0
274; GFX10-NEXT:    s_add_u32 s4, s3, s1
275; GFX10-NEXT:    s_addc_u32 s5, 0, s5
276; GFX10-NEXT:    s_add_i32 s1, s8, s7
277; GFX10-NEXT:    v_cmp_ne_u64_e64 s3, s[4:5], 0
278; GFX10-NEXT:    s_add_i32 s1, s1, s6
279; GFX10-NEXT:    s_mul_i32 s0, s0, s2
280; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s3
281; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s3
282; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
283; GFX10-NEXT:    s_endpgm
284bb:
285  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
286  %mul = extractvalue { i64, i1 } %umulo, 0
287  %overflow = extractvalue { i64, i1 } %umulo, 1
288  %res = select i1 %overflow, i64 0, i64 %mul
289  store i64 %res, i64 addrspace(1)* undef
290  ret void
291}
292
293define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
294; SI-LABEL: smulo_i64_s:
295; SI:       ; %bb.0: ; %bb
296; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
297; SI-NEXT:    v_mov_b32_e32 v0, 0
298; SI-NEXT:    s_mov_b32 s7, 0xf000
299; SI-NEXT:    s_waitcnt lgkmcnt(0)
300; SI-NEXT:    v_mov_b32_e32 v1, s2
301; SI-NEXT:    v_mul_hi_u32 v2, s1, v1
302; SI-NEXT:    s_mul_i32 s4, s1, s2
303; SI-NEXT:    v_mov_b32_e32 v3, s3
304; SI-NEXT:    v_mul_hi_u32 v4, s0, v3
305; SI-NEXT:    s_mul_i32 s5, s0, s3
306; SI-NEXT:    v_mul_hi_u32 v1, s0, v1
307; SI-NEXT:    v_mul_hi_i32 v3, s1, v3
308; SI-NEXT:    s_mul_i32 s6, s1, s3
309; SI-NEXT:    s_mul_i32 s8, s0, s2
310; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v1
311; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
312; SI-NEXT:    v_mov_b32_e32 v6, s8
313; SI-NEXT:    v_add_i32_e32 v5, vcc, s4, v5
314; SI-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
315; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
316; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v1
317; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
318; SI-NEXT:    v_addc_u32_e32 v3, vcc, v0, v3, vcc
319; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v1
320; SI-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v2
321; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
322; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
323; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s1, 0
324; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
325; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
326; SI-NEXT:    v_mov_b32_e32 v1, v0
327; SI-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v2
328; SI-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v3, vcc
329; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s3, 0
330; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
331; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
332; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
333; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
334; SI-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
335; SI-NEXT:    s_mov_b32 s6, -1
336; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
337; SI-NEXT:    s_endpgm
338;
339; GFX9-LABEL: smulo_i64_s:
340; GFX9:       ; %bb.0: ; %bb
341; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
342; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX9-NEXT:    s_mul_i32 s7, s0, s3
344; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
345; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
346; GFX9-NEXT:    s_add_u32 s9, s8, s7
347; GFX9-NEXT:    s_mul_i32 s5, s1, s2
348; GFX9-NEXT:    s_addc_u32 s6, 0, s6
349; GFX9-NEXT:    s_add_u32 s9, s9, s5
350; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
351; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
352; GFX9-NEXT:    s_addc_u32 s4, s6, s4
353; GFX9-NEXT:    s_addc_u32 s6, s10, 0
354; GFX9-NEXT:    s_mul_i32 s9, s1, s3
355; GFX9-NEXT:    s_add_u32 s4, s4, s9
356; GFX9-NEXT:    s_addc_u32 s6, 0, s6
357; GFX9-NEXT:    s_sub_u32 s9, s4, s2
358; GFX9-NEXT:    s_subb_u32 s10, s6, 0
359; GFX9-NEXT:    v_cmp_lt_i32_e64 vcc, s1, 0
360; GFX9-NEXT:    v_mov_b32_e32 v0, s6
361; GFX9-NEXT:    v_mov_b32_e32 v1, s10
362; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
363; GFX9-NEXT:    v_mov_b32_e32 v1, s4
364; GFX9-NEXT:    v_mov_b32_e32 v2, s9
365; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
366; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
367; GFX9-NEXT:    s_add_i32 s1, s8, s7
368; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
369; GFX9-NEXT:    s_add_i32 s1, s1, s5
370; GFX9-NEXT:    v_cmp_lt_i32_e64 vcc, s3, 0
371; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
372; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
373; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
374; GFX9-NEXT:    s_mov_b32 s5, s4
375; GFX9-NEXT:    s_mul_i32 s0, s0, s2
376; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
377; GFX9-NEXT:    v_mov_b32_e32 v0, s0
378; GFX9-NEXT:    v_mov_b32_e32 v2, s1
379; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
380; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
381; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
382; GFX9-NEXT:    s_endpgm
383;
384; GFX10-LABEL: smulo_i64_s:
385; GFX10:       ; %bb.0: ; %bb
386; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
387; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX10-NEXT:    s_mul_i32 s7, s0, s3
389; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
390; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
391; GFX10-NEXT:    s_add_u32 s11, s8, s7
392; GFX10-NEXT:    s_mul_i32 s5, s1, s2
393; GFX10-NEXT:    s_addc_u32 s6, 0, s6
394; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
395; GFX10-NEXT:    s_add_u32 s11, s11, s5
396; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
397; GFX10-NEXT:    s_addc_u32 s4, s6, s4
398; GFX10-NEXT:    s_mul_i32 s10, s1, s3
399; GFX10-NEXT:    s_addc_u32 s6, s9, 0
400; GFX10-NEXT:    s_add_u32 s4, s4, s10
401; GFX10-NEXT:    s_addc_u32 s6, 0, s6
402; GFX10-NEXT:    s_sub_u32 s9, s4, s2
403; GFX10-NEXT:    s_subb_u32 s10, s6, 0
404; GFX10-NEXT:    v_cmp_lt_i32_e64 vcc_lo, s1, 0
405; GFX10-NEXT:    v_mov_b32_e32 v0, s9
406; GFX10-NEXT:    v_mov_b32_e32 v1, s10
407; GFX10-NEXT:    s_add_i32 s1, s8, s7
408; GFX10-NEXT:    s_add_i32 s1, s1, s5
409; GFX10-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
410; GFX10-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
411; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
412; GFX10-NEXT:    s_mov_b32 s5, s4
413; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
414; GFX10-NEXT:    s_mul_i32 s0, s0, s2
415; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
416; GFX10-NEXT:    v_cmp_lt_i32_e64 vcc_lo, s3, 0
417; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
418; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
419; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
420; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
421; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
422; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
423; GFX10-NEXT:    s_endpgm
424bb:
425  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
426  %mul = extractvalue { i64, i1 } %umulo, 0
427  %overflow = extractvalue { i64, i1 } %umulo, 1
428  %res = select i1 %overflow, i64 0, i64 %mul
429  store i64 %res, i64 addrspace(1)* undef
430  ret void
431}
432
433define { i64, i1 } @smulo_i64_v_4(i64 %i) {
434; SI-LABEL: smulo_i64_v_4:
435; SI:       ; %bb.0: ; %bb
436; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
438; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
439; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
440; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
441; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
442; SI-NEXT:    v_mov_b32_e32 v0, v5
443; SI-NEXT:    v_mov_b32_e32 v1, v4
444; SI-NEXT:    s_setpc_b64 s[30:31]
445;
446; GFX9-LABEL: smulo_i64_v_4:
447; GFX9:       ; %bb.0: ; %bb
448; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
450; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
451; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
452; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
453; GFX9-NEXT:    v_mov_b32_e32 v0, v4
454; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
455; GFX9-NEXT:    v_mov_b32_e32 v1, v3
456; GFX9-NEXT:    s_setpc_b64 s[30:31]
457;
458; GFX10-LABEL: smulo_i64_v_4:
459; GFX10:       ; %bb.0: ; %bb
460; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
461; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
462; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
463; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
464; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
465; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
466; GFX10-NEXT:    v_mov_b32_e32 v0, v4
467; GFX10-NEXT:    v_mov_b32_e32 v1, v3
468; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
469; GFX10-NEXT:    s_setpc_b64 s[30:31]
470bb:
471  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
472  ret { i64, i1 } %umulo
473}
474
475define { i64, i1 } @umulo_i64_v_4(i64 %i) {
476; SI-LABEL: umulo_i64_v_4:
477; SI:       ; %bb.0: ; %bb
478; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
480; SI-NEXT:    v_mov_b32_e32 v6, v0
481; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
482; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
483; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
484; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
485; SI-NEXT:    v_mov_b32_e32 v0, v4
486; SI-NEXT:    v_mov_b32_e32 v1, v3
487; SI-NEXT:    s_setpc_b64 s[30:31]
488;
489; GFX9-LABEL: umulo_i64_v_4:
490; GFX9:       ; %bb.0: ; %bb
491; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
493; GFX9-NEXT:    v_mov_b32_e32 v6, v0
494; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
495; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
496; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
497; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
498; GFX9-NEXT:    v_mov_b32_e32 v0, v4
499; GFX9-NEXT:    v_mov_b32_e32 v1, v3
500; GFX9-NEXT:    s_setpc_b64 s[30:31]
501;
502; GFX10-LABEL: umulo_i64_v_4:
503; GFX10:       ; %bb.0: ; %bb
504; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
506; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
507; GFX10-NEXT:    v_mov_b32_e32 v6, v0
508; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
509; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
510; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
511; GFX10-NEXT:    v_mov_b32_e32 v0, v4
512; GFX10-NEXT:    v_mov_b32_e32 v1, v3
513; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
514; GFX10-NEXT:    s_setpc_b64 s[30:31]
515bb:
516  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
517  ret { i64, i1 } %umulo
518}
519
520declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
521declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
522