1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
7; SI-LABEL: umulo_i64_v_v:
8; SI:       ; %bb.0: ; %bb
9; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
11; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
12; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
13; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
14; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
15; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
16; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
17; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
18; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
19; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
20; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
21; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
22; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
23; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
24; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
25; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
26; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
27; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
28; SI-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX9-LABEL: umulo_i64_v_v:
31; GFX9:       ; %bb.0: ; %bb
32; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX9-NEXT:    v_mov_b32_e32 v5, v0
34; GFX9-NEXT:    v_mov_b32_e32 v4, v1
35; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
36; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
37; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
38; GFX9-NEXT:    v_mov_b32_e32 v10, v1
39; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
40; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
41; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0
42; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
43; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
44; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
45; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
46; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
47; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
48; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
49; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
50; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
51; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
52; GFX9-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX10-LABEL: umulo_i64_v_v:
55; GFX10:       ; %bb.0: ; %bb
56; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
58; GFX10-NEXT:    v_mov_b32_e32 v4, v0
59; GFX10-NEXT:    v_mov_b32_e32 v5, v1
60; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
61; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
62; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
63; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v5, v3, 0
64; GFX10-NEXT:    v_mov_b32_e32 v8, v1
65; GFX10-NEXT:    v_mul_lo_u32 v5, v5, v2
66; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v3
67; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
68; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
69; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
70; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
71; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
72; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
73; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
74; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
75; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
76; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
77; GFX10-NEXT:    s_setpc_b64 s[30:31]
78bb:
79  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
80  ret { i64, i1 } %umulo
81}
82
83define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
84; SI-LABEL: smulo_i64_v_v:
85; SI:       ; %bb.0: ; %bb
86; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
88; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
89; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
90; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
91; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
92; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
93; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
94; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
95; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
96; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
97; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
98; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
99; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
100; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
101; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
102; SI-NEXT:    v_mov_b32_e32 v7, v6
103; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
104; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
105; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
106; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v9, vcc
107; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
108; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
109; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
110; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
111; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
112; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
113; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
114; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
115; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
116; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
117; SI-NEXT:    v_mov_b32_e32 v0, v4
118; SI-NEXT:    v_mov_b32_e32 v1, v5
119; SI-NEXT:    s_setpc_b64 s[30:31]
120;
121; GFX9-LABEL: smulo_i64_v_v:
122; GFX9:       ; %bb.0: ; %bb
123; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124; GFX9-NEXT:    v_mov_b32_e32 v5, v0
125; GFX9-NEXT:    v_mov_b32_e32 v4, v1
126; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
127; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
128; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
129; GFX9-NEXT:    v_mov_b32_e32 v10, v1
130; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
131; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
132; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0
133; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
134; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
135; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
136; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
137; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
138; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v2
139; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
140; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
141; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
142; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
143; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v5
144; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
145; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
146; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
147; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
148; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
149; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
150; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
151; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
152; GFX9-NEXT:    v_mov_b32_e32 v5, v4
153; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
154; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
155; GFX9-NEXT:    s_setpc_b64 s[30:31]
156;
157; GFX10-LABEL: smulo_i64_v_v:
158; GFX10:       ; %bb.0: ; %bb
159; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
161; GFX10-NEXT:    v_mov_b32_e32 v4, v0
162; GFX10-NEXT:    v_mov_b32_e32 v5, v1
163; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
164; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
165; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
166; GFX10-NEXT:    v_mad_i64_i32 v[11:12], s4, v5, v3, 0
167; GFX10-NEXT:    v_mov_b32_e32 v8, v1
168; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
169; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
170; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v2
171; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
172; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
173; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
174; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v3
175; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
176; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
177; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
178; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
179; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
180; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
181; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
182; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
183; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
184; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
185; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
186; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
187; GFX10-NEXT:    v_mov_b32_e32 v3, v2
188; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
189; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
190; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
191; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
192; GFX10-NEXT:    s_setpc_b64 s[30:31]
193bb:
194  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
195  ret { i64, i1 } %smulo
196}
197
198define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
199; SI-LABEL: umulo_i64_s:
200; SI:       ; %bb.0: ; %bb
201; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
202; SI-NEXT:    s_mov_b32 s7, 0xf000
203; SI-NEXT:    s_waitcnt lgkmcnt(0)
204; SI-NEXT:    v_mov_b32_e32 v0, s2
205; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
206; SI-NEXT:    s_mul_i32 s4, s1, s2
207; SI-NEXT:    v_mov_b32_e32 v2, s3
208; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
209; SI-NEXT:    s_mul_i32 s5, s0, s3
210; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
211; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
212; SI-NEXT:    s_mul_i32 s1, s1, s3
213; SI-NEXT:    s_mul_i32 s0, s0, s2
214; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
215; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
216; SI-NEXT:    v_mov_b32_e32 v5, s0
217; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
218; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
219; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
220; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
221; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
222; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
223; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
224; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
225; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
226; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
227; SI-NEXT:    s_mov_b32 s6, -1
228; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
229; SI-NEXT:    s_endpgm
230;
231; GFX9-LABEL: umulo_i64_s:
232; GFX9:       ; %bb.0: ; %bb
233; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX9-NEXT:    s_mul_i32 s7, s0, s3
236; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
237; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
238; GFX9-NEXT:    s_add_u32 s9, s8, s7
239; GFX9-NEXT:    s_mul_i32 s6, s1, s2
240; GFX9-NEXT:    s_addc_u32 s5, 0, s5
241; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
242; GFX9-NEXT:    s_add_u32 s9, s9, s6
243; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
244; GFX9-NEXT:    s_addc_u32 s4, s5, s4
245; GFX9-NEXT:    s_addc_u32 s5, s10, 0
246; GFX9-NEXT:    s_mul_i32 s1, s1, s3
247; GFX9-NEXT:    s_add_u32 s4, s4, s1
248; GFX9-NEXT:    s_addc_u32 s5, 0, s5
249; GFX9-NEXT:    s_add_i32 s1, s8, s7
250; GFX9-NEXT:    s_add_i32 s1, s1, s6
251; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
252; GFX9-NEXT:    s_mul_i32 s2, s0, s2
253; GFX9-NEXT:    v_mov_b32_e32 v0, s1
254; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
255; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
256; GFX9-NEXT:    v_mov_b32_e32 v0, s2
257; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
258; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
259; GFX9-NEXT:    s_endpgm
260;
261; GFX10-LABEL: umulo_i64_s:
262; GFX10:       ; %bb.0: ; %bb
263; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
264; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX10-NEXT:    s_mul_i32 s7, s0, s3
266; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
267; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
268; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
269; GFX10-NEXT:    s_mul_i32 s6, s1, s2
270; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
271; GFX10-NEXT:    s_mul_i32 s1, s1, s3
272; GFX10-NEXT:    s_add_u32 s3, s8, s7
273; GFX10-NEXT:    s_addc_u32 s5, 0, s5
274; GFX10-NEXT:    s_add_u32 s3, s3, s6
275; GFX10-NEXT:    s_addc_u32 s3, s5, s4
276; GFX10-NEXT:    s_addc_u32 s5, s9, 0
277; GFX10-NEXT:    s_add_u32 s4, s3, s1
278; GFX10-NEXT:    s_addc_u32 s5, 0, s5
279; GFX10-NEXT:    s_add_i32 s1, s8, s7
280; GFX10-NEXT:    s_mul_i32 s0, s0, s2
281; GFX10-NEXT:    s_add_i32 s1, s1, s6
282; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
283; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
284; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
285; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
286; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
287; GFX10-NEXT:    s_endpgm
288bb:
289  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
290  %mul = extractvalue { i64, i1 } %umulo, 0
291  %overflow = extractvalue { i64, i1 } %umulo, 1
292  %res = select i1 %overflow, i64 0, i64 %mul
293  store i64 %res, i64 addrspace(1)* undef
294  ret void
295}
296
297define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
298; SI-LABEL: smulo_i64_s:
299; SI:       ; %bb.0: ; %bb
300; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
301; SI-NEXT:    s_mov_b32 s7, 0xf000
302; SI-NEXT:    s_waitcnt lgkmcnt(0)
303; SI-NEXT:    v_mov_b32_e32 v0, s2
304; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
305; SI-NEXT:    s_mul_i32 s4, s1, s2
306; SI-NEXT:    v_mov_b32_e32 v2, s3
307; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
308; SI-NEXT:    s_mul_i32 s5, s0, s3
309; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
310; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
311; SI-NEXT:    s_mul_i32 s6, s1, s3
312; SI-NEXT:    s_cmp_lt_i32 s1, 0
313; SI-NEXT:    s_mul_i32 s1, s0, s2
314; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
315; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
316; SI-NEXT:    v_mov_b32_e32 v5, s1
317; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
318; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
319; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
320; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
321; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
322; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
323; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v0
324; SI-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v1
325; SI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v2, vcc
326; SI-NEXT:    s_cselect_b64 vcc, -1, 0
327; SI-NEXT:    s_cmp_lt_i32 s3, 0
328; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
329; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
330; SI-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
331; SI-NEXT:    v_mov_b32_e32 v1, v0
332; SI-NEXT:    v_subrev_i32_e32 v7, vcc, s0, v6
333; SI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v2, vcc
334; SI-NEXT:    s_cselect_b64 vcc, -1, 0
335; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
336; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
337; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
338; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
339; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
340; SI-NEXT:    s_mov_b32 s6, -1
341; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
342; SI-NEXT:    s_endpgm
343;
344; GFX9-LABEL: smulo_i64_s:
345; GFX9:       ; %bb.0: ; %bb
346; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
347; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX9-NEXT:    s_mul_i32 s7, s0, s3
349; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
350; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
351; GFX9-NEXT:    s_add_u32 s9, s8, s7
352; GFX9-NEXT:    s_mul_i32 s5, s1, s2
353; GFX9-NEXT:    s_addc_u32 s6, 0, s6
354; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
355; GFX9-NEXT:    s_add_u32 s9, s9, s5
356; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
357; GFX9-NEXT:    s_addc_u32 s4, s6, s4
358; GFX9-NEXT:    s_addc_u32 s6, s10, 0
359; GFX9-NEXT:    s_mul_i32 s9, s1, s3
360; GFX9-NEXT:    s_add_u32 s4, s4, s9
361; GFX9-NEXT:    s_addc_u32 s6, 0, s6
362; GFX9-NEXT:    s_sub_u32 s9, s4, s2
363; GFX9-NEXT:    s_subb_u32 s10, s6, 0
364; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
365; GFX9-NEXT:    v_mov_b32_e32 v0, s6
366; GFX9-NEXT:    v_mov_b32_e32 v1, s10
367; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
368; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
369; GFX9-NEXT:    v_mov_b32_e32 v1, s4
370; GFX9-NEXT:    v_mov_b32_e32 v2, s9
371; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
372; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
373; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
374; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
375; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
376; GFX9-NEXT:    s_add_i32 s1, s8, s7
377; GFX9-NEXT:    s_add_i32 s1, s1, s5
378; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
379; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
380; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
381; GFX9-NEXT:    s_mov_b32 s5, s4
382; GFX9-NEXT:    s_mul_i32 s0, s0, s2
383; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
384; GFX9-NEXT:    v_mov_b32_e32 v2, s1
385; GFX9-NEXT:    v_mov_b32_e32 v0, s0
386; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
387; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
388; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
389; GFX9-NEXT:    s_endpgm
390;
391; GFX10-LABEL: smulo_i64_s:
392; GFX10:       ; %bb.0: ; %bb
393; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
394; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX10-NEXT:    s_mul_i32 s7, s0, s3
396; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
397; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
398; GFX10-NEXT:    s_mul_i32 s5, s1, s2
399; GFX10-NEXT:    s_add_u32 s11, s8, s7
400; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
401; GFX10-NEXT:    s_addc_u32 s6, 0, s6
402; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
403; GFX10-NEXT:    s_add_u32 s11, s11, s5
404; GFX10-NEXT:    s_mul_i32 s10, s1, s3
405; GFX10-NEXT:    s_addc_u32 s4, s6, s4
406; GFX10-NEXT:    s_addc_u32 s6, s9, 0
407; GFX10-NEXT:    s_add_u32 s4, s4, s10
408; GFX10-NEXT:    s_addc_u32 s6, 0, s6
409; GFX10-NEXT:    s_sub_u32 s9, s4, s2
410; GFX10-NEXT:    s_subb_u32 s10, s6, 0
411; GFX10-NEXT:    v_mov_b32_e32 v1, s9
412; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
413; GFX10-NEXT:    v_mov_b32_e32 v0, s10
414; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
415; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
416; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
417; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
418; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
419; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
420; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
421; GFX10-NEXT:    s_add_i32 s1, s8, s7
422; GFX10-NEXT:    s_mul_i32 s0, s0, s2
423; GFX10-NEXT:    s_add_i32 s1, s1, s5
424; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
425; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
426; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
427; GFX10-NEXT:    s_mov_b32 s5, s4
428; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
429; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
430; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
431; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
432; GFX10-NEXT:    s_endpgm
433bb:
434  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
435  %mul = extractvalue { i64, i1 } %umulo, 0
436  %overflow = extractvalue { i64, i1 } %umulo, 1
437  %res = select i1 %overflow, i64 0, i64 %mul
438  store i64 %res, i64 addrspace(1)* undef
439  ret void
440}
441
442define { i64, i1 } @smulo_i64_v_4(i64 %i) {
443; SI-LABEL: smulo_i64_v_4:
444; SI:       ; %bb.0: ; %bb
445; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
447; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
448; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
449; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
450; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
451; SI-NEXT:    v_mov_b32_e32 v0, v5
452; SI-NEXT:    v_mov_b32_e32 v1, v4
453; SI-NEXT:    s_setpc_b64 s[30:31]
454;
455; GFX9-LABEL: smulo_i64_v_4:
456; GFX9:       ; %bb.0: ; %bb
457; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
459; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
460; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
461; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
462; GFX9-NEXT:    v_mov_b32_e32 v0, v4
463; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
464; GFX9-NEXT:    v_mov_b32_e32 v1, v3
465; GFX9-NEXT:    s_setpc_b64 s[30:31]
466;
467; GFX10-LABEL: smulo_i64_v_4:
468; GFX10:       ; %bb.0: ; %bb
469; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
471; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
472; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
473; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
474; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
475; GFX10-NEXT:    v_mov_b32_e32 v0, v4
476; GFX10-NEXT:    v_mov_b32_e32 v1, v3
477; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
478; GFX10-NEXT:    s_setpc_b64 s[30:31]
479bb:
480  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
481  ret { i64, i1 } %umulo
482}
483
484define { i64, i1 } @umulo_i64_v_4(i64 %i) {
485; SI-LABEL: umulo_i64_v_4:
486; SI:       ; %bb.0: ; %bb
487; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
489; SI-NEXT:    v_mov_b32_e32 v6, v0
490; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
491; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
492; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
493; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
494; SI-NEXT:    v_mov_b32_e32 v0, v4
495; SI-NEXT:    v_mov_b32_e32 v1, v3
496; SI-NEXT:    s_setpc_b64 s[30:31]
497;
498; GFX9-LABEL: umulo_i64_v_4:
499; GFX9:       ; %bb.0: ; %bb
500; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
502; GFX9-NEXT:    v_mov_b32_e32 v6, v0
503; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
504; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
505; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
506; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
507; GFX9-NEXT:    v_mov_b32_e32 v0, v4
508; GFX9-NEXT:    v_mov_b32_e32 v1, v3
509; GFX9-NEXT:    s_setpc_b64 s[30:31]
510;
511; GFX10-LABEL: umulo_i64_v_4:
512; GFX10:       ; %bb.0: ; %bb
513; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
515; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
516; GFX10-NEXT:    v_mov_b32_e32 v6, v0
517; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
518; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
519; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
520; GFX10-NEXT:    v_mov_b32_e32 v0, v4
521; GFX10-NEXT:    v_mov_b32_e32 v1, v3
522; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
523; GFX10-NEXT:    s_setpc_b64 s[30:31]
524bb:
525  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
526  ret { i64, i1 } %umulo
527}
528
529declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
530declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
531