1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI %s
3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
4; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6
7; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
8
9define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
10; CI-LABEL: mad_i64_i32_sextops:
11; CI:       ; %bb.0:
12; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
14; CI-NEXT:    s_setpc_b64 s[30:31]
15;
16; SI-LABEL: mad_i64_i32_sextops:
17; SI:       ; %bb.0:
18; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
20; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
21; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
22; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
23; SI-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX9-LABEL: mad_i64_i32_sextops:
26; GFX9:       ; %bb.0:
27; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
29; GFX9-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX11-LABEL: mad_i64_i32_sextops:
32; GFX11:       ; %bb.0:
33; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
35; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
36; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
37; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
38; GFX11-NEXT:    s_setpc_b64 s[30:31]
39  %sext0 = sext i32 %arg0 to i64
40  %sext1 = sext i32 %arg1 to i64
41  %mul = mul i64 %sext0, %sext1
42  %mad = add i64 %mul, %arg2
43  ret i64 %mad
44}
45
46define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
47; CI-LABEL: mad_i64_i32_sextops_commute:
48; CI:       ; %bb.0:
49; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
51; CI-NEXT:    s_setpc_b64 s[30:31]
52;
53; SI-LABEL: mad_i64_i32_sextops_commute:
54; SI:       ; %bb.0:
55; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
57; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
58; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
59; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
60; SI-NEXT:    s_setpc_b64 s[30:31]
61;
62; GFX9-LABEL: mad_i64_i32_sextops_commute:
63; GFX9:       ; %bb.0:
64; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
66; GFX9-NEXT:    s_setpc_b64 s[30:31]
67;
68; GFX11-LABEL: mad_i64_i32_sextops_commute:
69; GFX11:       ; %bb.0:
70; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
72; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
73; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
74; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
75; GFX11-NEXT:    s_setpc_b64 s[30:31]
76  %sext0 = sext i32 %arg0 to i64
77  %sext1 = sext i32 %arg1 to i64
78  %mul = mul i64 %sext0, %sext1
79  %mad = add i64 %arg2, %mul
80  ret i64 %mad
81}
82
83define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
84; CI-LABEL: mad_u64_u32_zextops:
85; CI:       ; %bb.0:
86; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
88; CI-NEXT:    s_setpc_b64 s[30:31]
89;
90; SI-LABEL: mad_u64_u32_zextops:
91; SI:       ; %bb.0:
92; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
94; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
95; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
96; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
97; SI-NEXT:    s_setpc_b64 s[30:31]
98;
99; GFX9-LABEL: mad_u64_u32_zextops:
100; GFX9:       ; %bb.0:
101; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
103; GFX9-NEXT:    s_setpc_b64 s[30:31]
104;
105; GFX11-LABEL: mad_u64_u32_zextops:
106; GFX11:       ; %bb.0:
107; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
109; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
110; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
111; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
112; GFX11-NEXT:    s_setpc_b64 s[30:31]
113  %sext0 = zext i32 %arg0 to i64
114  %sext1 = zext i32 %arg1 to i64
115  %mul = mul i64 %sext0, %sext1
116  %mad = add i64 %mul, %arg2
117  ret i64 %mad
118}
119
120define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
121; CI-LABEL: mad_u64_u32_zextops_commute:
122; CI:       ; %bb.0:
123; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
125; CI-NEXT:    s_setpc_b64 s[30:31]
126;
127; SI-LABEL: mad_u64_u32_zextops_commute:
128; SI:       ; %bb.0:
129; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
131; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
132; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
133; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
134; SI-NEXT:    s_setpc_b64 s[30:31]
135;
136; GFX9-LABEL: mad_u64_u32_zextops_commute:
137; GFX9:       ; %bb.0:
138; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
140; GFX9-NEXT:    s_setpc_b64 s[30:31]
141;
142; GFX11-LABEL: mad_u64_u32_zextops_commute:
143; GFX11:       ; %bb.0:
144; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
146; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
147; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
148; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
149; GFX11-NEXT:    s_setpc_b64 s[30:31]
150  %sext0 = zext i32 %arg0 to i64
151  %sext1 = zext i32 %arg1 to i64
152  %mul = mul i64 %sext0, %sext1
153  %mad = add i64 %arg2, %mul
154  ret i64 %mad
155}
156
157define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
158; CI-LABEL: mad_i64_i32_sextops_i32_i128:
159; CI:       ; %bb.0:
160; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
162; CI-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
163; CI-NEXT:    v_mov_b32_e32 v8, 0
164; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8]
165; CI-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
166; CI-NEXT:    v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0
167; CI-NEXT:    v_mov_b32_e32 v7, v10
168; CI-NEXT:    v_mov_b32_e32 v10, v8
169; CI-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
170; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
171; CI-NEXT:    v_add_i32_e32 v9, vcc, v7, v9
172; CI-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
173; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
174; CI-NEXT:    v_add_i32_e32 v7, vcc, v9, v0
175; CI-NEXT:    v_addc_u32_e32 v9, vcc, v10, v1, vcc
176; CI-NEXT:    v_mov_b32_e32 v1, v8
177; CI-NEXT:    v_add_i32_e32 v0, vcc, v6, v2
178; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
179; CI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
180; CI-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
181; CI-NEXT:    s_setpc_b64 s[30:31]
182;
183; SI-LABEL: mad_i64_i32_sextops_i32_i128:
184; SI:       ; %bb.0:
185; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
187; SI-NEXT:    v_mul_lo_u32 v11, v6, v1
188; SI-NEXT:    v_mul_hi_u32 v12, v0, v1
189; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
190; SI-NEXT:    v_mul_hi_u32 v14, v6, v1
191; SI-NEXT:    v_mul_lo_u32 v13, v0, v7
192; SI-NEXT:    v_mul_hi_u32 v10, v0, v7
193; SI-NEXT:    v_add_i32_e32 v12, vcc, v11, v12
194; SI-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
195; SI-NEXT:    v_mul_hi_u32 v8, v6, v7
196; SI-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
197; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
198; SI-NEXT:    v_mul_i32_i24_e32 v9, v6, v7
199; SI-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
200; SI-NEXT:    v_mul_hi_i32 v6, v1, v6
201; SI-NEXT:    v_mul_hi_i32 v7, v7, v0
202; SI-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
203; SI-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
204; SI-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
205; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v11
206; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
207; SI-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
208; SI-NEXT:    v_add_i32_e32 v7, vcc, v9, v10
209; SI-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
210; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
211; SI-NEXT:    v_addc_u32_e32 v1, vcc, v12, v3, vcc
212; SI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
213; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
214; SI-NEXT:    s_setpc_b64 s[30:31]
215;
216; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
217; GFX9:       ; %bb.0:
218; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
220; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
221; GFX9-NEXT:    v_mov_b32_e32 v9, 0
222; GFX9-NEXT:    v_mov_b32_e32 v8, v7
223; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
224; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
225; GFX9-NEXT:    v_mov_b32_e32 v8, v11
226; GFX9-NEXT:    v_mov_b32_e32 v11, v9
227; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
228; GFX9-NEXT:    v_mov_b32_e32 v12, v11
229; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
230; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
231; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
232; GFX9-NEXT:    v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
233; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
234; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v0
235; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
236; GFX9-NEXT:    v_mov_b32_e32 v1, v10
237; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
238; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
239; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
240; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
241; GFX9-NEXT:    s_setpc_b64 s[30:31]
242;
243; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
244; GFX11:       ; %bb.0:
245; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
247; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
248; GFX11-NEXT:    v_mov_b32_e32 v8, 0
249; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
250; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
251; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
252; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
253; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
254; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
255; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
256; GFX11-NEXT:    v_mad_i64_i32 v[9:10], null, v1, v14, 0
257; GFX11-NEXT:    v_mov_b32_e32 v8, v12
258; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
259; GFX11-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
260; GFX11-NEXT:    v_add_co_u32 v7, s0, v7, v8
261; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
262; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, 0, s0
263; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
264; GFX11-NEXT:    v_mov_b32_e32 v7, v11
265; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
266; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v12
267; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
268; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
269; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
270; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
271; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
272; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
273; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
274; GFX11-NEXT:    s_setpc_b64 s[30:31]
275  %sext0 = sext i32 %arg0 to i128
276  %sext1 = sext i32 %arg1 to i128
277  %mul = mul i128 %sext0, %sext1
278  %mad = add i128 %mul, %arg2
279  ret i128 %mad
280}
281
282define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
283; CI-LABEL: mad_i64_i32_sextops_i32_i63:
284; CI:       ; %bb.0:
285; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
287; CI-NEXT:    s_setpc_b64 s[30:31]
288;
289; SI-LABEL: mad_i64_i32_sextops_i32_i63:
290; SI:       ; %bb.0:
291; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
293; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
294; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
295; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
296; SI-NEXT:    s_setpc_b64 s[30:31]
297;
298; GFX9-LABEL: mad_i64_i32_sextops_i32_i63:
299; GFX9:       ; %bb.0:
300; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
301; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
302; GFX9-NEXT:    s_setpc_b64 s[30:31]
303;
304; GFX11-LABEL: mad_i64_i32_sextops_i32_i63:
305; GFX11:       ; %bb.0:
306; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
308; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
309; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
310; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
311; GFX11-NEXT:    s_setpc_b64 s[30:31]
312  %sext0 = sext i32 %arg0 to i63
313  %sext1 = sext i32 %arg1 to i63
314  %mul = mul i63 %sext0, %sext1
315  %mad = add i63 %mul, %arg2
316  ret i63 %mad
317}
318
319define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
320; CI-LABEL: mad_i64_i32_sextops_i31_i63:
321; CI:       ; %bb.0:
322; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323; CI-NEXT:    v_bfe_i32 v1, v1, 0, 31
324; CI-NEXT:    v_bfe_i32 v0, v0, 0, 31
325; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
326; CI-NEXT:    s_setpc_b64 s[30:31]
327;
328; SI-LABEL: mad_i64_i32_sextops_i31_i63:
329; SI:       ; %bb.0:
330; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331; SI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
332; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
333; SI-NEXT:    v_ashr_i64 v[4:5], v[3:4], 33
334; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 33
335; SI-NEXT:    v_mul_lo_u32 v1, v4, v0
336; SI-NEXT:    v_mul_hi_i32 v4, v4, v0
337; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
338; SI-NEXT:    v_addc_u32_e32 v1, vcc, v4, v3, vcc
339; SI-NEXT:    s_setpc_b64 s[30:31]
340;
341; GFX9-LABEL: mad_i64_i32_sextops_i31_i63:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 31
345; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 31
346; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
347; GFX9-NEXT:    s_setpc_b64 s[30:31]
348;
349; GFX11-LABEL: mad_i64_i32_sextops_i31_i63:
350; GFX11:       ; %bb.0:
351; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
353; GFX11-NEXT:    v_bfe_i32 v4, v1, 0, 31
354; GFX11-NEXT:    v_bfe_i32 v5, v0, 0, 31
355; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
356; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
357; GFX11-NEXT:    s_setpc_b64 s[30:31]
358  %sext0 = sext i31 %arg0 to i63
359  %sext1 = sext i31 %arg1 to i63
360  %mul = mul i63 %sext0, %sext1
361  %mad = add i63 %mul, %arg2
362  ret i63 %mad
363}
364
365define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
366; CI-LABEL: mad_i64_i32_extops_i32_i64:
367; CI:       ; %bb.0:
368; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369; CI-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
370; CI-NEXT:    v_mul_lo_u32 v4, v4, v1
371; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
372; CI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
373; CI-NEXT:    s_setpc_b64 s[30:31]
374;
375; SI-LABEL: mad_i64_i32_extops_i32_i64:
376; SI:       ; %bb.0:
377; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
379; SI-NEXT:    v_mul_hi_u32 v5, v0, v1
380; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
381; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
382; SI-NEXT:    v_add_i32_e32 v1, vcc, v5, v4
383; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
384; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
385; SI-NEXT:    s_setpc_b64 s[30:31]
386;
387; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
388; GFX9:       ; %bb.0:
389; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390; GFX9-NEXT:    v_mov_b32_e32 v4, v1
391; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
392; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3]
393; GFX9-NEXT:    v_mov_b32_e32 v2, v1
394; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3]
395; GFX9-NEXT:    v_mov_b32_e32 v1, v2
396; GFX9-NEXT:    s_setpc_b64 s[30:31]
397;
398; GFX11-LABEL: mad_i64_i32_extops_i32_i64:
399; GFX11:       ; %bb.0:
400; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
402; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
403; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
404; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
405; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
406; GFX11-NEXT:    v_mov_b32_e32 v3, v1
407; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
408; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
409; GFX11-NEXT:    s_setpc_b64 s[30:31]
410  %ext0 = sext i32 %arg0 to i64
411  %ext1 = zext i32 %arg1 to i64
412  %mul = mul i64 %ext0, %ext1
413  %mad = add i64 %mul, %arg2
414  ret i64 %mad
415}
416
417define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
418; CI-LABEL: mad_u64_u32_bitops:
419; CI:       ; %bb.0:
420; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
422; CI-NEXT:    s_setpc_b64 s[30:31]
423;
424; SI-LABEL: mad_u64_u32_bitops:
425; SI:       ; %bb.0:
426; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427; SI-NEXT:    v_mul_lo_u32 v1, v0, v2
428; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
429; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v4
430; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v5, vcc
431; SI-NEXT:    s_setpc_b64 s[30:31]
432;
433; GFX9-LABEL: mad_u64_u32_bitops:
434; GFX9:       ; %bb.0:
435; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
437; GFX9-NEXT:    s_setpc_b64 s[30:31]
438;
439; GFX11-LABEL: mad_u64_u32_bitops:
440; GFX11:       ; %bb.0:
441; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
443; GFX11-NEXT:    v_mov_b32_e32 v3, v0
444; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
445; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
446; GFX11-NEXT:    s_setpc_b64 s[30:31]
447  %trunc.lhs = and i64 %arg0, 4294967295
448  %trunc.rhs = and i64 %arg1, 4294967295
449  %mul = mul i64 %trunc.lhs, %trunc.rhs
450  %add = add i64 %mul, %arg2
451  ret i64 %add
452}
453
454define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
455; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
456; CI:       ; %bb.0:
457; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; CI-NEXT:    v_and_b32_e32 v3, 1, v1
459; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
460; CI-NEXT:    v_mul_lo_u32 v2, v3, v2
461; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
462; CI-NEXT:    s_setpc_b64 s[30:31]
463;
464; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
465; SI:       ; %bb.0:
466; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; SI-NEXT:    v_and_b32_e32 v1, 1, v1
468; SI-NEXT:    v_mul_hi_u32 v3, v0, v2
469; SI-NEXT:    v_mul_lo_u32 v1, v1, v2
470; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
471; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
472; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
473; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
474; SI-NEXT:    s_setpc_b64 s[30:31]
475;
476; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small:
477; GFX9:       ; %bb.0:
478; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479; GFX9-NEXT:    v_and_b32_e32 v3, 1, v1
480; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
481; GFX9-NEXT:    v_mov_b32_e32 v4, v1
482; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5]
483; GFX9-NEXT:    v_mov_b32_e32 v1, v2
484; GFX9-NEXT:    s_setpc_b64 s[30:31]
485;
486; GFX11-LABEL: mad_u64_u32_bitops_lhs_mask_small:
487; GFX11:       ; %bb.0:
488; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
490; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
491; GFX11-NEXT:    v_mov_b32_e32 v6, v1
492; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
493; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
494; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
495; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
496; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
497; GFX11-NEXT:    s_setpc_b64 s[30:31]
498  %trunc.lhs = and i64 %arg0, 8589934591
499  %trunc.rhs = and i64 %arg1, 4294967295
500  %mul = mul i64 %trunc.lhs, %trunc.rhs
501  %add = add i64 %mul, %arg2
502  ret i64 %add
503}
504
505define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
506; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
507; CI:       ; %bb.0:
508; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509; CI-NEXT:    v_mov_b32_e32 v6, v0
510; CI-NEXT:    v_and_b32_e32 v3, 1, v3
511; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
512; CI-NEXT:    v_mul_lo_u32 v2, v6, v3
513; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
514; CI-NEXT:    s_setpc_b64 s[30:31]
515;
516; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
517; SI:       ; %bb.0:
518; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519; SI-NEXT:    v_and_b32_e32 v1, 1, v3
520; SI-NEXT:    v_mul_hi_u32 v3, v0, v2
521; SI-NEXT:    v_mul_lo_u32 v1, v0, v1
522; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
523; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
524; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
525; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
526; SI-NEXT:    s_setpc_b64 s[30:31]
527;
528; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small:
529; GFX9:       ; %bb.0:
530; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; GFX9-NEXT:    v_mov_b32_e32 v6, v0
532; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
533; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
534; GFX9-NEXT:    v_mov_b32_e32 v2, v1
535; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3]
536; GFX9-NEXT:    v_mov_b32_e32 v1, v2
537; GFX9-NEXT:    s_setpc_b64 s[30:31]
538;
539; GFX11-LABEL: mad_u64_u32_bitops_rhs_mask_small:
540; GFX11:       ; %bb.0:
541; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
542; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
543; GFX11-NEXT:    v_mov_b32_e32 v6, v0
544; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
545; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
546; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
547; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
548; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
549; GFX11-NEXT:    s_setpc_b64 s[30:31]
550  %trunc.lhs = and i64 %arg0, 4294967295
551  %trunc.rhs = and i64 %arg1, 8589934591
552  %mul = mul i64 %trunc.lhs, %trunc.rhs
553  %add = add i64 %mul, %arg2
554  ret i64 %add
555}
556
557define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
558; CI-LABEL: mad_i64_i32_bitops:
559; CI:       ; %bb.0:
560; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
561; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
562; CI-NEXT:    s_setpc_b64 s[30:31]
563;
564; SI-LABEL: mad_i64_i32_bitops:
565; SI:       ; %bb.0:
566; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567; SI-NEXT:    v_mul_lo_u32 v1, v0, v2
568; SI-NEXT:    v_mul_hi_i32 v2, v0, v2
569; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v4
570; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v5, vcc
571; SI-NEXT:    s_setpc_b64 s[30:31]
572;
573; GFX9-LABEL: mad_i64_i32_bitops:
574; GFX9:       ; %bb.0:
575; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
577; GFX9-NEXT:    s_setpc_b64 s[30:31]
578;
579; GFX11-LABEL: mad_i64_i32_bitops:
580; GFX11:       ; %bb.0:
581; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
583; GFX11-NEXT:    v_mov_b32_e32 v3, v0
584; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
585; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
586; GFX11-NEXT:    s_setpc_b64 s[30:31]
587  %shl.lhs = shl i64 %arg0, 32
588  %trunc.lhs = ashr i64 %shl.lhs, 32
589  %shl.rhs = shl i64 %arg1, 32
590  %trunc.rhs = ashr i64 %shl.rhs, 32
591  %mul = mul i64 %trunc.lhs, %trunc.rhs
592  %add = add i64 %mul, %arg2
593  ret i64 %add
594}
595
596; Example from bug report
597define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
598; CI-LABEL: mad_i64_i32_unpack_i64ops:
599; CI:       ; %bb.0:
600; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
601; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
602; CI-NEXT:    s_setpc_b64 s[30:31]
603;
604; SI-LABEL: mad_i64_i32_unpack_i64ops:
605; SI:       ; %bb.0:
606; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
607; SI-NEXT:    v_mul_lo_u32 v2, v1, v0
608; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
609; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
610; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
611; SI-NEXT:    s_setpc_b64 s[30:31]
612;
613; GFX9-LABEL: mad_i64_i32_unpack_i64ops:
614; GFX9:       ; %bb.0:
615; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
617; GFX9-NEXT:    s_setpc_b64 s[30:31]
618;
619; GFX11-LABEL: mad_i64_i32_unpack_i64ops:
620; GFX11:       ; %bb.0:
621; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
622; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
623; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
624; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
625; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
626; GFX11-NEXT:    s_setpc_b64 s[30:31]
627  %tmp4 = lshr i64 %arg0, 32
628  %tmp5 = and i64 %arg0, 4294967295
629  %mul = mul nuw i64 %tmp4, %tmp5
630  %mad = add i64 %mul, %arg0
631  ret i64 %mad
632}
633
634define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
635; CI-LABEL: mad_i64_i32_uniform:
636; CI:       ; %bb.0:
637; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
638; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
639; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
640; CI-NEXT:    s_waitcnt lgkmcnt(0)
641; CI-NEXT:    v_mov_b32_e32 v2, s3
642; CI-NEXT:    v_mov_b32_e32 v0, s4
643; CI-NEXT:    v_mov_b32_e32 v1, s5
644; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
645; CI-NEXT:    s_mov_b32 s3, 0xf000
646; CI-NEXT:    s_mov_b32 s2, -1
647; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
648; CI-NEXT:    s_endpgm
649;
650; SI-LABEL: mad_i64_i32_uniform:
651; SI:       ; %bb.0:
652; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
653; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
654; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
655; SI-NEXT:    s_mov_b32 s7, 0xf000
656; SI-NEXT:    s_mov_b32 s6, -1
657; SI-NEXT:    s_waitcnt lgkmcnt(0)
658; SI-NEXT:    v_mov_b32_e32 v0, s3
659; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
660; SI-NEXT:    s_mul_i32 s2, s2, s3
661; SI-NEXT:    v_mov_b32_e32 v0, s2
662; SI-NEXT:    v_mov_b32_e32 v2, s1
663; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
664; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
665; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
666; SI-NEXT:    s_endpgm
667;
668; GFX9-LABEL: mad_i64_i32_uniform:
669; GFX9:       ; %bb.0:
670; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
671; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
672; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
673; GFX9-NEXT:    v_mov_b32_e32 v2, 0
674; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX9-NEXT:    s_mul_i32 s0, s2, s3
676; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
677; GFX9-NEXT:    s_add_u32 s0, s0, s4
678; GFX9-NEXT:    s_addc_u32 s1, s1, s5
679; GFX9-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
680; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
681; GFX9-NEXT:    s_endpgm
682;
683; GFX11-LABEL: mad_i64_i32_uniform:
684; GFX11:       ; %bb.0:
685; GFX11-NEXT:    s_clause 0x2
686; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
687; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
688; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
689; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
690; GFX11-NEXT:    s_mul_i32 s6, s2, s3
691; GFX11-NEXT:    s_mul_hi_u32 s3, s2, s3
692; GFX11-NEXT:    s_add_u32 s2, s6, s4
693; GFX11-NEXT:    s_addc_u32 s3, s3, s5
694; GFX11-NEXT:    v_mov_b32_e32 v0, s2
695; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
696; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
697; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
698; GFX11-NEXT:    s_endpgm
699  %ext0 = zext i32 %arg0 to i64
700  %ext1 = zext i32 %arg1 to i64
701  %mul = mul i64 %ext0, %ext1
702  %mad = add i64 %mul, %arg2
703  store i64 %mad, i64 addrspace(1)* %out
704  ret void
705}
706
707define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
708; CI-LABEL: mad_i64_i32_twice:
709; CI:       ; %bb.0:
710; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
711; CI-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
712; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
713; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
714; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
715; CI-NEXT:    s_setpc_b64 s[30:31]
716;
717; SI-LABEL: mad_i64_i32_twice:
718; SI:       ; %bb.0:
719; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
720; SI-NEXT:    v_mul_lo_u32 v6, v0, v1
721; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
722; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
723; SI-NEXT:    v_addc_u32_e32 v1, vcc, v0, v3, vcc
724; SI-NEXT:    v_add_i32_e32 v3, vcc, v6, v4
725; SI-NEXT:    v_addc_u32_e32 v0, vcc, v0, v5, vcc
726; SI-NEXT:    v_xor_b32_e32 v1, v1, v0
727; SI-NEXT:    v_xor_b32_e32 v0, v2, v3
728; SI-NEXT:    s_setpc_b64 s[30:31]
729;
730; GFX9-LABEL: mad_i64_i32_twice:
731; GFX9:       ; %bb.0:
732; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733; GFX9-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
734; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
735; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v1
736; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
737; GFX9-NEXT:    s_setpc_b64 s[30:31]
738;
739; GFX11-LABEL: mad_i64_i32_twice:
740; GFX11:       ; %bb.0:
741; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
743; GFX11-NEXT:    v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
744; GFX11-NEXT:    v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
745; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
746; GFX11-NEXT:    v_xor_b32_e32 v0, v6, v2
747; GFX11-NEXT:    v_xor_b32_e32 v1, v7, v3
748; GFX11-NEXT:    s_setpc_b64 s[30:31]
749  %sext0 = sext i32 %arg0 to i64
750  %sext1 = sext i32 %arg1 to i64
751  %mul = mul i64 %sext0, %sext1
752  %mad1 = add i64 %mul, %arg2
753  %mad2 = add i64 %mul, %arg3
754  %out = xor i64 %mad1, %mad2
755  ret i64 %out
756}
757
758define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) #0 {
759; CI-LABEL: mad_i64_i32_thrice:
760; CI:       ; %bb.0:
761; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
763; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
764; CI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
765; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v4
766; CI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
767; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
768; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
769; CI-NEXT:    v_xor_b32_e32 v3, v3, v5
770; CI-NEXT:    v_xor_b32_e32 v2, v2, v4
771; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
772; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
773; CI-NEXT:    s_setpc_b64 s[30:31]
774;
775; SI-LABEL: mad_i64_i32_thrice:
776; SI:       ; %bb.0:
777; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
778; SI-NEXT:    v_mul_lo_u32 v8, v0, v1
779; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
780; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v2
781; SI-NEXT:    v_addc_u32_e32 v2, vcc, v0, v3, vcc
782; SI-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
783; SI-NEXT:    v_addc_u32_e32 v4, vcc, v0, v5, vcc
784; SI-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
785; SI-NEXT:    v_addc_u32_e32 v0, vcc, v0, v7, vcc
786; SI-NEXT:    v_xor_b32_e32 v2, v2, v4
787; SI-NEXT:    v_xor_b32_e32 v3, v1, v3
788; SI-NEXT:    v_xor_b32_e32 v1, v2, v0
789; SI-NEXT:    v_xor_b32_e32 v0, v3, v5
790; SI-NEXT:    s_setpc_b64 s[30:31]
791;
792; GFX9-LABEL: mad_i64_i32_thrice:
793; GFX9:       ; %bb.0:
794; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795; GFX9-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
796; GFX9-NEXT:    v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5]
797; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7]
798; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
799; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
800; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v1
801; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
802; GFX9-NEXT:    s_setpc_b64 s[30:31]
803;
804; GFX11-LABEL: mad_i64_i32_thrice:
805; GFX11:       ; %bb.0:
806; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
808; GFX11-NEXT:    v_mad_i64_i32 v[8:9], null, v0, v1, 0
809; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
810; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v2
811; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
812; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v4
813; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
814; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v8, v6
815; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
816; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
817; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v2
818; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
819; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
820; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v4
821; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
822; GFX11-NEXT:    s_setpc_b64 s[30:31]
823  %sext0 = sext i32 %arg0 to i64
824  %sext1 = sext i32 %arg1 to i64
825  %mul = mul i64 %sext0, %sext1
826  %mad1 = add i64 %mul, %arg2
827  %mad2 = add i64 %mul, %arg3
828  %mad3 = add i64 %mul, %arg4
829  %out.p = xor i64 %mad1, %mad2
830  %out = xor i64 %out.p, %mad3
831  ret i64 %out
832}
833
834define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
835; CI-LABEL: mad_i64_i32_secondary_use:
836; CI:       ; %bb.0:
837; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
838; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
839; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
840; CI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
841; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
842; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
843; CI-NEXT:    s_setpc_b64 s[30:31]
844;
845; SI-LABEL: mad_i64_i32_secondary_use:
846; SI:       ; %bb.0:
847; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
848; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
849; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
850; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
851; SI-NEXT:    v_addc_u32_e32 v1, vcc, v0, v3, vcc
852; SI-NEXT:    v_xor_b32_e32 v1, v1, v0
853; SI-NEXT:    v_xor_b32_e32 v0, v2, v4
854; SI-NEXT:    s_setpc_b64 s[30:31]
855;
856; GFX9-LABEL: mad_i64_i32_secondary_use:
857; GFX9:       ; %bb.0:
858; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859; GFX9-NEXT:    v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0
860; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
861; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
862; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
863; GFX9-NEXT:    s_setpc_b64 s[30:31]
864;
865; GFX11-LABEL: mad_i64_i32_secondary_use:
866; GFX11:       ; %bb.0:
867; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
868; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
869; GFX11-NEXT:    v_mad_i64_i32 v[4:5], null, v0, v1, 0
870; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
871; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
872; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
873; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
874; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v4
875; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
876; GFX11-NEXT:    s_setpc_b64 s[30:31]
877  %sext0 = sext i32 %arg0 to i64
878  %sext1 = sext i32 %arg1 to i64
879  %mul = mul i64 %sext0, %sext1
880  %mad = add i64 %mul, %arg2
881  %out = xor i64 %mad, %mul
882  ret i64 %out
883}
884
885define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
886; CI-LABEL: mad_i48_i48:
887; CI:       ; %bb.0:
888; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889; CI-NEXT:    v_mov_b32_e32 v6, v1
890; CI-NEXT:    v_mov_b32_e32 v7, v0
891; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
892; CI-NEXT:    v_mul_lo_u32 v2, v6, v2
893; CI-NEXT:    v_mul_lo_u32 v3, v7, v3
894; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
895; CI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
896; CI-NEXT:    s_setpc_b64 s[30:31]
897;
898; SI-LABEL: mad_i48_i48:
899; SI:       ; %bb.0:
900; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
901; SI-NEXT:    v_mul_lo_u32 v3, v0, v3
902; SI-NEXT:    v_mul_hi_u32 v6, v0, v2
903; SI-NEXT:    v_mul_lo_u32 v1, v1, v2
904; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
905; SI-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
906; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
907; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
908; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
909; SI-NEXT:    s_setpc_b64 s[30:31]
910;
911; GFX9-LABEL: mad_i48_i48:
912; GFX9:       ; %bb.0:
913; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
914; GFX9-NEXT:    v_mov_b32_e32 v6, v1
915; GFX9-NEXT:    v_mov_b32_e32 v7, v0
916; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
917; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v3
918; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v2
919; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
920; GFX9-NEXT:    s_setpc_b64 s[30:31]
921;
922; GFX11-LABEL: mad_i48_i48:
923; GFX11:       ; %bb.0:
924; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
925; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
926; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
927; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
928; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
929; GFX11-NEXT:    v_mul_lo_u32 v3, v7, v3
930; GFX11-NEXT:    v_mul_lo_u32 v2, v6, v2
931; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
932; GFX11-NEXT:    v_add3_u32 v1, v2, v1, v3
933; GFX11-NEXT:    s_setpc_b64 s[30:31]
934  %m = mul i48 %arg0, %arg1
935  %a = add i48 %m, %arg2
936  ret i48 %a
937}
938
939attributes #0 = { nounwind }
940attributes #1 = { nounwind readnone speculatable }
941