1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI %s 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 4; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 6 7; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok. 8 9define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 10; CI-LABEL: mad_i64_i32_sextops: 11; CI: ; %bb.0: 12; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 14; CI-NEXT: s_setpc_b64 s[30:31] 15; 16; SI-LABEL: mad_i64_i32_sextops: 17; SI: ; %bb.0: 18; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; SI-NEXT: v_mul_lo_u32 v4, v0, v1 20; SI-NEXT: v_mul_hi_i32 v1, v0, v1 21; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2 22; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 23; SI-NEXT: s_setpc_b64 s[30:31] 24; 25; GFX9-LABEL: mad_i64_i32_sextops: 26; GFX9: ; %bb.0: 27; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 29; GFX9-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX11-LABEL: mad_i64_i32_sextops: 32; GFX11: ; %bb.0: 33; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 35; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 36; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 37; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] 38; GFX11-NEXT: s_setpc_b64 s[30:31] 39 %sext0 = sext i32 %arg0 to i64 40 %sext1 = sext i32 %arg1 to i64 41 %mul = mul i64 %sext0, %sext1 42 %mad = add i64 %mul, %arg2 43 ret i64 %mad 44} 45 46define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 47; CI-LABEL: mad_i64_i32_sextops_commute: 48; CI: ; %bb.0: 49; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 51; CI-NEXT: s_setpc_b64 s[30:31] 52; 53; SI-LABEL: mad_i64_i32_sextops_commute: 54; SI: ; %bb.0: 55; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; SI-NEXT: v_mul_lo_u32 v4, v0, v1 57; SI-NEXT: v_mul_hi_i32 v1, v0, v1 58; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v4 59; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 60; SI-NEXT: s_setpc_b64 s[30:31] 61; 62; GFX9-LABEL: mad_i64_i32_sextops_commute: 63; GFX9: ; %bb.0: 64; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 65; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 66; GFX9-NEXT: s_setpc_b64 s[30:31] 67; 68; GFX11-LABEL: mad_i64_i32_sextops_commute: 69; GFX11: ; %bb.0: 70; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 71; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 72; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 73; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 74; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] 75; GFX11-NEXT: s_setpc_b64 s[30:31] 76 %sext0 = sext i32 %arg0 to i64 77 %sext1 = sext i32 %arg1 to i64 78 %mul = mul i64 %sext0, %sext1 79 %mad = add i64 %arg2, %mul 80 ret i64 %mad 81} 82 83define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 84; CI-LABEL: mad_u64_u32_zextops: 85; CI: ; %bb.0: 86; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] 88; CI-NEXT: s_setpc_b64 s[30:31] 89; 90; SI-LABEL: mad_u64_u32_zextops: 91; SI: ; %bb.0: 92; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; SI-NEXT: v_mul_lo_u32 v4, v0, v1 94; SI-NEXT: v_mul_hi_u32 v1, v0, v1 95; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2 96; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 97; SI-NEXT: s_setpc_b64 s[30:31] 98; 99; GFX9-LABEL: mad_u64_u32_zextops: 100; GFX9: ; %bb.0: 101; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] 103; GFX9-NEXT: s_setpc_b64 s[30:31] 104; 105; GFX11-LABEL: mad_u64_u32_zextops: 106; GFX11: ; %bb.0: 107; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 109; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 110; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 111; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] 112; GFX11-NEXT: s_setpc_b64 s[30:31] 113 %sext0 = zext i32 %arg0 to i64 114 %sext1 = zext i32 %arg1 to i64 115 %mul = mul i64 %sext0, %sext1 116 %mad = add i64 %mul, %arg2 117 ret i64 %mad 118} 119 120define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 121; CI-LABEL: mad_u64_u32_zextops_commute: 122; CI: ; %bb.0: 123; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] 125; CI-NEXT: s_setpc_b64 s[30:31] 126; 127; SI-LABEL: mad_u64_u32_zextops_commute: 128; SI: ; %bb.0: 129; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; SI-NEXT: v_mul_lo_u32 v4, v0, v1 131; SI-NEXT: v_mul_hi_u32 v1, v0, v1 132; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v4 133; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 134; SI-NEXT: s_setpc_b64 s[30:31] 135; 136; GFX9-LABEL: mad_u64_u32_zextops_commute: 137; GFX9: ; %bb.0: 138; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 139; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] 140; GFX9-NEXT: s_setpc_b64 s[30:31] 141; 142; GFX11-LABEL: mad_u64_u32_zextops_commute: 143; GFX11: ; %bb.0: 144; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 145; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 146; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 147; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 148; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] 149; GFX11-NEXT: s_setpc_b64 s[30:31] 150 %sext0 = zext i32 %arg0 to i64 151 %sext1 = zext i32 %arg1 to i64 152 %mul = mul i64 %sext0, %sext1 153 %mad = add i64 %arg2, %mul 154 ret i64 %mad 155} 156 157define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { 158; CI-LABEL: mad_i64_i32_sextops_i32_i128: 159; CI: ; %bb.0: 160; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 162; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0 163; CI-NEXT: v_mov_b32_e32 v8, 0 164; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8] 165; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1 166; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0 167; CI-NEXT: v_mov_b32_e32 v7, v10 168; CI-NEXT: v_mov_b32_e32 v10, v8 169; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10] 170; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12] 171; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9 172; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc 173; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10] 174; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0 175; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc 176; CI-NEXT: v_mov_b32_e32 v1, v8 177; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2 178; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 179; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 180; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc 181; CI-NEXT: s_setpc_b64 s[30:31] 182; 183; SI-LABEL: mad_i64_i32_sextops_i32_i128: 184; SI: ; %bb.0: 185; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0 187; SI-NEXT: v_mul_lo_u32 v11, v6, v1 188; SI-NEXT: v_mul_hi_u32 v12, v0, v1 189; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 190; SI-NEXT: v_mul_hi_u32 v14, v6, v1 191; SI-NEXT: v_mul_lo_u32 v13, v0, v7 192; SI-NEXT: v_mul_hi_u32 v10, v0, v7 193; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12 194; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc 195; SI-NEXT: v_mul_hi_u32 v8, v6, v7 196; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12 197; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc 198; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7 199; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10 200; SI-NEXT: v_mul_hi_i32 v6, v1, v6 201; SI-NEXT: v_mul_hi_i32 v7, v7, v0 202; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc 203; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10 204; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc 205; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11 206; SI-NEXT: v_mul_lo_u32 v0, v0, v1 207; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc 208; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10 209; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc 210; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 211; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc 212; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 213; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 214; SI-NEXT: s_setpc_b64 s[30:31] 215; 216; GFX9-LABEL: mad_i64_i32_sextops_i32_i128: 217; GFX9: ; %bb.0: 218; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 220; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0 221; GFX9-NEXT: v_mov_b32_e32 v9, 0 222; GFX9-NEXT: v_mov_b32_e32 v8, v7 223; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9] 224; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1 225; GFX9-NEXT: v_mov_b32_e32 v8, v11 226; GFX9-NEXT: v_mov_b32_e32 v11, v9 227; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11] 228; GFX9-NEXT: v_mov_b32_e32 v12, v11 229; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 230; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc 231; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9] 232; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0 233; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13] 234; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0 235; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc 236; GFX9-NEXT: v_mov_b32_e32 v1, v10 237; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 238; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 239; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc 240; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc 241; GFX9-NEXT: s_setpc_b64 s[30:31] 242; 243; GFX11-LABEL: mad_i64_i32_sextops_i32_i128: 244; GFX11: ; %bb.0: 245; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 247; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0 248; GFX11-NEXT: v_mov_b32_e32 v8, 0 249; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0 250; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 251; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 252; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] 253; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8 254; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 255; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] 256; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 257; GFX11-NEXT: v_mov_b32_e32 v8, v12 258; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 259; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10] 260; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8 261; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 262; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0 263; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8] 264; GFX11-NEXT: v_mov_b32_e32 v7, v11 265; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 266; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 267; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo 268; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 269; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 270; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo 271; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo 272; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 273; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo 274; GFX11-NEXT: s_setpc_b64 s[30:31] 275 %sext0 = sext i32 %arg0 to i128 276 %sext1 = sext i32 %arg1 to i128 277 %mul = mul i128 %sext0, %sext1 278 %mad = add i128 %mul, %arg2 279 ret i128 %mad 280} 281 282define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { 283; CI-LABEL: mad_i64_i32_sextops_i32_i63: 284; CI: ; %bb.0: 285; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 287; CI-NEXT: s_setpc_b64 s[30:31] 288; 289; SI-LABEL: mad_i64_i32_sextops_i32_i63: 290; SI: ; %bb.0: 291; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; SI-NEXT: v_mul_lo_u32 v4, v0, v1 293; SI-NEXT: v_mul_hi_i32 v1, v0, v1 294; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2 295; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 296; SI-NEXT: s_setpc_b64 s[30:31] 297; 298; GFX9-LABEL: mad_i64_i32_sextops_i32_i63: 299; GFX9: ; %bb.0: 300; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 301; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 302; GFX9-NEXT: s_setpc_b64 s[30:31] 303; 304; GFX11-LABEL: mad_i64_i32_sextops_i32_i63: 305; GFX11: ; %bb.0: 306; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 308; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 309; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 310; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] 311; GFX11-NEXT: s_setpc_b64 s[30:31] 312 %sext0 = sext i32 %arg0 to i63 313 %sext1 = sext i32 %arg1 to i63 314 %mul = mul i63 %sext0, %sext1 315 %mad = add i63 %mul, %arg2 316 ret i63 %mad 317} 318 319define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { 320; CI-LABEL: mad_i64_i32_sextops_i31_i63: 321; CI: ; %bb.0: 322; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 323; CI-NEXT: v_bfe_i32 v1, v1, 0, 31 324; CI-NEXT: v_bfe_i32 v0, v0, 0, 31 325; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 326; CI-NEXT: s_setpc_b64 s[30:31] 327; 328; SI-LABEL: mad_i64_i32_sextops_i31_i63: 329; SI: ; %bb.0: 330; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 331; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 332; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 333; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33 334; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33 335; SI-NEXT: v_mul_lo_u32 v1, v4, v0 336; SI-NEXT: v_mul_hi_i32 v4, v4, v0 337; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2 338; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc 339; SI-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX9-LABEL: mad_i64_i32_sextops_i31_i63: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 31 345; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 31 346; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 347; GFX9-NEXT: s_setpc_b64 s[30:31] 348; 349; GFX11-LABEL: mad_i64_i32_sextops_i31_i63: 350; GFX11: ; %bb.0: 351; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 352; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 353; GFX11-NEXT: v_bfe_i32 v4, v1, 0, 31 354; GFX11-NEXT: v_bfe_i32 v5, v0, 0, 31 355; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 356; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] 357; GFX11-NEXT: s_setpc_b64 s[30:31] 358 %sext0 = sext i31 %arg0 to i63 359 %sext1 = sext i31 %arg1 to i63 360 %mul = mul i63 %sext0, %sext1 361 %mad = add i63 %mul, %arg2 362 ret i63 %mad 363} 364 365define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 366; CI-LABEL: mad_i64_i32_extops_i32_i64: 367; CI: ; %bb.0: 368; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 369; CI-NEXT: v_ashrrev_i32_e32 v4, 31, v0 370; CI-NEXT: v_mul_lo_u32 v4, v4, v1 371; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] 372; CI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 373; CI-NEXT: s_setpc_b64 s[30:31] 374; 375; SI-LABEL: mad_i64_i32_extops_i32_i64: 376; SI: ; %bb.0: 377; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 378; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v0 379; SI-NEXT: v_mul_hi_u32 v5, v0, v1 380; SI-NEXT: v_mul_lo_u32 v4, v4, v1 381; SI-NEXT: v_mul_lo_u32 v0, v0, v1 382; SI-NEXT: v_add_i32_e32 v1, vcc, v5, v4 383; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 384; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 385; SI-NEXT: s_setpc_b64 s[30:31] 386; 387; GFX9-LABEL: mad_i64_i32_extops_i32_i64: 388; GFX9: ; %bb.0: 389; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 390; GFX9-NEXT: v_mov_b32_e32 v4, v1 391; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 392; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3] 393; GFX9-NEXT: v_mov_b32_e32 v2, v1 394; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3] 395; GFX9-NEXT: v_mov_b32_e32 v1, v2 396; GFX9-NEXT: s_setpc_b64 s[30:31] 397; 398; GFX11-LABEL: mad_i64_i32_extops_i32_i64: 399; GFX11: ; %bb.0: 400; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 401; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 402; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 403; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 404; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] 405; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 406; GFX11-NEXT: v_mov_b32_e32 v3, v1 407; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 408; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] 409; GFX11-NEXT: s_setpc_b64 s[30:31] 410 %ext0 = sext i32 %arg0 to i64 411 %ext1 = zext i32 %arg1 to i64 412 %mul = mul i64 %ext0, %ext1 413 %mad = add i64 %mul, %arg2 414 ret i64 %mad 415} 416 417define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { 418; CI-LABEL: mad_u64_u32_bitops: 419; CI: ; %bb.0: 420; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 421; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] 422; CI-NEXT: s_setpc_b64 s[30:31] 423; 424; SI-LABEL: mad_u64_u32_bitops: 425; SI: ; %bb.0: 426; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 427; SI-NEXT: v_mul_lo_u32 v1, v0, v2 428; SI-NEXT: v_mul_hi_u32 v2, v0, v2 429; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v4 430; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v5, vcc 431; SI-NEXT: s_setpc_b64 s[30:31] 432; 433; GFX9-LABEL: mad_u64_u32_bitops: 434; GFX9: ; %bb.0: 435; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] 437; GFX9-NEXT: s_setpc_b64 s[30:31] 438; 439; GFX11-LABEL: mad_u64_u32_bitops: 440; GFX11: ; %bb.0: 441; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 443; GFX11-NEXT: v_mov_b32_e32 v3, v0 444; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 445; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] 446; GFX11-NEXT: s_setpc_b64 s[30:31] 447 %trunc.lhs = and i64 %arg0, 4294967295 448 %trunc.rhs = and i64 %arg1, 4294967295 449 %mul = mul i64 %trunc.lhs, %trunc.rhs 450 %add = add i64 %mul, %arg2 451 ret i64 %add 452} 453 454define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 { 455; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small: 456; CI: ; %bb.0: 457; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; CI-NEXT: v_and_b32_e32 v3, 1, v1 459; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] 460; CI-NEXT: v_mul_lo_u32 v2, v3, v2 461; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 462; CI-NEXT: s_setpc_b64 s[30:31] 463; 464; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small: 465; SI: ; %bb.0: 466; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; SI-NEXT: v_and_b32_e32 v1, 1, v1 468; SI-NEXT: v_mul_hi_u32 v3, v0, v2 469; SI-NEXT: v_mul_lo_u32 v1, v1, v2 470; SI-NEXT: v_mul_lo_u32 v0, v0, v2 471; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 472; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 473; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc 474; SI-NEXT: s_setpc_b64 s[30:31] 475; 476; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small: 477; GFX9: ; %bb.0: 478; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 479; GFX9-NEXT: v_and_b32_e32 v3, 1, v1 480; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] 481; GFX9-NEXT: v_mov_b32_e32 v4, v1 482; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5] 483; GFX9-NEXT: v_mov_b32_e32 v1, v2 484; GFX9-NEXT: s_setpc_b64 s[30:31] 485; 486; GFX11-LABEL: mad_u64_u32_bitops_lhs_mask_small: 487; GFX11: ; %bb.0: 488; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 489; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 490; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 491; GFX11-NEXT: v_mov_b32_e32 v6, v1 492; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 493; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] 494; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 495; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 496; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] 497; GFX11-NEXT: s_setpc_b64 s[30:31] 498 %trunc.lhs = and i64 %arg0, 8589934591 499 %trunc.rhs = and i64 %arg1, 4294967295 500 %mul = mul i64 %trunc.lhs, %trunc.rhs 501 %add = add i64 %mul, %arg2 502 ret i64 %add 503} 504 505define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 { 506; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small: 507; CI: ; %bb.0: 508; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 509; CI-NEXT: v_mov_b32_e32 v6, v0 510; CI-NEXT: v_and_b32_e32 v3, 1, v3 511; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] 512; CI-NEXT: v_mul_lo_u32 v2, v6, v3 513; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 514; CI-NEXT: s_setpc_b64 s[30:31] 515; 516; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small: 517; SI: ; %bb.0: 518; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 519; SI-NEXT: v_and_b32_e32 v1, 1, v3 520; SI-NEXT: v_mul_hi_u32 v3, v0, v2 521; SI-NEXT: v_mul_lo_u32 v1, v0, v1 522; SI-NEXT: v_mul_lo_u32 v0, v0, v2 523; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 524; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 525; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc 526; SI-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small: 529; GFX9: ; %bb.0: 530; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX9-NEXT: v_mov_b32_e32 v6, v0 532; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] 533; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 534; GFX9-NEXT: v_mov_b32_e32 v2, v1 535; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3] 536; GFX9-NEXT: v_mov_b32_e32 v1, v2 537; GFX9-NEXT: s_setpc_b64 s[30:31] 538; 539; GFX11-LABEL: mad_u64_u32_bitops_rhs_mask_small: 540; GFX11: ; %bb.0: 541; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 542; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 543; GFX11-NEXT: v_mov_b32_e32 v6, v0 544; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 545; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] 546; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 547; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 548; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] 549; GFX11-NEXT: s_setpc_b64 s[30:31] 550 %trunc.lhs = and i64 %arg0, 4294967295 551 %trunc.rhs = and i64 %arg1, 8589934591 552 %mul = mul i64 %trunc.lhs, %trunc.rhs 553 %add = add i64 %mul, %arg2 554 ret i64 %add 555} 556 557define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { 558; CI-LABEL: mad_i64_i32_bitops: 559; CI: ; %bb.0: 560; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 561; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5] 562; CI-NEXT: s_setpc_b64 s[30:31] 563; 564; SI-LABEL: mad_i64_i32_bitops: 565; SI: ; %bb.0: 566; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 567; SI-NEXT: v_mul_lo_u32 v1, v0, v2 568; SI-NEXT: v_mul_hi_i32 v2, v0, v2 569; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v4 570; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v5, vcc 571; SI-NEXT: s_setpc_b64 s[30:31] 572; 573; GFX9-LABEL: mad_i64_i32_bitops: 574; GFX9: ; %bb.0: 575; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 576; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5] 577; GFX9-NEXT: s_setpc_b64 s[30:31] 578; 579; GFX11-LABEL: mad_i64_i32_bitops: 580; GFX11: ; %bb.0: 581; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 583; GFX11-NEXT: v_mov_b32_e32 v3, v0 584; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 585; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5] 586; GFX11-NEXT: s_setpc_b64 s[30:31] 587 %shl.lhs = shl i64 %arg0, 32 588 %trunc.lhs = ashr i64 %shl.lhs, 32 589 %shl.rhs = shl i64 %arg1, 32 590 %trunc.rhs = ashr i64 %shl.rhs, 32 591 %mul = mul i64 %trunc.lhs, %trunc.rhs 592 %add = add i64 %mul, %arg2 593 ret i64 %add 594} 595 596; Example from bug report 597define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { 598; CI-LABEL: mad_i64_i32_unpack_i64ops: 599; CI: ; %bb.0: 600; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 601; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1] 602; CI-NEXT: s_setpc_b64 s[30:31] 603; 604; SI-LABEL: mad_i64_i32_unpack_i64ops: 605; SI: ; %bb.0: 606; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 607; SI-NEXT: v_mul_lo_u32 v2, v1, v0 608; SI-NEXT: v_mul_hi_u32 v3, v1, v0 609; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 610; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 611; SI-NEXT: s_setpc_b64 s[30:31] 612; 613; GFX9-LABEL: mad_i64_i32_unpack_i64ops: 614; GFX9: ; %bb.0: 615; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 616; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1] 617; GFX9-NEXT: s_setpc_b64 s[30:31] 618; 619; GFX11-LABEL: mad_i64_i32_unpack_i64ops: 620; GFX11: ; %bb.0: 621; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 622; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 623; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] 624; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 625; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 626; GFX11-NEXT: s_setpc_b64 s[30:31] 627 %tmp4 = lshr i64 %arg0, 32 628 %tmp5 = and i64 %arg0, 4294967295 629 %mul = mul nuw i64 %tmp4, %tmp5 630 %mad = add i64 %mul, %arg0 631 ret i64 %mad 632} 633 634define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 { 635; CI-LABEL: mad_i64_i32_uniform: 636; CI: ; %bb.0: 637; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 638; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 639; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 640; CI-NEXT: s_waitcnt lgkmcnt(0) 641; CI-NEXT: v_mov_b32_e32 v2, s3 642; CI-NEXT: v_mov_b32_e32 v0, s4 643; CI-NEXT: v_mov_b32_e32 v1, s5 644; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 645; CI-NEXT: s_mov_b32 s3, 0xf000 646; CI-NEXT: s_mov_b32 s2, -1 647; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 648; CI-NEXT: s_endpgm 649; 650; SI-LABEL: mad_i64_i32_uniform: 651; SI: ; %bb.0: 652; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 653; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 654; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 655; SI-NEXT: s_mov_b32 s7, 0xf000 656; SI-NEXT: s_mov_b32 s6, -1 657; SI-NEXT: s_waitcnt lgkmcnt(0) 658; SI-NEXT: v_mov_b32_e32 v0, s3 659; SI-NEXT: v_mul_hi_u32 v1, s2, v0 660; SI-NEXT: s_mul_i32 s2, s2, s3 661; SI-NEXT: v_mov_b32_e32 v0, s2 662; SI-NEXT: v_mov_b32_e32 v2, s1 663; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 664; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc 665; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 666; SI-NEXT: s_endpgm 667; 668; GFX9-LABEL: mad_i64_i32_uniform: 669; GFX9: ; %bb.0: 670; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 671; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 672; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 673; GFX9-NEXT: v_mov_b32_e32 v2, 0 674; GFX9-NEXT: s_waitcnt lgkmcnt(0) 675; GFX9-NEXT: s_mul_i32 s0, s2, s3 676; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 677; GFX9-NEXT: s_add_u32 s0, s0, s4 678; GFX9-NEXT: s_addc_u32 s1, s1, s5 679; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 680; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 681; GFX9-NEXT: s_endpgm 682; 683; GFX11-LABEL: mad_i64_i32_uniform: 684; GFX11: ; %bb.0: 685; GFX11-NEXT: s_clause 0x2 686; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 687; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 688; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 689; GFX11-NEXT: s_waitcnt lgkmcnt(0) 690; GFX11-NEXT: s_mul_i32 s6, s2, s3 691; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 692; GFX11-NEXT: s_add_u32 s2, s6, s4 693; GFX11-NEXT: s_addc_u32 s3, s3, s5 694; GFX11-NEXT: v_mov_b32_e32 v0, s2 695; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 696; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 697; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 698; GFX11-NEXT: s_endpgm 699 %ext0 = zext i32 %arg0 to i64 700 %ext1 = zext i32 %arg1 to i64 701 %mul = mul i64 %ext0, %ext1 702 %mad = add i64 %mul, %arg2 703 store i64 %mad, i64 addrspace(1)* %out 704 ret void 705} 706 707define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 { 708; CI-LABEL: mad_i64_i32_twice: 709; CI: ; %bb.0: 710; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 711; CI-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3] 712; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5] 713; CI-NEXT: v_xor_b32_e32 v1, v3, v1 714; CI-NEXT: v_xor_b32_e32 v0, v2, v0 715; CI-NEXT: s_setpc_b64 s[30:31] 716; 717; SI-LABEL: mad_i64_i32_twice: 718; SI: ; %bb.0: 719; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 720; SI-NEXT: v_mul_lo_u32 v6, v0, v1 721; SI-NEXT: v_mul_hi_i32 v0, v0, v1 722; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 723; SI-NEXT: v_addc_u32_e32 v1, vcc, v0, v3, vcc 724; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v4 725; SI-NEXT: v_addc_u32_e32 v0, vcc, v0, v5, vcc 726; SI-NEXT: v_xor_b32_e32 v1, v1, v0 727; SI-NEXT: v_xor_b32_e32 v0, v2, v3 728; SI-NEXT: s_setpc_b64 s[30:31] 729; 730; GFX9-LABEL: mad_i64_i32_twice: 731; GFX9: ; %bb.0: 732; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 733; GFX9-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3] 734; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5] 735; GFX9-NEXT: v_xor_b32_e32 v1, v3, v1 736; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 737; GFX9-NEXT: s_setpc_b64 s[30:31] 738; 739; GFX11-LABEL: mad_i64_i32_twice: 740; GFX11: ; %bb.0: 741; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 743; GFX11-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3] 744; GFX11-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5] 745; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 746; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2 747; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3 748; GFX11-NEXT: s_setpc_b64 s[30:31] 749 %sext0 = sext i32 %arg0 to i64 750 %sext1 = sext i32 %arg1 to i64 751 %mul = mul i64 %sext0, %sext1 752 %mad1 = add i64 %mul, %arg2 753 %mad2 = add i64 %mul, %arg3 754 %out = xor i64 %mad1, %mad2 755 ret i64 %out 756} 757 758define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) #0 { 759; CI-LABEL: mad_i64_i32_thrice: 760; CI: ; %bb.0: 761; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 762; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0 763; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 764; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 765; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v4 766; CI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 767; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 768; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc 769; CI-NEXT: v_xor_b32_e32 v3, v3, v5 770; CI-NEXT: v_xor_b32_e32 v2, v2, v4 771; CI-NEXT: v_xor_b32_e32 v1, v3, v1 772; CI-NEXT: v_xor_b32_e32 v0, v2, v0 773; CI-NEXT: s_setpc_b64 s[30:31] 774; 775; SI-LABEL: mad_i64_i32_thrice: 776; SI: ; %bb.0: 777; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 778; SI-NEXT: v_mul_lo_u32 v8, v0, v1 779; SI-NEXT: v_mul_hi_i32 v0, v0, v1 780; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v2 781; SI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc 782; SI-NEXT: v_add_i32_e32 v3, vcc, v8, v4 783; SI-NEXT: v_addc_u32_e32 v4, vcc, v0, v5, vcc 784; SI-NEXT: v_add_i32_e32 v5, vcc, v8, v6 785; SI-NEXT: v_addc_u32_e32 v0, vcc, v0, v7, vcc 786; SI-NEXT: v_xor_b32_e32 v2, v2, v4 787; SI-NEXT: v_xor_b32_e32 v3, v1, v3 788; SI-NEXT: v_xor_b32_e32 v1, v2, v0 789; SI-NEXT: v_xor_b32_e32 v0, v3, v5 790; SI-NEXT: s_setpc_b64 s[30:31] 791; 792; GFX9-LABEL: mad_i64_i32_thrice: 793; GFX9: ; %bb.0: 794; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 795; GFX9-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3] 796; GFX9-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5] 797; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7] 798; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5 799; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 800; GFX9-NEXT: v_xor_b32_e32 v1, v3, v1 801; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 802; GFX9-NEXT: s_setpc_b64 s[30:31] 803; 804; GFX11-LABEL: mad_i64_i32_thrice: 805; GFX11: ; %bb.0: 806; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 807; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 808; GFX11-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0 809; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 810; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2 811; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo 812; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4 813; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo 814; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6 815; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo 816; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 817; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 818; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 819; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 820; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4 821; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 822; GFX11-NEXT: s_setpc_b64 s[30:31] 823 %sext0 = sext i32 %arg0 to i64 824 %sext1 = sext i32 %arg1 to i64 825 %mul = mul i64 %sext0, %sext1 826 %mad1 = add i64 %mul, %arg2 827 %mad2 = add i64 %mul, %arg3 828 %mad3 = add i64 %mul, %arg4 829 %out.p = xor i64 %mad1, %mad2 830 %out = xor i64 %out.p, %mad3 831 ret i64 %out 832} 833 834define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 835; CI-LABEL: mad_i64_i32_secondary_use: 836; CI: ; %bb.0: 837; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 838; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0 839; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 840; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 841; CI-NEXT: v_xor_b32_e32 v1, v3, v1 842; CI-NEXT: v_xor_b32_e32 v0, v2, v0 843; CI-NEXT: s_setpc_b64 s[30:31] 844; 845; SI-LABEL: mad_i64_i32_secondary_use: 846; SI: ; %bb.0: 847; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 848; SI-NEXT: v_mul_lo_u32 v4, v0, v1 849; SI-NEXT: v_mul_hi_i32 v0, v0, v1 850; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 851; SI-NEXT: v_addc_u32_e32 v1, vcc, v0, v3, vcc 852; SI-NEXT: v_xor_b32_e32 v1, v1, v0 853; SI-NEXT: v_xor_b32_e32 v0, v2, v4 854; SI-NEXT: s_setpc_b64 s[30:31] 855; 856; GFX9-LABEL: mad_i64_i32_secondary_use: 857; GFX9: ; %bb.0: 858; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 859; GFX9-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0 860; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 861; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5 862; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 863; GFX9-NEXT: s_setpc_b64 s[30:31] 864; 865; GFX11-LABEL: mad_i64_i32_secondary_use: 866; GFX11: ; %bb.0: 867; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 868; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 869; GFX11-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0 870; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 871; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 872; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo 873; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 874; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4 875; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 876; GFX11-NEXT: s_setpc_b64 s[30:31] 877 %sext0 = sext i32 %arg0 to i64 878 %sext1 = sext i32 %arg1 to i64 879 %mul = mul i64 %sext0, %sext1 880 %mad = add i64 %mul, %arg2 881 %out = xor i64 %mad, %mul 882 ret i64 %out 883} 884 885define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { 886; CI-LABEL: mad_i48_i48: 887; CI: ; %bb.0: 888; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 889; CI-NEXT: v_mov_b32_e32 v6, v1 890; CI-NEXT: v_mov_b32_e32 v7, v0 891; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5] 892; CI-NEXT: v_mul_lo_u32 v2, v6, v2 893; CI-NEXT: v_mul_lo_u32 v3, v7, v3 894; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 895; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 896; CI-NEXT: s_setpc_b64 s[30:31] 897; 898; SI-LABEL: mad_i48_i48: 899; SI: ; %bb.0: 900; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 901; SI-NEXT: v_mul_lo_u32 v3, v0, v3 902; SI-NEXT: v_mul_hi_u32 v6, v0, v2 903; SI-NEXT: v_mul_lo_u32 v1, v1, v2 904; SI-NEXT: v_mul_lo_u32 v0, v0, v2 905; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v3 906; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 907; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 908; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc 909; SI-NEXT: s_setpc_b64 s[30:31] 910; 911; GFX9-LABEL: mad_i48_i48: 912; GFX9: ; %bb.0: 913; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 914; GFX9-NEXT: v_mov_b32_e32 v6, v1 915; GFX9-NEXT: v_mov_b32_e32 v7, v0 916; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5] 917; GFX9-NEXT: v_mul_lo_u32 v3, v7, v3 918; GFX9-NEXT: v_mul_lo_u32 v2, v6, v2 919; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 920; GFX9-NEXT: s_setpc_b64 s[30:31] 921; 922; GFX11-LABEL: mad_i48_i48: 923; GFX11: ; %bb.0: 924; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 925; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 926; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0 927; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 928; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5] 929; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3 930; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2 931; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 932; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3 933; GFX11-NEXT: s_setpc_b64 s[30:31] 934 %m = mul i48 %arg0, %arg1 935 %a = add i48 %m, %arg2 936 ret i48 %a 937} 938 939attributes #0 = { nounwind } 940attributes #1 = { nounwind readnone speculatable } 941