1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 6 7define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { 8; SI-LABEL: umulo_i64_v_v: 9; SI: ; %bb.0: ; %bb 10; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; SI-NEXT: v_mul_hi_u32 v4, v1, v2 12; SI-NEXT: v_mul_lo_u32 v5, v1, v2 13; SI-NEXT: v_mul_hi_u32 v6, v0, v3 14; SI-NEXT: v_mul_lo_u32 v7, v0, v3 15; SI-NEXT: v_mul_hi_u32 v8, v0, v2 16; SI-NEXT: v_mul_hi_u32 v9, v1, v3 17; SI-NEXT: v_mul_lo_u32 v3, v1, v3 18; SI-NEXT: v_mul_lo_u32 v0, v0, v2 19; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v7 20; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc 21; SI-NEXT: v_add_i32_e32 v6, vcc, v1, v5 22; SI-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5 23; SI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 24; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc 25; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 26; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 27; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 28; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 29; SI-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX9-LABEL: umulo_i64_v_v: 32; GFX9: ; %bb.0: ; %bb 33; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX9-NEXT: v_mov_b32_e32 v5, v0 35; GFX9-NEXT: v_mov_b32_e32 v4, v1 36; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 37; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0 38; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0 39; GFX9-NEXT: v_mov_b32_e32 v10, v1 40; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6 41; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc 42; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0 43; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 44; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 45; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 46; GFX9-NEXT: v_mul_lo_u32 v4, v4, v2 47; GFX9-NEXT: v_mul_lo_u32 v5, v5, v3 48; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v6 49; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v7, vcc 50; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 51; GFX9-NEXT: v_add3_u32 v1, v1, v5, v4 52; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 53; GFX9-NEXT: s_setpc_b64 s[30:31] 54; 55; GFX10-LABEL: umulo_i64_v_v: 56; GFX10: ; %bb.0: ; %bb 57; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 58; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 59; GFX10-NEXT: v_mov_b32_e32 v4, v0 60; GFX10-NEXT: v_mov_b32_e32 v5, v1 61; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 62; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0 63; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v5, v2, 0 64; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v3, 0 65; GFX10-NEXT: v_mov_b32_e32 v8, v1 66; GFX10-NEXT: v_mul_lo_u32 v5, v5, v2 67; GFX10-NEXT: v_mul_lo_u32 v4, v4, v3 68; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 69; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo 70; GFX10-NEXT: v_add3_u32 v1, v1, v4, v5 71; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9 72; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo 73; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo 74; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11 75; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo 76; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] 77; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 78; GFX10-NEXT: s_setpc_b64 s[30:31] 79; 80; GFX11-LABEL: umulo_i64_v_v: 81; GFX11: ; %bb.0: ; %bb 82; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 84; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 85; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 86; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 87; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 88; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 89; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v5, v3, 0 90; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 91; GFX11-NEXT: v_mov_b32_e32 v8, v1 92; GFX11-NEXT: v_mul_lo_u32 v5, v5, v2 93; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3 94; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 95; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo 96; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 97; GFX11-NEXT: v_add3_u32 v1, v1, v4, v5 98; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9 99; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 100; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo 101; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo 102; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11 103; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 104; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo 105; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] 106; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 107; GFX11-NEXT: s_setpc_b64 s[30:31] 108bb: 109 %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) 110 ret { i64, i1 } %umulo 111} 112 113define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { 114; SI-LABEL: smulo_i64_v_v: 115; SI: ; %bb.0: ; %bb 116; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; SI-NEXT: v_mul_hi_u32 v6, v1, v2 118; SI-NEXT: v_mul_lo_u32 v5, v1, v2 119; SI-NEXT: v_mul_hi_u32 v7, v0, v3 120; SI-NEXT: v_mul_lo_u32 v8, v0, v3 121; SI-NEXT: v_mul_hi_u32 v9, v0, v2 122; SI-NEXT: v_mul_hi_i32 v10, v1, v3 123; SI-NEXT: v_mul_lo_u32 v11, v1, v3 124; SI-NEXT: v_mul_lo_u32 v4, v0, v2 125; SI-NEXT: v_add_i32_e32 v8, vcc, v9, v8 126; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc 127; SI-NEXT: v_add_i32_e32 v9, vcc, v8, v5 128; SI-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 129; SI-NEXT: v_addc_u32_e32 v8, vcc, v7, v6, vcc 130; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 131; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 132; SI-NEXT: v_mov_b32_e32 v7, v6 133; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v11 134; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 135; SI-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 136; SI-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v9, vcc 137; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 138; SI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc 139; SI-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc 140; SI-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 141; SI-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc 142; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 143; SI-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 144; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 145; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7] 146; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 147; SI-NEXT: v_mov_b32_e32 v0, v4 148; SI-NEXT: v_mov_b32_e32 v1, v5 149; SI-NEXT: s_setpc_b64 s[30:31] 150; 151; GFX9-LABEL: smulo_i64_v_v: 152; GFX9: ; %bb.0: ; %bb 153; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; GFX9-NEXT: v_mov_b32_e32 v5, v0 155; GFX9-NEXT: v_mov_b32_e32 v4, v1 156; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 157; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0 158; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0 159; GFX9-NEXT: v_mov_b32_e32 v10, v1 160; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6 161; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc 162; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0 163; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 164; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 165; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 166; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 167; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 168; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v6, v2 169; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc 170; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4 171; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 172; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 173; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v6, v5 174; GFX9-NEXT: v_mul_lo_u32 v4, v4, v2 175; GFX9-NEXT: v_mul_lo_u32 v5, v5, v3 176; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc 177; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 178; GFX9-NEXT: v_add3_u32 v1, v1, v5, v4 179; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1 180; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 181; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc 182; GFX9-NEXT: v_mov_b32_e32 v5, v4 183; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5] 184; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 185; GFX9-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX10-LABEL: smulo_i64_v_v: 188; GFX10: ; %bb.0: ; %bb 189; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 191; GFX10-NEXT: v_mov_b32_e32 v4, v0 192; GFX10-NEXT: v_mov_b32_e32 v5, v1 193; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 194; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0 195; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v5, v2, 0 196; GFX10-NEXT: v_mad_i64_i32 v[11:12], s4, v5, v3, 0 197; GFX10-NEXT: v_mov_b32_e32 v8, v1 198; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 199; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo 200; GFX10-NEXT: v_mul_lo_u32 v8, v5, v2 201; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9 202; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo 203; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo 204; GFX10-NEXT: v_mul_lo_u32 v9, v4, v3 205; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11 206; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo 207; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2 208; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo 209; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 210; GFX10-NEXT: v_add3_u32 v1, v1, v9, v8 211; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo 212; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo 213; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1 214; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 215; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo 216; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 217; GFX10-NEXT: v_mov_b32_e32 v3, v2 218; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo 219; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo 220; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] 221; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 222; GFX10-NEXT: s_setpc_b64 s[30:31] 223; 224; GFX11-LABEL: smulo_i64_v_v: 225; GFX11: ; %bb.0: ; %bb 226; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 228; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 229; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) 230; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 231; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 232; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 233; GFX11-NEXT: v_mad_i64_i32 v[11:12], null, v5, v3, 0 234; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 235; GFX11-NEXT: v_mov_b32_e32 v8, v1 236; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 237; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo 238; GFX11-NEXT: v_mul_lo_u32 v8, v5, v2 239; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 240; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9 241; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo 242; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo 243; GFX11-NEXT: v_mul_lo_u32 v9, v4, v3 244; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 245; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11 246; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo 247; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 248; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2 249; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo 250; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 251; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8 252; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 253; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo 254; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo 255; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 256; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) 257; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 258; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo 259; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 260; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 261; GFX11-NEXT: v_mov_b32_e32 v3, v2 262; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4 263; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 264; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] 265; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 266; GFX11-NEXT: s_setpc_b64 s[30:31] 267bb: 268 %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) 269 ret { i64, i1 } %smulo 270} 271 272define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { 273; SI-LABEL: umulo_i64_s: 274; SI: ; %bb.0: ; %bb 275; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 276; SI-NEXT: s_mov_b32 s7, 0xf000 277; SI-NEXT: s_waitcnt lgkmcnt(0) 278; SI-NEXT: v_mov_b32_e32 v0, s2 279; SI-NEXT: v_mul_hi_u32 v1, s1, v0 280; SI-NEXT: s_mul_i32 s4, s1, s2 281; SI-NEXT: v_mov_b32_e32 v2, s3 282; SI-NEXT: v_mul_hi_u32 v3, s0, v2 283; SI-NEXT: s_mul_i32 s5, s0, s3 284; SI-NEXT: v_mul_hi_u32 v0, s0, v0 285; SI-NEXT: v_mul_hi_u32 v2, s1, v2 286; SI-NEXT: s_mul_i32 s1, s1, s3 287; SI-NEXT: s_mul_i32 s0, s0, s2 288; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v0 289; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 290; SI-NEXT: v_mov_b32_e32 v5, s0 291; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 292; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 293; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 294; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v0 295; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v1 296; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 297; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v3 298; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 299; SI-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc 300; SI-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc 301; SI-NEXT: s_mov_b32 s6, -1 302; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 303; SI-NEXT: s_endpgm 304; 305; GFX9-LABEL: umulo_i64_s: 306; GFX9: ; %bb.0: ; %bb 307; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 308; GFX9-NEXT: s_waitcnt lgkmcnt(0) 309; GFX9-NEXT: s_mul_i32 s7, s0, s3 310; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 311; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 312; GFX9-NEXT: s_add_u32 s9, s8, s7 313; GFX9-NEXT: s_mul_i32 s6, s1, s2 314; GFX9-NEXT: s_addc_u32 s5, 0, s5 315; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 316; GFX9-NEXT: s_add_u32 s9, s9, s6 317; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3 318; GFX9-NEXT: s_addc_u32 s4, s5, s4 319; GFX9-NEXT: s_addc_u32 s5, s10, 0 320; GFX9-NEXT: s_mul_i32 s1, s1, s3 321; GFX9-NEXT: s_add_u32 s4, s4, s1 322; GFX9-NEXT: s_addc_u32 s5, 0, s5 323; GFX9-NEXT: s_add_i32 s1, s8, s7 324; GFX9-NEXT: s_add_i32 s1, s1, s6 325; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 326; GFX9-NEXT: s_mul_i32 s2, s0, s2 327; GFX9-NEXT: v_mov_b32_e32 v0, s1 328; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 329; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] 330; GFX9-NEXT: v_mov_b32_e32 v0, s2 331; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] 332; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 333; GFX9-NEXT: s_endpgm 334; 335; GFX10-LABEL: umulo_i64_s: 336; GFX10: ; %bb.0: ; %bb 337; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 338; GFX10-NEXT: s_waitcnt lgkmcnt(0) 339; GFX10-NEXT: s_mul_i32 s7, s0, s3 340; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 341; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 342; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 343; GFX10-NEXT: s_mul_i32 s6, s1, s2 344; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3 345; GFX10-NEXT: s_mul_i32 s1, s1, s3 346; GFX10-NEXT: s_add_u32 s3, s8, s7 347; GFX10-NEXT: s_addc_u32 s5, 0, s5 348; GFX10-NEXT: s_add_u32 s3, s3, s6 349; GFX10-NEXT: s_addc_u32 s3, s5, s4 350; GFX10-NEXT: s_addc_u32 s5, s9, 0 351; GFX10-NEXT: s_add_u32 s4, s3, s1 352; GFX10-NEXT: s_addc_u32 s5, 0, s5 353; GFX10-NEXT: s_add_i32 s1, s8, s7 354; GFX10-NEXT: s_mul_i32 s0, s0, s2 355; GFX10-NEXT: s_add_i32 s1, s1, s6 356; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 357; GFX10-NEXT: s_cselect_b32 s2, -1, 0 358; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2 359; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2 360; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 361; GFX10-NEXT: s_endpgm 362; 363; GFX11-LABEL: umulo_i64_s: 364; GFX11: ; %bb.0: ; %bb 365; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 366; GFX11-NEXT: s_waitcnt lgkmcnt(0) 367; GFX11-NEXT: s_mul_i32 s7, s0, s3 368; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 369; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3 370; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2 371; GFX11-NEXT: s_mul_i32 s6, s1, s2 372; GFX11-NEXT: s_mul_hi_u32 s9, s1, s3 373; GFX11-NEXT: s_mul_i32 s1, s1, s3 374; GFX11-NEXT: s_add_u32 s3, s8, s7 375; GFX11-NEXT: s_addc_u32 s5, 0, s5 376; GFX11-NEXT: s_add_u32 s3, s3, s6 377; GFX11-NEXT: s_addc_u32 s3, s5, s4 378; GFX11-NEXT: s_addc_u32 s5, s9, 0 379; GFX11-NEXT: s_add_u32 s4, s3, s1 380; GFX11-NEXT: s_addc_u32 s5, 0, s5 381; GFX11-NEXT: s_add_i32 s1, s8, s7 382; GFX11-NEXT: s_mul_i32 s0, s0, s2 383; GFX11-NEXT: s_add_i32 s1, s1, s6 384; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 385; GFX11-NEXT: s_cselect_b32 s2, -1, 0 386; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 387; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2 388; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2 389; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off 390; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 391; GFX11-NEXT: s_endpgm 392bb: 393 %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) 394 %mul = extractvalue { i64, i1 } %umulo, 0 395 %overflow = extractvalue { i64, i1 } %umulo, 1 396 %res = select i1 %overflow, i64 0, i64 %mul 397 store i64 %res, i64 addrspace(1)* undef 398 ret void 399} 400 401define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { 402; SI-LABEL: smulo_i64_s: 403; SI: ; %bb.0: ; %bb 404; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 405; SI-NEXT: s_mov_b32 s7, 0xf000 406; SI-NEXT: s_waitcnt lgkmcnt(0) 407; SI-NEXT: v_mov_b32_e32 v0, s2 408; SI-NEXT: v_mul_hi_u32 v1, s1, v0 409; SI-NEXT: s_mul_i32 s4, s1, s2 410; SI-NEXT: v_mov_b32_e32 v2, s3 411; SI-NEXT: v_mul_hi_u32 v3, s0, v2 412; SI-NEXT: s_mul_i32 s5, s0, s3 413; SI-NEXT: v_mul_hi_u32 v0, s0, v0 414; SI-NEXT: v_mul_hi_i32 v2, s1, v2 415; SI-NEXT: s_mul_i32 s6, s1, s3 416; SI-NEXT: s_cmp_lt_i32 s1, 0 417; SI-NEXT: s_mul_i32 s1, s0, s2 418; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v0 419; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 420; SI-NEXT: v_mov_b32_e32 v5, s1 421; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 422; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 423; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 424; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0 425; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 426; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 427; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v0 428; SI-NEXT: v_subrev_i32_e32 v3, vcc, s2, v1 429; SI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v2, vcc 430; SI-NEXT: s_cselect_b64 vcc, -1, 0 431; SI-NEXT: s_cmp_lt_i32 s3, 0 432; SI-NEXT: v_ashrrev_i32_e32 v0, 31, v4 433; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 434; SI-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc 435; SI-NEXT: v_mov_b32_e32 v1, v0 436; SI-NEXT: v_subrev_i32_e32 v7, vcc, s0, v6 437; SI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v2, vcc 438; SI-NEXT: s_cselect_b64 vcc, -1, 0 439; SI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc 440; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc 441; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1] 442; SI-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc 443; SI-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc 444; SI-NEXT: s_mov_b32 s6, -1 445; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 446; SI-NEXT: s_endpgm 447; 448; GFX9-LABEL: smulo_i64_s: 449; GFX9: ; %bb.0: ; %bb 450; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: s_mul_i32 s7, s0, s3 453; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 454; GFX9-NEXT: s_mul_hi_u32 s6, s0, s3 455; GFX9-NEXT: s_add_u32 s9, s8, s7 456; GFX9-NEXT: s_mul_i32 s5, s1, s2 457; GFX9-NEXT: s_addc_u32 s6, 0, s6 458; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 459; GFX9-NEXT: s_add_u32 s9, s9, s5 460; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3 461; GFX9-NEXT: s_addc_u32 s4, s6, s4 462; GFX9-NEXT: s_addc_u32 s6, s10, 0 463; GFX9-NEXT: s_mul_i32 s9, s1, s3 464; GFX9-NEXT: s_add_u32 s4, s4, s9 465; GFX9-NEXT: s_addc_u32 s6, 0, s6 466; GFX9-NEXT: s_sub_u32 s9, s4, s2 467; GFX9-NEXT: s_subb_u32 s10, s6, 0 468; GFX9-NEXT: s_cmp_lt_i32 s1, 0 469; GFX9-NEXT: v_mov_b32_e32 v0, s6 470; GFX9-NEXT: v_mov_b32_e32 v1, s10 471; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 472; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 473; GFX9-NEXT: v_mov_b32_e32 v1, s4 474; GFX9-NEXT: v_mov_b32_e32 v2, s9 475; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc 476; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v2 477; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc 478; GFX9-NEXT: s_cmp_lt_i32 s3, 0 479; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 480; GFX9-NEXT: s_add_i32 s1, s8, s7 481; GFX9-NEXT: s_add_i32 s1, s1, s5 482; GFX9-NEXT: s_ashr_i32 s4, s1, 31 483; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 484; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 485; GFX9-NEXT: s_mov_b32 s5, s4 486; GFX9-NEXT: s_mul_i32 s0, s0, s2 487; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1] 488; GFX9-NEXT: v_mov_b32_e32 v2, s1 489; GFX9-NEXT: v_mov_b32_e32 v0, s0 490; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc 491; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 492; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 493; GFX9-NEXT: s_endpgm 494; 495; GFX10-LABEL: smulo_i64_s: 496; GFX10: ; %bb.0: ; %bb 497; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 498; GFX10-NEXT: s_waitcnt lgkmcnt(0) 499; GFX10-NEXT: s_mul_i32 s7, s0, s3 500; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 501; GFX10-NEXT: s_mul_hi_u32 s6, s0, s3 502; GFX10-NEXT: s_mul_i32 s5, s1, s2 503; GFX10-NEXT: s_add_u32 s11, s8, s7 504; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 505; GFX10-NEXT: s_addc_u32 s6, 0, s6 506; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 507; GFX10-NEXT: s_add_u32 s11, s11, s5 508; GFX10-NEXT: s_mul_i32 s10, s1, s3 509; GFX10-NEXT: s_addc_u32 s4, s6, s4 510; GFX10-NEXT: s_addc_u32 s6, s9, 0 511; GFX10-NEXT: s_add_u32 s4, s4, s10 512; GFX10-NEXT: s_addc_u32 s6, 0, s6 513; GFX10-NEXT: s_sub_u32 s9, s4, s2 514; GFX10-NEXT: s_subb_u32 s10, s6, 0 515; GFX10-NEXT: v_mov_b32_e32 v1, s9 516; GFX10-NEXT: s_cmp_lt_i32 s1, 0 517; GFX10-NEXT: v_mov_b32_e32 v0, s10 518; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 519; GFX10-NEXT: s_cmp_lt_i32 s3, 0 520; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo 521; GFX10-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo 522; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 523; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo 524; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 525; GFX10-NEXT: s_add_i32 s1, s8, s7 526; GFX10-NEXT: s_mul_i32 s0, s0, s2 527; GFX10-NEXT: s_add_i32 s1, s1, s5 528; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo 529; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 530; GFX10-NEXT: s_ashr_i32 s4, s1, 31 531; GFX10-NEXT: s_mov_b32 s5, s4 532; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] 533; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo 534; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo 535; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 536; GFX10-NEXT: s_endpgm 537; 538; GFX11-LABEL: smulo_i64_s: 539; GFX11: ; %bb.0: ; %bb 540; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 541; GFX11-NEXT: s_waitcnt lgkmcnt(0) 542; GFX11-NEXT: s_mul_i32 s7, s0, s3 543; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 544; GFX11-NEXT: s_mul_hi_u32 s6, s0, s3 545; GFX11-NEXT: s_mul_i32 s5, s1, s2 546; GFX11-NEXT: s_add_u32 s11, s8, s7 547; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2 548; GFX11-NEXT: s_addc_u32 s6, 0, s6 549; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3 550; GFX11-NEXT: s_add_u32 s11, s11, s5 551; GFX11-NEXT: s_mul_i32 s10, s1, s3 552; GFX11-NEXT: s_addc_u32 s4, s6, s4 553; GFX11-NEXT: s_addc_u32 s6, s9, 0 554; GFX11-NEXT: s_add_u32 s4, s4, s10 555; GFX11-NEXT: s_addc_u32 s6, 0, s6 556; GFX11-NEXT: s_sub_u32 s9, s4, s2 557; GFX11-NEXT: s_subb_u32 s10, s6, 0 558; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 559; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10 560; GFX11-NEXT: s_cmp_lt_i32 s1, 0 561; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 562; GFX11-NEXT: s_cmp_lt_i32 s3, 0 563; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo 564; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo 565; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 566; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 567; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo 568; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 569; GFX11-NEXT: s_add_i32 s1, s8, s7 570; GFX11-NEXT: s_mul_i32 s0, s0, s2 571; GFX11-NEXT: s_add_i32 s1, s1, s5 572; GFX11-NEXT: v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3 573; GFX11-NEXT: s_ashr_i32 s4, s1, 31 574; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 575; GFX11-NEXT: s_mov_b32 s5, s4 576; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 577; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] 578; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo 579; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo 580; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off 581; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 582; GFX11-NEXT: s_endpgm 583bb: 584 %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) 585 %mul = extractvalue { i64, i1 } %umulo, 0 586 %overflow = extractvalue { i64, i1 } %umulo, 1 587 %res = select i1 %overflow, i64 0, i64 %mul 588 store i64 %res, i64 addrspace(1)* undef 589 ret void 590} 591 592define { i64, i1 } @smulo_i64_v_4(i64 %i) { 593; SI-LABEL: smulo_i64_v_4: 594; SI: ; %bb.0: ; %bb 595; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 596; SI-NEXT: v_lshl_b64 v[5:6], v[0:1], 2 597; SI-NEXT: v_alignbit_b32 v4, v1, v0, 30 598; SI-NEXT: v_ashr_i64 v[2:3], v[5:6], 2 599; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1] 600; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 601; SI-NEXT: v_mov_b32_e32 v0, v5 602; SI-NEXT: v_mov_b32_e32 v1, v4 603; SI-NEXT: s_setpc_b64 s[30:31] 604; 605; GFX9-LABEL: smulo_i64_v_4: 606; GFX9: ; %bb.0: ; %bb 607; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 608; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] 609; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30 610; GFX9-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] 611; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1] 612; GFX9-NEXT: v_mov_b32_e32 v0, v4 613; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 614; GFX9-NEXT: v_mov_b32_e32 v1, v3 615; GFX9-NEXT: s_setpc_b64 s[30:31] 616; 617; GFX10-LABEL: smulo_i64_v_4: 618; GFX10: ; %bb.0: ; %bb 619; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 621; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] 622; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 623; GFX10-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] 624; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] 625; GFX10-NEXT: v_mov_b32_e32 v0, v4 626; GFX10-NEXT: v_mov_b32_e32 v1, v3 627; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 628; GFX10-NEXT: s_setpc_b64 s[30:31] 629; 630; GFX11-LABEL: smulo_i64_v_4: 631; GFX11: ; %bb.0: ; %bb 632; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 633; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 634; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] 635; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30 636; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 637; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] 638; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] 639; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 640; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 641; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 642; GFX11-NEXT: s_setpc_b64 s[30:31] 643bb: 644 %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4) 645 ret { i64, i1 } %umulo 646} 647 648define { i64, i1 } @umulo_i64_v_4(i64 %i) { 649; SI-LABEL: umulo_i64_v_4: 650; SI: ; %bb.0: ; %bb 651; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 652; SI-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 653; SI-NEXT: v_mov_b32_e32 v6, v0 654; SI-NEXT: v_lshl_b64 v[4:5], v[0:1], 2 655; SI-NEXT: v_alignbit_b32 v3, v1, v0, 30 656; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1] 657; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 658; SI-NEXT: v_mov_b32_e32 v0, v4 659; SI-NEXT: v_mov_b32_e32 v1, v3 660; SI-NEXT: s_setpc_b64 s[30:31] 661; 662; GFX9-LABEL: umulo_i64_v_4: 663; GFX9: ; %bb.0: ; %bb 664; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX9-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 666; GFX9-NEXT: v_mov_b32_e32 v6, v0 667; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] 668; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1] 669; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30 670; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 671; GFX9-NEXT: v_mov_b32_e32 v0, v4 672; GFX9-NEXT: v_mov_b32_e32 v1, v3 673; GFX9-NEXT: s_setpc_b64 s[30:31] 674; 675; GFX10-LABEL: umulo_i64_v_4: 676; GFX10: ; %bb.0: ; %bb 677; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 678; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 679; GFX10-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 680; GFX10-NEXT: v_mov_b32_e32 v6, v0 681; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] 682; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 683; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] 684; GFX10-NEXT: v_mov_b32_e32 v0, v4 685; GFX10-NEXT: v_mov_b32_e32 v1, v3 686; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 687; GFX10-NEXT: s_setpc_b64 s[30:31] 688; 689; GFX11-LABEL: umulo_i64_v_4: 690; GFX11: ; %bb.0: ; %bb 691; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 692; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 693; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1 694; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] 695; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30 696; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 697; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] 698; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 699; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 700; GFX11-NEXT: s_setpc_b64 s[30:31] 701bb: 702 %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4) 703 ret { i64, i1 } %umulo 704} 705 706declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) 707declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) 708