1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s 6; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s 7 8; Signed 24-bit multiply is not supported on pre-Cayman GPUs. 9define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 10; SI-LABEL: test_smul24_i32: 11; SI: ; %bb.0: ; %entry 12; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 13; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 14; SI-NEXT: s_mov_b32 s3, 0xf000 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 17; SI-NEXT: s_bfe_i32 s4, s5, 0x180000 18; SI-NEXT: s_mul_i32 s4, s2, s4 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: v_mov_b32_e32 v0, s4 21; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 22; SI-NEXT: s_endpgm 23; 24; VI-LABEL: test_smul24_i32: 25; VI: ; %bb.0: ; %entry 26; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 27; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 28; VI-NEXT: s_mov_b32 s3, 0xf000 29; VI-NEXT: s_mov_b32 s2, -1 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 32; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 33; VI-NEXT: s_mul_i32 s4, s4, s5 34; VI-NEXT: v_mov_b32_e32 v0, s4 35; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 36; VI-NEXT: s_endpgm 37; 38; GFX9-LABEL: test_smul24_i32: 39; GFX9: ; %bb.0: ; %entry 40; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 41; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 42; GFX9-NEXT: s_mov_b32 s7, 0xf000 43; GFX9-NEXT: s_mov_b32 s6, -1 44; GFX9-NEXT: s_waitcnt lgkmcnt(0) 45; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 46; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 47; GFX9-NEXT: s_mul_i32 s0, s0, s1 48; GFX9-NEXT: v_mov_b32_e32 v0, s0 49; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 50; GFX9-NEXT: s_endpgm 51; 52; EG-LABEL: test_smul24_i32: 53; EG: ; %bb.0: ; %entry 54; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 55; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 56; EG-NEXT: CF_END 57; EG-NEXT: PAD 58; EG-NEXT: ALU clause starting at 4: 59; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x, 60; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x, 61; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 62; EG-NEXT: ASHR T1.W, PS, literal.x, 63; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 64; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 65; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 66; EG-NEXT: MULLO_INT * T1.X, PS, PV.W, 67; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 68; 69; CM-LABEL: test_smul24_i32: 70; CM: ; %bb.0: ; %entry 71; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 72; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 73; CM-NEXT: CF_END 74; CM-NEXT: PAD 75; CM-NEXT: ALU clause starting at 4: 76; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x, 77; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x, 78; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 79; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 80; CM-NEXT: ASHR T1.Z, PV.W, literal.y, 81; CM-NEXT: ASHR * T0.W, PV.Z, literal.y, 82; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) 83; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z, 84; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z, 85; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z, 86; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z, 87entry: 88 %a.shl = shl i32 %a, 8 89 %a.24 = ashr i32 %a.shl, 8 90 %b.shl = shl i32 %b, 8 91 %b.24 = ashr i32 %b.shl, 8 92 %mul24 = mul i32 %a.24, %b.24 93 store i32 %mul24, i32 addrspace(1)* %out 94 ret void 95} 96 97define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 98; SI-LABEL: test_smulhi24_i64: 99; SI: ; %bb.0: ; %entry 100; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 101; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s3, 0xf000 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s5 106; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 107; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 108; SI-NEXT: s_endpgm 109; 110; VI-LABEL: test_smulhi24_i64: 111; VI: ; %bb.0: ; %entry 112; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 113; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 114; VI-NEXT: s_mov_b32 s3, 0xf000 115; VI-NEXT: s_mov_b32 s2, -1 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s5 118; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 119; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 120; VI-NEXT: s_endpgm 121; 122; GFX9-LABEL: test_smulhi24_i64: 123; GFX9: ; %bb.0: ; %entry 124; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 125; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 126; GFX9-NEXT: s_mov_b32 s7, 0xf000 127; GFX9-NEXT: s_mov_b32 s6, -1 128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 129; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 130; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 131; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 132; GFX9-NEXT: v_mov_b32_e32 v0, s0 133; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 134; GFX9-NEXT: s_endpgm 135; 136; EG-LABEL: test_smulhi24_i64: 137; EG: ; %bb.0: ; %entry 138; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 139; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 140; EG-NEXT: CF_END 141; EG-NEXT: PAD 142; EG-NEXT: ALU clause starting at 4: 143; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x, 144; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x, 145; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 146; EG-NEXT: ASHR T1.W, PS, literal.x, 147; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 148; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 149; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 150; EG-NEXT: MULHI_INT * T1.X, PS, PV.W, 151; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 152; 153; CM-LABEL: test_smulhi24_i64: 154; CM: ; %bb.0: ; %entry 155; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 156; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 157; CM-NEXT: CF_END 158; CM-NEXT: PAD 159; CM-NEXT: ALU clause starting at 4: 160; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 161; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 162; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W, 163; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W, 164; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W, 165; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W, 166entry: 167 %a.shl = shl i32 %a, 8 168 %a.24 = ashr i32 %a.shl, 8 169 %b.shl = shl i32 %b, 8 170 %b.24 = ashr i32 %b.shl, 8 171 %a.24.i64 = sext i32 %a.24 to i64 172 %b.24.i64 = sext i32 %b.24 to i64 173 %mul48 = mul i64 %a.24.i64, %b.24.i64 174 %mul48.hi = lshr i64 %mul48, 32 175 %mul24hi = trunc i64 %mul48.hi to i32 176 store i32 %mul24hi, i32 addrspace(1)* %out 177 ret void 178} 179 180define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) { 181; SI-LABEL: test_smul48_i64: 182; SI: ; %bb.0: 183; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 184; SI-NEXT: v_mul_i32_i24_e32 v3, v0, v2 185; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 186; SI-NEXT: v_mov_b32_e32 v0, v3 187; SI-NEXT: s_setpc_b64 s[30:31] 188; 189; VI-LABEL: test_smul48_i64: 190; VI: ; %bb.0: 191; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 192; VI-NEXT: v_mul_i32_i24_e32 v3, v0, v2 193; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 194; VI-NEXT: v_mov_b32_e32 v0, v3 195; VI-NEXT: s_setpc_b64 s[30:31] 196; 197; GFX9-LABEL: test_smul48_i64: 198; GFX9: ; %bb.0: 199; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 200; GFX9-NEXT: v_mul_i32_i24_e32 v3, v0, v2 201; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 202; GFX9-NEXT: v_mov_b32_e32 v0, v3 203; GFX9-NEXT: s_setpc_b64 s[30:31] 204; 205; EG-LABEL: test_smul48_i64: 206; EG: ; %bb.0: 207; EG-NEXT: CF_END 208; EG-NEXT: PAD 209; 210; CM-LABEL: test_smul48_i64: 211; CM: ; %bb.0: 212; CM-NEXT: CF_END 213; CM-NEXT: PAD 214 %shl.lhs = shl i64 %lhs, 40 215 %lhs24 = ashr i64 %shl.lhs, 40 216 %shl.rhs = shl i64 %rhs, 40 217 %rhs24 = ashr i64 %shl.rhs, 40 218 %mul = mul i64 %lhs24, %rhs24 219 ret i64 %mul 220} 221 222define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 223; SI-LABEL: test_smul48_v2i64: 224; SI: ; %bb.0: 225; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 226; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 227; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 228; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6 229; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 230; SI-NEXT: v_ashr_i64 v[5:6], v[0:1], 40 231; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40 232; SI-NEXT: v_ashr_i64 v[6:7], v[2:3], 40 233; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 40 234; SI-NEXT: v_mul_i32_i24_e32 v0, v1, v2 235; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2 236; SI-NEXT: v_mul_i32_i24_e32 v2, v5, v6 237; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v5, v6 238; SI-NEXT: s_setpc_b64 s[30:31] 239; 240; VI-LABEL: test_smul48_v2i64: 241; VI: ; %bb.0: 242; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 243; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 244; VI-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1] 245; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 246; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] 247; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 248; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 249; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3] 250; VI-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2] 251; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v3 252; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3 253; VI-NEXT: v_mul_i32_i24_e32 v2, v7, v4 254; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4 255; VI-NEXT: s_setpc_b64 s[30:31] 256; 257; GFX9-LABEL: test_smul48_v2i64: 258; GFX9: ; %bb.0: 259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 261; GFX9-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1] 262; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0 263; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] 264; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 265; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4 266; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3] 267; GFX9-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2] 268; GFX9-NEXT: v_mul_i32_i24_e32 v0, v1, v3 269; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3 270; GFX9-NEXT: v_mul_i32_i24_e32 v2, v7, v4 271; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4 272; GFX9-NEXT: s_setpc_b64 s[30:31] 273; 274; EG-LABEL: test_smul48_v2i64: 275; EG: ; %bb.0: 276; EG-NEXT: CF_END 277; EG-NEXT: PAD 278; 279; CM-LABEL: test_smul48_v2i64: 280; CM: ; %bb.0: 281; CM-NEXT: CF_END 282; CM-NEXT: PAD 283 %shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40> 284 %lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40> 285 %shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40> 286 %rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40> 287 %mul = mul <2 x i64> %lhs24, %rhs24 288 ret <2 x i64> %mul 289} 290 291; This requires handling of the original 64-bit mul node to eliminate 292; unnecessary extension instructions because after legalization they 293; will not be removed by SimplifyDemandedBits because there are 294; multiple uses by the separate mul and mulhi. 295define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { 296; SI-LABEL: test_smul24_i64: 297; SI: ; %bb.0: 298; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 299; SI-NEXT: s_load_dword s2, s[0:1], 0x13 300; SI-NEXT: s_load_dword s0, s[0:1], 0x1c 301; SI-NEXT: s_mov_b32 s7, 0xf000 302; SI-NEXT: s_mov_b32 s6, -1 303; SI-NEXT: s_waitcnt lgkmcnt(0) 304; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 305; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 306; SI-NEXT: v_mov_b32_e32 v0, s1 307; SI-NEXT: s_mul_i32 s1, s0, s1 308; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 309; SI-NEXT: v_mov_b32_e32 v0, s1 310; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 311; SI-NEXT: s_endpgm 312; 313; VI-LABEL: test_smul24_i64: 314; VI: ; %bb.0: 315; VI-NEXT: s_load_dword s4, s[0:1], 0x4c 316; VI-NEXT: s_load_dword s5, s[0:1], 0x70 317; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 318; VI-NEXT: s_mov_b32 s3, 0xf000 319; VI-NEXT: s_mov_b32 s2, -1 320; VI-NEXT: s_waitcnt lgkmcnt(0) 321; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 322; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 323; VI-NEXT: v_mov_b32_e32 v0, s4 324; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 325; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0 326; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 327; VI-NEXT: s_endpgm 328; 329; GFX9-LABEL: test_smul24_i64: 330; GFX9: ; %bb.0: 331; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c 332; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 333; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 334; GFX9-NEXT: s_mov_b32 s7, 0xf000 335; GFX9-NEXT: s_mov_b32 s6, -1 336; GFX9-NEXT: s_waitcnt lgkmcnt(0) 337; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 338; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 339; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 340; GFX9-NEXT: s_mul_i32 s1, s1, s0 341; GFX9-NEXT: v_mov_b32_e32 v0, s1 342; GFX9-NEXT: v_mov_b32_e32 v1, s2 343; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 344; GFX9-NEXT: s_endpgm 345; 346; EG-LABEL: test_smul24_i64: 347; EG: ; %bb.0: 348; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 349; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 350; EG-NEXT: CF_END 351; EG-NEXT: PAD 352; EG-NEXT: ALU clause starting at 4: 353; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x, 354; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x, 355; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 356; EG-NEXT: ASHR T1.W, PS, literal.x, 357; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 358; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 359; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS, 360; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 361; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W, 362; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 363; 364; CM-LABEL: test_smul24_i64: 365; CM: ; %bb.0: 366; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] 367; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X 368; CM-NEXT: CF_END 369; CM-NEXT: PAD 370; CM-NEXT: ALU clause starting at 4: 371; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x, 372; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x, 373; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 374; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 375; CM-NEXT: ASHR T1.Z, PV.W, literal.y, 376; CM-NEXT: ASHR * T0.W, PV.Z, literal.y, 377; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) 378; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W, 379; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W, 380; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W, 381; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W, 382; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z, 383; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z, 384; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z, 385; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z, 386 %shl.i = shl i32 %a, 8 387 %shr.i = ashr i32 %shl.i, 8 388 %conv.i = sext i32 %shr.i to i64 389 %shl1.i = shl i32 %b, 8 390 %shr2.i = ashr i32 %shl1.i, 8 391 %conv3.i = sext i32 %shr2.i to i64 392 %mul.i = mul i64 %conv3.i, %conv.i 393 store i64 %mul.i, i64 addrspace(1)* %out 394 ret void 395} 396 397define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { 398; SI-LABEL: test_smul24_i64_square: 399; SI: ; %bb.0: 400; SI-NEXT: s_load_dword s4, s[0:1], 0xb 401; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 402; SI-NEXT: s_mov_b32 s3, 0xf000 403; SI-NEXT: s_mov_b32 s2, -1 404; SI-NEXT: s_waitcnt lgkmcnt(0) 405; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 406; SI-NEXT: s_mul_i32 s5, s4, s4 407; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 408; SI-NEXT: v_mov_b32_e32 v0, s5 409; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 410; SI-NEXT: s_endpgm 411; 412; VI-LABEL: test_smul24_i64_square: 413; VI: ; %bb.0: 414; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 415; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 416; VI-NEXT: s_mov_b32 s3, 0xf000 417; VI-NEXT: s_mov_b32 s2, -1 418; VI-NEXT: s_waitcnt lgkmcnt(0) 419; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 420; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 421; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4 422; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 423; VI-NEXT: s_endpgm 424; 425; GFX9-LABEL: test_smul24_i64_square: 426; GFX9: ; %bb.0: 427; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 428; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 429; GFX9-NEXT: s_mov_b32 s7, 0xf000 430; GFX9-NEXT: s_mov_b32 s6, -1 431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 432; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 433; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 434; GFX9-NEXT: s_mul_i32 s0, s0, s0 435; GFX9-NEXT: v_mov_b32_e32 v0, s0 436; GFX9-NEXT: v_mov_b32_e32 v1, s1 437; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 438; GFX9-NEXT: s_endpgm 439; 440; EG-LABEL: test_smul24_i64_square: 441; EG: ; %bb.0: 442; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 443; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 444; EG-NEXT: CF_END 445; EG-NEXT: PAD 446; EG-NEXT: ALU clause starting at 4: 447; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 448; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 449; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 450; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 451; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W, 452; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 453; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W, 454; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 455; 456; CM-LABEL: test_smul24_i64_square: 457; CM: ; %bb.0: 458; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 459; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X 460; CM-NEXT: CF_END 461; CM-NEXT: PAD 462; CM-NEXT: ALU clause starting at 4: 463; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 464; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 465; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 466; CM-NEXT: ASHR * T0.W, PV.W, literal.y, 467; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) 468; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W, 469; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W, 470; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W, 471; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W, 472; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z, 473; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z, 474; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z, 475; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z, 476 %shl.i = shl i32 %a, 8 477 %shr.i = ashr i32 %shl.i, 8 478 %conv.i = sext i32 %shr.i to i64 479 %mul.i = mul i64 %conv.i, %conv.i 480 store i64 %mul.i, i64 addrspace(1)* %out 481 ret void 482} 483 484define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { 485; SI-LABEL: test_smul24_i33: 486; SI: ; %bb.0: ; %entry 487; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 488; SI-NEXT: s_load_dword s2, s[0:1], 0xb 489; SI-NEXT: s_load_dword s0, s[0:1], 0xd 490; SI-NEXT: s_mov_b32 s7, 0xf000 491; SI-NEXT: s_mov_b32 s6, -1 492; SI-NEXT: s_waitcnt lgkmcnt(0) 493; SI-NEXT: s_lshl_b32 s1, s2, 8 494; SI-NEXT: s_lshl_b32 s3, s0, 8 495; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 496; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 497; SI-NEXT: v_mov_b32_e32 v0, s2 498; SI-NEXT: s_mul_i32 s1, s0, s2 499; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 500; SI-NEXT: v_mov_b32_e32 v0, s1 501; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 502; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 503; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 504; SI-NEXT: s_endpgm 505; 506; VI-LABEL: test_smul24_i33: 507; VI: ; %bb.0: ; %entry 508; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 509; VI-NEXT: s_load_dword s4, s[0:1], 0x34 510; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 511; VI-NEXT: s_waitcnt lgkmcnt(0) 512; VI-NEXT: s_lshl_b32 s3, s2, 8 513; VI-NEXT: s_lshl_b32 s5, s4, 8 514; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 515; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 516; VI-NEXT: v_mov_b32_e32 v0, s4 517; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 518; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 519; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] 520; VI-NEXT: s_mov_b32 s3, 0xf000 521; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] 522; VI-NEXT: s_mov_b32 s2, -1 523; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 524; VI-NEXT: s_endpgm 525; 526; GFX9-LABEL: test_smul24_i33: 527; GFX9: ; %bb.0: ; %entry 528; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 529; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 530; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 531; GFX9-NEXT: s_mov_b32 s7, 0xf000 532; GFX9-NEXT: s_mov_b32 s6, -1 533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 534; GFX9-NEXT: s_lshl_b32 s1, s2, 8 535; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 536; GFX9-NEXT: s_lshl_b32 s1, s3, 8 537; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 538; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 539; GFX9-NEXT: s_mul_i32 s0, s0, s2 540; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 541; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 542; GFX9-NEXT: v_mov_b32_e32 v0, s0 543; GFX9-NEXT: v_mov_b32_e32 v1, s1 544; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 545; GFX9-NEXT: s_endpgm 546; 547; EG-LABEL: test_smul24_i33: 548; EG: ; %bb.0: ; %entry 549; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 550; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1 551; EG-NEXT: CF_END 552; EG-NEXT: PAD 553; EG-NEXT: ALU clause starting at 4: 554; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, 555; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, 556; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 557; EG-NEXT: ASHR T1.W, PS, literal.x, 558; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 559; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 560; EG-NEXT: MULHI_INT * T0.X, PS, PV.W, 561; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W, 562; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 563; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1, 564; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 565; 566; CM-LABEL: test_smul24_i33: 567; CM: ; %bb.0: ; %entry 568; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[] 569; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X 570; CM-NEXT: CF_END 571; CM-NEXT: PAD 572; CM-NEXT: ALU clause starting at 4: 573; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x, 574; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x, 575; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 576; CM-NEXT: ASHR T1.Z, PV.W, literal.x, 577; CM-NEXT: ASHR * T0.W, PV.Z, literal.x, 578; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 579; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y, 580; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y, 581; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y, 582; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y, 583; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z, 584; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z, 585; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z, 586; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z, 587; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 588; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1, 589; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 590entry: 591 %a.shl = shl i33 %a, 9 592 %a.24 = ashr i33 %a.shl, 9 593 %b.shl = shl i33 %b, 9 594 %b.24 = ashr i33 %b.shl, 9 595 %mul24 = mul i33 %a.24, %b.24 596 %ext = sext i33 %mul24 to i64 597 store i64 %ext, i64 addrspace(1)* %out 598 ret void 599} 600 601define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { 602; SI-LABEL: test_smulhi24_i33: 603; SI: ; %bb.0: ; %entry 604; SI-NEXT: s_load_dword s4, s[0:1], 0xd 605; SI-NEXT: s_load_dword s5, s[0:1], 0xb 606; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 607; SI-NEXT: s_mov_b32 s3, 0xf000 608; SI-NEXT: s_mov_b32 s2, -1 609; SI-NEXT: s_waitcnt lgkmcnt(0) 610; SI-NEXT: v_mov_b32_e32 v0, s4 611; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 612; SI-NEXT: v_and_b32_e32 v0, 1, v0 613; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 614; SI-NEXT: s_endpgm 615; 616; VI-LABEL: test_smulhi24_i33: 617; VI: ; %bb.0: ; %entry 618; VI-NEXT: s_load_dword s4, s[0:1], 0x34 619; VI-NEXT: s_load_dword s5, s[0:1], 0x2c 620; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 621; VI-NEXT: s_mov_b32 s3, 0xf000 622; VI-NEXT: s_mov_b32 s2, -1 623; VI-NEXT: s_waitcnt lgkmcnt(0) 624; VI-NEXT: v_mov_b32_e32 v0, s4 625; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 626; VI-NEXT: v_and_b32_e32 v0, 1, v0 627; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 628; VI-NEXT: s_endpgm 629; 630; GFX9-LABEL: test_smulhi24_i33: 631; GFX9: ; %bb.0: ; %entry 632; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 633; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 634; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 635; GFX9-NEXT: s_mov_b32 s7, 0xf000 636; GFX9-NEXT: s_mov_b32 s6, -1 637; GFX9-NEXT: s_waitcnt lgkmcnt(0) 638; GFX9-NEXT: s_lshl_b32 s1, s2, 8 639; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 640; GFX9-NEXT: s_lshl_b32 s1, s3, 8 641; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 642; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 643; GFX9-NEXT: s_and_b32 s0, s0, 1 644; GFX9-NEXT: v_mov_b32_e32 v0, s0 645; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 646; GFX9-NEXT: s_endpgm 647; 648; EG-LABEL: test_smulhi24_i33: 649; EG: ; %bb.0: ; %entry 650; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 651; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 652; EG-NEXT: CF_END 653; EG-NEXT: PAD 654; EG-NEXT: ALU clause starting at 4: 655; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, 656; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, 657; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 658; EG-NEXT: ASHR T1.W, PS, literal.x, 659; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 660; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 661; EG-NEXT: MULHI_INT * T0.X, PS, PV.W, 662; EG-NEXT: AND_INT T0.X, PS, 1, 663; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 664; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 665; 666; CM-LABEL: test_smulhi24_i33: 667; CM: ; %bb.0: ; %entry 668; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 669; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 670; CM-NEXT: CF_END 671; CM-NEXT: PAD 672; CM-NEXT: ALU clause starting at 4: 673; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y, 674; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y, 675; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y, 676; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y, 677; CM-NEXT: AND_INT * T0.X, PV.X, 1, 678; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 679; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 680entry: 681 %tmp0 = shl i33 %a, 9 682 %a_24 = ashr i33 %tmp0, 9 683 %tmp1 = shl i33 %b, 9 684 %b_24 = ashr i33 %tmp1, 9 685 %tmp2 = mul i33 %a_24, %b_24 686 %hi = lshr i33 %tmp2, 32 687 %trunc = trunc i33 %hi to i32 688 689 store i32 %trunc, i32 addrspace(1)* %out 690 ret void 691} 692 693define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { 694; SI-LABEL: simplify_i24_crash: 695; SI: ; %bb.0: ; %bb 696; SI-NEXT: s_load_dword s2, s[0:1], 0xb 697; SI-NEXT: s_waitcnt lgkmcnt(0) 698; SI-NEXT: s_cmp_lg_u32 s2, 0 699; SI-NEXT: s_cbranch_scc0 .LBB8_2 700; SI-NEXT: ; %bb.1: ; %bb7 701; SI-NEXT: s_endpgm 702; SI-NEXT: .LBB8_2: ; %bb11 703; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 704; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 705; SI-NEXT: s_mov_b32 s3, 0xf000 706; SI-NEXT: s_waitcnt lgkmcnt(0) 707; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 708; SI-NEXT: s_bfe_i32 s4, s6, 0x180000 709; SI-NEXT: s_mul_i32 s4, s2, s4 710; SI-NEXT: s_mov_b32 s2, -1 711; SI-NEXT: v_mov_b32_e32 v0, s4 712; SI-NEXT: v_mov_b32_e32 v1, s4 713; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 714; SI-NEXT: s_endpgm 715; 716; VI-LABEL: simplify_i24_crash: 717; VI: ; %bb.0: ; %bb 718; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 719; VI-NEXT: s_waitcnt lgkmcnt(0) 720; VI-NEXT: s_cmp_lg_u32 s2, 0 721; VI-NEXT: s_cbranch_scc0 .LBB8_2 722; VI-NEXT: ; %bb.1: ; %bb7 723; VI-NEXT: s_endpgm 724; VI-NEXT: .LBB8_2: ; %bb11 725; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 726; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 727; VI-NEXT: s_mov_b32 s3, 0xf000 728; VI-NEXT: s_mov_b32 s2, -1 729; VI-NEXT: s_waitcnt lgkmcnt(0) 730; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 731; VI-NEXT: s_bfe_i32 s5, s6, 0x180000 732; VI-NEXT: s_mul_i32 s4, s4, s5 733; VI-NEXT: v_mov_b32_e32 v0, s4 734; VI-NEXT: v_mov_b32_e32 v1, s4 735; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 736; VI-NEXT: s_endpgm 737; 738; GFX9-LABEL: simplify_i24_crash: 739; GFX9: ; %bb.0: ; %bb 740; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 741; GFX9-NEXT: s_waitcnt lgkmcnt(0) 742; GFX9-NEXT: s_cmp_lg_u32 s2, 0 743; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 744; GFX9-NEXT: ; %bb.1: ; %bb7 745; GFX9-NEXT: s_endpgm 746; GFX9-NEXT: .LBB8_2: ; %bb11 747; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 748; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 749; GFX9-NEXT: s_mov_b32 s11, 0xf000 750; GFX9-NEXT: s_mov_b32 s10, -1 751; GFX9-NEXT: s_waitcnt lgkmcnt(0) 752; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000 753; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000 754; GFX9-NEXT: s_mul_i32 s0, s0, s1 755; GFX9-NEXT: v_mov_b32_e32 v0, s0 756; GFX9-NEXT: v_mov_b32_e32 v1, s0 757; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 758; GFX9-NEXT: s_endpgm 759; 760; EG-LABEL: simplify_i24_crash: 761; EG: ; %bb.0: ; %bb 762; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[] 763; EG-NEXT: JUMP @5 POP:1 764; EG-NEXT: ALU 10, @8, KC0[CB0:0-32], KC1[] 765; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0 766; EG-NEXT: POP @5 POP:1 767; EG-NEXT: CF_END 768; EG-NEXT: ALU clause starting at 6: 769; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0, 770; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 771; EG-NEXT: ALU clause starting at 8: 772; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, 773; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, 774; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 775; EG-NEXT: ASHR T1.W, PS, literal.x, 776; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 777; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 778; EG-NEXT: MOV T2.W, KC0[2].Y, 779; EG-NEXT: MULLO_INT * T0.X, PS, PV.W, 780; EG-NEXT: LSHR T1.X, PV.W, literal.x, 781; EG-NEXT: MOV * T0.Y, PS, 782; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 783; 784; CM-LABEL: simplify_i24_crash: 785; CM: ; %bb.0: ; %bb 786; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[] 787; CM-NEXT: JUMP @5 POP:1 788; CM-NEXT: ALU 13, @8, KC0[CB0:0-32], KC1[] 789; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 790; CM-NEXT: POP @5 POP:1 791; CM-NEXT: CF_END 792; CM-NEXT: ALU clause starting at 6: 793; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0, 794; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 795; CM-NEXT: ALU clause starting at 8: 796; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x, 797; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x, 798; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 799; CM-NEXT: MOV T0.Y, KC0[2].Y, 800; CM-NEXT: ASHR T1.Z, PV.W, literal.x, 801; CM-NEXT: ASHR * T0.W, PV.Z, literal.x, 802; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 803; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z, 804; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z, 805; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z, 806; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z, 807; CM-NEXT: LSHR T1.X, T0.Y, literal.x, 808; CM-NEXT: MOV * T0.Y, PV.X, 809; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 810bb: 811 %cmp = icmp eq i32 %arg0, 0 812 br i1 %cmp, label %bb11, label %bb7 813 814bb11: 815 %tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer 816 %tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer 817 %tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8> 818 %tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8> 819 %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8> 820 %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8> 821 %tmp21 = mul <2 x i32> %tmp18, %tmp20 822 store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out 823 br label %bb7 824 825bb7: 826 ret void 827 828} 829attributes #0 = { nounwind } 830