1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s 6; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s 7 8; Signed 24-bit multiply is not supported on pre-Cayman GPUs. 9define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 10; SI-LABEL: test_smul24_i32: 11; SI: ; %bb.0: ; %entry 12; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 13; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 14; SI-NEXT: s_mov_b32 s3, 0xf000 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 17; SI-NEXT: s_bfe_i32 s4, s5, 0x180000 18; SI-NEXT: s_mul_i32 s4, s2, s4 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: v_mov_b32_e32 v0, s4 21; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 22; SI-NEXT: s_endpgm 23; 24; VI-LABEL: test_smul24_i32: 25; VI: ; %bb.0: ; %entry 26; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 27; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 28; VI-NEXT: s_mov_b32 s3, 0xf000 29; VI-NEXT: s_mov_b32 s2, -1 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 32; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 33; VI-NEXT: s_mul_i32 s4, s4, s5 34; VI-NEXT: v_mov_b32_e32 v0, s4 35; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 36; VI-NEXT: s_endpgm 37; 38; GFX9-LABEL: test_smul24_i32: 39; GFX9: ; %bb.0: ; %entry 40; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 41; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 42; GFX9-NEXT: s_mov_b32 s7, 0xf000 43; GFX9-NEXT: s_mov_b32 s6, -1 44; GFX9-NEXT: s_waitcnt lgkmcnt(0) 45; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 46; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 47; GFX9-NEXT: s_mul_i32 s0, s0, s1 48; GFX9-NEXT: v_mov_b32_e32 v0, s0 49; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 50; GFX9-NEXT: s_endpgm 51; 52; EG-LABEL: test_smul24_i32: 53; EG: ; %bb.0: ; %entry 54; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 55; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 56; EG-NEXT: CF_END 57; EG-NEXT: PAD 58; EG-NEXT: ALU clause starting at 4: 59; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x, 60; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x, 61; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 62; EG-NEXT: ASHR T1.W, PS, literal.x, 63; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 64; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 65; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 66; EG-NEXT: MULLO_INT * T1.X, PS, PV.W, 67; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 68; 69; CM-LABEL: test_smul24_i32: 70; CM: ; %bb.0: ; %entry 71; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 72; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 73; CM-NEXT: CF_END 74; CM-NEXT: PAD 75; CM-NEXT: ALU clause starting at 4: 76; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x, 77; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x, 78; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 79; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 80; CM-NEXT: ASHR T1.Z, PV.W, literal.y, 81; CM-NEXT: ASHR * T0.W, PV.Z, literal.y, 82; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) 83; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z, 84; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z, 85; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z, 86; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z, 87entry: 88 %a.shl = shl i32 %a, 8 89 %a.24 = ashr i32 %a.shl, 8 90 %b.shl = shl i32 %b, 8 91 %b.24 = ashr i32 %b.shl, 8 92 %mul24 = mul i32 %a.24, %b.24 93 store i32 %mul24, i32 addrspace(1)* %out 94 ret void 95} 96 97define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 98; SI-LABEL: test_smulhi24_i64: 99; SI: ; %bb.0: ; %entry 100; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 101; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s3, 0xf000 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s5 106; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 107; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 108; SI-NEXT: s_endpgm 109; 110; VI-LABEL: test_smulhi24_i64: 111; VI: ; %bb.0: ; %entry 112; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 113; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 114; VI-NEXT: s_mov_b32 s3, 0xf000 115; VI-NEXT: s_mov_b32 s2, -1 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s5 118; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 119; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 120; VI-NEXT: s_endpgm 121; 122; GFX9-LABEL: test_smulhi24_i64: 123; GFX9: ; %bb.0: ; %entry 124; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 125; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 126; GFX9-NEXT: s_mov_b32 s7, 0xf000 127; GFX9-NEXT: s_mov_b32 s6, -1 128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 129; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 130; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 131; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 132; GFX9-NEXT: v_mov_b32_e32 v0, s0 133; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 134; GFX9-NEXT: s_endpgm 135; 136; EG-LABEL: test_smulhi24_i64: 137; EG: ; %bb.0: ; %entry 138; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 139; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 140; EG-NEXT: CF_END 141; EG-NEXT: PAD 142; EG-NEXT: ALU clause starting at 4: 143; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x, 144; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x, 145; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 146; EG-NEXT: ASHR T1.W, PS, literal.x, 147; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 148; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 149; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 150; EG-NEXT: MULHI_INT * T1.X, PS, PV.W, 151; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 152; 153; CM-LABEL: test_smulhi24_i64: 154; CM: ; %bb.0: ; %entry 155; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 156; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 157; CM-NEXT: CF_END 158; CM-NEXT: PAD 159; CM-NEXT: ALU clause starting at 4: 160; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 161; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 162; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W, 163; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W, 164; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W, 165; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W, 166entry: 167 %a.shl = shl i32 %a, 8 168 %a.24 = ashr i32 %a.shl, 8 169 %b.shl = shl i32 %b, 8 170 %b.24 = ashr i32 %b.shl, 8 171 %a.24.i64 = sext i32 %a.24 to i64 172 %b.24.i64 = sext i32 %b.24 to i64 173 %mul48 = mul i64 %a.24.i64, %b.24.i64 174 %mul48.hi = lshr i64 %mul48, 32 175 %mul24hi = trunc i64 %mul48.hi to i32 176 store i32 %mul24hi, i32 addrspace(1)* %out 177 ret void 178} 179 180define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) { 181; SI-LABEL: test_smul48_i64: 182; SI: ; %bb.0: 183; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 184; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 185; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 186; SI-NEXT: v_ashr_i64 v[3:4], v[0:1], 40 187; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40 188; SI-NEXT: v_mul_i32_i24_e32 v0, v3, v1 189; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1 190; SI-NEXT: s_setpc_b64 s[30:31] 191; 192; VI-LABEL: test_smul48_i64: 193; VI: ; %bb.0: 194; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 196; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[0:1] 197; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 198; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] 199; VI-NEXT: v_mul_i32_i24_e32 v0, v3, v1 200; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1 201; VI-NEXT: s_setpc_b64 s[30:31] 202; 203; GFX9-LABEL: test_smul48_i64: 204; GFX9: ; %bb.0: 205; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 206; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0 207; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[0:1] 208; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 209; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] 210; GFX9-NEXT: v_mul_i32_i24_e32 v0, v3, v1 211; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1 212; GFX9-NEXT: s_setpc_b64 s[30:31] 213; 214; EG-LABEL: test_smul48_i64: 215; EG: ; %bb.0: 216; EG-NEXT: CF_END 217; EG-NEXT: PAD 218; 219; CM-LABEL: test_smul48_i64: 220; CM: ; %bb.0: 221; CM-NEXT: CF_END 222; CM-NEXT: PAD 223 %shl.lhs = shl i64 %lhs, 40 224 %lhs24 = ashr i64 %shl.lhs, 40 225 %shl.rhs = shl i64 %rhs, 40 226 %rhs24 = ashr i64 %shl.rhs, 40 227 %mul = mul i64 %lhs24, %rhs24 228 ret i64 %mul 229} 230 231define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 232; SI-LABEL: test_smul48_v2i64: 233; SI: ; %bb.0: 234; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 235; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 236; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 237; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6 238; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 239; SI-NEXT: v_ashr_i64 v[5:6], v[0:1], 40 240; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40 241; SI-NEXT: v_ashr_i64 v[6:7], v[2:3], 40 242; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 40 243; SI-NEXT: v_mul_i32_i24_e32 v0, v1, v2 244; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2 245; SI-NEXT: v_mul_i32_i24_e32 v2, v5, v6 246; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v5, v6 247; SI-NEXT: s_setpc_b64 s[30:31] 248; 249; VI-LABEL: test_smul48_v2i64: 250; VI: ; %bb.0: 251; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 253; VI-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1] 254; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 255; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] 256; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 257; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 258; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3] 259; VI-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2] 260; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v3 261; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3 262; VI-NEXT: v_mul_i32_i24_e32 v2, v7, v4 263; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4 264; VI-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX9-LABEL: test_smul48_v2i64: 267; GFX9: ; %bb.0: 268; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 270; GFX9-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1] 271; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0 272; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] 273; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 274; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4 275; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3] 276; GFX9-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2] 277; GFX9-NEXT: v_mul_i32_i24_e32 v0, v1, v3 278; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3 279; GFX9-NEXT: v_mul_i32_i24_e32 v2, v7, v4 280; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4 281; GFX9-NEXT: s_setpc_b64 s[30:31] 282; 283; EG-LABEL: test_smul48_v2i64: 284; EG: ; %bb.0: 285; EG-NEXT: CF_END 286; EG-NEXT: PAD 287; 288; CM-LABEL: test_smul48_v2i64: 289; CM: ; %bb.0: 290; CM-NEXT: CF_END 291; CM-NEXT: PAD 292 %shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40> 293 %lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40> 294 %shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40> 295 %rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40> 296 %mul = mul <2 x i64> %lhs24, %rhs24 297 ret <2 x i64> %mul 298} 299 300; This requires handling of the original 64-bit mul node to eliminate 301; unnecessary extension instructions because after legalization they 302; will not be removed by SimplifyDemandedBits because there are 303; multiple uses by the separate mul and mulhi. 304define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { 305; SI-LABEL: test_smul24_i64: 306; SI: ; %bb.0: 307; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 308; SI-NEXT: s_load_dword s2, s[0:1], 0x13 309; SI-NEXT: s_load_dword s0, s[0:1], 0x1c 310; SI-NEXT: s_mov_b32 s7, 0xf000 311; SI-NEXT: s_mov_b32 s6, -1 312; SI-NEXT: s_waitcnt lgkmcnt(0) 313; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 314; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 315; SI-NEXT: v_mov_b32_e32 v0, s1 316; SI-NEXT: s_mul_i32 s1, s0, s1 317; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 318; SI-NEXT: v_mov_b32_e32 v0, s1 319; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 320; SI-NEXT: s_endpgm 321; 322; VI-LABEL: test_smul24_i64: 323; VI: ; %bb.0: 324; VI-NEXT: s_load_dword s4, s[0:1], 0x4c 325; VI-NEXT: s_load_dword s5, s[0:1], 0x70 326; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 327; VI-NEXT: s_mov_b32 s3, 0xf000 328; VI-NEXT: s_mov_b32 s2, -1 329; VI-NEXT: s_waitcnt lgkmcnt(0) 330; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 331; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 332; VI-NEXT: v_mov_b32_e32 v0, s4 333; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 334; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0 335; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 336; VI-NEXT: s_endpgm 337; 338; GFX9-LABEL: test_smul24_i64: 339; GFX9: ; %bb.0: 340; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c 341; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 342; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 343; GFX9-NEXT: s_mov_b32 s7, 0xf000 344; GFX9-NEXT: s_mov_b32 s6, -1 345; GFX9-NEXT: s_waitcnt lgkmcnt(0) 346; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 347; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 348; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 349; GFX9-NEXT: s_mul_i32 s1, s1, s0 350; GFX9-NEXT: v_mov_b32_e32 v0, s1 351; GFX9-NEXT: v_mov_b32_e32 v1, s2 352; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 353; GFX9-NEXT: s_endpgm 354; 355; EG-LABEL: test_smul24_i64: 356; EG: ; %bb.0: 357; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 358; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 359; EG-NEXT: CF_END 360; EG-NEXT: PAD 361; EG-NEXT: ALU clause starting at 4: 362; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x, 363; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x, 364; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 365; EG-NEXT: ASHR T1.W, PS, literal.x, 366; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 367; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 368; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS, 369; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 370; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W, 371; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 372; 373; CM-LABEL: test_smul24_i64: 374; CM: ; %bb.0: 375; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] 376; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X 377; CM-NEXT: CF_END 378; CM-NEXT: PAD 379; CM-NEXT: ALU clause starting at 4: 380; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x, 381; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x, 382; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 383; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 384; CM-NEXT: ASHR T1.Z, PV.W, literal.y, 385; CM-NEXT: ASHR * T0.W, PV.Z, literal.y, 386; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) 387; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W, 388; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W, 389; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W, 390; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W, 391; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z, 392; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z, 393; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z, 394; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z, 395 %shl.i = shl i32 %a, 8 396 %shr.i = ashr i32 %shl.i, 8 397 %conv.i = sext i32 %shr.i to i64 398 %shl1.i = shl i32 %b, 8 399 %shr2.i = ashr i32 %shl1.i, 8 400 %conv3.i = sext i32 %shr2.i to i64 401 %mul.i = mul i64 %conv3.i, %conv.i 402 store i64 %mul.i, i64 addrspace(1)* %out 403 ret void 404} 405 406define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { 407; SI-LABEL: test_smul24_i64_square: 408; SI: ; %bb.0: 409; SI-NEXT: s_load_dword s4, s[0:1], 0xb 410; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 411; SI-NEXT: s_mov_b32 s3, 0xf000 412; SI-NEXT: s_mov_b32 s2, -1 413; SI-NEXT: s_waitcnt lgkmcnt(0) 414; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 415; SI-NEXT: s_mul_i32 s5, s4, s4 416; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 417; SI-NEXT: v_mov_b32_e32 v0, s5 418; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 419; SI-NEXT: s_endpgm 420; 421; VI-LABEL: test_smul24_i64_square: 422; VI: ; %bb.0: 423; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 424; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 425; VI-NEXT: s_mov_b32 s3, 0xf000 426; VI-NEXT: s_mov_b32 s2, -1 427; VI-NEXT: s_waitcnt lgkmcnt(0) 428; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 429; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 430; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4 431; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 432; VI-NEXT: s_endpgm 433; 434; GFX9-LABEL: test_smul24_i64_square: 435; GFX9: ; %bb.0: 436; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 437; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 438; GFX9-NEXT: s_mov_b32 s7, 0xf000 439; GFX9-NEXT: s_mov_b32 s6, -1 440; GFX9-NEXT: s_waitcnt lgkmcnt(0) 441; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 442; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 443; GFX9-NEXT: s_mul_i32 s0, s0, s0 444; GFX9-NEXT: v_mov_b32_e32 v0, s0 445; GFX9-NEXT: v_mov_b32_e32 v1, s1 446; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 447; GFX9-NEXT: s_endpgm 448; 449; EG-LABEL: test_smul24_i64_square: 450; EG: ; %bb.0: 451; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 452; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 453; EG-NEXT: CF_END 454; EG-NEXT: PAD 455; EG-NEXT: ALU clause starting at 4: 456; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 457; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 458; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 459; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 460; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W, 461; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 462; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W, 463; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 464; 465; CM-LABEL: test_smul24_i64_square: 466; CM: ; %bb.0: 467; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 468; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X 469; CM-NEXT: CF_END 470; CM-NEXT: PAD 471; CM-NEXT: ALU clause starting at 4: 472; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 473; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 474; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 475; CM-NEXT: ASHR * T0.W, PV.W, literal.y, 476; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) 477; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W, 478; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W, 479; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W, 480; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W, 481; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z, 482; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z, 483; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z, 484; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z, 485 %shl.i = shl i32 %a, 8 486 %shr.i = ashr i32 %shl.i, 8 487 %conv.i = sext i32 %shr.i to i64 488 %mul.i = mul i64 %conv.i, %conv.i 489 store i64 %mul.i, i64 addrspace(1)* %out 490 ret void 491} 492 493define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { 494; SI-LABEL: test_smul24_i33: 495; SI: ; %bb.0: ; %entry 496; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 497; SI-NEXT: s_load_dword s2, s[0:1], 0xb 498; SI-NEXT: s_load_dword s0, s[0:1], 0xd 499; SI-NEXT: s_mov_b32 s7, 0xf000 500; SI-NEXT: s_mov_b32 s6, -1 501; SI-NEXT: s_waitcnt lgkmcnt(0) 502; SI-NEXT: s_lshl_b32 s1, s2, 8 503; SI-NEXT: s_lshl_b32 s3, s0, 8 504; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 505; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 506; SI-NEXT: v_mov_b32_e32 v0, s2 507; SI-NEXT: s_mul_i32 s1, s0, s2 508; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 509; SI-NEXT: v_mov_b32_e32 v0, s1 510; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 511; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 512; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 513; SI-NEXT: s_endpgm 514; 515; VI-LABEL: test_smul24_i33: 516; VI: ; %bb.0: ; %entry 517; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 518; VI-NEXT: s_load_dword s4, s[0:1], 0x34 519; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 520; VI-NEXT: s_waitcnt lgkmcnt(0) 521; VI-NEXT: s_lshl_b32 s3, s2, 8 522; VI-NEXT: s_lshl_b32 s5, s4, 8 523; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 524; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 525; VI-NEXT: v_mov_b32_e32 v0, s4 526; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 527; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 528; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] 529; VI-NEXT: s_mov_b32 s3, 0xf000 530; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] 531; VI-NEXT: s_mov_b32 s2, -1 532; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 533; VI-NEXT: s_endpgm 534; 535; GFX9-LABEL: test_smul24_i33: 536; GFX9: ; %bb.0: ; %entry 537; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 538; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 539; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 540; GFX9-NEXT: s_mov_b32 s7, 0xf000 541; GFX9-NEXT: s_mov_b32 s6, -1 542; GFX9-NEXT: s_waitcnt lgkmcnt(0) 543; GFX9-NEXT: s_lshl_b32 s1, s2, 8 544; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 545; GFX9-NEXT: s_lshl_b32 s1, s3, 8 546; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 547; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 548; GFX9-NEXT: s_mul_i32 s0, s0, s2 549; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 550; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 551; GFX9-NEXT: v_mov_b32_e32 v0, s0 552; GFX9-NEXT: v_mov_b32_e32 v1, s1 553; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 554; GFX9-NEXT: s_endpgm 555; 556; EG-LABEL: test_smul24_i33: 557; EG: ; %bb.0: ; %entry 558; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 559; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1 560; EG-NEXT: CF_END 561; EG-NEXT: PAD 562; EG-NEXT: ALU clause starting at 4: 563; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, 564; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, 565; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 566; EG-NEXT: ASHR T1.W, PS, literal.x, 567; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 568; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 569; EG-NEXT: MULHI_INT * T0.X, PS, PV.W, 570; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W, 571; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 572; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1, 573; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 574; 575; CM-LABEL: test_smul24_i33: 576; CM: ; %bb.0: ; %entry 577; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[] 578; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X 579; CM-NEXT: CF_END 580; CM-NEXT: PAD 581; CM-NEXT: ALU clause starting at 4: 582; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x, 583; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x, 584; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 585; CM-NEXT: ASHR T1.Z, PV.W, literal.x, 586; CM-NEXT: ASHR * T0.W, PV.Z, literal.x, 587; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 588; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y, 589; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y, 590; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y, 591; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y, 592; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z, 593; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z, 594; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z, 595; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z, 596; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 597; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1, 598; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 599entry: 600 %a.shl = shl i33 %a, 9 601 %a.24 = ashr i33 %a.shl, 9 602 %b.shl = shl i33 %b, 9 603 %b.24 = ashr i33 %b.shl, 9 604 %mul24 = mul i33 %a.24, %b.24 605 %ext = sext i33 %mul24 to i64 606 store i64 %ext, i64 addrspace(1)* %out 607 ret void 608} 609 610define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { 611; SI-LABEL: test_smulhi24_i33: 612; SI: ; %bb.0: ; %entry 613; SI-NEXT: s_load_dword s4, s[0:1], 0xd 614; SI-NEXT: s_load_dword s5, s[0:1], 0xb 615; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 616; SI-NEXT: s_mov_b32 s3, 0xf000 617; SI-NEXT: s_mov_b32 s2, -1 618; SI-NEXT: s_waitcnt lgkmcnt(0) 619; SI-NEXT: v_mov_b32_e32 v0, s4 620; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 621; SI-NEXT: v_and_b32_e32 v0, 1, v0 622; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 623; SI-NEXT: s_endpgm 624; 625; VI-LABEL: test_smulhi24_i33: 626; VI: ; %bb.0: ; %entry 627; VI-NEXT: s_load_dword s4, s[0:1], 0x34 628; VI-NEXT: s_load_dword s5, s[0:1], 0x2c 629; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 630; VI-NEXT: s_mov_b32 s3, 0xf000 631; VI-NEXT: s_mov_b32 s2, -1 632; VI-NEXT: s_waitcnt lgkmcnt(0) 633; VI-NEXT: v_mov_b32_e32 v0, s4 634; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 635; VI-NEXT: v_and_b32_e32 v0, 1, v0 636; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 637; VI-NEXT: s_endpgm 638; 639; GFX9-LABEL: test_smulhi24_i33: 640; GFX9: ; %bb.0: ; %entry 641; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 642; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 643; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 644; GFX9-NEXT: s_mov_b32 s7, 0xf000 645; GFX9-NEXT: s_mov_b32 s6, -1 646; GFX9-NEXT: s_waitcnt lgkmcnt(0) 647; GFX9-NEXT: s_lshl_b32 s1, s2, 8 648; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 649; GFX9-NEXT: s_lshl_b32 s1, s3, 8 650; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 651; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 652; GFX9-NEXT: s_and_b32 s0, s0, 1 653; GFX9-NEXT: v_mov_b32_e32 v0, s0 654; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 655; GFX9-NEXT: s_endpgm 656; 657; EG-LABEL: test_smulhi24_i33: 658; EG: ; %bb.0: ; %entry 659; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 660; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 661; EG-NEXT: CF_END 662; EG-NEXT: PAD 663; EG-NEXT: ALU clause starting at 4: 664; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, 665; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, 666; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 667; EG-NEXT: ASHR T1.W, PS, literal.x, 668; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 669; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 670; EG-NEXT: MULHI_INT * T0.X, PS, PV.W, 671; EG-NEXT: AND_INT T0.X, PS, 1, 672; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 673; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 674; 675; CM-LABEL: test_smulhi24_i33: 676; CM: ; %bb.0: ; %entry 677; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 678; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 679; CM-NEXT: CF_END 680; CM-NEXT: PAD 681; CM-NEXT: ALU clause starting at 4: 682; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y, 683; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y, 684; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y, 685; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y, 686; CM-NEXT: AND_INT * T0.X, PV.X, 1, 687; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 688; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 689entry: 690 %tmp0 = shl i33 %a, 9 691 %a_24 = ashr i33 %tmp0, 9 692 %tmp1 = shl i33 %b, 9 693 %b_24 = ashr i33 %tmp1, 9 694 %tmp2 = mul i33 %a_24, %b_24 695 %hi = lshr i33 %tmp2, 32 696 %trunc = trunc i33 %hi to i32 697 698 store i32 %trunc, i32 addrspace(1)* %out 699 ret void 700} 701 702define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { 703; SI-LABEL: simplify_i24_crash: 704; SI: ; %bb.0: ; %bb 705; SI-NEXT: s_load_dword s2, s[0:1], 0xb 706; SI-NEXT: s_waitcnt lgkmcnt(0) 707; SI-NEXT: s_cmp_lg_u32 s2, 0 708; SI-NEXT: s_cbranch_scc0 .LBB8_2 709; SI-NEXT: ; %bb.1: ; %bb7 710; SI-NEXT: s_endpgm 711; SI-NEXT: .LBB8_2: ; %bb11 712; SI-NEXT: s_load_dword s2, s[0:1], 0xd 713; SI-NEXT: s_load_dword s4, s[0:1], 0xf 714; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 715; SI-NEXT: s_mov_b32 s3, 0xf000 716; SI-NEXT: s_waitcnt lgkmcnt(0) 717; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 718; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 719; SI-NEXT: s_mul_i32 s4, s2, s4 720; SI-NEXT: s_mov_b32 s2, -1 721; SI-NEXT: v_mov_b32_e32 v0, s4 722; SI-NEXT: v_mov_b32_e32 v1, s4 723; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 724; SI-NEXT: s_endpgm 725; 726; VI-LABEL: simplify_i24_crash: 727; VI: ; %bb.0: ; %bb 728; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 729; VI-NEXT: s_waitcnt lgkmcnt(0) 730; VI-NEXT: s_cmp_lg_u32 s2, 0 731; VI-NEXT: s_cbranch_scc0 .LBB8_2 732; VI-NEXT: ; %bb.1: ; %bb7 733; VI-NEXT: s_endpgm 734; VI-NEXT: .LBB8_2: ; %bb11 735; VI-NEXT: s_load_dword s4, s[0:1], 0x34 736; VI-NEXT: s_load_dword s5, s[0:1], 0x3c 737; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 738; VI-NEXT: s_mov_b32 s3, 0xf000 739; VI-NEXT: s_mov_b32 s2, -1 740; VI-NEXT: s_waitcnt lgkmcnt(0) 741; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 742; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 743; VI-NEXT: s_mul_i32 s4, s4, s5 744; VI-NEXT: v_mov_b32_e32 v0, s4 745; VI-NEXT: v_mov_b32_e32 v1, s4 746; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 747; VI-NEXT: s_endpgm 748; 749; GFX9-LABEL: simplify_i24_crash: 750; GFX9: ; %bb.0: ; %bb 751; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 752; GFX9-NEXT: s_waitcnt lgkmcnt(0) 753; GFX9-NEXT: s_cmp_lg_u32 s2, 0 754; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 755; GFX9-NEXT: ; %bb.1: ; %bb7 756; GFX9-NEXT: s_endpgm 757; GFX9-NEXT: .LBB8_2: ; %bb11 758; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 759; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c 760; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 761; GFX9-NEXT: s_mov_b32 s7, 0xf000 762; GFX9-NEXT: s_mov_b32 s6, -1 763; GFX9-NEXT: s_waitcnt lgkmcnt(0) 764; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 765; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 766; GFX9-NEXT: s_mul_i32 s0, s0, s1 767; GFX9-NEXT: v_mov_b32_e32 v0, s0 768; GFX9-NEXT: v_mov_b32_e32 v1, s0 769; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 770; GFX9-NEXT: s_endpgm 771; 772; EG-LABEL: simplify_i24_crash: 773; EG: ; %bb.0: ; %bb 774; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[] 775; EG-NEXT: JUMP @5 POP:1 776; EG-NEXT: ALU 10, @8, KC0[CB0:0-32], KC1[] 777; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0 778; EG-NEXT: POP @5 POP:1 779; EG-NEXT: CF_END 780; EG-NEXT: ALU clause starting at 6: 781; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0, 782; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 783; EG-NEXT: ALU clause starting at 8: 784; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, 785; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, 786; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 787; EG-NEXT: ASHR T1.W, PS, literal.x, 788; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 789; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 790; EG-NEXT: MOV T2.W, KC0[2].Y, 791; EG-NEXT: MULLO_INT * T0.X, PS, PV.W, 792; EG-NEXT: LSHR T1.X, PV.W, literal.x, 793; EG-NEXT: MOV * T0.Y, PS, 794; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 795; 796; CM-LABEL: simplify_i24_crash: 797; CM: ; %bb.0: ; %bb 798; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[] 799; CM-NEXT: JUMP @5 POP:1 800; CM-NEXT: ALU 13, @8, KC0[CB0:0-32], KC1[] 801; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 802; CM-NEXT: POP @5 POP:1 803; CM-NEXT: CF_END 804; CM-NEXT: ALU clause starting at 6: 805; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0, 806; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 807; CM-NEXT: ALU clause starting at 8: 808; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x, 809; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x, 810; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 811; CM-NEXT: MOV T0.Y, KC0[2].Y, 812; CM-NEXT: ASHR T1.Z, PV.W, literal.x, 813; CM-NEXT: ASHR * T0.W, PV.Z, literal.x, 814; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 815; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z, 816; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z, 817; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z, 818; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z, 819; CM-NEXT: LSHR T1.X, T0.Y, literal.x, 820; CM-NEXT: MOV * T0.Y, PV.X, 821; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 822bb: 823 %cmp = icmp eq i32 %arg0, 0 824 br i1 %cmp, label %bb11, label %bb7 825 826bb11: 827 %tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer 828 %tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer 829 %tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8> 830 %tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8> 831 %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8> 832 %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8> 833 %tmp21 = mul <2 x i32> %tmp18, %tmp20 834 store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out 835 br label %bb7 836 837bb7: 838 ret void 839 840} 841attributes #0 = { nounwind } 842