1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 #define DEBUG_TYPE "amdgpu-legalinfo" 27 28 using namespace llvm; 29 using namespace LegalizeActions; 30 using namespace LegalizeMutations; 31 using namespace LegalityPredicates; 32 33 34 static LegalityPredicate isMultiple32(unsigned TypeIdx, 35 unsigned MaxSize = 512) { 36 return [=](const LegalityQuery &Query) { 37 const LLT Ty = Query.Types[TypeIdx]; 38 const LLT EltTy = Ty.getScalarType(); 39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 40 }; 41 } 42 43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 return Ty.isVector() && 47 Ty.getNumElements() % 2 != 0 && 48 Ty.getElementType().getSizeInBits() < 32; 49 }; 50 } 51 52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 53 return [=](const LegalityQuery &Query) { 54 const LLT Ty = Query.Types[TypeIdx]; 55 const LLT EltTy = Ty.getElementType(); 56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 57 }; 58 } 59 60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 unsigned Size = Ty.getSizeInBits(); 65 unsigned Pieces = (Size + 63) / 64; 66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 68 }; 69 } 70 71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 72 return [=](const LegalityQuery &Query) { 73 const LLT QueryTy = Query.Types[TypeIdx]; 74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 75 }; 76 } 77 78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT QueryTy = Query.Types[TypeIdx]; 81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 82 }; 83 } 84 85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 86 // v2s16. 87 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 if (Ty.isVector()) { 91 const int EltSize = Ty.getElementType().getSizeInBits(); 92 return EltSize == 32 || EltSize == 64 || 93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 94 EltSize == 128 || EltSize == 256; 95 } 96 97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 98 }; 99 } 100 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 102 const GCNTargetMachine &TM) 103 : ST(ST_) { 104 using namespace TargetOpcode; 105 106 auto GetAddrSpacePtr = [&TM](unsigned AS) { 107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 108 }; 109 110 const LLT S1 = LLT::scalar(1); 111 const LLT S8 = LLT::scalar(8); 112 const LLT S16 = LLT::scalar(16); 113 const LLT S32 = LLT::scalar(32); 114 const LLT S64 = LLT::scalar(64); 115 const LLT S128 = LLT::scalar(128); 116 const LLT S256 = LLT::scalar(256); 117 const LLT S512 = LLT::scalar(512); 118 119 const LLT V2S16 = LLT::vector(2, 16); 120 const LLT V4S16 = LLT::vector(4, 16); 121 122 const LLT V2S32 = LLT::vector(2, 32); 123 const LLT V3S32 = LLT::vector(3, 32); 124 const LLT V4S32 = LLT::vector(4, 32); 125 const LLT V5S32 = LLT::vector(5, 32); 126 const LLT V6S32 = LLT::vector(6, 32); 127 const LLT V7S32 = LLT::vector(7, 32); 128 const LLT V8S32 = LLT::vector(8, 32); 129 const LLT V9S32 = LLT::vector(9, 32); 130 const LLT V10S32 = LLT::vector(10, 32); 131 const LLT V11S32 = LLT::vector(11, 32); 132 const LLT V12S32 = LLT::vector(12, 32); 133 const LLT V13S32 = LLT::vector(13, 32); 134 const LLT V14S32 = LLT::vector(14, 32); 135 const LLT V15S32 = LLT::vector(15, 32); 136 const LLT V16S32 = LLT::vector(16, 32); 137 138 const LLT V2S64 = LLT::vector(2, 64); 139 const LLT V3S64 = LLT::vector(3, 64); 140 const LLT V4S64 = LLT::vector(4, 64); 141 const LLT V5S64 = LLT::vector(5, 64); 142 const LLT V6S64 = LLT::vector(6, 64); 143 const LLT V7S64 = LLT::vector(7, 64); 144 const LLT V8S64 = LLT::vector(8, 64); 145 146 std::initializer_list<LLT> AllS32Vectors = 147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 149 std::initializer_list<LLT> AllS64Vectors = 150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 151 152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 154 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 155 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 156 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 157 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 158 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 159 160 const LLT CodePtr = FlatPtr; 161 162 const std::initializer_list<LLT> AddrSpaces64 = { 163 GlobalPtr, ConstantPtr, FlatPtr 164 }; 165 166 const std::initializer_list<LLT> AddrSpaces32 = { 167 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 168 }; 169 170 const std::initializer_list<LLT> FPTypesBase = { 171 S32, S64 172 }; 173 174 const std::initializer_list<LLT> FPTypes16 = { 175 S32, S64, S16 176 }; 177 178 const std::initializer_list<LLT> FPTypesPK16 = { 179 S32, S64, S16, V2S16 180 }; 181 182 setAction({G_BRCOND, S1}, Legal); 183 184 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 185 // elements for v3s16 186 getActionDefinitionsBuilder(G_PHI) 187 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 188 .legalFor(AllS32Vectors) 189 .legalFor(AllS64Vectors) 190 .legalFor(AddrSpaces64) 191 .legalFor(AddrSpaces32) 192 .clampScalar(0, S32, S256) 193 .widenScalarToNextPow2(0, 32) 194 .clampMaxNumElements(0, S32, 16) 195 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 196 .legalIf(isPointer(0)); 197 198 if (ST.has16BitInsts()) { 199 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 200 .legalFor({S32, S16}) 201 .clampScalar(0, S16, S32) 202 .scalarize(0); 203 } else { 204 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 205 .legalFor({S32}) 206 .clampScalar(0, S32, S32) 207 .scalarize(0); 208 } 209 210 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 211 .legalFor({S32}) 212 .clampScalar(0, S32, S32) 213 .scalarize(0); 214 215 // Report legal for any types we can handle anywhere. For the cases only legal 216 // on the SALU, RegBankSelect will be able to re-legalize. 217 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 218 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 219 .clampScalar(0, S32, S64) 220 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 221 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 222 .widenScalarToNextPow2(0) 223 .scalarize(0); 224 225 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 226 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 227 .legalFor({{S32, S1}}) 228 .clampScalar(0, S32, S32); 229 230 getActionDefinitionsBuilder(G_BITCAST) 231 .legalForCartesianProduct({S32, V2S16}) 232 .legalForCartesianProduct({S64, V2S32, V4S16}) 233 .legalForCartesianProduct({V2S64, V4S32}) 234 // Don't worry about the size constraint. 235 .legalIf(all(isPointer(0), isPointer(1))); 236 237 if (ST.has16BitInsts()) { 238 getActionDefinitionsBuilder(G_FCONSTANT) 239 .legalFor({S32, S64, S16}) 240 .clampScalar(0, S16, S64); 241 } else { 242 getActionDefinitionsBuilder(G_FCONSTANT) 243 .legalFor({S32, S64}) 244 .clampScalar(0, S32, S64); 245 } 246 247 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 248 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 249 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 250 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 251 .clampScalarOrElt(0, S32, S512) 252 .legalIf(isMultiple32(0)) 253 .widenScalarToNextPow2(0, 32) 254 .clampMaxNumElements(0, S32, 16); 255 256 257 // FIXME: i1 operands to intrinsics should always be legal, but other i1 258 // values may not be legal. We need to figure out how to distinguish 259 // between these two scenarios. 260 getActionDefinitionsBuilder(G_CONSTANT) 261 .legalFor({S1, S32, S64, GlobalPtr, 262 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 263 .clampScalar(0, S32, S64) 264 .widenScalarToNextPow2(0) 265 .legalIf(isPointer(0)); 266 267 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 268 269 auto &FPOpActions = getActionDefinitionsBuilder( 270 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 271 .legalFor({S32, S64}); 272 273 if (ST.has16BitInsts()) { 274 if (ST.hasVOP3PInsts()) 275 FPOpActions.legalFor({S16, V2S16}); 276 else 277 FPOpActions.legalFor({S16}); 278 } 279 280 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 281 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 282 283 if (ST.hasVOP3PInsts()) { 284 MinNumMaxNum.customFor(FPTypesPK16) 285 .clampMaxNumElements(0, S16, 2) 286 .clampScalar(0, S16, S64) 287 .scalarize(0); 288 } else if (ST.has16BitInsts()) { 289 MinNumMaxNum.customFor(FPTypes16) 290 .clampScalar(0, S16, S64) 291 .scalarize(0); 292 } else { 293 MinNumMaxNum.customFor(FPTypesBase) 294 .clampScalar(0, S32, S64) 295 .scalarize(0); 296 } 297 298 // TODO: Implement 299 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 300 301 if (ST.hasVOP3PInsts()) 302 FPOpActions.clampMaxNumElements(0, S16, 2); 303 FPOpActions 304 .scalarize(0) 305 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 306 307 if (ST.has16BitInsts()) { 308 getActionDefinitionsBuilder(G_FSQRT) 309 .legalFor({S32, S64, S16}) 310 .scalarize(0) 311 .clampScalar(0, S16, S64); 312 } else { 313 getActionDefinitionsBuilder(G_FSQRT) 314 .legalFor({S32, S64}) 315 .scalarize(0) 316 .clampScalar(0, S32, S64); 317 } 318 319 getActionDefinitionsBuilder(G_FPTRUNC) 320 .legalFor({{S32, S64}, {S16, S32}}) 321 .scalarize(0); 322 323 getActionDefinitionsBuilder(G_FPEXT) 324 .legalFor({{S64, S32}, {S32, S16}}) 325 .lowerFor({{S64, S16}}) // FIXME: Implement 326 .scalarize(0); 327 328 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 329 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 330 331 getActionDefinitionsBuilder(G_FSUB) 332 // Use actual fsub instruction 333 .legalFor({S32}) 334 // Must use fadd + fneg 335 .lowerFor({S64, S16, V2S16}) 336 .scalarize(0) 337 .clampScalar(0, S32, S64); 338 339 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 340 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 341 {S32, S1}, {S64, S1}, {S16, S1}, 342 // FIXME: Hack 343 {S64, LLT::scalar(33)}, 344 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 345 .scalarize(0); 346 347 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 348 .legalFor({{S32, S32}, {S64, S32}}) 349 .lowerFor({{S32, S64}}) 350 .customFor({{S64, S64}}) 351 .scalarize(0); 352 353 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 354 .legalFor({{S32, S32}, {S32, S64}}) 355 .scalarize(0); 356 357 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 358 .legalFor({S32, S64}) 359 .scalarize(0); 360 361 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 362 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 363 .legalFor({S32, S64}) 364 .clampScalar(0, S32, S64) 365 .scalarize(0); 366 } else { 367 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 368 .legalFor({S32}) 369 .customFor({S64}) 370 .clampScalar(0, S32, S64) 371 .scalarize(0); 372 } 373 374 getActionDefinitionsBuilder(G_GEP) 375 .legalForCartesianProduct(AddrSpaces64, {S64}) 376 .legalForCartesianProduct(AddrSpaces32, {S32}) 377 .scalarize(0); 378 379 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 380 381 auto &CmpBuilder = 382 getActionDefinitionsBuilder(G_ICMP) 383 .legalForCartesianProduct( 384 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 385 .legalFor({{S1, S32}, {S1, S64}}); 386 if (ST.has16BitInsts()) { 387 CmpBuilder.legalFor({{S1, S16}}); 388 } 389 390 CmpBuilder 391 .widenScalarToNextPow2(1) 392 .clampScalar(1, S32, S64) 393 .scalarize(0) 394 .legalIf(all(typeIs(0, S1), isPointer(1))); 395 396 getActionDefinitionsBuilder(G_FCMP) 397 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 398 .widenScalarToNextPow2(1) 399 .clampScalar(1, S32, S64) 400 .scalarize(0); 401 402 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 403 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 404 G_FLOG, G_FLOG2, G_FLOG10}) 405 .legalFor({S32}) 406 .scalarize(0); 407 408 // The 64-bit versions produce 32-bit results, but only on the SALU. 409 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 410 G_CTTZ, G_CTTZ_ZERO_UNDEF, 411 G_CTPOP}) 412 .legalFor({{S32, S32}, {S32, S64}}) 413 .clampScalar(0, S32, S32) 414 .clampScalar(1, S32, S64) 415 .scalarize(0) 416 .widenScalarToNextPow2(0, 32) 417 .widenScalarToNextPow2(1, 32); 418 419 // TODO: Expand for > s32 420 getActionDefinitionsBuilder(G_BSWAP) 421 .legalFor({S32}) 422 .clampScalar(0, S32, S32) 423 .scalarize(0); 424 425 if (ST.has16BitInsts()) { 426 if (ST.hasVOP3PInsts()) { 427 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 428 .legalFor({S32, S16, V2S16}) 429 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 430 .clampMaxNumElements(0, S16, 2) 431 .clampScalar(0, S16, S32) 432 .widenScalarToNextPow2(0) 433 .scalarize(0); 434 } else { 435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 436 .legalFor({S32, S16}) 437 .widenScalarToNextPow2(0) 438 .clampScalar(0, S16, S32) 439 .scalarize(0); 440 } 441 } else { 442 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 443 .legalFor({S32}) 444 .clampScalar(0, S32, S32) 445 .widenScalarToNextPow2(0) 446 .scalarize(0); 447 } 448 449 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 450 return [=](const LegalityQuery &Query) { 451 return Query.Types[TypeIdx0].getSizeInBits() < 452 Query.Types[TypeIdx1].getSizeInBits(); 453 }; 454 }; 455 456 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 457 return [=](const LegalityQuery &Query) { 458 return Query.Types[TypeIdx0].getSizeInBits() > 459 Query.Types[TypeIdx1].getSizeInBits(); 460 }; 461 }; 462 463 getActionDefinitionsBuilder(G_INTTOPTR) 464 // List the common cases 465 .legalForCartesianProduct(AddrSpaces64, {S64}) 466 .legalForCartesianProduct(AddrSpaces32, {S32}) 467 .scalarize(0) 468 // Accept any address space as long as the size matches 469 .legalIf(sameSize(0, 1)) 470 .widenScalarIf(smallerThan(1, 0), 471 [](const LegalityQuery &Query) { 472 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 473 }) 474 .narrowScalarIf(greaterThan(1, 0), 475 [](const LegalityQuery &Query) { 476 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 477 }); 478 479 getActionDefinitionsBuilder(G_PTRTOINT) 480 // List the common cases 481 .legalForCartesianProduct(AddrSpaces64, {S64}) 482 .legalForCartesianProduct(AddrSpaces32, {S32}) 483 .scalarize(0) 484 // Accept any address space as long as the size matches 485 .legalIf(sameSize(0, 1)) 486 .widenScalarIf(smallerThan(0, 1), 487 [](const LegalityQuery &Query) { 488 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 489 }) 490 .narrowScalarIf( 491 greaterThan(0, 1), 492 [](const LegalityQuery &Query) { 493 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 494 }); 495 496 if (ST.hasFlatAddressSpace()) { 497 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 498 .scalarize(0) 499 .custom(); 500 } 501 502 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 503 // handle some operations by just promoting the register during 504 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 505 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 506 .narrowScalarIf([](const LegalityQuery &Query) { 507 unsigned Size = Query.Types[0].getSizeInBits(); 508 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 509 return (Size > 32 && MemSize < Size); 510 }, 511 [](const LegalityQuery &Query) { 512 return std::make_pair(0, LLT::scalar(32)); 513 }) 514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 515 .fewerElementsIf([=](const LegalityQuery &Query) { 516 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 517 return (MemSize == 96) && 518 Query.Types[0].isVector() && 519 !ST.hasDwordx3LoadStores(); 520 }, 521 [=](const LegalityQuery &Query) { 522 return std::make_pair(0, V2S32); 523 }) 524 .legalIf([=](const LegalityQuery &Query) { 525 const LLT &Ty0 = Query.Types[0]; 526 527 unsigned Size = Ty0.getSizeInBits(); 528 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 529 if (Size < 32 || (Size > 32 && MemSize < Size)) 530 return false; 531 532 if (Ty0.isVector() && Size != MemSize) 533 return false; 534 535 // TODO: Decompose private loads into 4-byte components. 536 // TODO: Illegal flat loads on SI 537 switch (MemSize) { 538 case 8: 539 case 16: 540 return Size == 32; 541 case 32: 542 case 64: 543 case 128: 544 return true; 545 546 case 96: 547 return ST.hasDwordx3LoadStores(); 548 549 case 256: 550 case 512: 551 // TODO: Possibly support loads of i256 and i512 . This will require 552 // adding i256 and i512 types to MVT in order for to be able to use 553 // TableGen. 554 // TODO: Add support for other vector types, this will require 555 // defining more value mappings for the new types. 556 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 557 Ty0.getScalarType().getSizeInBits() == 64); 558 559 default: 560 return false; 561 } 562 }) 563 .clampScalar(0, S32, S64); 564 565 566 // FIXME: Handle alignment requirements. 567 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 568 .legalForTypesWithMemDesc({ 569 {S32, GlobalPtr, 8, 8}, 570 {S32, GlobalPtr, 16, 8}, 571 {S32, LocalPtr, 8, 8}, 572 {S32, LocalPtr, 16, 8}, 573 {S32, PrivatePtr, 8, 8}, 574 {S32, PrivatePtr, 16, 8}}); 575 if (ST.hasFlatAddressSpace()) { 576 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 577 {S32, FlatPtr, 16, 8}}); 578 } 579 580 ExtLoads.clampScalar(0, S32, S32) 581 .widenScalarToNextPow2(0) 582 .unsupportedIfMemSizeNotPow2() 583 .lower(); 584 585 auto &Atomics = getActionDefinitionsBuilder( 586 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 587 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 588 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 589 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 590 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 591 {S64, GlobalPtr}, {S64, LocalPtr}}); 592 if (ST.hasFlatAddressSpace()) { 593 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 594 } 595 596 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 597 .legalFor({{S32, LocalPtr}}); 598 599 // TODO: Pointer types, any 32-bit or 64-bit vector 600 getActionDefinitionsBuilder(G_SELECT) 601 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 602 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 603 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 604 .clampScalar(0, S16, S64) 605 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 606 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 607 .scalarize(1) 608 .clampMaxNumElements(0, S32, 2) 609 .clampMaxNumElements(0, LocalPtr, 2) 610 .clampMaxNumElements(0, PrivatePtr, 2) 611 .scalarize(0) 612 .widenScalarToNextPow2(0) 613 .legalIf(all(isPointer(0), typeIs(1, S1))); 614 615 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 616 // be more flexible with the shift amount type. 617 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 618 .legalFor({{S32, S32}, {S64, S32}}); 619 if (ST.has16BitInsts()) { 620 if (ST.hasVOP3PInsts()) { 621 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 622 .clampMaxNumElements(0, S16, 2); 623 } else 624 Shifts.legalFor({{S16, S32}, {S16, S16}}); 625 626 Shifts.clampScalar(1, S16, S32); 627 Shifts.clampScalar(0, S16, S64); 628 Shifts.widenScalarToNextPow2(0, 16); 629 } else { 630 // Make sure we legalize the shift amount type first, as the general 631 // expansion for the shifted type will produce much worse code if it hasn't 632 // been truncated already. 633 Shifts.clampScalar(1, S32, S32); 634 Shifts.clampScalar(0, S32, S64); 635 Shifts.widenScalarToNextPow2(0, 32); 636 } 637 Shifts.scalarize(0); 638 639 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 640 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 641 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 642 unsigned IdxTypeIdx = 2; 643 644 getActionDefinitionsBuilder(Op) 645 .customIf([=](const LegalityQuery &Query) { 646 const LLT EltTy = Query.Types[EltTypeIdx]; 647 const LLT VecTy = Query.Types[VecTypeIdx]; 648 const LLT IdxTy = Query.Types[IdxTypeIdx]; 649 return (EltTy.getSizeInBits() == 16 || 650 EltTy.getSizeInBits() % 32 == 0) && 651 VecTy.getSizeInBits() % 32 == 0 && 652 VecTy.getSizeInBits() <= 512 && 653 IdxTy.getSizeInBits() == 32; 654 }) 655 .clampScalar(EltTypeIdx, S32, S64) 656 .clampScalar(VecTypeIdx, S32, S64) 657 .clampScalar(IdxTypeIdx, S32, S32); 658 } 659 660 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 661 .unsupportedIf([=](const LegalityQuery &Query) { 662 const LLT &EltTy = Query.Types[1].getElementType(); 663 return Query.Types[0] != EltTy; 664 }); 665 666 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 667 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 668 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 669 670 // FIXME: Doesn't handle extract of illegal sizes. 671 getActionDefinitionsBuilder(Op) 672 .legalIf([=](const LegalityQuery &Query) { 673 const LLT BigTy = Query.Types[BigTyIdx]; 674 const LLT LitTy = Query.Types[LitTyIdx]; 675 return (BigTy.getSizeInBits() % 32 == 0) && 676 (LitTy.getSizeInBits() % 16 == 0); 677 }) 678 .widenScalarIf( 679 [=](const LegalityQuery &Query) { 680 const LLT BigTy = Query.Types[BigTyIdx]; 681 return (BigTy.getScalarSizeInBits() < 16); 682 }, 683 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 684 .widenScalarIf( 685 [=](const LegalityQuery &Query) { 686 const LLT LitTy = Query.Types[LitTyIdx]; 687 return (LitTy.getScalarSizeInBits() < 16); 688 }, 689 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 690 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 691 .widenScalarToNextPow2(BigTyIdx, 32); 692 693 } 694 695 getActionDefinitionsBuilder(G_BUILD_VECTOR) 696 .legalForCartesianProduct(AllS32Vectors, {S32}) 697 .legalForCartesianProduct(AllS64Vectors, {S64}) 698 .clampNumElements(0, V16S32, V16S32) 699 .clampNumElements(0, V2S64, V8S64) 700 .minScalarSameAs(1, 0) 701 .legalIf(isRegisterType(0)) 702 .minScalarOrElt(0, S32); 703 704 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 705 .legalIf(isRegisterType(0)); 706 707 // Merge/Unmerge 708 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 709 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 710 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 711 712 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 713 const LLT &Ty = Query.Types[TypeIdx]; 714 if (Ty.isVector()) { 715 const LLT &EltTy = Ty.getElementType(); 716 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 717 return true; 718 if (!isPowerOf2_32(EltTy.getSizeInBits())) 719 return true; 720 } 721 return false; 722 }; 723 724 getActionDefinitionsBuilder(Op) 725 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 726 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 727 // worth considering the multiples of 64 since 2*192 and 2*384 are not 728 // valid. 729 .clampScalar(LitTyIdx, S16, S256) 730 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 731 732 // Break up vectors with weird elements into scalars 733 .fewerElementsIf( 734 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 735 scalarize(0)) 736 .fewerElementsIf( 737 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 738 scalarize(1)) 739 .clampScalar(BigTyIdx, S32, S512) 740 .lowerFor({{S16, V2S16}}) 741 .widenScalarIf( 742 [=](const LegalityQuery &Query) { 743 const LLT &Ty = Query.Types[BigTyIdx]; 744 return !isPowerOf2_32(Ty.getSizeInBits()) && 745 Ty.getSizeInBits() % 16 != 0; 746 }, 747 [=](const LegalityQuery &Query) { 748 // Pick the next power of 2, or a multiple of 64 over 128. 749 // Whichever is smaller. 750 const LLT &Ty = Query.Types[BigTyIdx]; 751 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 752 if (NewSizeInBits >= 256) { 753 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 754 if (RoundedTo < NewSizeInBits) 755 NewSizeInBits = RoundedTo; 756 } 757 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 758 }) 759 .legalIf([=](const LegalityQuery &Query) { 760 const LLT &BigTy = Query.Types[BigTyIdx]; 761 const LLT &LitTy = Query.Types[LitTyIdx]; 762 763 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 764 return false; 765 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 766 return false; 767 768 return BigTy.getSizeInBits() % 16 == 0 && 769 LitTy.getSizeInBits() % 16 == 0 && 770 BigTy.getSizeInBits() <= 512; 771 }) 772 // Any vectors left are the wrong size. Scalarize them. 773 .scalarize(0) 774 .scalarize(1); 775 } 776 777 computeTables(); 778 verify(*ST.getInstrInfo()); 779 } 780 781 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 782 MachineRegisterInfo &MRI, 783 MachineIRBuilder &MIRBuilder, 784 GISelChangeObserver &Observer) const { 785 switch (MI.getOpcode()) { 786 case TargetOpcode::G_ADDRSPACE_CAST: 787 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 788 case TargetOpcode::G_FRINT: 789 return legalizeFrint(MI, MRI, MIRBuilder); 790 case TargetOpcode::G_FCEIL: 791 return legalizeFceil(MI, MRI, MIRBuilder); 792 case TargetOpcode::G_INTRINSIC_TRUNC: 793 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 794 case TargetOpcode::G_SITOFP: 795 return legalizeITOFP(MI, MRI, MIRBuilder, true); 796 case TargetOpcode::G_UITOFP: 797 return legalizeITOFP(MI, MRI, MIRBuilder, false); 798 case TargetOpcode::G_FMINNUM: 799 case TargetOpcode::G_FMAXNUM: 800 case TargetOpcode::G_FMINNUM_IEEE: 801 case TargetOpcode::G_FMAXNUM_IEEE: 802 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 803 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 804 return legalizeExtractVectorElt(MI, MRI, MIRBuilder); 805 case TargetOpcode::G_INSERT_VECTOR_ELT: 806 return legalizeInsertVectorElt(MI, MRI, MIRBuilder); 807 default: 808 return false; 809 } 810 811 llvm_unreachable("expected switch to return"); 812 } 813 814 Register AMDGPULegalizerInfo::getSegmentAperture( 815 unsigned AS, 816 MachineRegisterInfo &MRI, 817 MachineIRBuilder &MIRBuilder) const { 818 MachineFunction &MF = MIRBuilder.getMF(); 819 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 820 const LLT S32 = LLT::scalar(32); 821 822 if (ST.hasApertureRegs()) { 823 // FIXME: Use inline constants (src_{shared, private}_base) instead of 824 // getreg. 825 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 826 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 827 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 828 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 829 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 830 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 831 unsigned Encoding = 832 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 833 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 834 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 835 836 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 837 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 838 839 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 840 .addDef(GetReg) 841 .addImm(Encoding); 842 MRI.setType(GetReg, S32); 843 844 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 845 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 846 .addDef(ApertureReg) 847 .addUse(GetReg) 848 .addUse(ShiftAmt.getReg(0)); 849 850 return ApertureReg; 851 } 852 853 Register QueuePtr = MRI.createGenericVirtualRegister( 854 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 855 856 // FIXME: Placeholder until we can track the input registers. 857 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 858 859 // Offset into amd_queue_t for group_segment_aperture_base_hi / 860 // private_segment_aperture_base_hi. 861 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 862 863 // FIXME: Don't use undef 864 Value *V = UndefValue::get(PointerType::get( 865 Type::getInt8Ty(MF.getFunction().getContext()), 866 AMDGPUAS::CONSTANT_ADDRESS)); 867 868 MachinePointerInfo PtrInfo(V, StructOffset); 869 MachineMemOperand *MMO = MF.getMachineMemOperand( 870 PtrInfo, 871 MachineMemOperand::MOLoad | 872 MachineMemOperand::MODereferenceable | 873 MachineMemOperand::MOInvariant, 874 4, 875 MinAlign(64, StructOffset)); 876 877 Register LoadResult = MRI.createGenericVirtualRegister(S32); 878 Register LoadAddr; 879 880 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 881 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 882 return LoadResult; 883 } 884 885 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 886 MachineInstr &MI, MachineRegisterInfo &MRI, 887 MachineIRBuilder &MIRBuilder) const { 888 MachineFunction &MF = MIRBuilder.getMF(); 889 890 MIRBuilder.setInstr(MI); 891 892 Register Dst = MI.getOperand(0).getReg(); 893 Register Src = MI.getOperand(1).getReg(); 894 895 LLT DstTy = MRI.getType(Dst); 896 LLT SrcTy = MRI.getType(Src); 897 unsigned DestAS = DstTy.getAddressSpace(); 898 unsigned SrcAS = SrcTy.getAddressSpace(); 899 900 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 901 // vector element. 902 assert(!DstTy.isVector()); 903 904 const AMDGPUTargetMachine &TM 905 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 906 907 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 908 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 909 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 910 return true; 911 } 912 913 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 914 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 915 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 916 unsigned NullVal = TM.getNullPointerValue(DestAS); 917 918 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 919 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 920 921 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 922 923 // Extract low 32-bits of the pointer. 924 MIRBuilder.buildExtract(PtrLo32, Src, 0); 925 926 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 927 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 928 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 929 930 MI.eraseFromParent(); 931 return true; 932 } 933 934 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 935 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 936 937 auto SegmentNull = 938 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 939 auto FlatNull = 940 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 941 942 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 943 944 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 945 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 946 947 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 948 949 // Coerce the type of the low half of the result so we can use merge_values. 950 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 951 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 952 .addDef(SrcAsInt) 953 .addUse(Src); 954 955 // TODO: Should we allow mismatched types but matching sizes in merges to 956 // avoid the ptrtoint? 957 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 958 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 959 960 MI.eraseFromParent(); 961 return true; 962 } 963 964 bool AMDGPULegalizerInfo::legalizeFrint( 965 MachineInstr &MI, MachineRegisterInfo &MRI, 966 MachineIRBuilder &MIRBuilder) const { 967 MIRBuilder.setInstr(MI); 968 969 Register Src = MI.getOperand(1).getReg(); 970 LLT Ty = MRI.getType(Src); 971 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 972 973 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 974 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 975 976 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 977 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 978 979 // TODO: Should this propagate fast-math-flags? 980 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 981 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 982 983 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 984 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 985 986 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 987 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 988 return true; 989 } 990 991 bool AMDGPULegalizerInfo::legalizeFceil( 992 MachineInstr &MI, MachineRegisterInfo &MRI, 993 MachineIRBuilder &B) const { 994 B.setInstr(MI); 995 996 const LLT S1 = LLT::scalar(1); 997 const LLT S64 = LLT::scalar(64); 998 999 Register Src = MI.getOperand(1).getReg(); 1000 assert(MRI.getType(Src) == S64); 1001 1002 // result = trunc(src) 1003 // if (src > 0.0 && src != result) 1004 // result += 1.0 1005 1006 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1007 1008 const auto Zero = B.buildFConstant(S64, 0.0); 1009 const auto One = B.buildFConstant(S64, 1.0); 1010 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1011 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1012 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1013 auto Add = B.buildSelect(S64, And, One, Zero); 1014 1015 // TODO: Should this propagate fast-math-flags? 1016 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1017 return true; 1018 } 1019 1020 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1021 MachineIRBuilder &B) { 1022 const unsigned FractBits = 52; 1023 const unsigned ExpBits = 11; 1024 LLT S32 = LLT::scalar(32); 1025 1026 auto Const0 = B.buildConstant(S32, FractBits - 32); 1027 auto Const1 = B.buildConstant(S32, ExpBits); 1028 1029 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1030 .addUse(Const0.getReg(0)) 1031 .addUse(Const1.getReg(0)); 1032 1033 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1034 } 1035 1036 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1037 MachineInstr &MI, MachineRegisterInfo &MRI, 1038 MachineIRBuilder &B) const { 1039 B.setInstr(MI); 1040 1041 const LLT S1 = LLT::scalar(1); 1042 const LLT S32 = LLT::scalar(32); 1043 const LLT S64 = LLT::scalar(64); 1044 1045 Register Src = MI.getOperand(1).getReg(); 1046 assert(MRI.getType(Src) == S64); 1047 1048 // TODO: Should this use extract since the low half is unused? 1049 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1050 Register Hi = Unmerge.getReg(1); 1051 1052 // Extract the upper half, since this is where we will find the sign and 1053 // exponent. 1054 auto Exp = extractF64Exponent(Hi, B); 1055 1056 const unsigned FractBits = 52; 1057 1058 // Extract the sign bit. 1059 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1060 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1061 1062 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1063 1064 const auto Zero32 = B.buildConstant(S32, 0); 1065 1066 // Extend back to 64-bits. 1067 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1068 1069 auto Shr = B.buildAShr(S64, FractMask, Exp); 1070 auto Not = B.buildNot(S64, Shr); 1071 auto Tmp0 = B.buildAnd(S64, Src, Not); 1072 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1073 1074 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1075 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1076 1077 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1078 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1079 return true; 1080 } 1081 1082 bool AMDGPULegalizerInfo::legalizeITOFP( 1083 MachineInstr &MI, MachineRegisterInfo &MRI, 1084 MachineIRBuilder &B, bool Signed) const { 1085 B.setInstr(MI); 1086 1087 Register Dst = MI.getOperand(0).getReg(); 1088 Register Src = MI.getOperand(1).getReg(); 1089 1090 const LLT S64 = LLT::scalar(64); 1091 const LLT S32 = LLT::scalar(32); 1092 1093 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1094 1095 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1096 1097 auto CvtHi = Signed ? 1098 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1099 B.buildUITOFP(S64, Unmerge.getReg(1)); 1100 1101 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1102 1103 auto ThirtyTwo = B.buildConstant(S32, 32); 1104 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1105 .addUse(CvtHi.getReg(0)) 1106 .addUse(ThirtyTwo.getReg(0)); 1107 1108 // TODO: Should this propagate fast-math-flags? 1109 B.buildFAdd(Dst, LdExp, CvtLo); 1110 MI.eraseFromParent(); 1111 return true; 1112 } 1113 1114 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1115 MachineInstr &MI, MachineRegisterInfo &MRI, 1116 MachineIRBuilder &B) const { 1117 MachineFunction &MF = B.getMF(); 1118 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1119 1120 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1121 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1122 1123 // With ieee_mode disabled, the instructions have the correct behavior 1124 // already for G_FMINNUM/G_FMAXNUM 1125 if (!MFI->getMode().IEEE) 1126 return !IsIEEEOp; 1127 1128 if (IsIEEEOp) 1129 return true; 1130 1131 MachineIRBuilder HelperBuilder(MI); 1132 GISelObserverWrapper DummyObserver; 1133 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1134 HelperBuilder.setMBB(*MI.getParent()); 1135 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1136 } 1137 1138 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1139 MachineInstr &MI, MachineRegisterInfo &MRI, 1140 MachineIRBuilder &B) const { 1141 // TODO: Should move some of this into LegalizerHelper. 1142 1143 // TODO: Promote dynamic indexing of s16 to s32 1144 // TODO: Dynamic s64 indexing is only legal for SGPR. 1145 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1146 if (!IdxVal) // Dynamic case will be selected to register indexing. 1147 return true; 1148 1149 Register Dst = MI.getOperand(0).getReg(); 1150 Register Vec = MI.getOperand(1).getReg(); 1151 1152 LLT VecTy = MRI.getType(Vec); 1153 LLT EltTy = VecTy.getElementType(); 1154 assert(EltTy == MRI.getType(Dst)); 1155 1156 B.setInstr(MI); 1157 1158 if (IdxVal.getValue() < VecTy.getNumElements()) 1159 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1160 else 1161 B.buildUndef(Dst); 1162 1163 MI.eraseFromParent(); 1164 return true; 1165 } 1166 1167 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1168 MachineInstr &MI, MachineRegisterInfo &MRI, 1169 MachineIRBuilder &B) const { 1170 // TODO: Should move some of this into LegalizerHelper. 1171 1172 // TODO: Promote dynamic indexing of s16 to s32 1173 // TODO: Dynamic s64 indexing is only legal for SGPR. 1174 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1175 if (!IdxVal) // Dynamic case will be selected to register indexing. 1176 return true; 1177 1178 Register Dst = MI.getOperand(0).getReg(); 1179 Register Vec = MI.getOperand(1).getReg(); 1180 Register Ins = MI.getOperand(2).getReg(); 1181 1182 LLT VecTy = MRI.getType(Vec); 1183 LLT EltTy = VecTy.getElementType(); 1184 assert(EltTy == MRI.getType(Ins)); 1185 1186 B.setInstr(MI); 1187 1188 if (IdxVal.getValue() < VecTy.getNumElements()) 1189 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1190 else 1191 B.buildUndef(Dst); 1192 1193 MI.eraseFromParent(); 1194 return true; 1195 } 1196 1197 // Return the use branch instruction, otherwise null if the usage is invalid. 1198 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1199 MachineRegisterInfo &MRI) { 1200 Register CondDef = MI.getOperand(0).getReg(); 1201 if (!MRI.hasOneNonDBGUse(CondDef)) 1202 return nullptr; 1203 1204 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1205 return UseMI.getParent() == MI.getParent() && 1206 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1207 } 1208 1209 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1210 Register Reg, LLT Ty) const { 1211 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1212 if (LiveIn) 1213 return LiveIn; 1214 1215 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1216 MRI.addLiveIn(Reg, NewReg); 1217 return NewReg; 1218 } 1219 1220 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1221 const ArgDescriptor *Arg) const { 1222 if (!Arg->isRegister()) 1223 return false; // TODO: Handle these 1224 1225 assert(Arg->getRegister() != 0); 1226 assert(Arg->getRegister().isPhysical()); 1227 1228 MachineRegisterInfo &MRI = *B.getMRI(); 1229 1230 LLT Ty = MRI.getType(DstReg); 1231 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1232 1233 if (Arg->isMasked()) { 1234 // TODO: Should we try to emit this once in the entry block? 1235 const LLT S32 = LLT::scalar(32); 1236 const unsigned Mask = Arg->getMask(); 1237 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1238 1239 auto ShiftAmt = B.buildConstant(S32, Shift); 1240 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1241 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1242 } else 1243 B.buildCopy(DstReg, LiveIn); 1244 1245 // Insert the argument copy if it doens't already exist. 1246 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1247 if (!MRI.getVRegDef(LiveIn)) { 1248 MachineBasicBlock &EntryMBB = B.getMF().front(); 1249 EntryMBB.addLiveIn(Arg->getRegister()); 1250 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1251 B.buildCopy(LiveIn, Arg->getRegister()); 1252 } 1253 1254 return true; 1255 } 1256 1257 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1258 MachineInstr &MI, 1259 MachineRegisterInfo &MRI, 1260 MachineIRBuilder &B, 1261 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1262 B.setInstr(MI); 1263 1264 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1265 1266 const ArgDescriptor *Arg; 1267 const TargetRegisterClass *RC; 1268 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1269 if (!Arg) { 1270 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1271 return false; 1272 } 1273 1274 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1275 MI.eraseFromParent(); 1276 return true; 1277 } 1278 1279 return false; 1280 } 1281 1282 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, 1283 MachineRegisterInfo &MRI, 1284 MachineIRBuilder &B) const { 1285 B.setInstr(MI); 1286 Register Res = MI.getOperand(0).getReg(); 1287 Register LHS = MI.getOperand(2).getReg(); 1288 Register RHS = MI.getOperand(3).getReg(); 1289 uint16_t Flags = MI.getFlags(); 1290 1291 LLT S32 = LLT::scalar(32); 1292 LLT S1 = LLT::scalar(1); 1293 1294 auto Abs = B.buildFAbs(S32, RHS, Flags); 1295 const APFloat C0Val(1.0f); 1296 1297 auto C0 = B.buildConstant(S32, 0x6f800000); 1298 auto C1 = B.buildConstant(S32, 0x2f800000); 1299 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1300 1301 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1302 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1303 1304 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1305 1306 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1307 .addUse(Mul0.getReg(0)) 1308 .setMIFlags(Flags); 1309 1310 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1311 1312 B.buildFMul(Res, Sel, Mul1, Flags); 1313 1314 MI.eraseFromParent(); 1315 return true; 1316 } 1317 1318 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1319 MachineRegisterInfo &MRI, 1320 MachineIRBuilder &B) const { 1321 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1322 if (!MFI->isEntryFunction()) { 1323 return legalizePreloadedArgIntrin(MI, MRI, B, 1324 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1325 } 1326 1327 B.setInstr(MI); 1328 1329 uint64_t Offset = 1330 ST.getTargetLowering()->getImplicitParameterOffset( 1331 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1332 Register DstReg = MI.getOperand(0).getReg(); 1333 LLT DstTy = MRI.getType(DstReg); 1334 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1335 1336 const ArgDescriptor *Arg; 1337 const TargetRegisterClass *RC; 1338 std::tie(Arg, RC) 1339 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1340 if (!Arg) 1341 return false; 1342 1343 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1344 if (!loadInputValue(KernargPtrReg, B, Arg)) 1345 return false; 1346 1347 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1348 MI.eraseFromParent(); 1349 return true; 1350 } 1351 1352 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1353 MachineRegisterInfo &MRI, 1354 MachineIRBuilder &B) const { 1355 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1356 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1357 case Intrinsic::amdgcn_if: { 1358 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1359 const SIRegisterInfo *TRI 1360 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1361 1362 B.setInstr(*BrCond); 1363 Register Def = MI.getOperand(1).getReg(); 1364 Register Use = MI.getOperand(3).getReg(); 1365 B.buildInstr(AMDGPU::SI_IF) 1366 .addDef(Def) 1367 .addUse(Use) 1368 .addMBB(BrCond->getOperand(1).getMBB()); 1369 1370 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1371 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1372 MI.eraseFromParent(); 1373 BrCond->eraseFromParent(); 1374 return true; 1375 } 1376 1377 return false; 1378 } 1379 case Intrinsic::amdgcn_loop: { 1380 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1381 const SIRegisterInfo *TRI 1382 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1383 1384 B.setInstr(*BrCond); 1385 Register Reg = MI.getOperand(2).getReg(); 1386 B.buildInstr(AMDGPU::SI_LOOP) 1387 .addUse(Reg) 1388 .addMBB(BrCond->getOperand(1).getMBB()); 1389 MI.eraseFromParent(); 1390 BrCond->eraseFromParent(); 1391 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1392 return true; 1393 } 1394 1395 return false; 1396 } 1397 case Intrinsic::amdgcn_kernarg_segment_ptr: 1398 return legalizePreloadedArgIntrin( 1399 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1400 case Intrinsic::amdgcn_implicitarg_ptr: 1401 return legalizeImplicitArgPtr(MI, MRI, B); 1402 case Intrinsic::amdgcn_workitem_id_x: 1403 return legalizePreloadedArgIntrin(MI, MRI, B, 1404 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1405 case Intrinsic::amdgcn_workitem_id_y: 1406 return legalizePreloadedArgIntrin(MI, MRI, B, 1407 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1408 case Intrinsic::amdgcn_workitem_id_z: 1409 return legalizePreloadedArgIntrin(MI, MRI, B, 1410 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1411 case Intrinsic::amdgcn_workgroup_id_x: 1412 return legalizePreloadedArgIntrin(MI, MRI, B, 1413 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1414 case Intrinsic::amdgcn_workgroup_id_y: 1415 return legalizePreloadedArgIntrin(MI, MRI, B, 1416 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1417 case Intrinsic::amdgcn_workgroup_id_z: 1418 return legalizePreloadedArgIntrin(MI, MRI, B, 1419 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1420 case Intrinsic::amdgcn_dispatch_ptr: 1421 return legalizePreloadedArgIntrin(MI, MRI, B, 1422 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1423 case Intrinsic::amdgcn_queue_ptr: 1424 return legalizePreloadedArgIntrin(MI, MRI, B, 1425 AMDGPUFunctionArgInfo::QUEUE_PTR); 1426 case Intrinsic::amdgcn_implicit_buffer_ptr: 1427 return legalizePreloadedArgIntrin( 1428 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1429 case Intrinsic::amdgcn_dispatch_id: 1430 return legalizePreloadedArgIntrin(MI, MRI, B, 1431 AMDGPUFunctionArgInfo::DISPATCH_ID); 1432 case Intrinsic::amdgcn_fdiv_fast: 1433 return legalizeFDIVFast(MI, MRI, B); 1434 default: 1435 return true; 1436 } 1437 1438 return true; 1439 } 1440