1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 17 18 #include "AMDGPU.h" 19 #include "R600InstrInfo.h" 20 #include "R600ISelLowering.h" 21 #include "R600FrameLowering.h" 22 #include "SIInstrInfo.h" 23 #include "SIISelLowering.h" 24 #include "SIFrameLowering.h" 25 #include "Utils/AMDGPUBaseInfo.h" 26 #include "llvm/ADT/Triple.h" 27 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 30 #include "llvm/MC/MCInstrItineraries.h" 31 #include "llvm/Support/MathExtras.h" 32 #include <cassert> 33 #include <cstdint> 34 #include <memory> 35 #include <utility> 36 37 #define GET_SUBTARGETINFO_HEADER 38 #include "AMDGPUGenSubtargetInfo.inc" 39 40 namespace llvm { 41 42 class StringRef; 43 44 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { 45 public: 46 enum Generation { 47 R600 = 0, 48 R700, 49 EVERGREEN, 50 NORTHERN_ISLANDS, 51 SOUTHERN_ISLANDS, 52 SEA_ISLANDS, 53 VOLCANIC_ISLANDS, 54 }; 55 56 enum { 57 ISAVersion0_0_0, 58 ISAVersion7_0_0, 59 ISAVersion7_0_1, 60 ISAVersion7_0_2, 61 ISAVersion8_0_0, 62 ISAVersion8_0_1, 63 ISAVersion8_0_2, 64 ISAVersion8_0_3, 65 ISAVersion8_0_4, 66 ISAVersion8_1_0, 67 }; 68 69 protected: 70 // Basic subtarget description. 71 Triple TargetTriple; 72 Generation Gen; 73 unsigned IsaVersion; 74 unsigned WavefrontSize; 75 int LocalMemorySize; 76 int LDSBankCount; 77 unsigned MaxPrivateElementSize; 78 79 // Possibly statically set by tablegen, but may want to be overridden. 80 bool FastFMAF32; 81 bool HalfRate64Ops; 82 83 // Dynamially set bits that enable features. 84 bool FP16Denormals; 85 bool FP32Denormals; 86 bool FP64Denormals; 87 bool FPExceptions; 88 bool FlatForGlobal; 89 bool UnalignedScratchAccess; 90 bool UnalignedBufferAccess; 91 bool EnableXNACK; 92 bool DebuggerInsertNops; 93 bool DebuggerReserveRegs; 94 bool DebuggerEmitPrologue; 95 96 // Used as options. 97 bool EnableVGPRSpilling; 98 bool EnablePromoteAlloca; 99 bool EnableLoadStoreOpt; 100 bool EnableUnsafeDSOffsetFolding; 101 bool EnableSIScheduler; 102 bool DumpCode; 103 104 // Subtarget statically properties set by tablegen 105 bool FP64; 106 bool IsGCN; 107 bool GCN1Encoding; 108 bool GCN3Encoding; 109 bool CIInsts; 110 bool SGPRInitBug; 111 bool HasSMemRealTime; 112 bool Has16BitInsts; 113 bool HasMovrel; 114 bool HasVGPRIndexMode; 115 bool HasScalarStores; 116 bool HasInv2PiInlineImm; 117 bool FlatAddressSpace; 118 bool R600ALUInst; 119 bool CaymanISA; 120 bool CFALUBug; 121 bool HasVertexCache; 122 short TexVTXClauseSize; 123 bool ScalarizeGlobal; 124 125 // Dummy feature to use for assembler in tablegen. 126 bool FeatureDisable; 127 128 InstrItineraryData InstrItins; 129 SelectionDAGTargetInfo TSInfo; 130 131 public: 132 AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 133 const TargetMachine &TM); 134 ~AMDGPUSubtarget() override; 135 136 AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, 137 StringRef GPU, StringRef FS); 138 139 const AMDGPUInstrInfo *getInstrInfo() const override = 0; 140 const AMDGPUFrameLowering *getFrameLowering() const override = 0; 141 const AMDGPUTargetLowering *getTargetLowering() const override = 0; 142 const AMDGPURegisterInfo *getRegisterInfo() const override = 0; 143 144 const InstrItineraryData *getInstrItineraryData() const override { 145 return &InstrItins; 146 } 147 148 // Nothing implemented, just prevent crashes on use. 149 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 150 return &TSInfo; 151 } 152 153 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 154 155 bool isAmdHsaOS() const { 156 return TargetTriple.getOS() == Triple::AMDHSA; 157 } 158 159 bool isMesa3DOS() const { 160 return TargetTriple.getOS() == Triple::Mesa3D; 161 } 162 163 bool isOpenCLEnv() const { 164 return TargetTriple.getEnvironment() == Triple::OpenCL; 165 } 166 167 Generation getGeneration() const { 168 return Gen; 169 } 170 171 unsigned getWavefrontSize() const { 172 return WavefrontSize; 173 } 174 175 int getLocalMemorySize() const { 176 return LocalMemorySize; 177 } 178 179 int getLDSBankCount() const { 180 return LDSBankCount; 181 } 182 183 unsigned getMaxPrivateElementSize() const { 184 return MaxPrivateElementSize; 185 } 186 187 bool has16BitInsts() const { 188 return Has16BitInsts; 189 } 190 191 bool hasHWFP64() const { 192 return FP64; 193 } 194 195 bool hasFastFMAF32() const { 196 return FastFMAF32; 197 } 198 199 bool hasHalfRate64Ops() const { 200 return HalfRate64Ops; 201 } 202 203 bool hasAddr64() const { 204 return (getGeneration() < VOLCANIC_ISLANDS); 205 } 206 207 bool hasBFE() const { 208 return (getGeneration() >= EVERGREEN); 209 } 210 211 bool hasBFI() const { 212 return (getGeneration() >= EVERGREEN); 213 } 214 215 bool hasBFM() const { 216 return hasBFE(); 217 } 218 219 bool hasBCNT(unsigned Size) const { 220 if (Size == 32) 221 return (getGeneration() >= EVERGREEN); 222 223 if (Size == 64) 224 return (getGeneration() >= SOUTHERN_ISLANDS); 225 226 return false; 227 } 228 229 bool hasMulU24() const { 230 return (getGeneration() >= EVERGREEN); 231 } 232 233 bool hasMulI24() const { 234 return (getGeneration() >= SOUTHERN_ISLANDS || 235 hasCaymanISA()); 236 } 237 238 bool hasFFBL() const { 239 return (getGeneration() >= EVERGREEN); 240 } 241 242 bool hasFFBH() const { 243 return (getGeneration() >= EVERGREEN); 244 } 245 246 bool hasCARRY() const { 247 return (getGeneration() >= EVERGREEN); 248 } 249 250 bool hasBORROW() const { 251 return (getGeneration() >= EVERGREEN); 252 } 253 254 bool hasCaymanISA() const { 255 return CaymanISA; 256 } 257 258 bool isPromoteAllocaEnabled() const { 259 return EnablePromoteAlloca; 260 } 261 262 bool unsafeDSOffsetFoldingEnabled() const { 263 return EnableUnsafeDSOffsetFolding; 264 } 265 266 bool dumpCode() const { 267 return DumpCode; 268 } 269 270 bool enableIEEEBit(const MachineFunction &MF) const { 271 return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); 272 } 273 274 /// Return the amount of LDS that can be used that will not restrict the 275 /// occupancy lower than WaveCount. 276 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; 277 278 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 279 /// the given LDS memory size is the only constraint. 280 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; 281 282 bool hasFP16Denormals() const { 283 return FP16Denormals; 284 } 285 286 bool hasFP32Denormals() const { 287 return FP32Denormals; 288 } 289 290 bool hasFP64Denormals() const { 291 return FP64Denormals; 292 } 293 294 bool hasFPExceptions() const { 295 return FPExceptions; 296 } 297 298 bool useFlatForGlobal() const { 299 return FlatForGlobal; 300 } 301 302 bool hasUnalignedBufferAccess() const { 303 return UnalignedBufferAccess; 304 } 305 306 bool hasUnalignedScratchAccess() const { 307 return UnalignedScratchAccess; 308 } 309 310 bool isXNACKEnabled() const { 311 return EnableXNACK; 312 } 313 314 bool isAmdCodeObjectV2() const { 315 return isAmdHsaOS() || isMesa3DOS(); 316 } 317 318 /// \brief Returns the offset in bytes from the start of the input buffer 319 /// of the first explicit kernel argument. 320 unsigned getExplicitKernelArgOffset() const { 321 return isAmdCodeObjectV2() ? 0 : 36; 322 } 323 324 unsigned getAlignmentForImplicitArgPtr() const { 325 return isAmdHsaOS() ? 8 : 4; 326 } 327 328 unsigned getImplicitArgNumBytes() const { 329 if (isMesa3DOS()) 330 return 16; 331 if (isAmdHsaOS() && isOpenCLEnv()) 332 return 32; 333 return 0; 334 } 335 336 unsigned getStackAlignment() const { 337 // Scratch is allocated in 256 dword per wave blocks. 338 return 4 * 256 / getWavefrontSize(); 339 } 340 341 bool enableMachineScheduler() const override { 342 return true; 343 } 344 345 bool enableSubRegLiveness() const override { 346 return true; 347 } 348 349 /// \returns Number of execution units per compute unit supported by the 350 /// subtarget. 351 unsigned getEUsPerCU() const { 352 return 4; 353 } 354 355 /// \returns Maximum number of work groups per compute unit supported by the 356 /// subtarget and limited by given flat work group size. 357 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { 358 if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) 359 return 8; 360 return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16; 361 } 362 363 /// \returns Maximum number of waves per compute unit supported by the 364 /// subtarget without any kind of limitation. 365 unsigned getMaxWavesPerCU() const { 366 return getMaxWavesPerEU() * getEUsPerCU(); 367 } 368 369 /// \returns Maximum number of waves per compute unit supported by the 370 /// subtarget and limited by given flat work group size. 371 unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { 372 return getWavesPerWorkGroup(FlatWorkGroupSize); 373 } 374 375 /// \returns Minimum number of waves per execution unit supported by the 376 /// subtarget. 377 unsigned getMinWavesPerEU() const { 378 return 1; 379 } 380 381 /// \returns Maximum number of waves per execution unit supported by the 382 /// subtarget without any kind of limitation. 383 unsigned getMaxWavesPerEU() const { 384 if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) 385 return 8; 386 // FIXME: Need to take scratch memory into account. 387 return 10; 388 } 389 390 /// \returns Maximum number of waves per execution unit supported by the 391 /// subtarget and limited by given flat work group size. 392 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { 393 return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) / 394 getEUsPerCU(); 395 } 396 397 /// \returns Minimum flat work group size supported by the subtarget. 398 unsigned getMinFlatWorkGroupSize() const { 399 return 1; 400 } 401 402 /// \returns Maximum flat work group size supported by the subtarget. 403 unsigned getMaxFlatWorkGroupSize() const { 404 return 2048; 405 } 406 407 /// \returns Number of waves per work group given the flat work group size. 408 unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { 409 return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize(); 410 } 411 412 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} 413 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} 414 415 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 416 /// for function \p F, or minimum/maximum flat work group sizes explicitly 417 /// requested using "amdgpu-flat-work-group-size" attribute attached to 418 /// function \p F. 419 /// 420 /// \returns Subtarget's default values if explicitly requested values cannot 421 /// be converted to integer, or violate subtarget's specifications. 422 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 423 424 /// \returns Subtarget's default pair of minimum/maximum number of waves per 425 /// execution unit for function \p F, or minimum/maximum number of waves per 426 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 427 /// attached to function \p F. 428 /// 429 /// \returns Subtarget's default values if explicitly requested values cannot 430 /// be converted to integer, violate subtarget's specifications, or are not 431 /// compatible with minimum/maximum number of waves limited by flat work group 432 /// size, register usage, and/or lds usage. 433 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 434 }; 435 436 class R600Subtarget final : public AMDGPUSubtarget { 437 private: 438 R600InstrInfo InstrInfo; 439 R600FrameLowering FrameLowering; 440 R600TargetLowering TLInfo; 441 442 public: 443 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 444 const TargetMachine &TM); 445 446 const R600InstrInfo *getInstrInfo() const override { 447 return &InstrInfo; 448 } 449 450 const R600FrameLowering *getFrameLowering() const override { 451 return &FrameLowering; 452 } 453 454 const R600TargetLowering *getTargetLowering() const override { 455 return &TLInfo; 456 } 457 458 const R600RegisterInfo *getRegisterInfo() const override { 459 return &InstrInfo.getRegisterInfo(); 460 } 461 462 bool hasCFAluBug() const { 463 return CFALUBug; 464 } 465 466 bool hasVertexCache() const { 467 return HasVertexCache; 468 } 469 470 short getTexVTXClauseSize() const { 471 return TexVTXClauseSize; 472 } 473 }; 474 475 class SISubtarget final : public AMDGPUSubtarget { 476 public: 477 enum { 478 // The closed Vulkan driver sets 96, which limits the wave count to 8 but 479 // doesn't spill SGPRs as much as when 80 is set. 480 FIXED_SGPR_COUNT_FOR_INIT_BUG = 96 481 }; 482 483 private: 484 SIInstrInfo InstrInfo; 485 SIFrameLowering FrameLowering; 486 SITargetLowering TLInfo; 487 std::unique_ptr<GISelAccessor> GISel; 488 489 public: 490 SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, 491 const TargetMachine &TM); 492 493 const SIInstrInfo *getInstrInfo() const override { 494 return &InstrInfo; 495 } 496 497 const SIFrameLowering *getFrameLowering() const override { 498 return &FrameLowering; 499 } 500 501 const SITargetLowering *getTargetLowering() const override { 502 return &TLInfo; 503 } 504 505 const CallLowering *getCallLowering() const override { 506 assert(GISel && "Access to GlobalISel APIs not set"); 507 return GISel->getCallLowering(); 508 } 509 510 const SIRegisterInfo *getRegisterInfo() const override { 511 return &InstrInfo.getRegisterInfo(); 512 } 513 514 void setGISelAccessor(GISelAccessor &GISel) { 515 this->GISel.reset(&GISel); 516 } 517 518 void overrideSchedPolicy(MachineSchedPolicy &Policy, 519 unsigned NumRegionInstrs) const override; 520 521 bool isVGPRSpillingEnabled(const Function& F) const; 522 523 unsigned getMaxNumUserSGPRs() const { 524 return 16; 525 } 526 527 bool hasFlatAddressSpace() const { 528 return FlatAddressSpace; 529 } 530 531 bool hasSMemRealTime() const { 532 return HasSMemRealTime; 533 } 534 535 bool hasMovrel() const { 536 return HasMovrel; 537 } 538 539 bool hasVGPRIndexMode() const { 540 return HasVGPRIndexMode; 541 } 542 543 bool hasScalarCompareEq64() const { 544 return getGeneration() >= VOLCANIC_ISLANDS; 545 } 546 547 bool hasScalarStores() const { 548 return HasScalarStores; 549 } 550 551 bool hasInv2PiInlineImm() const { 552 return HasInv2PiInlineImm; 553 } 554 555 bool enableSIScheduler() const { 556 return EnableSIScheduler; 557 } 558 559 bool debuggerSupported() const { 560 return debuggerInsertNops() && debuggerReserveRegs() && 561 debuggerEmitPrologue(); 562 } 563 564 bool debuggerInsertNops() const { 565 return DebuggerInsertNops; 566 } 567 568 bool debuggerReserveRegs() const { 569 return DebuggerReserveRegs; 570 } 571 572 bool debuggerEmitPrologue() const { 573 return DebuggerEmitPrologue; 574 } 575 576 bool loadStoreOptEnabled() const { 577 return EnableLoadStoreOpt; 578 } 579 580 bool hasSGPRInitBug() const { 581 return SGPRInitBug; 582 } 583 584 bool has12DWordStoreHazard() const { 585 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 586 } 587 588 unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const; 589 590 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs 591 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 592 593 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs 594 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 595 596 /// \returns True if waitcnt instruction is needed before barrier instruction, 597 /// false otherwise. 598 bool needWaitcntBeforeBarrier() const { 599 return true; 600 } 601 602 unsigned getMaxNumSGPRs() const; 603 }; 604 605 } // end namespace llvm 606 607 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 608