1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 88 /// \returns Returns true if \p MI is modified, false otherwise. 89 template <uint16_t BitName> 90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 91 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 92 if (BitIdx == -1) 93 return false; 94 95 MachineOperand &Bit = MI->getOperand(BitIdx); 96 if (Bit.getImm() != 0) 97 return false; 98 99 Bit.setImm(1); 100 return true; 101 } 102 103 class SIMemOpInfo final { 104 private: 105 106 friend class SIMemOpAccess; 107 108 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 109 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 110 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 111 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 112 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 113 bool IsCrossAddressSpaceOrdering = false; 114 bool IsVolatile = false; 115 bool IsNonTemporal = false; 116 117 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 118 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 119 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 120 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 121 bool IsCrossAddressSpaceOrdering = true, 122 AtomicOrdering FailureOrdering = 123 AtomicOrdering::SequentiallyConsistent, 124 bool IsVolatile = false, 125 bool IsNonTemporal = false) 126 : Ordering(Ordering), FailureOrdering(FailureOrdering), 127 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 128 InstrAddrSpace(InstrAddrSpace), 129 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 130 IsVolatile(IsVolatile), 131 IsNonTemporal(IsNonTemporal) { 132 133 if (Ordering == AtomicOrdering::NotAtomic) { 134 assert(Scope == SIAtomicScope::NONE && 135 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 136 !IsCrossAddressSpaceOrdering && 137 FailureOrdering == AtomicOrdering::NotAtomic); 138 return; 139 } 140 141 assert(Scope != SIAtomicScope::NONE && 142 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 143 SIAtomicAddrSpace::NONE && 144 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 145 SIAtomicAddrSpace::NONE && 146 !isStrongerThan(FailureOrdering, Ordering)); 147 148 // There is also no cross address space ordering if the ordering 149 // address space is the same as the instruction address space and 150 // only contains a single address space. 151 if ((OrderingAddrSpace == InstrAddrSpace) && 152 isPowerOf2_32(uint32_t(InstrAddrSpace))) 153 this->IsCrossAddressSpaceOrdering = false; 154 155 // Limit the scope to the maximum supported by the instruction's address 156 // spaces. 157 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 158 SIAtomicAddrSpace::NONE) { 159 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 160 } else if ((InstrAddrSpace & 161 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 162 SIAtomicAddrSpace::NONE) { 163 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 164 } else if ((InstrAddrSpace & 165 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 166 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 167 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 168 } 169 } 170 171 public: 172 /// \returns Atomic synchronization scope of the machine instruction used to 173 /// create this SIMemOpInfo. 174 SIAtomicScope getScope() const { 175 return Scope; 176 } 177 178 /// \returns Ordering constraint of the machine instruction used to 179 /// create this SIMemOpInfo. 180 AtomicOrdering getOrdering() const { 181 return Ordering; 182 } 183 184 /// \returns Failure ordering constraint of the machine instruction used to 185 /// create this SIMemOpInfo. 186 AtomicOrdering getFailureOrdering() const { 187 return FailureOrdering; 188 } 189 190 /// \returns The address spaces be accessed by the machine 191 /// instruction used to create this SiMemOpInfo. 192 SIAtomicAddrSpace getInstrAddrSpace() const { 193 return InstrAddrSpace; 194 } 195 196 /// \returns The address spaces that must be ordered by the machine 197 /// instruction used to create this SiMemOpInfo. 198 SIAtomicAddrSpace getOrderingAddrSpace() const { 199 return OrderingAddrSpace; 200 } 201 202 /// \returns Return true iff memory ordering of operations on 203 /// different address spaces is required. 204 bool getIsCrossAddressSpaceOrdering() const { 205 return IsCrossAddressSpaceOrdering; 206 } 207 208 /// \returns True if memory access of the machine instruction used to 209 /// create this SIMemOpInfo is volatile, false otherwise. 210 bool isVolatile() const { 211 return IsVolatile; 212 } 213 214 /// \returns True if memory access of the machine instruction used to 215 /// create this SIMemOpInfo is nontemporal, false otherwise. 216 bool isNonTemporal() const { 217 return IsNonTemporal; 218 } 219 220 /// \returns True if ordering constraint of the machine instruction used to 221 /// create this SIMemOpInfo is unordered or higher, false otherwise. 222 bool isAtomic() const { 223 return Ordering != AtomicOrdering::NotAtomic; 224 } 225 226 }; 227 228 class SIMemOpAccess final { 229 private: 230 AMDGPUMachineModuleInfo *MMI = nullptr; 231 232 /// Reports unsupported message \p Msg for \p MI to LLVM context. 233 void reportUnsupported(const MachineBasicBlock::iterator &MI, 234 const char *Msg) const; 235 236 /// Inspects the target synchronization scope \p SSID and determines 237 /// the SI atomic scope it corresponds to, the address spaces it 238 /// covers, and whether the memory ordering applies between address 239 /// spaces. 240 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 241 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 242 243 /// \return Return a bit set of the address spaces accessed by \p AS. 244 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 245 246 /// \returns Info constructed from \p MI, which has at least machine memory 247 /// operand. 248 Optional<SIMemOpInfo> constructFromMIWithMMO( 249 const MachineBasicBlock::iterator &MI) const; 250 251 public: 252 /// Construct class to support accessing the machine memory operands 253 /// of instructions in the machine function \p MF. 254 SIMemOpAccess(MachineFunction &MF); 255 256 /// \returns Load info if \p MI is a load operation, "None" otherwise. 257 Optional<SIMemOpInfo> getLoadInfo( 258 const MachineBasicBlock::iterator &MI) const; 259 260 /// \returns Store info if \p MI is a store operation, "None" otherwise. 261 Optional<SIMemOpInfo> getStoreInfo( 262 const MachineBasicBlock::iterator &MI) const; 263 264 /// \returns Atomic fence info if \p MI is an atomic fence operation, 265 /// "None" otherwise. 266 Optional<SIMemOpInfo> getAtomicFenceInfo( 267 const MachineBasicBlock::iterator &MI) const; 268 269 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 270 /// rmw operation, "None" otherwise. 271 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 272 const MachineBasicBlock::iterator &MI) const; 273 }; 274 275 class SICacheControl { 276 protected: 277 278 /// AMDGPU subtarget info. 279 const GCNSubtarget &ST; 280 281 /// Instruction info. 282 const SIInstrInfo *TII = nullptr; 283 284 IsaVersion IV; 285 286 /// Whether to insert cache invalidating instructions. 287 bool InsertCacheInv; 288 289 SICacheControl(const GCNSubtarget &ST); 290 291 public: 292 293 /// Create a cache control for the subtarget \p ST. 294 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 295 296 /// Update \p MI memory load instruction to bypass any caches up to 297 /// the \p Scope memory scope for address spaces \p 298 /// AddrSpace. Return true iff the instruction was modified. 299 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 300 SIAtomicScope Scope, 301 SIAtomicAddrSpace AddrSpace) const = 0; 302 303 /// Update \p MI memory instruction of kind \p Op associated with address 304 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 305 /// true iff the instruction was modified. 306 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 307 SIAtomicAddrSpace AddrSpace, 308 SIMemOp Op, bool IsVolatile, 309 bool IsNonTemporal) const = 0; 310 311 /// Inserts any necessary instructions at position \p Pos relative 312 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 313 /// \p Op associated with address spaces \p AddrSpace have completed. Used 314 /// between memory instructions to enforce the order they become visible as 315 /// observed by other memory instructions executing in memory scope \p Scope. 316 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 317 /// address spaces. Returns true iff any instructions inserted. 318 virtual bool insertWait(MachineBasicBlock::iterator &MI, 319 SIAtomicScope Scope, 320 SIAtomicAddrSpace AddrSpace, 321 SIMemOp Op, 322 bool IsCrossAddrSpaceOrdering, 323 Position Pos) const = 0; 324 325 /// Inserts any necessary instructions at position \p Pos relative to 326 /// instruction \p MI to ensure any subsequent memory instructions of this 327 /// thread with address spaces \p AddrSpace will observe the previous memory 328 /// operations by any thread for memory scopes up to memory scope \p Scope . 329 /// Returns true iff any instructions inserted. 330 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 331 SIAtomicScope Scope, 332 SIAtomicAddrSpace AddrSpace, 333 Position Pos) const = 0; 334 335 /// Inserts any necessary instructions at position \p Pos relative to 336 /// instruction \p MI to ensure previous memory instructions by this thread 337 /// with address spaces \p AddrSpace have completed and can be observed by 338 /// subsequent memory instructions by any thread executing in memory scope \p 339 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 340 /// between address spaces. Returns true iff any instructions inserted. 341 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 342 SIAtomicScope Scope, 343 SIAtomicAddrSpace AddrSpace, 344 bool IsCrossAddrSpaceOrdering, 345 Position Pos) const = 0; 346 347 /// Virtual destructor to allow derivations to be deleted. 348 virtual ~SICacheControl() = default; 349 350 }; 351 352 class SIGfx6CacheControl : public SICacheControl { 353 protected: 354 355 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 356 /// is modified, false otherwise. 357 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 358 return enableNamedBit<AMDGPU::OpName::glc>(MI); 359 } 360 361 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 362 /// is modified, false otherwise. 363 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 364 return enableNamedBit<AMDGPU::OpName::slc>(MI); 365 } 366 367 public: 368 369 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 370 371 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 372 SIAtomicScope Scope, 373 SIAtomicAddrSpace AddrSpace) const override; 374 375 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 376 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 377 bool IsVolatile, 378 bool IsNonTemporal) const override; 379 380 bool insertWait(MachineBasicBlock::iterator &MI, 381 SIAtomicScope Scope, 382 SIAtomicAddrSpace AddrSpace, 383 SIMemOp Op, 384 bool IsCrossAddrSpaceOrdering, 385 Position Pos) const override; 386 387 bool insertAcquire(MachineBasicBlock::iterator &MI, 388 SIAtomicScope Scope, 389 SIAtomicAddrSpace AddrSpace, 390 Position Pos) const override; 391 392 bool insertRelease(MachineBasicBlock::iterator &MI, 393 SIAtomicScope Scope, 394 SIAtomicAddrSpace AddrSpace, 395 bool IsCrossAddrSpaceOrdering, 396 Position Pos) const override; 397 }; 398 399 class SIGfx7CacheControl : public SIGfx6CacheControl { 400 public: 401 402 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 403 404 bool insertAcquire(MachineBasicBlock::iterator &MI, 405 SIAtomicScope Scope, 406 SIAtomicAddrSpace AddrSpace, 407 Position Pos) const override; 408 409 }; 410 411 class SIGfx10CacheControl : public SIGfx7CacheControl { 412 protected: 413 414 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 415 /// is modified, false otherwise. 416 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 417 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 418 } 419 420 public: 421 422 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 423 424 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 425 SIAtomicScope Scope, 426 SIAtomicAddrSpace AddrSpace) const override; 427 428 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 429 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 430 bool IsVolatile, 431 bool IsNonTemporal) const override; 432 433 bool insertWait(MachineBasicBlock::iterator &MI, 434 SIAtomicScope Scope, 435 SIAtomicAddrSpace AddrSpace, 436 SIMemOp Op, 437 bool IsCrossAddrSpaceOrdering, 438 Position Pos) const override; 439 440 bool insertAcquire(MachineBasicBlock::iterator &MI, 441 SIAtomicScope Scope, 442 SIAtomicAddrSpace AddrSpace, 443 Position Pos) const override; 444 }; 445 446 class SIMemoryLegalizer final : public MachineFunctionPass { 447 private: 448 449 /// Cache Control. 450 std::unique_ptr<SICacheControl> CC = nullptr; 451 452 /// List of atomic pseudo instructions. 453 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 454 455 /// Return true iff instruction \p MI is a atomic instruction that 456 /// returns a result. 457 bool isAtomicRet(const MachineInstr &MI) const { 458 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 459 } 460 461 /// Removes all processed atomic pseudo instructions from the current 462 /// function. Returns true if current function is modified, false otherwise. 463 bool removeAtomicPseudoMIs(); 464 465 /// Expands load operation \p MI. Returns true if instructions are 466 /// added/deleted or \p MI is modified, false otherwise. 467 bool expandLoad(const SIMemOpInfo &MOI, 468 MachineBasicBlock::iterator &MI); 469 /// Expands store operation \p MI. Returns true if instructions are 470 /// added/deleted or \p MI is modified, false otherwise. 471 bool expandStore(const SIMemOpInfo &MOI, 472 MachineBasicBlock::iterator &MI); 473 /// Expands atomic fence operation \p MI. Returns true if 474 /// instructions are added/deleted or \p MI is modified, false otherwise. 475 bool expandAtomicFence(const SIMemOpInfo &MOI, 476 MachineBasicBlock::iterator &MI); 477 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 478 /// instructions are added/deleted or \p MI is modified, false otherwise. 479 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 480 MachineBasicBlock::iterator &MI); 481 482 public: 483 static char ID; 484 485 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 486 487 void getAnalysisUsage(AnalysisUsage &AU) const override { 488 AU.setPreservesCFG(); 489 MachineFunctionPass::getAnalysisUsage(AU); 490 } 491 492 StringRef getPassName() const override { 493 return PASS_NAME; 494 } 495 496 bool runOnMachineFunction(MachineFunction &MF) override; 497 }; 498 499 } // end namespace anonymous 500 501 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 502 const char *Msg) const { 503 const Function &Func = MI->getParent()->getParent()->getFunction(); 504 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 505 Func.getContext().diagnose(Diag); 506 } 507 508 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 509 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 510 SIAtomicAddrSpace InstrAddrSpace) const { 511 if (SSID == SyncScope::System) 512 return std::make_tuple(SIAtomicScope::SYSTEM, 513 SIAtomicAddrSpace::ATOMIC, 514 true); 515 if (SSID == MMI->getAgentSSID()) 516 return std::make_tuple(SIAtomicScope::AGENT, 517 SIAtomicAddrSpace::ATOMIC, 518 true); 519 if (SSID == MMI->getWorkgroupSSID()) 520 return std::make_tuple(SIAtomicScope::WORKGROUP, 521 SIAtomicAddrSpace::ATOMIC, 522 true); 523 if (SSID == MMI->getWavefrontSSID()) 524 return std::make_tuple(SIAtomicScope::WAVEFRONT, 525 SIAtomicAddrSpace::ATOMIC, 526 true); 527 if (SSID == SyncScope::SingleThread) 528 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 529 SIAtomicAddrSpace::ATOMIC, 530 true); 531 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 532 return std::make_tuple(SIAtomicScope::SYSTEM, 533 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 534 false); 535 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 536 return std::make_tuple(SIAtomicScope::AGENT, 537 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 538 false); 539 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 540 return std::make_tuple(SIAtomicScope::WORKGROUP, 541 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 542 false); 543 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 544 return std::make_tuple(SIAtomicScope::WAVEFRONT, 545 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 546 false); 547 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 548 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 549 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 550 false); 551 return None; 552 } 553 554 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 555 if (AS == AMDGPUAS::FLAT_ADDRESS) 556 return SIAtomicAddrSpace::FLAT; 557 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 558 return SIAtomicAddrSpace::GLOBAL; 559 if (AS == AMDGPUAS::LOCAL_ADDRESS) 560 return SIAtomicAddrSpace::LDS; 561 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 562 return SIAtomicAddrSpace::SCRATCH; 563 if (AS == AMDGPUAS::REGION_ADDRESS) 564 return SIAtomicAddrSpace::GDS; 565 566 return SIAtomicAddrSpace::OTHER; 567 } 568 569 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 570 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 571 } 572 573 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 574 const MachineBasicBlock::iterator &MI) const { 575 assert(MI->getNumMemOperands() > 0); 576 577 SyncScope::ID SSID = SyncScope::SingleThread; 578 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 579 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 580 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 581 bool IsNonTemporal = true; 582 bool IsVolatile = false; 583 584 // Validator should check whether or not MMOs cover the entire set of 585 // locations accessed by the memory instruction. 586 for (const auto &MMO : MI->memoperands()) { 587 IsNonTemporal &= MMO->isNonTemporal(); 588 IsVolatile |= MMO->isVolatile(); 589 InstrAddrSpace |= 590 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 591 AtomicOrdering OpOrdering = MMO->getOrdering(); 592 if (OpOrdering != AtomicOrdering::NotAtomic) { 593 const auto &IsSyncScopeInclusion = 594 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 595 if (!IsSyncScopeInclusion) { 596 reportUnsupported(MI, 597 "Unsupported non-inclusive atomic synchronization scope"); 598 return None; 599 } 600 601 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 602 Ordering = 603 isStrongerThan(Ordering, OpOrdering) ? 604 Ordering : MMO->getOrdering(); 605 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 606 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 607 FailureOrdering = 608 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 609 FailureOrdering : MMO->getFailureOrdering(); 610 } 611 } 612 613 SIAtomicScope Scope = SIAtomicScope::NONE; 614 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 615 bool IsCrossAddressSpaceOrdering = false; 616 if (Ordering != AtomicOrdering::NotAtomic) { 617 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 618 if (!ScopeOrNone) { 619 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 620 return None; 621 } 622 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 623 ScopeOrNone.getValue(); 624 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 625 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 626 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 627 reportUnsupported(MI, "Unsupported atomic address space"); 628 return None; 629 } 630 } 631 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 632 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 633 IsNonTemporal); 634 } 635 636 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 637 const MachineBasicBlock::iterator &MI) const { 638 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 639 640 if (!(MI->mayLoad() && !MI->mayStore())) 641 return None; 642 643 // Be conservative if there are no memory operands. 644 if (MI->getNumMemOperands() == 0) 645 return SIMemOpInfo(); 646 647 return constructFromMIWithMMO(MI); 648 } 649 650 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 651 const MachineBasicBlock::iterator &MI) const { 652 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 653 654 if (!(!MI->mayLoad() && MI->mayStore())) 655 return None; 656 657 // Be conservative if there are no memory operands. 658 if (MI->getNumMemOperands() == 0) 659 return SIMemOpInfo(); 660 661 return constructFromMIWithMMO(MI); 662 } 663 664 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 665 const MachineBasicBlock::iterator &MI) const { 666 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 667 668 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 669 return None; 670 671 AtomicOrdering Ordering = 672 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 673 674 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 675 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 676 if (!ScopeOrNone) { 677 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 678 return None; 679 } 680 681 SIAtomicScope Scope = SIAtomicScope::NONE; 682 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 683 bool IsCrossAddressSpaceOrdering = false; 684 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 685 ScopeOrNone.getValue(); 686 687 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 688 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 689 reportUnsupported(MI, "Unsupported atomic address space"); 690 return None; 691 } 692 693 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 694 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 695 } 696 697 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 698 const MachineBasicBlock::iterator &MI) const { 699 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 700 701 if (!(MI->mayLoad() && MI->mayStore())) 702 return None; 703 704 // Be conservative if there are no memory operands. 705 if (MI->getNumMemOperands() == 0) 706 return SIMemOpInfo(); 707 708 return constructFromMIWithMMO(MI); 709 } 710 711 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 712 TII = ST.getInstrInfo(); 713 IV = getIsaVersion(ST.getCPU()); 714 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 715 } 716 717 /* static */ 718 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 719 GCNSubtarget::Generation Generation = ST.getGeneration(); 720 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 721 return std::make_unique<SIGfx6CacheControl>(ST); 722 if (Generation < AMDGPUSubtarget::GFX10) 723 return std::make_unique<SIGfx7CacheControl>(ST); 724 return std::make_unique<SIGfx10CacheControl>(ST); 725 } 726 727 bool SIGfx6CacheControl::enableLoadCacheBypass( 728 const MachineBasicBlock::iterator &MI, 729 SIAtomicScope Scope, 730 SIAtomicAddrSpace AddrSpace) const { 731 assert(MI->mayLoad() && !MI->mayStore()); 732 bool Changed = false; 733 734 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 735 switch (Scope) { 736 case SIAtomicScope::SYSTEM: 737 case SIAtomicScope::AGENT: 738 Changed |= enableGLCBit(MI); 739 break; 740 case SIAtomicScope::WORKGROUP: 741 case SIAtomicScope::WAVEFRONT: 742 case SIAtomicScope::SINGLETHREAD: 743 // No cache to bypass. 744 break; 745 default: 746 llvm_unreachable("Unsupported synchronization scope"); 747 } 748 } 749 750 /// The scratch address space does not need the global memory caches 751 /// to be bypassed as all memory operations by the same thread are 752 /// sequentially consistent, and no other thread can access scratch 753 /// memory. 754 755 /// Other address spaces do not have a cache. 756 757 return Changed; 758 } 759 760 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 761 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 762 bool IsVolatile, bool IsNonTemporal) const { 763 // Only handle load and store, not atomic read-modify-write insructions. The 764 // latter use glc to indicate if the atomic returns a result and so must not 765 // be used for cache control. 766 assert(MI->mayLoad() ^ MI->mayStore()); 767 768 // Only update load and store, not LLVM IR atomic read-modify-write 769 // instructions. The latter are always marked as volatile so cannot sensibly 770 // handle it as do not want to pessimize all atomics. Also they do not support 771 // the nontemporal attribute. 772 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 773 774 bool Changed = false; 775 776 if (IsVolatile) { 777 if (Op == SIMemOp::LOAD) 778 Changed |= enableGLCBit(MI); 779 780 // Ensure operation has completed at system scope to cause all volatile 781 // operations to be visible outside the program in a global order. Do not 782 // request cross address space as only the global address space can be 783 // observable outside the program, so no need to cause a waitcnt for LDS 784 // address space operations. 785 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 786 Position::AFTER); 787 788 return Changed; 789 } 790 791 if (IsNonTemporal) { 792 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 793 Changed |= enableGLCBit(MI); 794 Changed |= enableSLCBit(MI); 795 return Changed; 796 } 797 798 return Changed; 799 } 800 801 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 802 SIAtomicScope Scope, 803 SIAtomicAddrSpace AddrSpace, 804 SIMemOp Op, 805 bool IsCrossAddrSpaceOrdering, 806 Position Pos) const { 807 bool Changed = false; 808 809 MachineBasicBlock &MBB = *MI->getParent(); 810 DebugLoc DL = MI->getDebugLoc(); 811 812 if (Pos == Position::AFTER) 813 ++MI; 814 815 bool VMCnt = false; 816 bool LGKMCnt = false; 817 818 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 819 SIAtomicAddrSpace::NONE) { 820 switch (Scope) { 821 case SIAtomicScope::SYSTEM: 822 case SIAtomicScope::AGENT: 823 VMCnt |= true; 824 break; 825 case SIAtomicScope::WORKGROUP: 826 case SIAtomicScope::WAVEFRONT: 827 case SIAtomicScope::SINGLETHREAD: 828 // The L1 cache keeps all memory operations in order for 829 // wavefronts in the same work-group. 830 break; 831 default: 832 llvm_unreachable("Unsupported synchronization scope"); 833 } 834 } 835 836 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 837 switch (Scope) { 838 case SIAtomicScope::SYSTEM: 839 case SIAtomicScope::AGENT: 840 case SIAtomicScope::WORKGROUP: 841 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 842 // not needed as LDS operations for all waves are executed in a total 843 // global ordering as observed by all waves. Required if also 844 // synchronizing with global/GDS memory as LDS operations could be 845 // reordered with respect to later global/GDS memory operations of the 846 // same wave. 847 LGKMCnt |= IsCrossAddrSpaceOrdering; 848 break; 849 case SIAtomicScope::WAVEFRONT: 850 case SIAtomicScope::SINGLETHREAD: 851 // The LDS keeps all memory operations in order for 852 // the same wavesfront. 853 break; 854 default: 855 llvm_unreachable("Unsupported synchronization scope"); 856 } 857 } 858 859 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 860 switch (Scope) { 861 case SIAtomicScope::SYSTEM: 862 case SIAtomicScope::AGENT: 863 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 864 // is not needed as GDS operations for all waves are executed in a total 865 // global ordering as observed by all waves. Required if also 866 // synchronizing with global/LDS memory as GDS operations could be 867 // reordered with respect to later global/LDS memory operations of the 868 // same wave. 869 LGKMCnt |= IsCrossAddrSpaceOrdering; 870 break; 871 case SIAtomicScope::WORKGROUP: 872 case SIAtomicScope::WAVEFRONT: 873 case SIAtomicScope::SINGLETHREAD: 874 // The GDS keeps all memory operations in order for 875 // the same work-group. 876 break; 877 default: 878 llvm_unreachable("Unsupported synchronization scope"); 879 } 880 } 881 882 if (VMCnt || LGKMCnt) { 883 unsigned WaitCntImmediate = 884 AMDGPU::encodeWaitcnt(IV, 885 VMCnt ? 0 : getVmcntBitMask(IV), 886 getExpcntBitMask(IV), 887 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 888 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 889 Changed = true; 890 } 891 892 if (Pos == Position::AFTER) 893 --MI; 894 895 return Changed; 896 } 897 898 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 899 SIAtomicScope Scope, 900 SIAtomicAddrSpace AddrSpace, 901 Position Pos) const { 902 if (!InsertCacheInv) 903 return false; 904 905 bool Changed = false; 906 907 MachineBasicBlock &MBB = *MI->getParent(); 908 DebugLoc DL = MI->getDebugLoc(); 909 910 if (Pos == Position::AFTER) 911 ++MI; 912 913 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 914 switch (Scope) { 915 case SIAtomicScope::SYSTEM: 916 case SIAtomicScope::AGENT: 917 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 918 Changed = true; 919 break; 920 case SIAtomicScope::WORKGROUP: 921 case SIAtomicScope::WAVEFRONT: 922 case SIAtomicScope::SINGLETHREAD: 923 // No cache to invalidate. 924 break; 925 default: 926 llvm_unreachable("Unsupported synchronization scope"); 927 } 928 } 929 930 /// The scratch address space does not need the global memory cache 931 /// to be flushed as all memory operations by the same thread are 932 /// sequentially consistent, and no other thread can access scratch 933 /// memory. 934 935 /// Other address spaces do not have a cache. 936 937 if (Pos == Position::AFTER) 938 --MI; 939 940 return Changed; 941 } 942 943 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 944 SIAtomicScope Scope, 945 SIAtomicAddrSpace AddrSpace, 946 bool IsCrossAddrSpaceOrdering, 947 Position Pos) const { 948 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 949 IsCrossAddrSpaceOrdering, Pos); 950 } 951 952 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 953 SIAtomicScope Scope, 954 SIAtomicAddrSpace AddrSpace, 955 Position Pos) const { 956 if (!InsertCacheInv) 957 return false; 958 959 bool Changed = false; 960 961 MachineBasicBlock &MBB = *MI->getParent(); 962 DebugLoc DL = MI->getDebugLoc(); 963 964 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 965 966 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 967 ? AMDGPU::BUFFER_WBINVL1 968 : AMDGPU::BUFFER_WBINVL1_VOL; 969 970 if (Pos == Position::AFTER) 971 ++MI; 972 973 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 974 switch (Scope) { 975 case SIAtomicScope::SYSTEM: 976 case SIAtomicScope::AGENT: 977 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 978 Changed = true; 979 break; 980 case SIAtomicScope::WORKGROUP: 981 case SIAtomicScope::WAVEFRONT: 982 case SIAtomicScope::SINGLETHREAD: 983 // No cache to invalidate. 984 break; 985 default: 986 llvm_unreachable("Unsupported synchronization scope"); 987 } 988 } 989 990 /// The scratch address space does not need the global memory cache 991 /// to be flushed as all memory operations by the same thread are 992 /// sequentially consistent, and no other thread can access scratch 993 /// memory. 994 995 /// Other address spaces do not have a cache. 996 997 if (Pos == Position::AFTER) 998 --MI; 999 1000 return Changed; 1001 } 1002 1003 bool SIGfx10CacheControl::enableLoadCacheBypass( 1004 const MachineBasicBlock::iterator &MI, 1005 SIAtomicScope Scope, 1006 SIAtomicAddrSpace AddrSpace) const { 1007 assert(MI->mayLoad() && !MI->mayStore()); 1008 bool Changed = false; 1009 1010 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1011 /// TODO Do not set glc for rmw atomic operations as they 1012 /// implicitly bypass the L0/L1 caches. 1013 1014 switch (Scope) { 1015 case SIAtomicScope::SYSTEM: 1016 case SIAtomicScope::AGENT: 1017 Changed |= enableGLCBit(MI); 1018 Changed |= enableDLCBit(MI); 1019 break; 1020 case SIAtomicScope::WORKGROUP: 1021 // In WGP mode the waves of a work-group can be executing on either CU of 1022 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1023 // CU mode all waves of a work-group are on the same CU, and so the L0 1024 // does not need to be bypassed. 1025 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 1026 break; 1027 case SIAtomicScope::WAVEFRONT: 1028 case SIAtomicScope::SINGLETHREAD: 1029 // No cache to bypass. 1030 break; 1031 default: 1032 llvm_unreachable("Unsupported synchronization scope"); 1033 } 1034 } 1035 1036 /// The scratch address space does not need the global memory caches 1037 /// to be bypassed as all memory operations by the same thread are 1038 /// sequentially consistent, and no other thread can access scratch 1039 /// memory. 1040 1041 /// Other address spaces do not have a cache. 1042 1043 return Changed; 1044 } 1045 1046 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1047 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1048 bool IsVolatile, bool IsNonTemporal) const { 1049 1050 // Only handle load and store, not atomic read-modify-write insructions. The 1051 // latter use glc to indicate if the atomic returns a result and so must not 1052 // be used for cache control. 1053 assert(MI->mayLoad() ^ MI->mayStore()); 1054 1055 // Only update load and store, not LLVM IR atomic read-modify-write 1056 // instructions. The latter are always marked as volatile so cannot sensibly 1057 // handle it as do not want to pessimize all atomics. Also they do not support 1058 // the nontemporal attribute. 1059 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1060 1061 bool Changed = false; 1062 1063 if (IsVolatile) { 1064 1065 if (Op == SIMemOp::LOAD) { 1066 Changed |= enableGLCBit(MI); 1067 Changed |= enableDLCBit(MI); 1068 } 1069 1070 // Ensure operation has completed at system scope to cause all volatile 1071 // operations to be visible outside the program in a global order. Do not 1072 // request cross address space as only the global address space can be 1073 // observable outside the program, so no need to cause a waitcnt for LDS 1074 // address space operations. 1075 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1076 Position::AFTER); 1077 return Changed; 1078 } 1079 1080 if (IsNonTemporal) { 1081 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1082 Changed |= enableSLCBit(MI); 1083 return Changed; 1084 } 1085 1086 return Changed; 1087 } 1088 1089 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1090 SIAtomicScope Scope, 1091 SIAtomicAddrSpace AddrSpace, 1092 SIMemOp Op, 1093 bool IsCrossAddrSpaceOrdering, 1094 Position Pos) const { 1095 bool Changed = false; 1096 1097 MachineBasicBlock &MBB = *MI->getParent(); 1098 DebugLoc DL = MI->getDebugLoc(); 1099 1100 if (Pos == Position::AFTER) 1101 ++MI; 1102 1103 bool VMCnt = false; 1104 bool VSCnt = false; 1105 bool LGKMCnt = false; 1106 1107 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1108 SIAtomicAddrSpace::NONE) { 1109 switch (Scope) { 1110 case SIAtomicScope::SYSTEM: 1111 case SIAtomicScope::AGENT: 1112 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1113 VMCnt |= true; 1114 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1115 VSCnt |= true; 1116 break; 1117 case SIAtomicScope::WORKGROUP: 1118 // In WGP mode the waves of a work-group can be executing on either CU of 1119 // the WGP. Therefore need to wait for operations to complete to ensure 1120 // they are visible to waves in the other CU as the L0 is per CU. 1121 // Otherwise in CU mode and all waves of a work-group are on the same CU 1122 // which shares the same L0. 1123 if (!ST.isCuModeEnabled()) { 1124 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1125 VMCnt |= true; 1126 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1127 VSCnt |= true; 1128 } 1129 break; 1130 case SIAtomicScope::WAVEFRONT: 1131 case SIAtomicScope::SINGLETHREAD: 1132 // The L0 cache keeps all memory operations in order for 1133 // work-items in the same wavefront. 1134 break; 1135 default: 1136 llvm_unreachable("Unsupported synchronization scope"); 1137 } 1138 } 1139 1140 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1141 switch (Scope) { 1142 case SIAtomicScope::SYSTEM: 1143 case SIAtomicScope::AGENT: 1144 case SIAtomicScope::WORKGROUP: 1145 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1146 // not needed as LDS operations for all waves are executed in a total 1147 // global ordering as observed by all waves. Required if also 1148 // synchronizing with global/GDS memory as LDS operations could be 1149 // reordered with respect to later global/GDS memory operations of the 1150 // same wave. 1151 LGKMCnt |= IsCrossAddrSpaceOrdering; 1152 break; 1153 case SIAtomicScope::WAVEFRONT: 1154 case SIAtomicScope::SINGLETHREAD: 1155 // The LDS keeps all memory operations in order for 1156 // the same wavesfront. 1157 break; 1158 default: 1159 llvm_unreachable("Unsupported synchronization scope"); 1160 } 1161 } 1162 1163 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1164 switch (Scope) { 1165 case SIAtomicScope::SYSTEM: 1166 case SIAtomicScope::AGENT: 1167 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1168 // is not needed as GDS operations for all waves are executed in a total 1169 // global ordering as observed by all waves. Required if also 1170 // synchronizing with global/LDS memory as GDS operations could be 1171 // reordered with respect to later global/LDS memory operations of the 1172 // same wave. 1173 LGKMCnt |= IsCrossAddrSpaceOrdering; 1174 break; 1175 case SIAtomicScope::WORKGROUP: 1176 case SIAtomicScope::WAVEFRONT: 1177 case SIAtomicScope::SINGLETHREAD: 1178 // The GDS keeps all memory operations in order for 1179 // the same work-group. 1180 break; 1181 default: 1182 llvm_unreachable("Unsupported synchronization scope"); 1183 } 1184 } 1185 1186 if (VMCnt || LGKMCnt) { 1187 unsigned WaitCntImmediate = 1188 AMDGPU::encodeWaitcnt(IV, 1189 VMCnt ? 0 : getVmcntBitMask(IV), 1190 getExpcntBitMask(IV), 1191 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1192 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1193 Changed = true; 1194 } 1195 1196 if (VSCnt) { 1197 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1198 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1199 .addImm(0); 1200 Changed = true; 1201 } 1202 1203 if (Pos == Position::AFTER) 1204 --MI; 1205 1206 return Changed; 1207 } 1208 1209 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1210 SIAtomicScope Scope, 1211 SIAtomicAddrSpace AddrSpace, 1212 Position Pos) const { 1213 if (!InsertCacheInv) 1214 return false; 1215 1216 bool Changed = false; 1217 1218 MachineBasicBlock &MBB = *MI->getParent(); 1219 DebugLoc DL = MI->getDebugLoc(); 1220 1221 if (Pos == Position::AFTER) 1222 ++MI; 1223 1224 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1225 switch (Scope) { 1226 case SIAtomicScope::SYSTEM: 1227 case SIAtomicScope::AGENT: 1228 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1229 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1230 Changed = true; 1231 break; 1232 case SIAtomicScope::WORKGROUP: 1233 // In WGP mode the waves of a work-group can be executing on either CU of 1234 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1235 // in CU mode and all waves of a work-group are on the same CU, and so the 1236 // L0 does not need to be invalidated. 1237 if (!ST.isCuModeEnabled()) { 1238 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1239 Changed = true; 1240 } 1241 break; 1242 case SIAtomicScope::WAVEFRONT: 1243 case SIAtomicScope::SINGLETHREAD: 1244 // No cache to invalidate. 1245 break; 1246 default: 1247 llvm_unreachable("Unsupported synchronization scope"); 1248 } 1249 } 1250 1251 /// The scratch address space does not need the global memory cache 1252 /// to be flushed as all memory operations by the same thread are 1253 /// sequentially consistent, and no other thread can access scratch 1254 /// memory. 1255 1256 /// Other address spaces do not have a cache. 1257 1258 if (Pos == Position::AFTER) 1259 --MI; 1260 1261 return Changed; 1262 } 1263 1264 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1265 if (AtomicPseudoMIs.empty()) 1266 return false; 1267 1268 for (auto &MI : AtomicPseudoMIs) 1269 MI->eraseFromParent(); 1270 1271 AtomicPseudoMIs.clear(); 1272 return true; 1273 } 1274 1275 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1276 MachineBasicBlock::iterator &MI) { 1277 assert(MI->mayLoad() && !MI->mayStore()); 1278 1279 bool Changed = false; 1280 1281 if (MOI.isAtomic()) { 1282 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1283 MOI.getOrdering() == AtomicOrdering::Acquire || 1284 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1285 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1286 MOI.getOrderingAddrSpace()); 1287 } 1288 1289 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1290 Changed |= CC->insertWait(MI, MOI.getScope(), 1291 MOI.getOrderingAddrSpace(), 1292 SIMemOp::LOAD | SIMemOp::STORE, 1293 MOI.getIsCrossAddressSpaceOrdering(), 1294 Position::BEFORE); 1295 1296 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1297 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1298 Changed |= CC->insertWait(MI, MOI.getScope(), 1299 MOI.getInstrAddrSpace(), 1300 SIMemOp::LOAD, 1301 MOI.getIsCrossAddressSpaceOrdering(), 1302 Position::AFTER); 1303 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1304 MOI.getOrderingAddrSpace(), 1305 Position::AFTER); 1306 } 1307 1308 return Changed; 1309 } 1310 1311 // Atomic instructions already bypass caches to the scope specified by the 1312 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1313 // need additional treatment. 1314 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1315 SIMemOp::LOAD, MOI.isVolatile(), 1316 MOI.isNonTemporal()); 1317 return Changed; 1318 } 1319 1320 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1321 MachineBasicBlock::iterator &MI) { 1322 assert(!MI->mayLoad() && MI->mayStore()); 1323 1324 bool Changed = false; 1325 1326 if (MOI.isAtomic()) { 1327 if (MOI.getOrdering() == AtomicOrdering::Release || 1328 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1329 Changed |= CC->insertRelease(MI, MOI.getScope(), 1330 MOI.getOrderingAddrSpace(), 1331 MOI.getIsCrossAddressSpaceOrdering(), 1332 Position::BEFORE); 1333 1334 return Changed; 1335 } 1336 1337 // Atomic instructions already bypass caches to the scope specified by the 1338 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1339 // need additional treatment. 1340 Changed |= CC->enableVolatileAndOrNonTemporal( 1341 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1342 MOI.isNonTemporal()); 1343 return Changed; 1344 } 1345 1346 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1347 MachineBasicBlock::iterator &MI) { 1348 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1349 1350 AtomicPseudoMIs.push_back(MI); 1351 bool Changed = false; 1352 1353 if (MOI.isAtomic()) { 1354 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1355 MOI.getOrdering() == AtomicOrdering::Release || 1356 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1357 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1358 /// TODO: This relies on a barrier always generating a waitcnt 1359 /// for LDS to ensure it is not reordered with the completion of 1360 /// the proceeding LDS operations. If barrier had a memory 1361 /// ordering and memory scope, then library does not need to 1362 /// generate a fence. Could add support in this file for 1363 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1364 /// adding S_WAITCNT before a S_BARRIER. 1365 Changed |= CC->insertRelease(MI, MOI.getScope(), 1366 MOI.getOrderingAddrSpace(), 1367 MOI.getIsCrossAddressSpaceOrdering(), 1368 Position::BEFORE); 1369 1370 // TODO: If both release and invalidate are happening they could be combined 1371 // to use the single "BUFFER_WBL2" instruction. This could be done by 1372 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1373 // track cache invalidate and write back instructions. 1374 1375 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1376 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1377 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1378 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1379 MOI.getOrderingAddrSpace(), 1380 Position::BEFORE); 1381 1382 return Changed; 1383 } 1384 1385 return Changed; 1386 } 1387 1388 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1389 MachineBasicBlock::iterator &MI) { 1390 assert(MI->mayLoad() && MI->mayStore()); 1391 1392 bool Changed = false; 1393 1394 if (MOI.isAtomic()) { 1395 if (MOI.getOrdering() == AtomicOrdering::Release || 1396 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1397 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1398 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1399 Changed |= CC->insertRelease(MI, MOI.getScope(), 1400 MOI.getOrderingAddrSpace(), 1401 MOI.getIsCrossAddressSpaceOrdering(), 1402 Position::BEFORE); 1403 1404 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1405 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1406 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1407 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1408 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1409 Changed |= CC->insertWait(MI, MOI.getScope(), 1410 MOI.getOrderingAddrSpace(), 1411 isAtomicRet(*MI) ? SIMemOp::LOAD : 1412 SIMemOp::STORE, 1413 MOI.getIsCrossAddressSpaceOrdering(), 1414 Position::AFTER); 1415 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1416 MOI.getOrderingAddrSpace(), 1417 Position::AFTER); 1418 } 1419 1420 return Changed; 1421 } 1422 1423 return Changed; 1424 } 1425 1426 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1427 bool Changed = false; 1428 1429 SIMemOpAccess MOA(MF); 1430 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1431 1432 for (auto &MBB : MF) { 1433 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1434 1435 // Unbundle instructions after the post-RA scheduler. 1436 if (MI->isBundle()) { 1437 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1438 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1439 I != E && I->isBundledWithPred(); ++I) { 1440 I->unbundleFromPred(); 1441 for (MachineOperand &MO : I->operands()) 1442 if (MO.isReg()) 1443 MO.setIsInternalRead(false); 1444 } 1445 1446 MI->eraseFromParent(); 1447 MI = II->getIterator(); 1448 } 1449 1450 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1451 continue; 1452 1453 if (const auto &MOI = MOA.getLoadInfo(MI)) 1454 Changed |= expandLoad(MOI.getValue(), MI); 1455 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1456 Changed |= expandStore(MOI.getValue(), MI); 1457 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1458 Changed |= expandAtomicFence(MOI.getValue(), MI); 1459 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1460 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1461 } 1462 } 1463 1464 Changed |= removeAtomicPseudoMIs(); 1465 return Changed; 1466 } 1467 1468 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1469 1470 char SIMemoryLegalizer::ID = 0; 1471 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1472 1473 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1474 return new SIMemoryLegalizer(); 1475 } 1476