1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "llvm/ADT/BitmaskEnum.h" 20 #include "llvm/CodeGen/MachineBasicBlock.h" 21 #include "llvm/IR/DiagnosticInfo.h" 22 #include "llvm/Support/AtomicOrdering.h" 23 #include "llvm/Support/TargetParser.h" 24 25 using namespace llvm; 26 using namespace llvm::AMDGPU; 27 28 #define DEBUG_TYPE "si-memory-legalizer" 29 #define PASS_NAME "SI Memory Legalizer" 30 31 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 32 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 33 cl::desc("Use this to skip inserting cache invalidating instructions.")); 34 35 namespace { 36 37 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 38 39 /// Memory operation flags. Can be ORed together. 40 enum class SIMemOp { 41 NONE = 0u, 42 LOAD = 1u << 0, 43 STORE = 1u << 1, 44 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 45 }; 46 47 /// Position to insert a new instruction relative to an existing 48 /// instruction. 49 enum class Position { 50 BEFORE, 51 AFTER 52 }; 53 54 /// The atomic synchronization scopes supported by the AMDGPU target. 55 enum class SIAtomicScope { 56 NONE, 57 SINGLETHREAD, 58 WAVEFRONT, 59 WORKGROUP, 60 AGENT, 61 SYSTEM 62 }; 63 64 /// The distinct address spaces supported by the AMDGPU target for 65 /// atomic memory operation. Can be ORed toether. 66 enum class SIAtomicAddrSpace { 67 NONE = 0u, 68 GLOBAL = 1u << 0, 69 LDS = 1u << 1, 70 SCRATCH = 1u << 2, 71 GDS = 1u << 3, 72 OTHER = 1u << 4, 73 74 /// The address spaces that can be accessed by a FLAT instruction. 75 FLAT = GLOBAL | LDS | SCRATCH, 76 77 /// The address spaces that support atomic instructions. 78 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 79 80 /// All address spaces. 81 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 82 83 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 84 }; 85 86 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 87 /// \returns Returns true if \p MI is modified, false otherwise. 88 template <uint16_t BitName> 89 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 90 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 91 if (BitIdx == -1) 92 return false; 93 94 MachineOperand &Bit = MI->getOperand(BitIdx); 95 if (Bit.getImm() != 0) 96 return false; 97 98 Bit.setImm(1); 99 return true; 100 } 101 102 class SIMemOpInfo final { 103 private: 104 105 friend class SIMemOpAccess; 106 107 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 108 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 109 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 110 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 111 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 112 bool IsCrossAddressSpaceOrdering = false; 113 bool IsVolatile = false; 114 bool IsNonTemporal = false; 115 116 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 117 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 118 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 119 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 120 bool IsCrossAddressSpaceOrdering = true, 121 AtomicOrdering FailureOrdering = 122 AtomicOrdering::SequentiallyConsistent, 123 bool IsVolatile = false, 124 bool IsNonTemporal = false) 125 : Ordering(Ordering), FailureOrdering(FailureOrdering), 126 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 127 InstrAddrSpace(InstrAddrSpace), 128 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 129 IsVolatile(IsVolatile), 130 IsNonTemporal(IsNonTemporal) { 131 // There is also no cross address space ordering if the ordering 132 // address space is the same as the instruction address space and 133 // only contains a single address space. 134 if ((OrderingAddrSpace == InstrAddrSpace) && 135 isPowerOf2_32(uint32_t(InstrAddrSpace))) 136 this->IsCrossAddressSpaceOrdering = false; 137 } 138 139 public: 140 /// \returns Atomic synchronization scope of the machine instruction used to 141 /// create this SIMemOpInfo. 142 SIAtomicScope getScope() const { 143 return Scope; 144 } 145 146 /// \returns Ordering constraint of the machine instruction used to 147 /// create this SIMemOpInfo. 148 AtomicOrdering getOrdering() const { 149 return Ordering; 150 } 151 152 /// \returns Failure ordering constraint of the machine instruction used to 153 /// create this SIMemOpInfo. 154 AtomicOrdering getFailureOrdering() const { 155 return FailureOrdering; 156 } 157 158 /// \returns The address spaces be accessed by the machine 159 /// instruction used to create this SiMemOpInfo. 160 SIAtomicAddrSpace getInstrAddrSpace() const { 161 return InstrAddrSpace; 162 } 163 164 /// \returns The address spaces that must be ordered by the machine 165 /// instruction used to create this SiMemOpInfo. 166 SIAtomicAddrSpace getOrderingAddrSpace() const { 167 return OrderingAddrSpace; 168 } 169 170 /// \returns Return true iff memory ordering of operations on 171 /// different address spaces is required. 172 bool getIsCrossAddressSpaceOrdering() const { 173 return IsCrossAddressSpaceOrdering; 174 } 175 176 /// \returns True if memory access of the machine instruction used to 177 /// create this SIMemOpInfo is volatile, false otherwise. 178 bool isVolatile() const { 179 return IsVolatile; 180 } 181 182 /// \returns True if memory access of the machine instruction used to 183 /// create this SIMemOpInfo is nontemporal, false otherwise. 184 bool isNonTemporal() const { 185 return IsNonTemporal; 186 } 187 188 /// \returns True if ordering constraint of the machine instruction used to 189 /// create this SIMemOpInfo is unordered or higher, false otherwise. 190 bool isAtomic() const { 191 return Ordering != AtomicOrdering::NotAtomic; 192 } 193 194 }; 195 196 class SIMemOpAccess final { 197 private: 198 AMDGPUMachineModuleInfo *MMI = nullptr; 199 200 /// Reports unsupported message \p Msg for \p MI to LLVM context. 201 void reportUnsupported(const MachineBasicBlock::iterator &MI, 202 const char *Msg) const; 203 204 /// Inspects the target synchonization scope \p SSID and determines 205 /// the SI atomic scope it corresponds to, the address spaces it 206 /// covers, and whether the memory ordering applies between address 207 /// spaces. 208 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 209 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 210 211 /// \return Return a bit set of the address spaces accessed by \p AS. 212 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 213 214 /// \returns Info constructed from \p MI, which has at least machine memory 215 /// operand. 216 Optional<SIMemOpInfo> constructFromMIWithMMO( 217 const MachineBasicBlock::iterator &MI) const; 218 219 public: 220 /// Construct class to support accessing the machine memory operands 221 /// of instructions in the machine function \p MF. 222 SIMemOpAccess(MachineFunction &MF); 223 224 /// \returns Load info if \p MI is a load operation, "None" otherwise. 225 Optional<SIMemOpInfo> getLoadInfo( 226 const MachineBasicBlock::iterator &MI) const; 227 228 /// \returns Store info if \p MI is a store operation, "None" otherwise. 229 Optional<SIMemOpInfo> getStoreInfo( 230 const MachineBasicBlock::iterator &MI) const; 231 232 /// \returns Atomic fence info if \p MI is an atomic fence operation, 233 /// "None" otherwise. 234 Optional<SIMemOpInfo> getAtomicFenceInfo( 235 const MachineBasicBlock::iterator &MI) const; 236 237 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 238 /// rmw operation, "None" otherwise. 239 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 240 const MachineBasicBlock::iterator &MI) const; 241 }; 242 243 class SICacheControl { 244 protected: 245 246 /// AMDGPU subtarget info. 247 const GCNSubtarget &ST; 248 249 /// Instruction info. 250 const SIInstrInfo *TII = nullptr; 251 252 IsaVersion IV; 253 254 /// Whether to insert cache invalidating instructions. 255 bool InsertCacheInv; 256 257 SICacheControl(const GCNSubtarget &ST); 258 259 public: 260 261 /// Create a cache control for the subtarget \p ST. 262 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 263 264 /// Update \p MI memory load instruction to bypass any caches up to 265 /// the \p Scope memory scope for address spaces \p 266 /// AddrSpace. Return true iff the instruction was modified. 267 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 268 SIAtomicScope Scope, 269 SIAtomicAddrSpace AddrSpace) const = 0; 270 271 /// Update \p MI memory instruction of kind \p Op associated with address 272 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 273 /// true iff the instruction was modified. 274 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 275 SIAtomicAddrSpace AddrSpace, 276 SIMemOp Op, bool IsVolatile, 277 bool IsNonTemporal) const = 0; 278 279 /// Inserts any necessary instructions at position \p Pos relative 280 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 281 /// \p Op associated with address spaces \p AddrSpace have completed. Used 282 /// between memory instructions to enforce the order they become visible as 283 /// observed by other memory instructions executing in memory scope \p Scope. 284 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 285 /// address spaces. Returns true iff any instructions inserted. 286 virtual bool insertWait(MachineBasicBlock::iterator &MI, 287 SIAtomicScope Scope, 288 SIAtomicAddrSpace AddrSpace, 289 SIMemOp Op, 290 bool IsCrossAddrSpaceOrdering, 291 Position Pos) const = 0; 292 293 /// Inserts any necessary instructions at position \p Pos relative to 294 /// instruction \p MI to ensure any subsequent memory instructions of this 295 /// thread with address spaces \p AddrSpace will observe the previous memory 296 /// operations by any thread for memory scopes up to memory scope \p Scope . 297 /// Returns true iff any instructions inserted. 298 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 299 SIAtomicScope Scope, 300 SIAtomicAddrSpace AddrSpace, 301 Position Pos) const = 0; 302 303 /// Inserts any necessary instructions at position \p Pos relative to 304 /// instruction \p MI to ensure previous memory instructions by this thread 305 /// with address spaces \p AddrSpace have completed and can be observed by 306 /// subsequent memory instructions by any thread executing in memory scope \p 307 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 308 /// between address spaces. Returns true iff any instructions inserted. 309 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 310 SIAtomicScope Scope, 311 SIAtomicAddrSpace AddrSpace, 312 bool IsCrossAddrSpaceOrdering, 313 Position Pos) const = 0; 314 315 /// Virtual destructor to allow derivations to be deleted. 316 virtual ~SICacheControl() = default; 317 318 }; 319 320 class SIGfx6CacheControl : public SICacheControl { 321 protected: 322 323 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 324 /// is modified, false otherwise. 325 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 326 return enableNamedBit<AMDGPU::OpName::glc>(MI); 327 } 328 329 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 330 /// is modified, false otherwise. 331 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 332 return enableNamedBit<AMDGPU::OpName::slc>(MI); 333 } 334 335 public: 336 337 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 338 339 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 340 SIAtomicScope Scope, 341 SIAtomicAddrSpace AddrSpace) const override; 342 343 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 344 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 345 bool IsVolatile, 346 bool IsNonTemporal) const override; 347 348 bool insertWait(MachineBasicBlock::iterator &MI, 349 SIAtomicScope Scope, 350 SIAtomicAddrSpace AddrSpace, 351 SIMemOp Op, 352 bool IsCrossAddrSpaceOrdering, 353 Position Pos) const override; 354 355 bool insertAcquire(MachineBasicBlock::iterator &MI, 356 SIAtomicScope Scope, 357 SIAtomicAddrSpace AddrSpace, 358 Position Pos) const override; 359 360 bool insertRelease(MachineBasicBlock::iterator &MI, 361 SIAtomicScope Scope, 362 SIAtomicAddrSpace AddrSpace, 363 bool IsCrossAddrSpaceOrdering, 364 Position Pos) const override; 365 }; 366 367 class SIGfx7CacheControl : public SIGfx6CacheControl { 368 public: 369 370 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 371 372 bool insertAcquire(MachineBasicBlock::iterator &MI, 373 SIAtomicScope Scope, 374 SIAtomicAddrSpace AddrSpace, 375 Position Pos) const override; 376 377 }; 378 379 class SIGfx10CacheControl : public SIGfx7CacheControl { 380 protected: 381 382 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 383 /// is modified, false otherwise. 384 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 385 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 386 } 387 388 public: 389 390 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 391 392 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 393 SIAtomicScope Scope, 394 SIAtomicAddrSpace AddrSpace) const override; 395 396 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 397 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 398 bool IsVolatile, 399 bool IsNonTemporal) const override; 400 401 bool insertWait(MachineBasicBlock::iterator &MI, 402 SIAtomicScope Scope, 403 SIAtomicAddrSpace AddrSpace, 404 SIMemOp Op, 405 bool IsCrossAddrSpaceOrdering, 406 Position Pos) const override; 407 408 bool insertAcquire(MachineBasicBlock::iterator &MI, 409 SIAtomicScope Scope, 410 SIAtomicAddrSpace AddrSpace, 411 Position Pos) const override; 412 }; 413 414 class SIMemoryLegalizer final : public MachineFunctionPass { 415 private: 416 417 /// Cache Control. 418 std::unique_ptr<SICacheControl> CC = nullptr; 419 420 /// List of atomic pseudo instructions. 421 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 422 423 /// Return true iff instruction \p MI is a atomic instruction that 424 /// returns a result. 425 bool isAtomicRet(const MachineInstr &MI) const { 426 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 427 } 428 429 /// Removes all processed atomic pseudo instructions from the current 430 /// function. Returns true if current function is modified, false otherwise. 431 bool removeAtomicPseudoMIs(); 432 433 /// Expands load operation \p MI. Returns true if instructions are 434 /// added/deleted or \p MI is modified, false otherwise. 435 bool expandLoad(const SIMemOpInfo &MOI, 436 MachineBasicBlock::iterator &MI); 437 /// Expands store operation \p MI. Returns true if instructions are 438 /// added/deleted or \p MI is modified, false otherwise. 439 bool expandStore(const SIMemOpInfo &MOI, 440 MachineBasicBlock::iterator &MI); 441 /// Expands atomic fence operation \p MI. Returns true if 442 /// instructions are added/deleted or \p MI is modified, false otherwise. 443 bool expandAtomicFence(const SIMemOpInfo &MOI, 444 MachineBasicBlock::iterator &MI); 445 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 446 /// instructions are added/deleted or \p MI is modified, false otherwise. 447 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 448 MachineBasicBlock::iterator &MI); 449 450 public: 451 static char ID; 452 453 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 454 455 void getAnalysisUsage(AnalysisUsage &AU) const override { 456 AU.setPreservesCFG(); 457 MachineFunctionPass::getAnalysisUsage(AU); 458 } 459 460 StringRef getPassName() const override { 461 return PASS_NAME; 462 } 463 464 bool runOnMachineFunction(MachineFunction &MF) override; 465 }; 466 467 } // end namespace anonymous 468 469 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 470 const char *Msg) const { 471 const Function &Func = MI->getParent()->getParent()->getFunction(); 472 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 473 Func.getContext().diagnose(Diag); 474 } 475 476 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 477 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 478 SIAtomicAddrSpace InstrScope) const { 479 if (SSID == SyncScope::System) 480 return std::make_tuple(SIAtomicScope::SYSTEM, 481 SIAtomicAddrSpace::ATOMIC, 482 true); 483 if (SSID == MMI->getAgentSSID()) 484 return std::make_tuple(SIAtomicScope::AGENT, 485 SIAtomicAddrSpace::ATOMIC, 486 true); 487 if (SSID == MMI->getWorkgroupSSID()) 488 return std::make_tuple(SIAtomicScope::WORKGROUP, 489 SIAtomicAddrSpace::ATOMIC, 490 true); 491 if (SSID == MMI->getWavefrontSSID()) 492 return std::make_tuple(SIAtomicScope::WAVEFRONT, 493 SIAtomicAddrSpace::ATOMIC, 494 true); 495 if (SSID == SyncScope::SingleThread) 496 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 497 SIAtomicAddrSpace::ATOMIC, 498 true); 499 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 500 return std::make_tuple(SIAtomicScope::SYSTEM, 501 SIAtomicAddrSpace::ATOMIC & InstrScope, 502 false); 503 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 504 return std::make_tuple(SIAtomicScope::AGENT, 505 SIAtomicAddrSpace::ATOMIC & InstrScope, 506 false); 507 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 508 return std::make_tuple(SIAtomicScope::WORKGROUP, 509 SIAtomicAddrSpace::ATOMIC & InstrScope, 510 false); 511 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 512 return std::make_tuple(SIAtomicScope::WAVEFRONT, 513 SIAtomicAddrSpace::ATOMIC & InstrScope, 514 false); 515 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 516 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 517 SIAtomicAddrSpace::ATOMIC & InstrScope, 518 false); 519 return None; 520 } 521 522 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 523 if (AS == AMDGPUAS::FLAT_ADDRESS) 524 return SIAtomicAddrSpace::FLAT; 525 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 526 return SIAtomicAddrSpace::GLOBAL; 527 if (AS == AMDGPUAS::LOCAL_ADDRESS) 528 return SIAtomicAddrSpace::LDS; 529 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 530 return SIAtomicAddrSpace::SCRATCH; 531 if (AS == AMDGPUAS::REGION_ADDRESS) 532 return SIAtomicAddrSpace::GDS; 533 534 return SIAtomicAddrSpace::OTHER; 535 } 536 537 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 538 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 539 } 540 541 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 542 const MachineBasicBlock::iterator &MI) const { 543 assert(MI->getNumMemOperands() > 0); 544 545 SyncScope::ID SSID = SyncScope::SingleThread; 546 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 547 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 548 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 549 bool IsNonTemporal = true; 550 bool IsVolatile = false; 551 552 // Validator should check whether or not MMOs cover the entire set of 553 // locations accessed by the memory instruction. 554 for (const auto &MMO : MI->memoperands()) { 555 IsNonTemporal &= MMO->isNonTemporal(); 556 IsVolatile |= MMO->isVolatile(); 557 InstrAddrSpace |= 558 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 559 AtomicOrdering OpOrdering = MMO->getOrdering(); 560 if (OpOrdering != AtomicOrdering::NotAtomic) { 561 const auto &IsSyncScopeInclusion = 562 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 563 if (!IsSyncScopeInclusion) { 564 reportUnsupported(MI, 565 "Unsupported non-inclusive atomic synchronization scope"); 566 return None; 567 } 568 569 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 570 Ordering = 571 isStrongerThan(Ordering, OpOrdering) ? 572 Ordering : MMO->getOrdering(); 573 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 574 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 575 FailureOrdering = 576 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 577 FailureOrdering : MMO->getFailureOrdering(); 578 } 579 } 580 581 SIAtomicScope Scope = SIAtomicScope::NONE; 582 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 583 bool IsCrossAddressSpaceOrdering = false; 584 if (Ordering != AtomicOrdering::NotAtomic) { 585 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 586 if (!ScopeOrNone) { 587 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 588 return None; 589 } 590 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 591 ScopeOrNone.getValue(); 592 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 593 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 594 reportUnsupported(MI, "Unsupported atomic address space"); 595 return None; 596 } 597 } 598 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 599 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 600 IsNonTemporal); 601 } 602 603 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 604 const MachineBasicBlock::iterator &MI) const { 605 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 606 607 if (!(MI->mayLoad() && !MI->mayStore())) 608 return None; 609 610 // Be conservative if there are no memory operands. 611 if (MI->getNumMemOperands() == 0) 612 return SIMemOpInfo(); 613 614 return constructFromMIWithMMO(MI); 615 } 616 617 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 618 const MachineBasicBlock::iterator &MI) const { 619 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 620 621 if (!(!MI->mayLoad() && MI->mayStore())) 622 return None; 623 624 // Be conservative if there are no memory operands. 625 if (MI->getNumMemOperands() == 0) 626 return SIMemOpInfo(); 627 628 return constructFromMIWithMMO(MI); 629 } 630 631 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 632 const MachineBasicBlock::iterator &MI) const { 633 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 634 635 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 636 return None; 637 638 AtomicOrdering Ordering = 639 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 640 641 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 642 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 643 if (!ScopeOrNone) { 644 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 645 return None; 646 } 647 648 SIAtomicScope Scope = SIAtomicScope::NONE; 649 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 650 bool IsCrossAddressSpaceOrdering = false; 651 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 652 ScopeOrNone.getValue(); 653 654 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 655 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 656 reportUnsupported(MI, "Unsupported atomic address space"); 657 return None; 658 } 659 660 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 661 IsCrossAddressSpaceOrdering); 662 } 663 664 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 665 const MachineBasicBlock::iterator &MI) const { 666 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 667 668 if (!(MI->mayLoad() && MI->mayStore())) 669 return None; 670 671 // Be conservative if there are no memory operands. 672 if (MI->getNumMemOperands() == 0) 673 return SIMemOpInfo(); 674 675 return constructFromMIWithMMO(MI); 676 } 677 678 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 679 TII = ST.getInstrInfo(); 680 IV = getIsaVersion(ST.getCPU()); 681 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 682 } 683 684 /* static */ 685 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 686 GCNSubtarget::Generation Generation = ST.getGeneration(); 687 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 688 return std::make_unique<SIGfx6CacheControl>(ST); 689 if (Generation < AMDGPUSubtarget::GFX10) 690 return std::make_unique<SIGfx7CacheControl>(ST); 691 return std::make_unique<SIGfx10CacheControl>(ST); 692 } 693 694 bool SIGfx6CacheControl::enableLoadCacheBypass( 695 const MachineBasicBlock::iterator &MI, 696 SIAtomicScope Scope, 697 SIAtomicAddrSpace AddrSpace) const { 698 assert(MI->mayLoad() && !MI->mayStore()); 699 bool Changed = false; 700 701 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 702 switch (Scope) { 703 case SIAtomicScope::SYSTEM: 704 case SIAtomicScope::AGENT: 705 Changed |= enableGLCBit(MI); 706 break; 707 case SIAtomicScope::WORKGROUP: 708 case SIAtomicScope::WAVEFRONT: 709 case SIAtomicScope::SINGLETHREAD: 710 // No cache to bypass. 711 break; 712 default: 713 llvm_unreachable("Unsupported synchronization scope"); 714 } 715 } 716 717 /// The scratch address space does not need the global memory caches 718 /// to be bypassed as all memory operations by the same thread are 719 /// sequentially consistent, and no other thread can access scratch 720 /// memory. 721 722 /// Other address spaces do not have a cache. 723 724 return Changed; 725 } 726 727 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 728 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 729 bool IsVolatile, bool IsNonTemporal) const { 730 // Only handle load and store, not atomic read-modify-write insructions. The 731 // latter use glc to indicate if the atomic returns a result and so must not 732 // be used for cache control. 733 assert(MI->mayLoad() ^ MI->mayStore()); 734 735 // Only update load and store, not LLVM IR atomic read-modify-write 736 // instructions. The latter are always marked as volatile so cannot sensibly 737 // handle it as do not want to pessimize all atomics. Also they do not support 738 // the nontemporal attribute. 739 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 740 741 bool Changed = false; 742 743 if (IsVolatile) { 744 if (Op == SIMemOp::LOAD) 745 Changed |= enableGLCBit(MI); 746 747 // Ensure operation has completed at system scope to cause all volatile 748 // operations to be visible outside the program in a global order. Do not 749 // request cross address space as only the global address space can be 750 // observable outside the program, so no need to cause a waitcnt for LDS 751 // address space operations. 752 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 753 Position::AFTER); 754 755 return Changed; 756 } 757 758 if (IsNonTemporal) { 759 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 760 Changed |= enableGLCBit(MI); 761 Changed |= enableSLCBit(MI); 762 return Changed; 763 } 764 765 return Changed; 766 } 767 768 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 769 SIAtomicScope Scope, 770 SIAtomicAddrSpace AddrSpace, 771 SIMemOp Op, 772 bool IsCrossAddrSpaceOrdering, 773 Position Pos) const { 774 bool Changed = false; 775 776 MachineBasicBlock &MBB = *MI->getParent(); 777 DebugLoc DL = MI->getDebugLoc(); 778 779 if (Pos == Position::AFTER) 780 ++MI; 781 782 bool VMCnt = false; 783 bool LGKMCnt = false; 784 785 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 786 SIAtomicAddrSpace::NONE) { 787 switch (Scope) { 788 case SIAtomicScope::SYSTEM: 789 case SIAtomicScope::AGENT: 790 VMCnt |= true; 791 break; 792 case SIAtomicScope::WORKGROUP: 793 case SIAtomicScope::WAVEFRONT: 794 case SIAtomicScope::SINGLETHREAD: 795 // The L1 cache keeps all memory operations in order for 796 // wavefronts in the same work-group. 797 break; 798 default: 799 llvm_unreachable("Unsupported synchronization scope"); 800 } 801 } 802 803 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 804 switch (Scope) { 805 case SIAtomicScope::SYSTEM: 806 case SIAtomicScope::AGENT: 807 case SIAtomicScope::WORKGROUP: 808 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 809 // not needed as LDS operations for all waves are executed in a total 810 // global ordering as observed by all waves. Required if also 811 // synchronizing with global/GDS memory as LDS operations could be 812 // reordered with respect to later global/GDS memory operations of the 813 // same wave. 814 LGKMCnt |= IsCrossAddrSpaceOrdering; 815 break; 816 case SIAtomicScope::WAVEFRONT: 817 case SIAtomicScope::SINGLETHREAD: 818 // The LDS keeps all memory operations in order for 819 // the same wavesfront. 820 break; 821 default: 822 llvm_unreachable("Unsupported synchronization scope"); 823 } 824 } 825 826 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 827 switch (Scope) { 828 case SIAtomicScope::SYSTEM: 829 case SIAtomicScope::AGENT: 830 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 831 // is not needed as GDS operations for all waves are executed in a total 832 // global ordering as observed by all waves. Required if also 833 // synchronizing with global/LDS memory as GDS operations could be 834 // reordered with respect to later global/LDS memory operations of the 835 // same wave. 836 LGKMCnt |= IsCrossAddrSpaceOrdering; 837 break; 838 case SIAtomicScope::WORKGROUP: 839 case SIAtomicScope::WAVEFRONT: 840 case SIAtomicScope::SINGLETHREAD: 841 // The GDS keeps all memory operations in order for 842 // the same work-group. 843 break; 844 default: 845 llvm_unreachable("Unsupported synchronization scope"); 846 } 847 } 848 849 if (VMCnt || LGKMCnt) { 850 unsigned WaitCntImmediate = 851 AMDGPU::encodeWaitcnt(IV, 852 VMCnt ? 0 : getVmcntBitMask(IV), 853 getExpcntBitMask(IV), 854 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 855 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 856 Changed = true; 857 } 858 859 if (Pos == Position::AFTER) 860 --MI; 861 862 return Changed; 863 } 864 865 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 866 SIAtomicScope Scope, 867 SIAtomicAddrSpace AddrSpace, 868 Position Pos) const { 869 if (!InsertCacheInv) 870 return false; 871 872 bool Changed = false; 873 874 MachineBasicBlock &MBB = *MI->getParent(); 875 DebugLoc DL = MI->getDebugLoc(); 876 877 if (Pos == Position::AFTER) 878 ++MI; 879 880 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 881 switch (Scope) { 882 case SIAtomicScope::SYSTEM: 883 case SIAtomicScope::AGENT: 884 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 885 Changed = true; 886 break; 887 case SIAtomicScope::WORKGROUP: 888 case SIAtomicScope::WAVEFRONT: 889 case SIAtomicScope::SINGLETHREAD: 890 // No cache to invalidate. 891 break; 892 default: 893 llvm_unreachable("Unsupported synchronization scope"); 894 } 895 } 896 897 /// The scratch address space does not need the global memory cache 898 /// to be flushed as all memory operations by the same thread are 899 /// sequentially consistent, and no other thread can access scratch 900 /// memory. 901 902 /// Other address spaces do not have a cache. 903 904 if (Pos == Position::AFTER) 905 --MI; 906 907 return Changed; 908 } 909 910 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 911 SIAtomicScope Scope, 912 SIAtomicAddrSpace AddrSpace, 913 bool IsCrossAddrSpaceOrdering, 914 Position Pos) const { 915 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 916 IsCrossAddrSpaceOrdering, Pos); 917 } 918 919 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 920 SIAtomicScope Scope, 921 SIAtomicAddrSpace AddrSpace, 922 Position Pos) const { 923 if (!InsertCacheInv) 924 return false; 925 926 bool Changed = false; 927 928 MachineBasicBlock &MBB = *MI->getParent(); 929 DebugLoc DL = MI->getDebugLoc(); 930 931 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 932 933 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 934 ? AMDGPU::BUFFER_WBINVL1 935 : AMDGPU::BUFFER_WBINVL1_VOL; 936 937 if (Pos == Position::AFTER) 938 ++MI; 939 940 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 941 switch (Scope) { 942 case SIAtomicScope::SYSTEM: 943 case SIAtomicScope::AGENT: 944 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 945 Changed = true; 946 break; 947 case SIAtomicScope::WORKGROUP: 948 case SIAtomicScope::WAVEFRONT: 949 case SIAtomicScope::SINGLETHREAD: 950 // No cache to invalidate. 951 break; 952 default: 953 llvm_unreachable("Unsupported synchronization scope"); 954 } 955 } 956 957 /// The scratch address space does not need the global memory cache 958 /// to be flushed as all memory operations by the same thread are 959 /// sequentially consistent, and no other thread can access scratch 960 /// memory. 961 962 /// Other address spaces do not have a cache. 963 964 if (Pos == Position::AFTER) 965 --MI; 966 967 return Changed; 968 } 969 970 bool SIGfx10CacheControl::enableLoadCacheBypass( 971 const MachineBasicBlock::iterator &MI, 972 SIAtomicScope Scope, 973 SIAtomicAddrSpace AddrSpace) const { 974 assert(MI->mayLoad() && !MI->mayStore()); 975 bool Changed = false; 976 977 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 978 /// TODO Do not set glc for rmw atomic operations as they 979 /// implicitly bypass the L0/L1 caches. 980 981 switch (Scope) { 982 case SIAtomicScope::SYSTEM: 983 case SIAtomicScope::AGENT: 984 Changed |= enableGLCBit(MI); 985 Changed |= enableDLCBit(MI); 986 break; 987 case SIAtomicScope::WORKGROUP: 988 // In WGP mode the waves of a work-group can be executing on either CU of 989 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 990 // CU mode all waves of a work-group are on the same CU, and so the L0 991 // does not need to be bypassed. 992 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 993 break; 994 case SIAtomicScope::WAVEFRONT: 995 case SIAtomicScope::SINGLETHREAD: 996 // No cache to bypass. 997 break; 998 default: 999 llvm_unreachable("Unsupported synchronization scope"); 1000 } 1001 } 1002 1003 /// The scratch address space does not need the global memory caches 1004 /// to be bypassed as all memory operations by the same thread are 1005 /// sequentially consistent, and no other thread can access scratch 1006 /// memory. 1007 1008 /// Other address spaces do not have a cache. 1009 1010 return Changed; 1011 } 1012 1013 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1014 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1015 bool IsVolatile, bool IsNonTemporal) const { 1016 1017 // Only handle load and store, not atomic read-modify-write insructions. The 1018 // latter use glc to indicate if the atomic returns a result and so must not 1019 // be used for cache control. 1020 assert(MI->mayLoad() ^ MI->mayStore()); 1021 1022 // Only update load and store, not LLVM IR atomic read-modify-write 1023 // instructions. The latter are always marked as volatile so cannot sensibly 1024 // handle it as do not want to pessimize all atomics. Also they do not support 1025 // the nontemporal attribute. 1026 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1027 1028 bool Changed = false; 1029 1030 if (IsVolatile) { 1031 1032 if (Op == SIMemOp::LOAD) { 1033 Changed |= enableGLCBit(MI); 1034 Changed |= enableDLCBit(MI); 1035 } 1036 1037 // Ensure operation has completed at system scope to cause all volatile 1038 // operations to be visible outside the program in a global order. Do not 1039 // request cross address space as only the global address space can be 1040 // observable outside the program, so no need to cause a waitcnt for LDS 1041 // address space operations. 1042 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1043 Position::AFTER); 1044 return Changed; 1045 } 1046 1047 if (IsNonTemporal) { 1048 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1049 Changed |= enableSLCBit(MI); 1050 return Changed; 1051 } 1052 1053 return Changed; 1054 } 1055 1056 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1057 SIAtomicScope Scope, 1058 SIAtomicAddrSpace AddrSpace, 1059 SIMemOp Op, 1060 bool IsCrossAddrSpaceOrdering, 1061 Position Pos) const { 1062 bool Changed = false; 1063 1064 MachineBasicBlock &MBB = *MI->getParent(); 1065 DebugLoc DL = MI->getDebugLoc(); 1066 1067 if (Pos == Position::AFTER) 1068 ++MI; 1069 1070 bool VMCnt = false; 1071 bool VSCnt = false; 1072 bool LGKMCnt = false; 1073 1074 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1075 SIAtomicAddrSpace::NONE) { 1076 switch (Scope) { 1077 case SIAtomicScope::SYSTEM: 1078 case SIAtomicScope::AGENT: 1079 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1080 VMCnt |= true; 1081 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1082 VSCnt |= true; 1083 break; 1084 case SIAtomicScope::WORKGROUP: 1085 // In WGP mode the waves of a work-group can be executing on either CU of 1086 // the WGP. Therefore need to wait for operations to complete to ensure 1087 // they are visible to waves in the other CU as the L0 is per CU. 1088 // Otherwise in CU mode and all waves of a work-group are on the same CU 1089 // which shares the same L0. 1090 if (!ST.isCuModeEnabled()) { 1091 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1092 VMCnt |= true; 1093 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1094 VSCnt |= true; 1095 } 1096 break; 1097 case SIAtomicScope::WAVEFRONT: 1098 case SIAtomicScope::SINGLETHREAD: 1099 // The L0 cache keeps all memory operations in order for 1100 // work-items in the same wavefront. 1101 break; 1102 default: 1103 llvm_unreachable("Unsupported synchronization scope"); 1104 } 1105 } 1106 1107 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1108 switch (Scope) { 1109 case SIAtomicScope::SYSTEM: 1110 case SIAtomicScope::AGENT: 1111 case SIAtomicScope::WORKGROUP: 1112 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1113 // not needed as LDS operations for all waves are executed in a total 1114 // global ordering as observed by all waves. Required if also 1115 // synchronizing with global/GDS memory as LDS operations could be 1116 // reordered with respect to later global/GDS memory operations of the 1117 // same wave. 1118 LGKMCnt |= IsCrossAddrSpaceOrdering; 1119 break; 1120 case SIAtomicScope::WAVEFRONT: 1121 case SIAtomicScope::SINGLETHREAD: 1122 // The LDS keeps all memory operations in order for 1123 // the same wavesfront. 1124 break; 1125 default: 1126 llvm_unreachable("Unsupported synchronization scope"); 1127 } 1128 } 1129 1130 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1131 switch (Scope) { 1132 case SIAtomicScope::SYSTEM: 1133 case SIAtomicScope::AGENT: 1134 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1135 // is not needed as GDS operations for all waves are executed in a total 1136 // global ordering as observed by all waves. Required if also 1137 // synchronizing with global/LDS memory as GDS operations could be 1138 // reordered with respect to later global/LDS memory operations of the 1139 // same wave. 1140 LGKMCnt |= IsCrossAddrSpaceOrdering; 1141 break; 1142 case SIAtomicScope::WORKGROUP: 1143 case SIAtomicScope::WAVEFRONT: 1144 case SIAtomicScope::SINGLETHREAD: 1145 // The GDS keeps all memory operations in order for 1146 // the same work-group. 1147 break; 1148 default: 1149 llvm_unreachable("Unsupported synchronization scope"); 1150 } 1151 } 1152 1153 if (VMCnt || LGKMCnt) { 1154 unsigned WaitCntImmediate = 1155 AMDGPU::encodeWaitcnt(IV, 1156 VMCnt ? 0 : getVmcntBitMask(IV), 1157 getExpcntBitMask(IV), 1158 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1159 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1160 Changed = true; 1161 } 1162 1163 if (VSCnt) { 1164 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1165 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1166 .addImm(0); 1167 Changed = true; 1168 } 1169 1170 if (Pos == Position::AFTER) 1171 --MI; 1172 1173 return Changed; 1174 } 1175 1176 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1177 SIAtomicScope Scope, 1178 SIAtomicAddrSpace AddrSpace, 1179 Position Pos) const { 1180 if (!InsertCacheInv) 1181 return false; 1182 1183 bool Changed = false; 1184 1185 MachineBasicBlock &MBB = *MI->getParent(); 1186 DebugLoc DL = MI->getDebugLoc(); 1187 1188 if (Pos == Position::AFTER) 1189 ++MI; 1190 1191 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1192 switch (Scope) { 1193 case SIAtomicScope::SYSTEM: 1194 case SIAtomicScope::AGENT: 1195 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1196 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1197 Changed = true; 1198 break; 1199 case SIAtomicScope::WORKGROUP: 1200 // In WGP mode the waves of a work-group can be executing on either CU of 1201 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1202 // in CU mode and all waves of a work-group are on the same CU, and so the 1203 // L0 does not need to be invalidated. 1204 if (!ST.isCuModeEnabled()) { 1205 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1206 Changed = true; 1207 } 1208 break; 1209 case SIAtomicScope::WAVEFRONT: 1210 case SIAtomicScope::SINGLETHREAD: 1211 // No cache to invalidate. 1212 break; 1213 default: 1214 llvm_unreachable("Unsupported synchronization scope"); 1215 } 1216 } 1217 1218 /// The scratch address space does not need the global memory cache 1219 /// to be flushed as all memory operations by the same thread are 1220 /// sequentially consistent, and no other thread can access scratch 1221 /// memory. 1222 1223 /// Other address spaces do not have a cache. 1224 1225 if (Pos == Position::AFTER) 1226 --MI; 1227 1228 return Changed; 1229 } 1230 1231 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1232 if (AtomicPseudoMIs.empty()) 1233 return false; 1234 1235 for (auto &MI : AtomicPseudoMIs) 1236 MI->eraseFromParent(); 1237 1238 AtomicPseudoMIs.clear(); 1239 return true; 1240 } 1241 1242 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1243 MachineBasicBlock::iterator &MI) { 1244 assert(MI->mayLoad() && !MI->mayStore()); 1245 1246 bool Changed = false; 1247 1248 if (MOI.isAtomic()) { 1249 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1250 MOI.getOrdering() == AtomicOrdering::Acquire || 1251 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1252 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1253 MOI.getOrderingAddrSpace()); 1254 } 1255 1256 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1257 Changed |= CC->insertWait(MI, MOI.getScope(), 1258 MOI.getOrderingAddrSpace(), 1259 SIMemOp::LOAD | SIMemOp::STORE, 1260 MOI.getIsCrossAddressSpaceOrdering(), 1261 Position::BEFORE); 1262 1263 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1264 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1265 Changed |= CC->insertWait(MI, MOI.getScope(), 1266 MOI.getInstrAddrSpace(), 1267 SIMemOp::LOAD, 1268 MOI.getIsCrossAddressSpaceOrdering(), 1269 Position::AFTER); 1270 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1271 MOI.getOrderingAddrSpace(), 1272 Position::AFTER); 1273 } 1274 1275 return Changed; 1276 } 1277 1278 // Atomic instructions already bypass caches to the scope specified by the 1279 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1280 // need additional treatment. 1281 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1282 SIMemOp::LOAD, MOI.isVolatile(), 1283 MOI.isNonTemporal()); 1284 return Changed; 1285 } 1286 1287 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1288 MachineBasicBlock::iterator &MI) { 1289 assert(!MI->mayLoad() && MI->mayStore()); 1290 1291 bool Changed = false; 1292 1293 if (MOI.isAtomic()) { 1294 if (MOI.getOrdering() == AtomicOrdering::Release || 1295 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1296 Changed |= CC->insertRelease(MI, MOI.getScope(), 1297 MOI.getOrderingAddrSpace(), 1298 MOI.getIsCrossAddressSpaceOrdering(), 1299 Position::BEFORE); 1300 1301 return Changed; 1302 } 1303 1304 // Atomic instructions already bypass caches to the scope specified by the 1305 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1306 // need additional treatment. 1307 Changed |= CC->enableVolatileAndOrNonTemporal( 1308 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1309 MOI.isNonTemporal()); 1310 return Changed; 1311 } 1312 1313 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1314 MachineBasicBlock::iterator &MI) { 1315 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1316 1317 AtomicPseudoMIs.push_back(MI); 1318 bool Changed = false; 1319 1320 if (MOI.isAtomic()) { 1321 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1322 MOI.getOrdering() == AtomicOrdering::Release || 1323 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1324 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1325 /// TODO: This relies on a barrier always generating a waitcnt 1326 /// for LDS to ensure it is not reordered with the completion of 1327 /// the proceeding LDS operations. If barrier had a memory 1328 /// ordering and memory scope, then library does not need to 1329 /// generate a fence. Could add support in this file for 1330 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1331 /// adding S_WAITCNT before a S_BARRIER. 1332 Changed |= CC->insertRelease(MI, MOI.getScope(), 1333 MOI.getOrderingAddrSpace(), 1334 MOI.getIsCrossAddressSpaceOrdering(), 1335 Position::BEFORE); 1336 1337 // TODO: If both release and invalidate are happening they could be combined 1338 // to use the single "BUFFER_WBL2" instruction. This could be done by 1339 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1340 // track cache invalidate and write back instructions. 1341 1342 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1343 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1344 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1345 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1346 MOI.getOrderingAddrSpace(), 1347 Position::BEFORE); 1348 1349 return Changed; 1350 } 1351 1352 return Changed; 1353 } 1354 1355 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1356 MachineBasicBlock::iterator &MI) { 1357 assert(MI->mayLoad() && MI->mayStore()); 1358 1359 bool Changed = false; 1360 1361 if (MOI.isAtomic()) { 1362 if (MOI.getOrdering() == AtomicOrdering::Release || 1363 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1364 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1365 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1366 Changed |= CC->insertRelease(MI, MOI.getScope(), 1367 MOI.getOrderingAddrSpace(), 1368 MOI.getIsCrossAddressSpaceOrdering(), 1369 Position::BEFORE); 1370 1371 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1372 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1373 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1374 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1375 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1376 Changed |= CC->insertWait(MI, MOI.getScope(), 1377 MOI.getOrderingAddrSpace(), 1378 isAtomicRet(*MI) ? SIMemOp::LOAD : 1379 SIMemOp::STORE, 1380 MOI.getIsCrossAddressSpaceOrdering(), 1381 Position::AFTER); 1382 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1383 MOI.getOrderingAddrSpace(), 1384 Position::AFTER); 1385 } 1386 1387 return Changed; 1388 } 1389 1390 return Changed; 1391 } 1392 1393 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1394 bool Changed = false; 1395 1396 SIMemOpAccess MOA(MF); 1397 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1398 1399 for (auto &MBB : MF) { 1400 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1401 1402 // Unbundle instructions after the post-RA scheduler. 1403 if (MI->isBundle()) { 1404 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1405 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1406 I != E && I->isBundledWithPred(); ++I) { 1407 I->unbundleFromPred(); 1408 for (MachineOperand &MO : I->operands()) 1409 if (MO.isReg()) 1410 MO.setIsInternalRead(false); 1411 } 1412 1413 MI->eraseFromParent(); 1414 MI = II->getIterator(); 1415 } 1416 1417 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1418 continue; 1419 1420 if (const auto &MOI = MOA.getLoadInfo(MI)) 1421 Changed |= expandLoad(MOI.getValue(), MI); 1422 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1423 Changed |= expandStore(MOI.getValue(), MI); 1424 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1425 Changed |= expandAtomicFence(MOI.getValue(), MI); 1426 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1427 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1428 } 1429 } 1430 1431 Changed |= removeAtomicPseudoMIs(); 1432 return Changed; 1433 } 1434 1435 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1436 1437 char SIMemoryLegalizer::ID = 0; 1438 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1439 1440 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1441 return new SIMemoryLegalizer(); 1442 } 1443