1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 88 /// \returns Returns true if \p MI is modified, false otherwise. 89 template <uint16_t BitName> 90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 91 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 92 if (BitIdx == -1) 93 return false; 94 95 MachineOperand &Bit = MI->getOperand(BitIdx); 96 if (Bit.getImm() != 0) 97 return false; 98 99 Bit.setImm(1); 100 return true; 101 } 102 103 class SIMemOpInfo final { 104 private: 105 106 friend class SIMemOpAccess; 107 108 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 109 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 110 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 111 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 112 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 113 bool IsCrossAddressSpaceOrdering = false; 114 bool IsVolatile = false; 115 bool IsNonTemporal = false; 116 117 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 118 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 119 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 120 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 121 bool IsCrossAddressSpaceOrdering = true, 122 AtomicOrdering FailureOrdering = 123 AtomicOrdering::SequentiallyConsistent, 124 bool IsVolatile = false, 125 bool IsNonTemporal = false) 126 : Ordering(Ordering), FailureOrdering(FailureOrdering), 127 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 128 InstrAddrSpace(InstrAddrSpace), 129 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 130 IsVolatile(IsVolatile), 131 IsNonTemporal(IsNonTemporal) { 132 133 if (Ordering == AtomicOrdering::NotAtomic) { 134 assert(Scope == SIAtomicScope::NONE && 135 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 136 !IsCrossAddressSpaceOrdering && 137 FailureOrdering == AtomicOrdering::NotAtomic); 138 return; 139 } 140 141 assert(Scope != SIAtomicScope::NONE && 142 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 143 SIAtomicAddrSpace::NONE && 144 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 145 SIAtomicAddrSpace::NONE && 146 !isStrongerThan(FailureOrdering, Ordering)); 147 148 // There is also no cross address space ordering if the ordering 149 // address space is the same as the instruction address space and 150 // only contains a single address space. 151 if ((OrderingAddrSpace == InstrAddrSpace) && 152 isPowerOf2_32(uint32_t(InstrAddrSpace))) 153 this->IsCrossAddressSpaceOrdering = false; 154 155 // Limit the scope to the maximum supported by the instruction's address 156 // spaces. 157 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 158 SIAtomicAddrSpace::NONE) { 159 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 160 } else if ((InstrAddrSpace & 161 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 162 SIAtomicAddrSpace::NONE) { 163 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 164 } else if ((InstrAddrSpace & 165 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 166 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 167 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 168 } 169 } 170 171 public: 172 /// \returns Atomic synchronization scope of the machine instruction used to 173 /// create this SIMemOpInfo. 174 SIAtomicScope getScope() const { 175 return Scope; 176 } 177 178 /// \returns Ordering constraint of the machine instruction used to 179 /// create this SIMemOpInfo. 180 AtomicOrdering getOrdering() const { 181 return Ordering; 182 } 183 184 /// \returns Failure ordering constraint of the machine instruction used to 185 /// create this SIMemOpInfo. 186 AtomicOrdering getFailureOrdering() const { 187 return FailureOrdering; 188 } 189 190 /// \returns The address spaces be accessed by the machine 191 /// instruction used to create this SiMemOpInfo. 192 SIAtomicAddrSpace getInstrAddrSpace() const { 193 return InstrAddrSpace; 194 } 195 196 /// \returns The address spaces that must be ordered by the machine 197 /// instruction used to create this SiMemOpInfo. 198 SIAtomicAddrSpace getOrderingAddrSpace() const { 199 return OrderingAddrSpace; 200 } 201 202 /// \returns Return true iff memory ordering of operations on 203 /// different address spaces is required. 204 bool getIsCrossAddressSpaceOrdering() const { 205 return IsCrossAddressSpaceOrdering; 206 } 207 208 /// \returns True if memory access of the machine instruction used to 209 /// create this SIMemOpInfo is volatile, false otherwise. 210 bool isVolatile() const { 211 return IsVolatile; 212 } 213 214 /// \returns True if memory access of the machine instruction used to 215 /// create this SIMemOpInfo is nontemporal, false otherwise. 216 bool isNonTemporal() const { 217 return IsNonTemporal; 218 } 219 220 /// \returns True if ordering constraint of the machine instruction used to 221 /// create this SIMemOpInfo is unordered or higher, false otherwise. 222 bool isAtomic() const { 223 return Ordering != AtomicOrdering::NotAtomic; 224 } 225 226 }; 227 228 class SIMemOpAccess final { 229 private: 230 AMDGPUMachineModuleInfo *MMI = nullptr; 231 232 /// Reports unsupported message \p Msg for \p MI to LLVM context. 233 void reportUnsupported(const MachineBasicBlock::iterator &MI, 234 const char *Msg) const; 235 236 /// Inspects the target synchronization scope \p SSID and determines 237 /// the SI atomic scope it corresponds to, the address spaces it 238 /// covers, and whether the memory ordering applies between address 239 /// spaces. 240 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 241 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 242 243 /// \return Return a bit set of the address spaces accessed by \p AS. 244 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 245 246 /// \returns Info constructed from \p MI, which has at least machine memory 247 /// operand. 248 Optional<SIMemOpInfo> constructFromMIWithMMO( 249 const MachineBasicBlock::iterator &MI) const; 250 251 public: 252 /// Construct class to support accessing the machine memory operands 253 /// of instructions in the machine function \p MF. 254 SIMemOpAccess(MachineFunction &MF); 255 256 /// \returns Load info if \p MI is a load operation, "None" otherwise. 257 Optional<SIMemOpInfo> getLoadInfo( 258 const MachineBasicBlock::iterator &MI) const; 259 260 /// \returns Store info if \p MI is a store operation, "None" otherwise. 261 Optional<SIMemOpInfo> getStoreInfo( 262 const MachineBasicBlock::iterator &MI) const; 263 264 /// \returns Atomic fence info if \p MI is an atomic fence operation, 265 /// "None" otherwise. 266 Optional<SIMemOpInfo> getAtomicFenceInfo( 267 const MachineBasicBlock::iterator &MI) const; 268 269 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 270 /// rmw operation, "None" otherwise. 271 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 272 const MachineBasicBlock::iterator &MI) const; 273 }; 274 275 class SICacheControl { 276 protected: 277 278 /// AMDGPU subtarget info. 279 const GCNSubtarget &ST; 280 281 /// Instruction info. 282 const SIInstrInfo *TII = nullptr; 283 284 IsaVersion IV; 285 286 /// Whether to insert cache invalidating instructions. 287 bool InsertCacheInv; 288 289 SICacheControl(const GCNSubtarget &ST); 290 291 public: 292 293 /// Create a cache control for the subtarget \p ST. 294 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 295 296 /// Update \p MI memory load instruction to bypass any caches up to 297 /// the \p Scope memory scope for address spaces \p 298 /// AddrSpace. Return true iff the instruction was modified. 299 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 300 SIAtomicScope Scope, 301 SIAtomicAddrSpace AddrSpace) const = 0; 302 303 /// Update \p MI memory store instruction to bypass any caches up to 304 /// the \p Scope memory scope for address spaces \p 305 /// AddrSpace. Return true iff the instruction was modified. 306 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 307 SIAtomicScope Scope, 308 SIAtomicAddrSpace AddrSpace) const = 0; 309 310 /// Update \p MI memory read-modify-write instruction to bypass any caches up 311 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 312 /// iff the instruction was modified. 313 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 314 SIAtomicScope Scope, 315 SIAtomicAddrSpace AddrSpace) const = 0; 316 317 /// Update \p MI memory instruction of kind \p Op associated with address 318 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 319 /// true iff the instruction was modified. 320 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 321 SIAtomicAddrSpace AddrSpace, 322 SIMemOp Op, bool IsVolatile, 323 bool IsNonTemporal) const = 0; 324 325 /// Inserts any necessary instructions at position \p Pos relative 326 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 327 /// \p Op associated with address spaces \p AddrSpace have completed. Used 328 /// between memory instructions to enforce the order they become visible as 329 /// observed by other memory instructions executing in memory scope \p Scope. 330 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 331 /// address spaces. Returns true iff any instructions inserted. 332 virtual bool insertWait(MachineBasicBlock::iterator &MI, 333 SIAtomicScope Scope, 334 SIAtomicAddrSpace AddrSpace, 335 SIMemOp Op, 336 bool IsCrossAddrSpaceOrdering, 337 Position Pos) const = 0; 338 339 /// Inserts any necessary instructions at position \p Pos relative to 340 /// instruction \p MI to ensure any subsequent memory instructions of this 341 /// thread with address spaces \p AddrSpace will observe the previous memory 342 /// operations by any thread for memory scopes up to memory scope \p Scope . 343 /// Returns true iff any instructions inserted. 344 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 345 SIAtomicScope Scope, 346 SIAtomicAddrSpace AddrSpace, 347 Position Pos) const = 0; 348 349 /// Inserts any necessary instructions at position \p Pos relative to 350 /// instruction \p MI to ensure previous memory instructions by this thread 351 /// with address spaces \p AddrSpace have completed and can be observed by 352 /// subsequent memory instructions by any thread executing in memory scope \p 353 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 354 /// between address spaces. Returns true iff any instructions inserted. 355 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 356 SIAtomicScope Scope, 357 SIAtomicAddrSpace AddrSpace, 358 bool IsCrossAddrSpaceOrdering, 359 Position Pos) const = 0; 360 361 /// Virtual destructor to allow derivations to be deleted. 362 virtual ~SICacheControl() = default; 363 364 }; 365 366 class SIGfx6CacheControl : public SICacheControl { 367 protected: 368 369 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 370 /// is modified, false otherwise. 371 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 372 return enableNamedBit<AMDGPU::OpName::glc>(MI); 373 } 374 375 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 376 /// is modified, false otherwise. 377 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 378 return enableNamedBit<AMDGPU::OpName::slc>(MI); 379 } 380 381 public: 382 383 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 384 385 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 386 SIAtomicScope Scope, 387 SIAtomicAddrSpace AddrSpace) const override; 388 389 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 390 SIAtomicScope Scope, 391 SIAtomicAddrSpace AddrSpace) const override; 392 393 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 394 SIAtomicScope Scope, 395 SIAtomicAddrSpace AddrSpace) const override; 396 397 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 398 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 399 bool IsVolatile, 400 bool IsNonTemporal) const override; 401 402 bool insertWait(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 SIMemOp Op, 406 bool IsCrossAddrSpaceOrdering, 407 Position Pos) const override; 408 409 bool insertAcquire(MachineBasicBlock::iterator &MI, 410 SIAtomicScope Scope, 411 SIAtomicAddrSpace AddrSpace, 412 Position Pos) const override; 413 414 bool insertRelease(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 bool IsCrossAddrSpaceOrdering, 418 Position Pos) const override; 419 }; 420 421 class SIGfx7CacheControl : public SIGfx6CacheControl { 422 public: 423 424 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 425 426 bool insertAcquire(MachineBasicBlock::iterator &MI, 427 SIAtomicScope Scope, 428 SIAtomicAddrSpace AddrSpace, 429 Position Pos) const override; 430 431 }; 432 433 class SIGfx90ACacheControl : public SIGfx7CacheControl { 434 protected: 435 436 /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI 437 /// is modified, false otherwise. 438 bool enableSCCBit(const MachineBasicBlock::iterator &MI) const { 439 return enableNamedBit<AMDGPU::OpName::sccb>(MI); 440 } 441 442 public: 443 444 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 445 446 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 447 SIAtomicScope Scope, 448 SIAtomicAddrSpace AddrSpace) const override; 449 450 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 451 SIAtomicScope Scope, 452 SIAtomicAddrSpace AddrSpace) const override; 453 454 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 455 SIAtomicScope Scope, 456 SIAtomicAddrSpace AddrSpace) const override; 457 458 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 459 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 460 bool IsVolatile, 461 bool IsNonTemporal) const override; 462 463 bool insertWait(MachineBasicBlock::iterator &MI, 464 SIAtomicScope Scope, 465 SIAtomicAddrSpace AddrSpace, 466 SIMemOp Op, 467 bool IsCrossAddrSpaceOrdering, 468 Position Pos) const override; 469 470 bool insertAcquire(MachineBasicBlock::iterator &MI, 471 SIAtomicScope Scope, 472 SIAtomicAddrSpace AddrSpace, 473 Position Pos) const override; 474 475 bool insertRelease(MachineBasicBlock::iterator &MI, 476 SIAtomicScope Scope, 477 SIAtomicAddrSpace AddrSpace, 478 bool IsCrossAddrSpaceOrdering, 479 Position Pos) const override; 480 }; 481 482 class SIGfx10CacheControl : public SIGfx7CacheControl { 483 protected: 484 485 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 486 /// is modified, false otherwise. 487 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 488 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 489 } 490 491 public: 492 493 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 494 495 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 496 SIAtomicScope Scope, 497 SIAtomicAddrSpace AddrSpace) const override; 498 499 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 500 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 501 bool IsVolatile, 502 bool IsNonTemporal) const override; 503 504 bool insertWait(MachineBasicBlock::iterator &MI, 505 SIAtomicScope Scope, 506 SIAtomicAddrSpace AddrSpace, 507 SIMemOp Op, 508 bool IsCrossAddrSpaceOrdering, 509 Position Pos) const override; 510 511 bool insertAcquire(MachineBasicBlock::iterator &MI, 512 SIAtomicScope Scope, 513 SIAtomicAddrSpace AddrSpace, 514 Position Pos) const override; 515 }; 516 517 class SIMemoryLegalizer final : public MachineFunctionPass { 518 private: 519 520 /// Cache Control. 521 std::unique_ptr<SICacheControl> CC = nullptr; 522 523 /// List of atomic pseudo instructions. 524 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 525 526 /// Return true iff instruction \p MI is a atomic instruction that 527 /// returns a result. 528 bool isAtomicRet(const MachineInstr &MI) const { 529 return SIInstrInfo::isAtomicRet(MI); 530 } 531 532 /// Removes all processed atomic pseudo instructions from the current 533 /// function. Returns true if current function is modified, false otherwise. 534 bool removeAtomicPseudoMIs(); 535 536 /// Expands load operation \p MI. Returns true if instructions are 537 /// added/deleted or \p MI is modified, false otherwise. 538 bool expandLoad(const SIMemOpInfo &MOI, 539 MachineBasicBlock::iterator &MI); 540 /// Expands store operation \p MI. Returns true if instructions are 541 /// added/deleted or \p MI is modified, false otherwise. 542 bool expandStore(const SIMemOpInfo &MOI, 543 MachineBasicBlock::iterator &MI); 544 /// Expands atomic fence operation \p MI. Returns true if 545 /// instructions are added/deleted or \p MI is modified, false otherwise. 546 bool expandAtomicFence(const SIMemOpInfo &MOI, 547 MachineBasicBlock::iterator &MI); 548 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 549 /// instructions are added/deleted or \p MI is modified, false otherwise. 550 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 551 MachineBasicBlock::iterator &MI); 552 553 public: 554 static char ID; 555 556 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 557 558 void getAnalysisUsage(AnalysisUsage &AU) const override { 559 AU.setPreservesCFG(); 560 MachineFunctionPass::getAnalysisUsage(AU); 561 } 562 563 StringRef getPassName() const override { 564 return PASS_NAME; 565 } 566 567 bool runOnMachineFunction(MachineFunction &MF) override; 568 }; 569 570 } // end namespace anonymous 571 572 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 573 const char *Msg) const { 574 const Function &Func = MI->getParent()->getParent()->getFunction(); 575 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 576 Func.getContext().diagnose(Diag); 577 } 578 579 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 580 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 581 SIAtomicAddrSpace InstrAddrSpace) const { 582 if (SSID == SyncScope::System) 583 return std::make_tuple(SIAtomicScope::SYSTEM, 584 SIAtomicAddrSpace::ATOMIC, 585 true); 586 if (SSID == MMI->getAgentSSID()) 587 return std::make_tuple(SIAtomicScope::AGENT, 588 SIAtomicAddrSpace::ATOMIC, 589 true); 590 if (SSID == MMI->getWorkgroupSSID()) 591 return std::make_tuple(SIAtomicScope::WORKGROUP, 592 SIAtomicAddrSpace::ATOMIC, 593 true); 594 if (SSID == MMI->getWavefrontSSID()) 595 return std::make_tuple(SIAtomicScope::WAVEFRONT, 596 SIAtomicAddrSpace::ATOMIC, 597 true); 598 if (SSID == SyncScope::SingleThread) 599 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 600 SIAtomicAddrSpace::ATOMIC, 601 true); 602 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 603 return std::make_tuple(SIAtomicScope::SYSTEM, 604 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 605 false); 606 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 607 return std::make_tuple(SIAtomicScope::AGENT, 608 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 609 false); 610 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 611 return std::make_tuple(SIAtomicScope::WORKGROUP, 612 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 613 false); 614 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 615 return std::make_tuple(SIAtomicScope::WAVEFRONT, 616 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 617 false); 618 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 619 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 620 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 621 false); 622 return None; 623 } 624 625 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 626 if (AS == AMDGPUAS::FLAT_ADDRESS) 627 return SIAtomicAddrSpace::FLAT; 628 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 629 return SIAtomicAddrSpace::GLOBAL; 630 if (AS == AMDGPUAS::LOCAL_ADDRESS) 631 return SIAtomicAddrSpace::LDS; 632 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 633 return SIAtomicAddrSpace::SCRATCH; 634 if (AS == AMDGPUAS::REGION_ADDRESS) 635 return SIAtomicAddrSpace::GDS; 636 637 return SIAtomicAddrSpace::OTHER; 638 } 639 640 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 641 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 642 } 643 644 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 645 const MachineBasicBlock::iterator &MI) const { 646 assert(MI->getNumMemOperands() > 0); 647 648 SyncScope::ID SSID = SyncScope::SingleThread; 649 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 650 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 651 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 652 bool IsNonTemporal = true; 653 bool IsVolatile = false; 654 655 // Validator should check whether or not MMOs cover the entire set of 656 // locations accessed by the memory instruction. 657 for (const auto &MMO : MI->memoperands()) { 658 IsNonTemporal &= MMO->isNonTemporal(); 659 IsVolatile |= MMO->isVolatile(); 660 InstrAddrSpace |= 661 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 662 AtomicOrdering OpOrdering = MMO->getOrdering(); 663 if (OpOrdering != AtomicOrdering::NotAtomic) { 664 const auto &IsSyncScopeInclusion = 665 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 666 if (!IsSyncScopeInclusion) { 667 reportUnsupported(MI, 668 "Unsupported non-inclusive atomic synchronization scope"); 669 return None; 670 } 671 672 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 673 Ordering = 674 isStrongerThan(Ordering, OpOrdering) ? 675 Ordering : MMO->getOrdering(); 676 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 677 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 678 FailureOrdering = 679 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 680 FailureOrdering : MMO->getFailureOrdering(); 681 } 682 } 683 684 SIAtomicScope Scope = SIAtomicScope::NONE; 685 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 686 bool IsCrossAddressSpaceOrdering = false; 687 if (Ordering != AtomicOrdering::NotAtomic) { 688 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 689 if (!ScopeOrNone) { 690 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 691 return None; 692 } 693 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 694 ScopeOrNone.getValue(); 695 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 696 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 697 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 698 reportUnsupported(MI, "Unsupported atomic address space"); 699 return None; 700 } 701 } 702 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 703 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 704 IsNonTemporal); 705 } 706 707 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 708 const MachineBasicBlock::iterator &MI) const { 709 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 710 711 if (!(MI->mayLoad() && !MI->mayStore())) 712 return None; 713 714 // Be conservative if there are no memory operands. 715 if (MI->getNumMemOperands() == 0) 716 return SIMemOpInfo(); 717 718 return constructFromMIWithMMO(MI); 719 } 720 721 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 722 const MachineBasicBlock::iterator &MI) const { 723 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 724 725 if (!(!MI->mayLoad() && MI->mayStore())) 726 return None; 727 728 // Be conservative if there are no memory operands. 729 if (MI->getNumMemOperands() == 0) 730 return SIMemOpInfo(); 731 732 return constructFromMIWithMMO(MI); 733 } 734 735 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 736 const MachineBasicBlock::iterator &MI) const { 737 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 738 739 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 740 return None; 741 742 AtomicOrdering Ordering = 743 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 744 745 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 746 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 747 if (!ScopeOrNone) { 748 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 749 return None; 750 } 751 752 SIAtomicScope Scope = SIAtomicScope::NONE; 753 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 754 bool IsCrossAddressSpaceOrdering = false; 755 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 756 ScopeOrNone.getValue(); 757 758 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 759 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 760 reportUnsupported(MI, "Unsupported atomic address space"); 761 return None; 762 } 763 764 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 765 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 766 } 767 768 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 769 const MachineBasicBlock::iterator &MI) const { 770 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 771 772 if (!(MI->mayLoad() && MI->mayStore())) 773 return None; 774 775 // Be conservative if there are no memory operands. 776 if (MI->getNumMemOperands() == 0) 777 return SIMemOpInfo(); 778 779 return constructFromMIWithMMO(MI); 780 } 781 782 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 783 TII = ST.getInstrInfo(); 784 IV = getIsaVersion(ST.getCPU()); 785 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 786 } 787 788 /* static */ 789 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 790 GCNSubtarget::Generation Generation = ST.getGeneration(); 791 if (ST.hasGFX90AInsts()) 792 return std::make_unique<SIGfx90ACacheControl>(ST); 793 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 794 return std::make_unique<SIGfx6CacheControl>(ST); 795 if (Generation < AMDGPUSubtarget::GFX10) 796 return std::make_unique<SIGfx7CacheControl>(ST); 797 return std::make_unique<SIGfx10CacheControl>(ST); 798 } 799 800 bool SIGfx6CacheControl::enableLoadCacheBypass( 801 const MachineBasicBlock::iterator &MI, 802 SIAtomicScope Scope, 803 SIAtomicAddrSpace AddrSpace) const { 804 assert(MI->mayLoad() && !MI->mayStore()); 805 bool Changed = false; 806 807 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 808 switch (Scope) { 809 case SIAtomicScope::SYSTEM: 810 case SIAtomicScope::AGENT: 811 Changed |= enableGLCBit(MI); 812 break; 813 case SIAtomicScope::WORKGROUP: 814 case SIAtomicScope::WAVEFRONT: 815 case SIAtomicScope::SINGLETHREAD: 816 // No cache to bypass. 817 break; 818 default: 819 llvm_unreachable("Unsupported synchronization scope"); 820 } 821 } 822 823 /// The scratch address space does not need the global memory caches 824 /// to be bypassed as all memory operations by the same thread are 825 /// sequentially consistent, and no other thread can access scratch 826 /// memory. 827 828 /// Other address spaces do not have a cache. 829 830 return Changed; 831 } 832 833 bool SIGfx6CacheControl::enableStoreCacheBypass( 834 const MachineBasicBlock::iterator &MI, 835 SIAtomicScope Scope, 836 SIAtomicAddrSpace AddrSpace) const { 837 assert(!MI->mayLoad() && MI->mayStore()); 838 bool Changed = false; 839 840 /// The L1 cache is write through so does not need to be bypassed. There is no 841 /// bypass control for the L2 cache at the isa level. 842 843 return Changed; 844 } 845 846 bool SIGfx6CacheControl::enableRMWCacheBypass( 847 const MachineBasicBlock::iterator &MI, 848 SIAtomicScope Scope, 849 SIAtomicAddrSpace AddrSpace) const { 850 assert(MI->mayLoad() && MI->mayStore()); 851 bool Changed = false; 852 853 /// The L1 cache is write through so does not need to be bypassed. There is no 854 /// bypass control for the L2 cache at the isa level. 855 856 return Changed; 857 } 858 859 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 860 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 861 bool IsVolatile, bool IsNonTemporal) const { 862 // Only handle load and store, not atomic read-modify-write insructions. The 863 // latter use glc to indicate if the atomic returns a result and so must not 864 // be used for cache control. 865 assert(MI->mayLoad() ^ MI->mayStore()); 866 867 // Only update load and store, not LLVM IR atomic read-modify-write 868 // instructions. The latter are always marked as volatile so cannot sensibly 869 // handle it as do not want to pessimize all atomics. Also they do not support 870 // the nontemporal attribute. 871 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 872 873 bool Changed = false; 874 875 if (IsVolatile) { 876 if (Op == SIMemOp::LOAD) 877 Changed |= enableGLCBit(MI); 878 879 // Ensure operation has completed at system scope to cause all volatile 880 // operations to be visible outside the program in a global order. Do not 881 // request cross address space as only the global address space can be 882 // observable outside the program, so no need to cause a waitcnt for LDS 883 // address space operations. 884 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 885 Position::AFTER); 886 887 return Changed; 888 } 889 890 if (IsNonTemporal) { 891 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 892 Changed |= enableGLCBit(MI); 893 Changed |= enableSLCBit(MI); 894 return Changed; 895 } 896 897 return Changed; 898 } 899 900 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 901 SIAtomicScope Scope, 902 SIAtomicAddrSpace AddrSpace, 903 SIMemOp Op, 904 bool IsCrossAddrSpaceOrdering, 905 Position Pos) const { 906 bool Changed = false; 907 908 MachineBasicBlock &MBB = *MI->getParent(); 909 DebugLoc DL = MI->getDebugLoc(); 910 911 if (Pos == Position::AFTER) 912 ++MI; 913 914 bool VMCnt = false; 915 bool LGKMCnt = false; 916 917 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 918 SIAtomicAddrSpace::NONE) { 919 switch (Scope) { 920 case SIAtomicScope::SYSTEM: 921 case SIAtomicScope::AGENT: 922 VMCnt |= true; 923 break; 924 case SIAtomicScope::WORKGROUP: 925 case SIAtomicScope::WAVEFRONT: 926 case SIAtomicScope::SINGLETHREAD: 927 // The L1 cache keeps all memory operations in order for 928 // wavefronts in the same work-group. 929 break; 930 default: 931 llvm_unreachable("Unsupported synchronization scope"); 932 } 933 } 934 935 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 936 switch (Scope) { 937 case SIAtomicScope::SYSTEM: 938 case SIAtomicScope::AGENT: 939 case SIAtomicScope::WORKGROUP: 940 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 941 // not needed as LDS operations for all waves are executed in a total 942 // global ordering as observed by all waves. Required if also 943 // synchronizing with global/GDS memory as LDS operations could be 944 // reordered with respect to later global/GDS memory operations of the 945 // same wave. 946 LGKMCnt |= IsCrossAddrSpaceOrdering; 947 break; 948 case SIAtomicScope::WAVEFRONT: 949 case SIAtomicScope::SINGLETHREAD: 950 // The LDS keeps all memory operations in order for 951 // the same wavesfront. 952 break; 953 default: 954 llvm_unreachable("Unsupported synchronization scope"); 955 } 956 } 957 958 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 959 switch (Scope) { 960 case SIAtomicScope::SYSTEM: 961 case SIAtomicScope::AGENT: 962 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 963 // is not needed as GDS operations for all waves are executed in a total 964 // global ordering as observed by all waves. Required if also 965 // synchronizing with global/LDS memory as GDS operations could be 966 // reordered with respect to later global/LDS memory operations of the 967 // same wave. 968 LGKMCnt |= IsCrossAddrSpaceOrdering; 969 break; 970 case SIAtomicScope::WORKGROUP: 971 case SIAtomicScope::WAVEFRONT: 972 case SIAtomicScope::SINGLETHREAD: 973 // The GDS keeps all memory operations in order for 974 // the same work-group. 975 break; 976 default: 977 llvm_unreachable("Unsupported synchronization scope"); 978 } 979 } 980 981 if (VMCnt || LGKMCnt) { 982 unsigned WaitCntImmediate = 983 AMDGPU::encodeWaitcnt(IV, 984 VMCnt ? 0 : getVmcntBitMask(IV), 985 getExpcntBitMask(IV), 986 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 987 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 988 Changed = true; 989 } 990 991 if (Pos == Position::AFTER) 992 --MI; 993 994 return Changed; 995 } 996 997 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 998 SIAtomicScope Scope, 999 SIAtomicAddrSpace AddrSpace, 1000 Position Pos) const { 1001 if (!InsertCacheInv) 1002 return false; 1003 1004 bool Changed = false; 1005 1006 MachineBasicBlock &MBB = *MI->getParent(); 1007 DebugLoc DL = MI->getDebugLoc(); 1008 1009 if (Pos == Position::AFTER) 1010 ++MI; 1011 1012 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1013 switch (Scope) { 1014 case SIAtomicScope::SYSTEM: 1015 case SIAtomicScope::AGENT: 1016 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1017 Changed = true; 1018 break; 1019 case SIAtomicScope::WORKGROUP: 1020 case SIAtomicScope::WAVEFRONT: 1021 case SIAtomicScope::SINGLETHREAD: 1022 // No cache to invalidate. 1023 break; 1024 default: 1025 llvm_unreachable("Unsupported synchronization scope"); 1026 } 1027 } 1028 1029 /// The scratch address space does not need the global memory cache 1030 /// to be flushed as all memory operations by the same thread are 1031 /// sequentially consistent, and no other thread can access scratch 1032 /// memory. 1033 1034 /// Other address spaces do not have a cache. 1035 1036 if (Pos == Position::AFTER) 1037 --MI; 1038 1039 return Changed; 1040 } 1041 1042 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1043 SIAtomicScope Scope, 1044 SIAtomicAddrSpace AddrSpace, 1045 bool IsCrossAddrSpaceOrdering, 1046 Position Pos) const { 1047 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1048 IsCrossAddrSpaceOrdering, Pos); 1049 } 1050 1051 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1052 SIAtomicScope Scope, 1053 SIAtomicAddrSpace AddrSpace, 1054 Position Pos) const { 1055 if (!InsertCacheInv) 1056 return false; 1057 1058 bool Changed = false; 1059 1060 MachineBasicBlock &MBB = *MI->getParent(); 1061 DebugLoc DL = MI->getDebugLoc(); 1062 1063 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1064 1065 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1066 ? AMDGPU::BUFFER_WBINVL1 1067 : AMDGPU::BUFFER_WBINVL1_VOL; 1068 1069 if (Pos == Position::AFTER) 1070 ++MI; 1071 1072 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1073 switch (Scope) { 1074 case SIAtomicScope::SYSTEM: 1075 case SIAtomicScope::AGENT: 1076 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1077 Changed = true; 1078 break; 1079 case SIAtomicScope::WORKGROUP: 1080 case SIAtomicScope::WAVEFRONT: 1081 case SIAtomicScope::SINGLETHREAD: 1082 // No cache to invalidate. 1083 break; 1084 default: 1085 llvm_unreachable("Unsupported synchronization scope"); 1086 } 1087 } 1088 1089 /// The scratch address space does not need the global memory cache 1090 /// to be flushed as all memory operations by the same thread are 1091 /// sequentially consistent, and no other thread can access scratch 1092 /// memory. 1093 1094 /// Other address spaces do not have a cache. 1095 1096 if (Pos == Position::AFTER) 1097 --MI; 1098 1099 return Changed; 1100 } 1101 1102 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1103 const MachineBasicBlock::iterator &MI, 1104 SIAtomicScope Scope, 1105 SIAtomicAddrSpace AddrSpace) const { 1106 assert(MI->mayLoad() && !MI->mayStore()); 1107 bool Changed = false; 1108 1109 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1110 switch (Scope) { 1111 case SIAtomicScope::SYSTEM: 1112 Changed |= enableSCCBit(MI); 1113 Changed |= enableGLCBit(MI); 1114 break; 1115 case SIAtomicScope::AGENT: 1116 Changed |= enableGLCBit(MI); 1117 break; 1118 case SIAtomicScope::WORKGROUP: 1119 // In threadgroup split mode the waves of a work-group can be executing on 1120 // different CUs. Therefore need to bypass the L1 which is per CU. 1121 // Otherwise in non-threadgroup split mode all waves of a work-group are 1122 // on the same CU, and so the L1 does not need to be bypassed. 1123 if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI); 1124 break; 1125 case SIAtomicScope::WAVEFRONT: 1126 case SIAtomicScope::SINGLETHREAD: 1127 // No cache to bypass. 1128 break; 1129 default: 1130 llvm_unreachable("Unsupported synchronization scope"); 1131 } 1132 } 1133 1134 /// The scratch address space does not need the global memory caches 1135 /// to be bypassed as all memory operations by the same thread are 1136 /// sequentially consistent, and no other thread can access scratch 1137 /// memory. 1138 1139 /// Other address spaces do not have a cache. 1140 1141 return Changed; 1142 } 1143 1144 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1145 const MachineBasicBlock::iterator &MI, 1146 SIAtomicScope Scope, 1147 SIAtomicAddrSpace AddrSpace) const { 1148 assert(!MI->mayLoad() && MI->mayStore()); 1149 bool Changed = false; 1150 1151 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1152 switch (Scope) { 1153 case SIAtomicScope::SYSTEM: 1154 Changed |= enableSCCBit(MI); 1155 LLVM_FALLTHROUGH; 1156 case SIAtomicScope::AGENT: 1157 /// Do not set glc for store atomic operations as they implicitly write 1158 /// through the L1 cache. 1159 break; 1160 case SIAtomicScope::WORKGROUP: 1161 case SIAtomicScope::WAVEFRONT: 1162 case SIAtomicScope::SINGLETHREAD: 1163 // No cache to bypass. Store atomics implicitly write through the L1 1164 // cache. 1165 break; 1166 default: 1167 llvm_unreachable("Unsupported synchronization scope"); 1168 } 1169 } 1170 1171 /// The scratch address space does not need the global memory caches 1172 /// to be bypassed as all memory operations by the same thread are 1173 /// sequentially consistent, and no other thread can access scratch 1174 /// memory. 1175 1176 /// Other address spaces do not have a cache. 1177 1178 return Changed; 1179 } 1180 1181 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1182 const MachineBasicBlock::iterator &MI, 1183 SIAtomicScope Scope, 1184 SIAtomicAddrSpace AddrSpace) const { 1185 assert(MI->mayLoad() && MI->mayStore()); 1186 bool Changed = false; 1187 1188 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1189 switch (Scope) { 1190 case SIAtomicScope::SYSTEM: 1191 Changed |= enableSCCBit(MI); 1192 LLVM_FALLTHROUGH; 1193 case SIAtomicScope::AGENT: 1194 /// Do not set glc for RMW atomic operations as they implicitly bypass 1195 /// the L1 cache, and the glc bit is instead used to indicate if they are 1196 /// return or no-return. 1197 break; 1198 case SIAtomicScope::WORKGROUP: 1199 case SIAtomicScope::WAVEFRONT: 1200 case SIAtomicScope::SINGLETHREAD: 1201 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1202 break; 1203 default: 1204 llvm_unreachable("Unsupported synchronization scope"); 1205 } 1206 } 1207 1208 return Changed; 1209 } 1210 1211 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1212 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1213 bool IsVolatile, bool IsNonTemporal) const { 1214 // Only handle load and store, not atomic read-modify-write insructions. The 1215 // latter use glc to indicate if the atomic returns a result and so must not 1216 // be used for cache control. 1217 assert(MI->mayLoad() ^ MI->mayStore()); 1218 1219 // Only update load and store, not LLVM IR atomic read-modify-write 1220 // instructions. The latter are always marked as volatile so cannot sensibly 1221 // handle it as do not want to pessimize all atomics. Also they do not support 1222 // the nontemporal attribute. 1223 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1224 1225 bool Changed = false; 1226 1227 if (IsVolatile) { 1228 if (Op == SIMemOp::LOAD) { 1229 Changed |= enableGLCBit(MI); 1230 } 1231 Changed |= enableSCCBit(MI); 1232 1233 // Ensure operation has completed at system scope to cause all volatile 1234 // operations to be visible outside the program in a global order. Do not 1235 // request cross address space as only the global address space can be 1236 // observable outside the program, so no need to cause a waitcnt for LDS 1237 // address space operations. 1238 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1239 Position::AFTER); 1240 1241 return Changed; 1242 } 1243 1244 if (IsNonTemporal) { 1245 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 1246 Changed |= enableGLCBit(MI); 1247 Changed |= enableSLCBit(MI); 1248 return Changed; 1249 } 1250 1251 return Changed; 1252 } 1253 1254 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1255 SIAtomicScope Scope, 1256 SIAtomicAddrSpace AddrSpace, 1257 SIMemOp Op, 1258 bool IsCrossAddrSpaceOrdering, 1259 Position Pos) const { 1260 if (ST.isTgSplitEnabled()) { 1261 // In threadgroup split mode the waves of a work-group can be executing on 1262 // different CUs. Therefore need to wait for global or GDS memory operations 1263 // to complete to ensure they are visible to waves in the other CUs. 1264 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1265 // the same CU, so no need to wait for global memory as all waves in the 1266 // work-group access the same the L1, nor wait for GDS as access are ordered 1267 // on a CU. 1268 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1269 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1270 (Scope == SIAtomicScope::WORKGROUP)) { 1271 // Same as GFX7 using agent scope. 1272 Scope = SIAtomicScope::AGENT; 1273 } 1274 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1275 // LDS memory operations. 1276 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1277 } 1278 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1279 IsCrossAddrSpaceOrdering, Pos); 1280 } 1281 1282 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1283 SIAtomicScope Scope, 1284 SIAtomicAddrSpace AddrSpace, 1285 Position Pos) const { 1286 if (!InsertCacheInv) 1287 return false; 1288 1289 bool Changed = false; 1290 1291 MachineBasicBlock &MBB = *MI->getParent(); 1292 DebugLoc DL = MI->getDebugLoc(); 1293 1294 if (Pos == Position::AFTER) 1295 ++MI; 1296 1297 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1298 switch (Scope) { 1299 case SIAtomicScope::SYSTEM: 1300 // Ensures that following loads will not see stale remote VMEM data or 1301 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1302 // CC will never be stale due to the local memory probes. 1303 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1304 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1305 // hardware does not reorder memory operations by the same wave with 1306 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1307 // remove any cache lines of earlier writes by the same wave and ensures 1308 // later reads by the same wave will refetch the cache lines. 1309 Changed = true; 1310 break; 1311 case SIAtomicScope::AGENT: 1312 // Same as GFX7. 1313 break; 1314 case SIAtomicScope::WORKGROUP: 1315 // In threadgroup split mode the waves of a work-group can be executing on 1316 // different CUs. Therefore need to invalidate the L1 which is per CU. 1317 // Otherwise in non-threadgroup split mode all waves of a work-group are 1318 // on the same CU, and so the L1 does not need to be invalidated. 1319 if (ST.isTgSplitEnabled()) { 1320 // Same as GFX7 using agent scope. 1321 Scope = SIAtomicScope::AGENT; 1322 } 1323 break; 1324 case SIAtomicScope::WAVEFRONT: 1325 case SIAtomicScope::SINGLETHREAD: 1326 // Same as GFX7. 1327 break; 1328 default: 1329 llvm_unreachable("Unsupported synchronization scope"); 1330 } 1331 } 1332 1333 /// The scratch address space does not need the global memory cache 1334 /// to be flushed as all memory operations by the same thread are 1335 /// sequentially consistent, and no other thread can access scratch 1336 /// memory. 1337 1338 /// Other address spaces do not have a cache. 1339 1340 if (Pos == Position::AFTER) 1341 --MI; 1342 1343 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1344 1345 return Changed; 1346 } 1347 1348 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1349 SIAtomicScope Scope, 1350 SIAtomicAddrSpace AddrSpace, 1351 bool IsCrossAddrSpaceOrdering, 1352 Position Pos) const { 1353 bool Changed = false; 1354 1355 MachineBasicBlock &MBB = *MI->getParent(); 1356 DebugLoc DL = MI->getDebugLoc(); 1357 1358 if (Pos == Position::AFTER) 1359 ++MI; 1360 1361 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1362 switch (Scope) { 1363 case SIAtomicScope::SYSTEM: 1364 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1365 // hardware does not reorder memory operations by the same wave with 1366 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1367 // to initiate writeback of any dirty cache lines of earlier writes by the 1368 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1369 // writeback has completed. 1370 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); 1371 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1372 // vmcnt(0)" needed by the "BUFFER_WBL2". 1373 Changed = true; 1374 break; 1375 case SIAtomicScope::AGENT: 1376 case SIAtomicScope::WORKGROUP: 1377 case SIAtomicScope::WAVEFRONT: 1378 case SIAtomicScope::SINGLETHREAD: 1379 // Same as GFX7. 1380 break; 1381 default: 1382 llvm_unreachable("Unsupported synchronization scope"); 1383 } 1384 } 1385 1386 if (Pos == Position::AFTER) 1387 --MI; 1388 1389 Changed |= 1390 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1391 IsCrossAddrSpaceOrdering, Pos); 1392 1393 return Changed; 1394 } 1395 1396 bool SIGfx10CacheControl::enableLoadCacheBypass( 1397 const MachineBasicBlock::iterator &MI, 1398 SIAtomicScope Scope, 1399 SIAtomicAddrSpace AddrSpace) const { 1400 assert(MI->mayLoad() && !MI->mayStore()); 1401 bool Changed = false; 1402 1403 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1404 /// TODO Do not set glc for rmw atomic operations as they 1405 /// implicitly bypass the L0/L1 caches. 1406 1407 switch (Scope) { 1408 case SIAtomicScope::SYSTEM: 1409 case SIAtomicScope::AGENT: 1410 Changed |= enableGLCBit(MI); 1411 Changed |= enableDLCBit(MI); 1412 break; 1413 case SIAtomicScope::WORKGROUP: 1414 // In WGP mode the waves of a work-group can be executing on either CU of 1415 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1416 // CU mode all waves of a work-group are on the same CU, and so the L0 1417 // does not need to be bypassed. 1418 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 1419 break; 1420 case SIAtomicScope::WAVEFRONT: 1421 case SIAtomicScope::SINGLETHREAD: 1422 // No cache to bypass. 1423 break; 1424 default: 1425 llvm_unreachable("Unsupported synchronization scope"); 1426 } 1427 } 1428 1429 /// The scratch address space does not need the global memory caches 1430 /// to be bypassed as all memory operations by the same thread are 1431 /// sequentially consistent, and no other thread can access scratch 1432 /// memory. 1433 1434 /// Other address spaces do not have a cache. 1435 1436 return Changed; 1437 } 1438 1439 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1440 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1441 bool IsVolatile, bool IsNonTemporal) const { 1442 1443 // Only handle load and store, not atomic read-modify-write insructions. The 1444 // latter use glc to indicate if the atomic returns a result and so must not 1445 // be used for cache control. 1446 assert(MI->mayLoad() ^ MI->mayStore()); 1447 1448 // Only update load and store, not LLVM IR atomic read-modify-write 1449 // instructions. The latter are always marked as volatile so cannot sensibly 1450 // handle it as do not want to pessimize all atomics. Also they do not support 1451 // the nontemporal attribute. 1452 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1453 1454 bool Changed = false; 1455 1456 if (IsVolatile) { 1457 1458 if (Op == SIMemOp::LOAD) { 1459 Changed |= enableGLCBit(MI); 1460 Changed |= enableDLCBit(MI); 1461 } 1462 1463 // Ensure operation has completed at system scope to cause all volatile 1464 // operations to be visible outside the program in a global order. Do not 1465 // request cross address space as only the global address space can be 1466 // observable outside the program, so no need to cause a waitcnt for LDS 1467 // address space operations. 1468 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1469 Position::AFTER); 1470 return Changed; 1471 } 1472 1473 if (IsNonTemporal) { 1474 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1475 Changed |= enableSLCBit(MI); 1476 return Changed; 1477 } 1478 1479 return Changed; 1480 } 1481 1482 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1483 SIAtomicScope Scope, 1484 SIAtomicAddrSpace AddrSpace, 1485 SIMemOp Op, 1486 bool IsCrossAddrSpaceOrdering, 1487 Position Pos) const { 1488 bool Changed = false; 1489 1490 MachineBasicBlock &MBB = *MI->getParent(); 1491 DebugLoc DL = MI->getDebugLoc(); 1492 1493 if (Pos == Position::AFTER) 1494 ++MI; 1495 1496 bool VMCnt = false; 1497 bool VSCnt = false; 1498 bool LGKMCnt = false; 1499 1500 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1501 SIAtomicAddrSpace::NONE) { 1502 switch (Scope) { 1503 case SIAtomicScope::SYSTEM: 1504 case SIAtomicScope::AGENT: 1505 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1506 VMCnt |= true; 1507 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1508 VSCnt |= true; 1509 break; 1510 case SIAtomicScope::WORKGROUP: 1511 // In WGP mode the waves of a work-group can be executing on either CU of 1512 // the WGP. Therefore need to wait for operations to complete to ensure 1513 // they are visible to waves in the other CU as the L0 is per CU. 1514 // Otherwise in CU mode and all waves of a work-group are on the same CU 1515 // which shares the same L0. 1516 if (!ST.isCuModeEnabled()) { 1517 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1518 VMCnt |= true; 1519 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1520 VSCnt |= true; 1521 } 1522 break; 1523 case SIAtomicScope::WAVEFRONT: 1524 case SIAtomicScope::SINGLETHREAD: 1525 // The L0 cache keeps all memory operations in order for 1526 // work-items in the same wavefront. 1527 break; 1528 default: 1529 llvm_unreachable("Unsupported synchronization scope"); 1530 } 1531 } 1532 1533 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1534 switch (Scope) { 1535 case SIAtomicScope::SYSTEM: 1536 case SIAtomicScope::AGENT: 1537 case SIAtomicScope::WORKGROUP: 1538 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1539 // not needed as LDS operations for all waves are executed in a total 1540 // global ordering as observed by all waves. Required if also 1541 // synchronizing with global/GDS memory as LDS operations could be 1542 // reordered with respect to later global/GDS memory operations of the 1543 // same wave. 1544 LGKMCnt |= IsCrossAddrSpaceOrdering; 1545 break; 1546 case SIAtomicScope::WAVEFRONT: 1547 case SIAtomicScope::SINGLETHREAD: 1548 // The LDS keeps all memory operations in order for 1549 // the same wavesfront. 1550 break; 1551 default: 1552 llvm_unreachable("Unsupported synchronization scope"); 1553 } 1554 } 1555 1556 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1557 switch (Scope) { 1558 case SIAtomicScope::SYSTEM: 1559 case SIAtomicScope::AGENT: 1560 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1561 // is not needed as GDS operations for all waves are executed in a total 1562 // global ordering as observed by all waves. Required if also 1563 // synchronizing with global/LDS memory as GDS operations could be 1564 // reordered with respect to later global/LDS memory operations of the 1565 // same wave. 1566 LGKMCnt |= IsCrossAddrSpaceOrdering; 1567 break; 1568 case SIAtomicScope::WORKGROUP: 1569 case SIAtomicScope::WAVEFRONT: 1570 case SIAtomicScope::SINGLETHREAD: 1571 // The GDS keeps all memory operations in order for 1572 // the same work-group. 1573 break; 1574 default: 1575 llvm_unreachable("Unsupported synchronization scope"); 1576 } 1577 } 1578 1579 if (VMCnt || LGKMCnt) { 1580 unsigned WaitCntImmediate = 1581 AMDGPU::encodeWaitcnt(IV, 1582 VMCnt ? 0 : getVmcntBitMask(IV), 1583 getExpcntBitMask(IV), 1584 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1585 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1586 Changed = true; 1587 } 1588 1589 if (VSCnt) { 1590 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1591 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1592 .addImm(0); 1593 Changed = true; 1594 } 1595 1596 if (Pos == Position::AFTER) 1597 --MI; 1598 1599 return Changed; 1600 } 1601 1602 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1603 SIAtomicScope Scope, 1604 SIAtomicAddrSpace AddrSpace, 1605 Position Pos) const { 1606 if (!InsertCacheInv) 1607 return false; 1608 1609 bool Changed = false; 1610 1611 MachineBasicBlock &MBB = *MI->getParent(); 1612 DebugLoc DL = MI->getDebugLoc(); 1613 1614 if (Pos == Position::AFTER) 1615 ++MI; 1616 1617 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1618 switch (Scope) { 1619 case SIAtomicScope::SYSTEM: 1620 case SIAtomicScope::AGENT: 1621 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1622 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1623 Changed = true; 1624 break; 1625 case SIAtomicScope::WORKGROUP: 1626 // In WGP mode the waves of a work-group can be executing on either CU of 1627 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1628 // in CU mode and all waves of a work-group are on the same CU, and so the 1629 // L0 does not need to be invalidated. 1630 if (!ST.isCuModeEnabled()) { 1631 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1632 Changed = true; 1633 } 1634 break; 1635 case SIAtomicScope::WAVEFRONT: 1636 case SIAtomicScope::SINGLETHREAD: 1637 // No cache to invalidate. 1638 break; 1639 default: 1640 llvm_unreachable("Unsupported synchronization scope"); 1641 } 1642 } 1643 1644 /// The scratch address space does not need the global memory cache 1645 /// to be flushed as all memory operations by the same thread are 1646 /// sequentially consistent, and no other thread can access scratch 1647 /// memory. 1648 1649 /// Other address spaces do not have a cache. 1650 1651 if (Pos == Position::AFTER) 1652 --MI; 1653 1654 return Changed; 1655 } 1656 1657 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1658 if (AtomicPseudoMIs.empty()) 1659 return false; 1660 1661 for (auto &MI : AtomicPseudoMIs) 1662 MI->eraseFromParent(); 1663 1664 AtomicPseudoMIs.clear(); 1665 return true; 1666 } 1667 1668 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1669 MachineBasicBlock::iterator &MI) { 1670 assert(MI->mayLoad() && !MI->mayStore()); 1671 1672 bool Changed = false; 1673 1674 if (MOI.isAtomic()) { 1675 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1676 MOI.getOrdering() == AtomicOrdering::Acquire || 1677 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1678 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1679 MOI.getOrderingAddrSpace()); 1680 } 1681 1682 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1683 Changed |= CC->insertWait(MI, MOI.getScope(), 1684 MOI.getOrderingAddrSpace(), 1685 SIMemOp::LOAD | SIMemOp::STORE, 1686 MOI.getIsCrossAddressSpaceOrdering(), 1687 Position::BEFORE); 1688 1689 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1690 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1691 Changed |= CC->insertWait(MI, MOI.getScope(), 1692 MOI.getInstrAddrSpace(), 1693 SIMemOp::LOAD, 1694 MOI.getIsCrossAddressSpaceOrdering(), 1695 Position::AFTER); 1696 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1697 MOI.getOrderingAddrSpace(), 1698 Position::AFTER); 1699 } 1700 1701 return Changed; 1702 } 1703 1704 // Atomic instructions already bypass caches to the scope specified by the 1705 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1706 // need additional treatment. 1707 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1708 SIMemOp::LOAD, MOI.isVolatile(), 1709 MOI.isNonTemporal()); 1710 return Changed; 1711 } 1712 1713 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1714 MachineBasicBlock::iterator &MI) { 1715 assert(!MI->mayLoad() && MI->mayStore()); 1716 1717 bool Changed = false; 1718 1719 if (MOI.isAtomic()) { 1720 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1721 MOI.getOrdering() == AtomicOrdering::Release || 1722 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1723 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1724 MOI.getOrderingAddrSpace()); 1725 } 1726 1727 if (MOI.getOrdering() == AtomicOrdering::Release || 1728 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1729 Changed |= CC->insertRelease(MI, MOI.getScope(), 1730 MOI.getOrderingAddrSpace(), 1731 MOI.getIsCrossAddressSpaceOrdering(), 1732 Position::BEFORE); 1733 1734 return Changed; 1735 } 1736 1737 // Atomic instructions already bypass caches to the scope specified by the 1738 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1739 // need additional treatment. 1740 Changed |= CC->enableVolatileAndOrNonTemporal( 1741 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1742 MOI.isNonTemporal()); 1743 return Changed; 1744 } 1745 1746 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1747 MachineBasicBlock::iterator &MI) { 1748 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1749 1750 AtomicPseudoMIs.push_back(MI); 1751 bool Changed = false; 1752 1753 if (MOI.isAtomic()) { 1754 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1755 MOI.getOrdering() == AtomicOrdering::Release || 1756 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1757 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1758 /// TODO: This relies on a barrier always generating a waitcnt 1759 /// for LDS to ensure it is not reordered with the completion of 1760 /// the proceeding LDS operations. If barrier had a memory 1761 /// ordering and memory scope, then library does not need to 1762 /// generate a fence. Could add support in this file for 1763 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1764 /// adding S_WAITCNT before a S_BARRIER. 1765 Changed |= CC->insertRelease(MI, MOI.getScope(), 1766 MOI.getOrderingAddrSpace(), 1767 MOI.getIsCrossAddressSpaceOrdering(), 1768 Position::BEFORE); 1769 1770 // TODO: If both release and invalidate are happening they could be combined 1771 // to use the single "BUFFER_WBL2" instruction. This could be done by 1772 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1773 // track cache invalidate and write back instructions. 1774 1775 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1776 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1777 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1778 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1779 MOI.getOrderingAddrSpace(), 1780 Position::BEFORE); 1781 1782 return Changed; 1783 } 1784 1785 return Changed; 1786 } 1787 1788 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1789 MachineBasicBlock::iterator &MI) { 1790 assert(MI->mayLoad() && MI->mayStore()); 1791 1792 bool Changed = false; 1793 1794 if (MOI.isAtomic()) { 1795 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1796 MOI.getOrdering() == AtomicOrdering::Acquire || 1797 MOI.getOrdering() == AtomicOrdering::Release || 1798 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1799 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1800 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1801 MOI.getInstrAddrSpace()); 1802 } 1803 1804 if (MOI.getOrdering() == AtomicOrdering::Release || 1805 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1806 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1807 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1808 Changed |= CC->insertRelease(MI, MOI.getScope(), 1809 MOI.getOrderingAddrSpace(), 1810 MOI.getIsCrossAddressSpaceOrdering(), 1811 Position::BEFORE); 1812 1813 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1814 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1815 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1816 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1817 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1818 Changed |= CC->insertWait(MI, MOI.getScope(), 1819 MOI.getInstrAddrSpace(), 1820 isAtomicRet(*MI) ? SIMemOp::LOAD : 1821 SIMemOp::STORE, 1822 MOI.getIsCrossAddressSpaceOrdering(), 1823 Position::AFTER); 1824 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1825 MOI.getOrderingAddrSpace(), 1826 Position::AFTER); 1827 } 1828 1829 return Changed; 1830 } 1831 1832 return Changed; 1833 } 1834 1835 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1836 bool Changed = false; 1837 1838 SIMemOpAccess MOA(MF); 1839 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1840 1841 for (auto &MBB : MF) { 1842 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1843 1844 // Unbundle instructions after the post-RA scheduler. 1845 if (MI->isBundle()) { 1846 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1847 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1848 I != E && I->isBundledWithPred(); ++I) { 1849 I->unbundleFromPred(); 1850 for (MachineOperand &MO : I->operands()) 1851 if (MO.isReg()) 1852 MO.setIsInternalRead(false); 1853 } 1854 1855 MI->eraseFromParent(); 1856 MI = II->getIterator(); 1857 } 1858 1859 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1860 continue; 1861 1862 if (const auto &MOI = MOA.getLoadInfo(MI)) 1863 Changed |= expandLoad(MOI.getValue(), MI); 1864 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1865 Changed |= expandStore(MOI.getValue(), MI); 1866 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1867 Changed |= expandAtomicFence(MOI.getValue(), MI); 1868 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1869 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1870 } 1871 } 1872 1873 Changed |= removeAtomicPseudoMIs(); 1874 return Changed; 1875 } 1876 1877 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1878 1879 char SIMemoryLegalizer::ID = 0; 1880 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1881 1882 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1883 return new SIMemoryLegalizer(); 1884 } 1885