1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed together. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE); 130 131 // There is also no cross address space ordering if the ordering 132 // address space is the same as the instruction address space and 133 // only contains a single address space. 134 if ((OrderingAddrSpace == InstrAddrSpace) && 135 isPowerOf2_32(uint32_t(InstrAddrSpace))) 136 this->IsCrossAddressSpaceOrdering = false; 137 138 // Limit the scope to the maximum supported by the instruction's address 139 // spaces. 140 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 141 SIAtomicAddrSpace::NONE) { 142 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 143 } else if ((InstrAddrSpace & 144 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 145 SIAtomicAddrSpace::NONE) { 146 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 147 } else if ((InstrAddrSpace & 148 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 149 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 150 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 151 } 152 } 153 154 public: 155 /// \returns Atomic synchronization scope of the machine instruction used to 156 /// create this SIMemOpInfo. 157 SIAtomicScope getScope() const { 158 return Scope; 159 } 160 161 /// \returns Ordering constraint of the machine instruction used to 162 /// create this SIMemOpInfo. 163 AtomicOrdering getOrdering() const { 164 return Ordering; 165 } 166 167 /// \returns Failure ordering constraint of the machine instruction used to 168 /// create this SIMemOpInfo. 169 AtomicOrdering getFailureOrdering() const { 170 return FailureOrdering; 171 } 172 173 /// \returns The address spaces be accessed by the machine 174 /// instruction used to create this SiMemOpInfo. 175 SIAtomicAddrSpace getInstrAddrSpace() const { 176 return InstrAddrSpace; 177 } 178 179 /// \returns The address spaces that must be ordered by the machine 180 /// instruction used to create this SiMemOpInfo. 181 SIAtomicAddrSpace getOrderingAddrSpace() const { 182 return OrderingAddrSpace; 183 } 184 185 /// \returns Return true iff memory ordering of operations on 186 /// different address spaces is required. 187 bool getIsCrossAddressSpaceOrdering() const { 188 return IsCrossAddressSpaceOrdering; 189 } 190 191 /// \returns True if memory access of the machine instruction used to 192 /// create this SIMemOpInfo is volatile, false otherwise. 193 bool isVolatile() const { 194 return IsVolatile; 195 } 196 197 /// \returns True if memory access of the machine instruction used to 198 /// create this SIMemOpInfo is nontemporal, false otherwise. 199 bool isNonTemporal() const { 200 return IsNonTemporal; 201 } 202 203 /// \returns True if ordering constraint of the machine instruction used to 204 /// create this SIMemOpInfo is unordered or higher, false otherwise. 205 bool isAtomic() const { 206 return Ordering != AtomicOrdering::NotAtomic; 207 } 208 209 }; 210 211 class SIMemOpAccess final { 212 private: 213 AMDGPUMachineModuleInfo *MMI = nullptr; 214 215 /// Reports unsupported message \p Msg for \p MI to LLVM context. 216 void reportUnsupported(const MachineBasicBlock::iterator &MI, 217 const char *Msg) const; 218 219 /// Inspects the target synchronization scope \p SSID and determines 220 /// the SI atomic scope it corresponds to, the address spaces it 221 /// covers, and whether the memory ordering applies between address 222 /// spaces. 223 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 224 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 225 226 /// \return Return a bit set of the address spaces accessed by \p AS. 227 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 228 229 /// \returns Info constructed from \p MI, which has at least machine memory 230 /// operand. 231 Optional<SIMemOpInfo> constructFromMIWithMMO( 232 const MachineBasicBlock::iterator &MI) const; 233 234 public: 235 /// Construct class to support accessing the machine memory operands 236 /// of instructions in the machine function \p MF. 237 SIMemOpAccess(MachineFunction &MF); 238 239 /// \returns Load info if \p MI is a load operation, "None" otherwise. 240 Optional<SIMemOpInfo> getLoadInfo( 241 const MachineBasicBlock::iterator &MI) const; 242 243 /// \returns Store info if \p MI is a store operation, "None" otherwise. 244 Optional<SIMemOpInfo> getStoreInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic fence info if \p MI is an atomic fence operation, 248 /// "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicFenceInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 252 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 253 /// rmw operation, "None" otherwise. 254 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 255 const MachineBasicBlock::iterator &MI) const; 256 }; 257 258 class SICacheControl { 259 protected: 260 261 /// AMDGPU subtarget info. 262 const GCNSubtarget &ST; 263 264 /// Instruction info. 265 const SIInstrInfo *TII = nullptr; 266 267 IsaVersion IV; 268 269 /// Whether to insert cache invalidating instructions. 270 bool InsertCacheInv; 271 272 SICacheControl(const GCNSubtarget &ST); 273 274 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 275 /// \returns Returns true if \p MI is modified, false otherwise. 276 bool enableNamedBit(const MachineBasicBlock::iterator MI, 277 AMDGPU::CPol::CPol Bit) const; 278 279 public: 280 281 /// Create a cache control for the subtarget \p ST. 282 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 283 284 /// Update \p MI memory load instruction to bypass any caches up to 285 /// the \p Scope memory scope for address spaces \p 286 /// AddrSpace. Return true iff the instruction was modified. 287 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 288 SIAtomicScope Scope, 289 SIAtomicAddrSpace AddrSpace) const = 0; 290 291 /// Update \p MI memory store instruction to bypass any caches up to 292 /// the \p Scope memory scope for address spaces \p 293 /// AddrSpace. Return true iff the instruction was modified. 294 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 295 SIAtomicScope Scope, 296 SIAtomicAddrSpace AddrSpace) const = 0; 297 298 /// Update \p MI memory read-modify-write instruction to bypass any caches up 299 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 300 /// iff the instruction was modified. 301 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 302 SIAtomicScope Scope, 303 SIAtomicAddrSpace AddrSpace) const = 0; 304 305 /// Update \p MI memory instruction of kind \p Op associated with address 306 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 307 /// true iff the instruction was modified. 308 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 309 SIAtomicAddrSpace AddrSpace, 310 SIMemOp Op, bool IsVolatile, 311 bool IsNonTemporal) const = 0; 312 313 /// Inserts any necessary instructions at position \p Pos relative 314 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 315 /// \p Op associated with address spaces \p AddrSpace have completed. Used 316 /// between memory instructions to enforce the order they become visible as 317 /// observed by other memory instructions executing in memory scope \p Scope. 318 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 319 /// address spaces. Returns true iff any instructions inserted. 320 virtual bool insertWait(MachineBasicBlock::iterator &MI, 321 SIAtomicScope Scope, 322 SIAtomicAddrSpace AddrSpace, 323 SIMemOp Op, 324 bool IsCrossAddrSpaceOrdering, 325 Position Pos) const = 0; 326 327 /// Inserts any necessary instructions at position \p Pos relative to 328 /// instruction \p MI to ensure any subsequent memory instructions of this 329 /// thread with address spaces \p AddrSpace will observe the previous memory 330 /// operations by any thread for memory scopes up to memory scope \p Scope . 331 /// Returns true iff any instructions inserted. 332 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 333 SIAtomicScope Scope, 334 SIAtomicAddrSpace AddrSpace, 335 Position Pos) const = 0; 336 337 /// Inserts any necessary instructions at position \p Pos relative to 338 /// instruction \p MI to ensure previous memory instructions by this thread 339 /// with address spaces \p AddrSpace have completed and can be observed by 340 /// subsequent memory instructions by any thread executing in memory scope \p 341 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 342 /// between address spaces. Returns true iff any instructions inserted. 343 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 344 SIAtomicScope Scope, 345 SIAtomicAddrSpace AddrSpace, 346 bool IsCrossAddrSpaceOrdering, 347 Position Pos) const = 0; 348 349 /// Virtual destructor to allow derivations to be deleted. 350 virtual ~SICacheControl() = default; 351 352 }; 353 354 class SIGfx6CacheControl : public SICacheControl { 355 protected: 356 357 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 358 /// is modified, false otherwise. 359 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 360 return enableNamedBit(MI, AMDGPU::CPol::GLC); 361 } 362 363 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::SLC); 367 } 368 369 public: 370 371 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 372 373 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 374 SIAtomicScope Scope, 375 SIAtomicAddrSpace AddrSpace) const override; 376 377 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 378 SIAtomicScope Scope, 379 SIAtomicAddrSpace AddrSpace) const override; 380 381 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 382 SIAtomicScope Scope, 383 SIAtomicAddrSpace AddrSpace) const override; 384 385 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 386 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 387 bool IsVolatile, 388 bool IsNonTemporal) const override; 389 390 bool insertWait(MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace, 393 SIMemOp Op, 394 bool IsCrossAddrSpaceOrdering, 395 Position Pos) const override; 396 397 bool insertAcquire(MachineBasicBlock::iterator &MI, 398 SIAtomicScope Scope, 399 SIAtomicAddrSpace AddrSpace, 400 Position Pos) const override; 401 402 bool insertRelease(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 bool IsCrossAddrSpaceOrdering, 406 Position Pos) const override; 407 }; 408 409 class SIGfx7CacheControl : public SIGfx6CacheControl { 410 public: 411 412 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 413 414 bool insertAcquire(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 Position Pos) const override; 418 419 }; 420 421 class SIGfx90ACacheControl : public SIGfx7CacheControl { 422 public: 423 424 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 425 426 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 427 SIAtomicScope Scope, 428 SIAtomicAddrSpace AddrSpace) const override; 429 430 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 431 SIAtomicScope Scope, 432 SIAtomicAddrSpace AddrSpace) const override; 433 434 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 435 SIAtomicScope Scope, 436 SIAtomicAddrSpace AddrSpace) const override; 437 438 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 439 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 440 bool IsVolatile, 441 bool IsNonTemporal) const override; 442 443 bool insertWait(MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace, 446 SIMemOp Op, 447 bool IsCrossAddrSpaceOrdering, 448 Position Pos) const override; 449 450 bool insertAcquire(MachineBasicBlock::iterator &MI, 451 SIAtomicScope Scope, 452 SIAtomicAddrSpace AddrSpace, 453 Position Pos) const override; 454 455 bool insertRelease(MachineBasicBlock::iterator &MI, 456 SIAtomicScope Scope, 457 SIAtomicAddrSpace AddrSpace, 458 bool IsCrossAddrSpaceOrdering, 459 Position Pos) const override; 460 }; 461 462 class SIGfx940CacheControl : public SIGfx90ACacheControl { 463 protected: 464 465 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 466 /// is modified, false otherwise. 467 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 468 return enableNamedBit(MI, AMDGPU::CPol::SC0); 469 } 470 471 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 472 /// is modified, false otherwise. 473 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 474 return enableNamedBit(MI, AMDGPU::CPol::SC1); 475 } 476 477 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 478 /// is modified, false otherwise. 479 bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 480 return enableNamedBit(MI, AMDGPU::CPol::NT); 481 } 482 483 public: 484 485 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 486 487 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 488 SIAtomicScope Scope, 489 SIAtomicAddrSpace AddrSpace) const override; 490 491 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 492 SIAtomicScope Scope, 493 SIAtomicAddrSpace AddrSpace) const override; 494 495 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 496 SIAtomicScope Scope, 497 SIAtomicAddrSpace AddrSpace) const override; 498 499 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 500 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 501 bool IsVolatile, 502 bool IsNonTemporal) const override; 503 504 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 505 SIAtomicAddrSpace AddrSpace, Position Pos) const override; 506 507 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 508 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 509 Position Pos) const override; 510 }; 511 512 class SIGfx10CacheControl : public SIGfx7CacheControl { 513 protected: 514 515 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 516 /// is modified, false otherwise. 517 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 518 return enableNamedBit(MI, AMDGPU::CPol::DLC); 519 } 520 521 public: 522 523 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 524 525 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 526 SIAtomicScope Scope, 527 SIAtomicAddrSpace AddrSpace) const override; 528 529 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 530 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 531 bool IsVolatile, 532 bool IsNonTemporal) const override; 533 534 bool insertWait(MachineBasicBlock::iterator &MI, 535 SIAtomicScope Scope, 536 SIAtomicAddrSpace AddrSpace, 537 SIMemOp Op, 538 bool IsCrossAddrSpaceOrdering, 539 Position Pos) const override; 540 541 bool insertAcquire(MachineBasicBlock::iterator &MI, 542 SIAtomicScope Scope, 543 SIAtomicAddrSpace AddrSpace, 544 Position Pos) const override; 545 }; 546 547 class SIMemoryLegalizer final : public MachineFunctionPass { 548 private: 549 550 /// Cache Control. 551 std::unique_ptr<SICacheControl> CC = nullptr; 552 553 /// List of atomic pseudo instructions. 554 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 555 556 /// Return true iff instruction \p MI is a atomic instruction that 557 /// returns a result. 558 bool isAtomicRet(const MachineInstr &MI) const { 559 return SIInstrInfo::isAtomicRet(MI); 560 } 561 562 /// Removes all processed atomic pseudo instructions from the current 563 /// function. Returns true if current function is modified, false otherwise. 564 bool removeAtomicPseudoMIs(); 565 566 /// Expands load operation \p MI. Returns true if instructions are 567 /// added/deleted or \p MI is modified, false otherwise. 568 bool expandLoad(const SIMemOpInfo &MOI, 569 MachineBasicBlock::iterator &MI); 570 /// Expands store operation \p MI. Returns true if instructions are 571 /// added/deleted or \p MI is modified, false otherwise. 572 bool expandStore(const SIMemOpInfo &MOI, 573 MachineBasicBlock::iterator &MI); 574 /// Expands atomic fence operation \p MI. Returns true if 575 /// instructions are added/deleted or \p MI is modified, false otherwise. 576 bool expandAtomicFence(const SIMemOpInfo &MOI, 577 MachineBasicBlock::iterator &MI); 578 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 579 /// instructions are added/deleted or \p MI is modified, false otherwise. 580 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 581 MachineBasicBlock::iterator &MI); 582 583 public: 584 static char ID; 585 586 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 587 588 void getAnalysisUsage(AnalysisUsage &AU) const override { 589 AU.setPreservesCFG(); 590 MachineFunctionPass::getAnalysisUsage(AU); 591 } 592 593 StringRef getPassName() const override { 594 return PASS_NAME; 595 } 596 597 bool runOnMachineFunction(MachineFunction &MF) override; 598 }; 599 600 } // end namespace anonymous 601 602 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 603 const char *Msg) const { 604 const Function &Func = MI->getParent()->getParent()->getFunction(); 605 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 606 Func.getContext().diagnose(Diag); 607 } 608 609 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 610 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 611 SIAtomicAddrSpace InstrAddrSpace) const { 612 if (SSID == SyncScope::System) 613 return std::make_tuple(SIAtomicScope::SYSTEM, 614 SIAtomicAddrSpace::ATOMIC, 615 true); 616 if (SSID == MMI->getAgentSSID()) 617 return std::make_tuple(SIAtomicScope::AGENT, 618 SIAtomicAddrSpace::ATOMIC, 619 true); 620 if (SSID == MMI->getWorkgroupSSID()) 621 return std::make_tuple(SIAtomicScope::WORKGROUP, 622 SIAtomicAddrSpace::ATOMIC, 623 true); 624 if (SSID == MMI->getWavefrontSSID()) 625 return std::make_tuple(SIAtomicScope::WAVEFRONT, 626 SIAtomicAddrSpace::ATOMIC, 627 true); 628 if (SSID == SyncScope::SingleThread) 629 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 630 SIAtomicAddrSpace::ATOMIC, 631 true); 632 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 633 return std::make_tuple(SIAtomicScope::SYSTEM, 634 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 635 false); 636 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 637 return std::make_tuple(SIAtomicScope::AGENT, 638 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 639 false); 640 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 641 return std::make_tuple(SIAtomicScope::WORKGROUP, 642 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 643 false); 644 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 645 return std::make_tuple(SIAtomicScope::WAVEFRONT, 646 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 647 false); 648 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 649 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 650 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 651 false); 652 return None; 653 } 654 655 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 656 if (AS == AMDGPUAS::FLAT_ADDRESS) 657 return SIAtomicAddrSpace::FLAT; 658 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 659 return SIAtomicAddrSpace::GLOBAL; 660 if (AS == AMDGPUAS::LOCAL_ADDRESS) 661 return SIAtomicAddrSpace::LDS; 662 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 663 return SIAtomicAddrSpace::SCRATCH; 664 if (AS == AMDGPUAS::REGION_ADDRESS) 665 return SIAtomicAddrSpace::GDS; 666 667 return SIAtomicAddrSpace::OTHER; 668 } 669 670 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 671 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 672 } 673 674 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 675 const MachineBasicBlock::iterator &MI) const { 676 assert(MI->getNumMemOperands() > 0); 677 678 SyncScope::ID SSID = SyncScope::SingleThread; 679 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 680 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 681 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 682 bool IsNonTemporal = true; 683 bool IsVolatile = false; 684 685 // Validator should check whether or not MMOs cover the entire set of 686 // locations accessed by the memory instruction. 687 for (const auto &MMO : MI->memoperands()) { 688 IsNonTemporal &= MMO->isNonTemporal(); 689 IsVolatile |= MMO->isVolatile(); 690 InstrAddrSpace |= 691 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 692 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 693 if (OpOrdering != AtomicOrdering::NotAtomic) { 694 const auto &IsSyncScopeInclusion = 695 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 696 if (!IsSyncScopeInclusion) { 697 reportUnsupported(MI, 698 "Unsupported non-inclusive atomic synchronization scope"); 699 return None; 700 } 701 702 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 703 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 704 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 705 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 706 FailureOrdering = 707 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 708 } 709 } 710 711 SIAtomicScope Scope = SIAtomicScope::NONE; 712 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 713 bool IsCrossAddressSpaceOrdering = false; 714 if (Ordering != AtomicOrdering::NotAtomic) { 715 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 716 if (!ScopeOrNone) { 717 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 718 return None; 719 } 720 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 721 ScopeOrNone.getValue(); 722 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 723 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 724 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 725 reportUnsupported(MI, "Unsupported atomic address space"); 726 return None; 727 } 728 } 729 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 730 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 731 IsNonTemporal); 732 } 733 734 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 735 const MachineBasicBlock::iterator &MI) const { 736 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 737 738 if (!(MI->mayLoad() && !MI->mayStore())) 739 return None; 740 741 // Be conservative if there are no memory operands. 742 if (MI->getNumMemOperands() == 0) 743 return SIMemOpInfo(); 744 745 return constructFromMIWithMMO(MI); 746 } 747 748 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 749 const MachineBasicBlock::iterator &MI) const { 750 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 751 752 if (!(!MI->mayLoad() && MI->mayStore())) 753 return None; 754 755 // Be conservative if there are no memory operands. 756 if (MI->getNumMemOperands() == 0) 757 return SIMemOpInfo(); 758 759 return constructFromMIWithMMO(MI); 760 } 761 762 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 763 const MachineBasicBlock::iterator &MI) const { 764 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 765 766 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 767 return None; 768 769 AtomicOrdering Ordering = 770 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 771 772 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 773 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 774 if (!ScopeOrNone) { 775 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 776 return None; 777 } 778 779 SIAtomicScope Scope = SIAtomicScope::NONE; 780 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 781 bool IsCrossAddressSpaceOrdering = false; 782 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 783 ScopeOrNone.getValue(); 784 785 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 786 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 787 reportUnsupported(MI, "Unsupported atomic address space"); 788 return None; 789 } 790 791 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 792 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 793 } 794 795 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 796 const MachineBasicBlock::iterator &MI) const { 797 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 798 799 if (!(MI->mayLoad() && MI->mayStore())) 800 return None; 801 802 // Be conservative if there are no memory operands. 803 if (MI->getNumMemOperands() == 0) 804 return SIMemOpInfo(); 805 806 return constructFromMIWithMMO(MI); 807 } 808 809 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 810 TII = ST.getInstrInfo(); 811 IV = getIsaVersion(ST.getCPU()); 812 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 813 } 814 815 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 816 AMDGPU::CPol::CPol Bit) const { 817 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 818 if (!CPol) 819 return false; 820 821 CPol->setImm(CPol->getImm() | Bit); 822 return true; 823 } 824 825 /* static */ 826 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 827 GCNSubtarget::Generation Generation = ST.getGeneration(); 828 if (ST.hasGFX940Insts()) 829 return std::make_unique<SIGfx940CacheControl>(ST); 830 if (ST.hasGFX90AInsts()) 831 return std::make_unique<SIGfx90ACacheControl>(ST); 832 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 833 return std::make_unique<SIGfx6CacheControl>(ST); 834 if (Generation < AMDGPUSubtarget::GFX10) 835 return std::make_unique<SIGfx7CacheControl>(ST); 836 return std::make_unique<SIGfx10CacheControl>(ST); 837 } 838 839 bool SIGfx6CacheControl::enableLoadCacheBypass( 840 const MachineBasicBlock::iterator &MI, 841 SIAtomicScope Scope, 842 SIAtomicAddrSpace AddrSpace) const { 843 assert(MI->mayLoad() && !MI->mayStore()); 844 bool Changed = false; 845 846 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 847 switch (Scope) { 848 case SIAtomicScope::SYSTEM: 849 case SIAtomicScope::AGENT: 850 // Set L1 cache policy to MISS_EVICT. 851 // Note: there is no L2 cache bypass policy at the ISA level. 852 Changed |= enableGLCBit(MI); 853 break; 854 case SIAtomicScope::WORKGROUP: 855 case SIAtomicScope::WAVEFRONT: 856 case SIAtomicScope::SINGLETHREAD: 857 // No cache to bypass. 858 break; 859 default: 860 llvm_unreachable("Unsupported synchronization scope"); 861 } 862 } 863 864 /// The scratch address space does not need the global memory caches 865 /// to be bypassed as all memory operations by the same thread are 866 /// sequentially consistent, and no other thread can access scratch 867 /// memory. 868 869 /// Other address spaces do not have a cache. 870 871 return Changed; 872 } 873 874 bool SIGfx6CacheControl::enableStoreCacheBypass( 875 const MachineBasicBlock::iterator &MI, 876 SIAtomicScope Scope, 877 SIAtomicAddrSpace AddrSpace) const { 878 assert(!MI->mayLoad() && MI->mayStore()); 879 bool Changed = false; 880 881 /// The L1 cache is write through so does not need to be bypassed. There is no 882 /// bypass control for the L2 cache at the isa level. 883 884 return Changed; 885 } 886 887 bool SIGfx6CacheControl::enableRMWCacheBypass( 888 const MachineBasicBlock::iterator &MI, 889 SIAtomicScope Scope, 890 SIAtomicAddrSpace AddrSpace) const { 891 assert(MI->mayLoad() && MI->mayStore()); 892 bool Changed = false; 893 894 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 895 /// bypassed, and the GLC bit is instead used to indicate if they are 896 /// return or no-return. 897 /// Note: there is no L2 cache coherent bypass control at the ISA level. 898 899 return Changed; 900 } 901 902 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 903 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 904 bool IsVolatile, bool IsNonTemporal) const { 905 // Only handle load and store, not atomic read-modify-write insructions. The 906 // latter use glc to indicate if the atomic returns a result and so must not 907 // be used for cache control. 908 assert(MI->mayLoad() ^ MI->mayStore()); 909 910 // Only update load and store, not LLVM IR atomic read-modify-write 911 // instructions. The latter are always marked as volatile so cannot sensibly 912 // handle it as do not want to pessimize all atomics. Also they do not support 913 // the nontemporal attribute. 914 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 915 916 bool Changed = false; 917 918 if (IsVolatile) { 919 // Set L1 cache policy to be MISS_EVICT for load instructions 920 // and MISS_LRU for store instructions. 921 // Note: there is no L2 cache bypass policy at the ISA level. 922 if (Op == SIMemOp::LOAD) 923 Changed |= enableGLCBit(MI); 924 925 // Ensure operation has completed at system scope to cause all volatile 926 // operations to be visible outside the program in a global order. Do not 927 // request cross address space as only the global address space can be 928 // observable outside the program, so no need to cause a waitcnt for LDS 929 // address space operations. 930 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 931 Position::AFTER); 932 933 return Changed; 934 } 935 936 if (IsNonTemporal) { 937 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 938 // for both loads and stores, and the L2 cache policy to STREAM. 939 Changed |= enableGLCBit(MI); 940 Changed |= enableSLCBit(MI); 941 return Changed; 942 } 943 944 return Changed; 945 } 946 947 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 948 SIAtomicScope Scope, 949 SIAtomicAddrSpace AddrSpace, 950 SIMemOp Op, 951 bool IsCrossAddrSpaceOrdering, 952 Position Pos) const { 953 bool Changed = false; 954 955 MachineBasicBlock &MBB = *MI->getParent(); 956 DebugLoc DL = MI->getDebugLoc(); 957 958 if (Pos == Position::AFTER) 959 ++MI; 960 961 bool VMCnt = false; 962 bool LGKMCnt = false; 963 964 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 965 SIAtomicAddrSpace::NONE) { 966 switch (Scope) { 967 case SIAtomicScope::SYSTEM: 968 case SIAtomicScope::AGENT: 969 VMCnt |= true; 970 break; 971 case SIAtomicScope::WORKGROUP: 972 case SIAtomicScope::WAVEFRONT: 973 case SIAtomicScope::SINGLETHREAD: 974 // The L1 cache keeps all memory operations in order for 975 // wavefronts in the same work-group. 976 break; 977 default: 978 llvm_unreachable("Unsupported synchronization scope"); 979 } 980 } 981 982 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 983 switch (Scope) { 984 case SIAtomicScope::SYSTEM: 985 case SIAtomicScope::AGENT: 986 case SIAtomicScope::WORKGROUP: 987 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 988 // not needed as LDS operations for all waves are executed in a total 989 // global ordering as observed by all waves. Required if also 990 // synchronizing with global/GDS memory as LDS operations could be 991 // reordered with respect to later global/GDS memory operations of the 992 // same wave. 993 LGKMCnt |= IsCrossAddrSpaceOrdering; 994 break; 995 case SIAtomicScope::WAVEFRONT: 996 case SIAtomicScope::SINGLETHREAD: 997 // The LDS keeps all memory operations in order for 998 // the same wavefront. 999 break; 1000 default: 1001 llvm_unreachable("Unsupported synchronization scope"); 1002 } 1003 } 1004 1005 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1006 switch (Scope) { 1007 case SIAtomicScope::SYSTEM: 1008 case SIAtomicScope::AGENT: 1009 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1010 // is not needed as GDS operations for all waves are executed in a total 1011 // global ordering as observed by all waves. Required if also 1012 // synchronizing with global/LDS memory as GDS operations could be 1013 // reordered with respect to later global/LDS memory operations of the 1014 // same wave. 1015 LGKMCnt |= IsCrossAddrSpaceOrdering; 1016 break; 1017 case SIAtomicScope::WORKGROUP: 1018 case SIAtomicScope::WAVEFRONT: 1019 case SIAtomicScope::SINGLETHREAD: 1020 // The GDS keeps all memory operations in order for 1021 // the same work-group. 1022 break; 1023 default: 1024 llvm_unreachable("Unsupported synchronization scope"); 1025 } 1026 } 1027 1028 if (VMCnt || LGKMCnt) { 1029 unsigned WaitCntImmediate = 1030 AMDGPU::encodeWaitcnt(IV, 1031 VMCnt ? 0 : getVmcntBitMask(IV), 1032 getExpcntBitMask(IV), 1033 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1034 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1035 Changed = true; 1036 } 1037 1038 if (Pos == Position::AFTER) 1039 --MI; 1040 1041 return Changed; 1042 } 1043 1044 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1045 SIAtomicScope Scope, 1046 SIAtomicAddrSpace AddrSpace, 1047 Position Pos) const { 1048 if (!InsertCacheInv) 1049 return false; 1050 1051 bool Changed = false; 1052 1053 MachineBasicBlock &MBB = *MI->getParent(); 1054 DebugLoc DL = MI->getDebugLoc(); 1055 1056 if (Pos == Position::AFTER) 1057 ++MI; 1058 1059 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1060 switch (Scope) { 1061 case SIAtomicScope::SYSTEM: 1062 case SIAtomicScope::AGENT: 1063 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1064 Changed = true; 1065 break; 1066 case SIAtomicScope::WORKGROUP: 1067 case SIAtomicScope::WAVEFRONT: 1068 case SIAtomicScope::SINGLETHREAD: 1069 // No cache to invalidate. 1070 break; 1071 default: 1072 llvm_unreachable("Unsupported synchronization scope"); 1073 } 1074 } 1075 1076 /// The scratch address space does not need the global memory cache 1077 /// to be flushed as all memory operations by the same thread are 1078 /// sequentially consistent, and no other thread can access scratch 1079 /// memory. 1080 1081 /// Other address spaces do not have a cache. 1082 1083 if (Pos == Position::AFTER) 1084 --MI; 1085 1086 return Changed; 1087 } 1088 1089 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1090 SIAtomicScope Scope, 1091 SIAtomicAddrSpace AddrSpace, 1092 bool IsCrossAddrSpaceOrdering, 1093 Position Pos) const { 1094 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1095 IsCrossAddrSpaceOrdering, Pos); 1096 } 1097 1098 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1099 SIAtomicScope Scope, 1100 SIAtomicAddrSpace AddrSpace, 1101 Position Pos) const { 1102 if (!InsertCacheInv) 1103 return false; 1104 1105 bool Changed = false; 1106 1107 MachineBasicBlock &MBB = *MI->getParent(); 1108 DebugLoc DL = MI->getDebugLoc(); 1109 1110 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1111 1112 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1113 ? AMDGPU::BUFFER_WBINVL1 1114 : AMDGPU::BUFFER_WBINVL1_VOL; 1115 1116 if (Pos == Position::AFTER) 1117 ++MI; 1118 1119 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1120 switch (Scope) { 1121 case SIAtomicScope::SYSTEM: 1122 case SIAtomicScope::AGENT: 1123 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1124 Changed = true; 1125 break; 1126 case SIAtomicScope::WORKGROUP: 1127 case SIAtomicScope::WAVEFRONT: 1128 case SIAtomicScope::SINGLETHREAD: 1129 // No cache to invalidate. 1130 break; 1131 default: 1132 llvm_unreachable("Unsupported synchronization scope"); 1133 } 1134 } 1135 1136 /// The scratch address space does not need the global memory cache 1137 /// to be flushed as all memory operations by the same thread are 1138 /// sequentially consistent, and no other thread can access scratch 1139 /// memory. 1140 1141 /// Other address spaces do not have a cache. 1142 1143 if (Pos == Position::AFTER) 1144 --MI; 1145 1146 return Changed; 1147 } 1148 1149 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1150 const MachineBasicBlock::iterator &MI, 1151 SIAtomicScope Scope, 1152 SIAtomicAddrSpace AddrSpace) const { 1153 assert(MI->mayLoad() && !MI->mayStore()); 1154 bool Changed = false; 1155 1156 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1157 switch (Scope) { 1158 case SIAtomicScope::SYSTEM: 1159 case SIAtomicScope::AGENT: 1160 // Set the L1 cache policy to MISS_LRU. 1161 // Note: there is no L2 cache bypass policy at the ISA level. 1162 Changed |= enableGLCBit(MI); 1163 break; 1164 case SIAtomicScope::WORKGROUP: 1165 // In threadgroup split mode the waves of a work-group can be executing on 1166 // different CUs. Therefore need to bypass the L1 which is per CU. 1167 // Otherwise in non-threadgroup split mode all waves of a work-group are 1168 // on the same CU, and so the L1 does not need to be bypassed. 1169 if (ST.isTgSplitEnabled()) 1170 Changed |= enableGLCBit(MI); 1171 break; 1172 case SIAtomicScope::WAVEFRONT: 1173 case SIAtomicScope::SINGLETHREAD: 1174 // No cache to bypass. 1175 break; 1176 default: 1177 llvm_unreachable("Unsupported synchronization scope"); 1178 } 1179 } 1180 1181 /// The scratch address space does not need the global memory caches 1182 /// to be bypassed as all memory operations by the same thread are 1183 /// sequentially consistent, and no other thread can access scratch 1184 /// memory. 1185 1186 /// Other address spaces do not have a cache. 1187 1188 return Changed; 1189 } 1190 1191 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1192 const MachineBasicBlock::iterator &MI, 1193 SIAtomicScope Scope, 1194 SIAtomicAddrSpace AddrSpace) const { 1195 assert(!MI->mayLoad() && MI->mayStore()); 1196 bool Changed = false; 1197 1198 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1199 switch (Scope) { 1200 case SIAtomicScope::SYSTEM: 1201 case SIAtomicScope::AGENT: 1202 /// Do not set glc for store atomic operations as they implicitly write 1203 /// through the L1 cache. 1204 break; 1205 case SIAtomicScope::WORKGROUP: 1206 case SIAtomicScope::WAVEFRONT: 1207 case SIAtomicScope::SINGLETHREAD: 1208 // No cache to bypass. Store atomics implicitly write through the L1 1209 // cache. 1210 break; 1211 default: 1212 llvm_unreachable("Unsupported synchronization scope"); 1213 } 1214 } 1215 1216 /// The scratch address space does not need the global memory caches 1217 /// to be bypassed as all memory operations by the same thread are 1218 /// sequentially consistent, and no other thread can access scratch 1219 /// memory. 1220 1221 /// Other address spaces do not have a cache. 1222 1223 return Changed; 1224 } 1225 1226 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1227 const MachineBasicBlock::iterator &MI, 1228 SIAtomicScope Scope, 1229 SIAtomicAddrSpace AddrSpace) const { 1230 assert(MI->mayLoad() && MI->mayStore()); 1231 bool Changed = false; 1232 1233 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1234 switch (Scope) { 1235 case SIAtomicScope::SYSTEM: 1236 case SIAtomicScope::AGENT: 1237 /// Do not set glc for RMW atomic operations as they implicitly bypass 1238 /// the L1 cache, and the glc bit is instead used to indicate if they are 1239 /// return or no-return. 1240 break; 1241 case SIAtomicScope::WORKGROUP: 1242 case SIAtomicScope::WAVEFRONT: 1243 case SIAtomicScope::SINGLETHREAD: 1244 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1245 break; 1246 default: 1247 llvm_unreachable("Unsupported synchronization scope"); 1248 } 1249 } 1250 1251 return Changed; 1252 } 1253 1254 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1255 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1256 bool IsVolatile, bool IsNonTemporal) const { 1257 // Only handle load and store, not atomic read-modify-write insructions. The 1258 // latter use glc to indicate if the atomic returns a result and so must not 1259 // be used for cache control. 1260 assert(MI->mayLoad() ^ MI->mayStore()); 1261 1262 // Only update load and store, not LLVM IR atomic read-modify-write 1263 // instructions. The latter are always marked as volatile so cannot sensibly 1264 // handle it as do not want to pessimize all atomics. Also they do not support 1265 // the nontemporal attribute. 1266 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1267 1268 bool Changed = false; 1269 1270 if (IsVolatile) { 1271 // Set L1 cache policy to be MISS_EVICT for load instructions 1272 // and MISS_LRU for store instructions. 1273 // Note: there is no L2 cache bypass policy at the ISA level. 1274 if (Op == SIMemOp::LOAD) 1275 Changed |= enableGLCBit(MI); 1276 1277 // Ensure operation has completed at system scope to cause all volatile 1278 // operations to be visible outside the program in a global order. Do not 1279 // request cross address space as only the global address space can be 1280 // observable outside the program, so no need to cause a waitcnt for LDS 1281 // address space operations. 1282 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1283 Position::AFTER); 1284 1285 return Changed; 1286 } 1287 1288 if (IsNonTemporal) { 1289 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1290 // for both loads and stores, and the L2 cache policy to STREAM. 1291 Changed |= enableGLCBit(MI); 1292 Changed |= enableSLCBit(MI); 1293 return Changed; 1294 } 1295 1296 return Changed; 1297 } 1298 1299 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1300 SIAtomicScope Scope, 1301 SIAtomicAddrSpace AddrSpace, 1302 SIMemOp Op, 1303 bool IsCrossAddrSpaceOrdering, 1304 Position Pos) const { 1305 if (ST.isTgSplitEnabled()) { 1306 // In threadgroup split mode the waves of a work-group can be executing on 1307 // different CUs. Therefore need to wait for global or GDS memory operations 1308 // to complete to ensure they are visible to waves in the other CUs. 1309 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1310 // the same CU, so no need to wait for global memory as all waves in the 1311 // work-group access the same the L1, nor wait for GDS as access are ordered 1312 // on a CU. 1313 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1314 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1315 (Scope == SIAtomicScope::WORKGROUP)) { 1316 // Same as GFX7 using agent scope. 1317 Scope = SIAtomicScope::AGENT; 1318 } 1319 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1320 // LDS memory operations. 1321 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1322 } 1323 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1324 IsCrossAddrSpaceOrdering, Pos); 1325 } 1326 1327 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1328 SIAtomicScope Scope, 1329 SIAtomicAddrSpace AddrSpace, 1330 Position Pos) const { 1331 if (!InsertCacheInv) 1332 return false; 1333 1334 bool Changed = false; 1335 1336 MachineBasicBlock &MBB = *MI->getParent(); 1337 DebugLoc DL = MI->getDebugLoc(); 1338 1339 if (Pos == Position::AFTER) 1340 ++MI; 1341 1342 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1343 switch (Scope) { 1344 case SIAtomicScope::SYSTEM: 1345 // Ensures that following loads will not see stale remote VMEM data or 1346 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1347 // CC will never be stale due to the local memory probes. 1348 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1349 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1350 // hardware does not reorder memory operations by the same wave with 1351 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1352 // remove any cache lines of earlier writes by the same wave and ensures 1353 // later reads by the same wave will refetch the cache lines. 1354 Changed = true; 1355 break; 1356 case SIAtomicScope::AGENT: 1357 // Same as GFX7. 1358 break; 1359 case SIAtomicScope::WORKGROUP: 1360 // In threadgroup split mode the waves of a work-group can be executing on 1361 // different CUs. Therefore need to invalidate the L1 which is per CU. 1362 // Otherwise in non-threadgroup split mode all waves of a work-group are 1363 // on the same CU, and so the L1 does not need to be invalidated. 1364 if (ST.isTgSplitEnabled()) { 1365 // Same as GFX7 using agent scope. 1366 Scope = SIAtomicScope::AGENT; 1367 } 1368 break; 1369 case SIAtomicScope::WAVEFRONT: 1370 case SIAtomicScope::SINGLETHREAD: 1371 // Same as GFX7. 1372 break; 1373 default: 1374 llvm_unreachable("Unsupported synchronization scope"); 1375 } 1376 } 1377 1378 /// The scratch address space does not need the global memory cache 1379 /// to be flushed as all memory operations by the same thread are 1380 /// sequentially consistent, and no other thread can access scratch 1381 /// memory. 1382 1383 /// Other address spaces do not have a cache. 1384 1385 if (Pos == Position::AFTER) 1386 --MI; 1387 1388 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1389 1390 return Changed; 1391 } 1392 1393 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1394 SIAtomicScope Scope, 1395 SIAtomicAddrSpace AddrSpace, 1396 bool IsCrossAddrSpaceOrdering, 1397 Position Pos) const { 1398 bool Changed = false; 1399 1400 MachineBasicBlock &MBB = *MI->getParent(); 1401 DebugLoc DL = MI->getDebugLoc(); 1402 1403 if (Pos == Position::AFTER) 1404 ++MI; 1405 1406 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1407 switch (Scope) { 1408 case SIAtomicScope::SYSTEM: 1409 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1410 // hardware does not reorder memory operations by the same wave with 1411 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1412 // to initiate writeback of any dirty cache lines of earlier writes by the 1413 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1414 // writeback has completed. 1415 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1416 // Set SC bits to indicate system scope. 1417 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1418 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1419 // vmcnt(0)" needed by the "BUFFER_WBL2". 1420 Changed = true; 1421 break; 1422 case SIAtomicScope::AGENT: 1423 case SIAtomicScope::WORKGROUP: 1424 case SIAtomicScope::WAVEFRONT: 1425 case SIAtomicScope::SINGLETHREAD: 1426 // Same as GFX7. 1427 break; 1428 default: 1429 llvm_unreachable("Unsupported synchronization scope"); 1430 } 1431 } 1432 1433 if (Pos == Position::AFTER) 1434 --MI; 1435 1436 Changed |= 1437 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1438 IsCrossAddrSpaceOrdering, Pos); 1439 1440 return Changed; 1441 } 1442 1443 bool SIGfx940CacheControl::enableLoadCacheBypass( 1444 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1445 SIAtomicAddrSpace AddrSpace) const { 1446 assert(MI->mayLoad() && !MI->mayStore()); 1447 bool Changed = false; 1448 1449 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1450 switch (Scope) { 1451 case SIAtomicScope::SYSTEM: 1452 // Set SC bits to indicate system scope. 1453 Changed |= enableSC0Bit(MI); 1454 Changed |= enableSC1Bit(MI); 1455 break; 1456 case SIAtomicScope::AGENT: 1457 // Set SC bits to indicate agent scope. 1458 Changed |= enableSC1Bit(MI); 1459 break; 1460 case SIAtomicScope::WORKGROUP: 1461 // In threadgroup split mode the waves of a work-group can be executing on 1462 // different CUs. Therefore need to bypass the L1 which is per CU. 1463 // Otherwise in non-threadgroup split mode all waves of a work-group are 1464 // on the same CU, and so the L1 does not need to be bypassed. Setting SC 1465 // bits to indicate work-group scope will do this automatically. 1466 Changed |= enableSC0Bit(MI); 1467 break; 1468 case SIAtomicScope::WAVEFRONT: 1469 case SIAtomicScope::SINGLETHREAD: 1470 // Leave SC bits unset to indicate wavefront scope. 1471 break; 1472 default: 1473 llvm_unreachable("Unsupported synchronization scope"); 1474 } 1475 } 1476 1477 /// The scratch address space does not need the global memory caches 1478 /// to be bypassed as all memory operations by the same thread are 1479 /// sequentially consistent, and no other thread can access scratch 1480 /// memory. 1481 1482 /// Other address spaces do not have a cache. 1483 1484 return Changed; 1485 } 1486 1487 bool SIGfx940CacheControl::enableStoreCacheBypass( 1488 const MachineBasicBlock::iterator &MI, 1489 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 1490 assert(!MI->mayLoad() && MI->mayStore()); 1491 bool Changed = false; 1492 1493 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1494 switch (Scope) { 1495 case SIAtomicScope::SYSTEM: 1496 // Set SC bits to indicate system scope. 1497 Changed |= enableSC0Bit(MI); 1498 Changed |= enableSC1Bit(MI); 1499 break; 1500 case SIAtomicScope::AGENT: 1501 // Set SC bits to indicate agent scope. 1502 Changed |= enableSC1Bit(MI); 1503 break; 1504 case SIAtomicScope::WORKGROUP: 1505 // Set SC bits to indicate workgroup scope. 1506 Changed |= enableSC0Bit(MI); 1507 break; 1508 case SIAtomicScope::WAVEFRONT: 1509 case SIAtomicScope::SINGLETHREAD: 1510 // Leave SC bits unset to indicate wavefront scope. 1511 break; 1512 default: 1513 llvm_unreachable("Unsupported synchronization scope"); 1514 } 1515 } 1516 1517 /// The scratch address space does not need the global memory caches 1518 /// to be bypassed as all memory operations by the same thread are 1519 /// sequentially consistent, and no other thread can access scratch 1520 /// memory. 1521 1522 /// Other address spaces do not have a cache. 1523 1524 return Changed; 1525 } 1526 1527 bool SIGfx940CacheControl::enableRMWCacheBypass( 1528 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 1529 SIAtomicAddrSpace AddrSpace) const { 1530 assert(MI->mayLoad() && MI->mayStore()); 1531 bool Changed = false; 1532 1533 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1534 switch (Scope) { 1535 case SIAtomicScope::SYSTEM: 1536 // Set SC1 bit to indicate system scope. 1537 Changed |= enableSC1Bit(MI); 1538 break; 1539 case SIAtomicScope::AGENT: 1540 case SIAtomicScope::WORKGROUP: 1541 case SIAtomicScope::WAVEFRONT: 1542 case SIAtomicScope::SINGLETHREAD: 1543 // RMW atomic operations implicitly bypass the L1 cache and only use SC1 1544 // to indicate system or agent scope. The SC0 bit is used to indicate if 1545 // they are return or no-return. Leave SC1 bit unset to indicate agent 1546 // scope. 1547 break; 1548 default: 1549 llvm_unreachable("Unsupported synchronization scope"); 1550 } 1551 } 1552 1553 return Changed; 1554 } 1555 1556 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 1557 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1558 bool IsVolatile, bool IsNonTemporal) const { 1559 // Only handle load and store, not atomic read-modify-write insructions. The 1560 // latter use glc to indicate if the atomic returns a result and so must not 1561 // be used for cache control. 1562 assert(MI->mayLoad() ^ MI->mayStore()); 1563 1564 // Only update load and store, not LLVM IR atomic read-modify-write 1565 // instructions. The latter are always marked as volatile so cannot sensibly 1566 // handle it as do not want to pessimize all atomics. Also they do not support 1567 // the nontemporal attribute. 1568 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1569 1570 bool Changed = false; 1571 1572 if (IsVolatile) { 1573 // Set SC bits to indicate system scope. 1574 Changed |= enableSC0Bit(MI); 1575 Changed |= enableSC1Bit(MI); 1576 1577 // Ensure operation has completed at system scope to cause all volatile 1578 // operations to be visible outside the program in a global order. Do not 1579 // request cross address space as only the global address space can be 1580 // observable outside the program, so no need to cause a waitcnt for LDS 1581 // address space operations. 1582 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1583 Position::AFTER); 1584 1585 return Changed; 1586 } 1587 1588 if (IsNonTemporal) { 1589 Changed |= enableNTBit(MI); 1590 return Changed; 1591 } 1592 1593 return Changed; 1594 } 1595 1596 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1597 SIAtomicScope Scope, 1598 SIAtomicAddrSpace AddrSpace, 1599 Position Pos) const { 1600 if (!InsertCacheInv) 1601 return false; 1602 1603 bool Changed = false; 1604 1605 MachineBasicBlock &MBB = *MI->getParent(); 1606 DebugLoc DL = MI->getDebugLoc(); 1607 1608 if (Pos == Position::AFTER) 1609 ++MI; 1610 1611 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1612 switch (Scope) { 1613 case SIAtomicScope::SYSTEM: 1614 // Ensures that following loads will not see stale remote VMEM data or 1615 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1616 // CC will never be stale due to the local memory probes. 1617 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1618 // Set SC bits to indicate system scope. 1619 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1620 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1621 // hardware does not reorder memory operations by the same wave with 1622 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 1623 // remove any cache lines of earlier writes by the same wave and ensures 1624 // later reads by the same wave will refetch the cache lines. 1625 Changed = true; 1626 break; 1627 case SIAtomicScope::AGENT: 1628 // Ensures that following loads will not see stale remote date or local 1629 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 1630 // due to the memory probes. 1631 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1632 // Set SC bits to indicate agent scope. 1633 .addImm(AMDGPU::CPol::SC1); 1634 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1635 // does not reorder memory operations with respect to preceeding buffer 1636 // invalidate. The invalidate is guaranteed to remove any cache lines of 1637 // earlier writes and ensures later writes will refetch the cache lines. 1638 Changed = true; 1639 break; 1640 case SIAtomicScope::WORKGROUP: 1641 // In threadgroup split mode the waves of a work-group can be executing on 1642 // different CUs. Therefore need to invalidate the L1 which is per CU. 1643 // Otherwise in non-threadgroup split mode all waves of a work-group are 1644 // on the same CU, and so the L1 does not need to be invalidated. 1645 if (ST.isTgSplitEnabled()) { 1646 // Ensures L1 is invalidated if in threadgroup split mode. In 1647 // non-threadgroup split mode it is a NOP, but no point generating it in 1648 // that case if know not in that mode. 1649 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 1650 // Set SC bits to indicate work-group scope. 1651 .addImm(AMDGPU::CPol::SC0); 1652 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 1653 // does not reorder memory operations with respect to preceeding buffer 1654 // invalidate. The invalidate is guaranteed to remove any cache lines of 1655 // earlier writes and ensures later writes will refetch the cache lines. 1656 Changed = true; 1657 } 1658 break; 1659 case SIAtomicScope::WAVEFRONT: 1660 case SIAtomicScope::SINGLETHREAD: 1661 // Could generate "BUFFER_INV" but it would do nothing as there are no 1662 // caches to invalidate. 1663 break; 1664 default: 1665 llvm_unreachable("Unsupported synchronization scope"); 1666 } 1667 } 1668 1669 /// The scratch address space does not need the global memory cache 1670 /// to be flushed as all memory operations by the same thread are 1671 /// sequentially consistent, and no other thread can access scratch 1672 /// memory. 1673 1674 /// Other address spaces do not have a cache. 1675 1676 if (Pos == Position::AFTER) 1677 --MI; 1678 1679 return Changed; 1680 } 1681 1682 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1683 SIAtomicScope Scope, 1684 SIAtomicAddrSpace AddrSpace, 1685 bool IsCrossAddrSpaceOrdering, 1686 Position Pos) const { 1687 bool Changed = false; 1688 1689 MachineBasicBlock &MBB = *MI->getParent(); 1690 DebugLoc DL = MI->getDebugLoc(); 1691 1692 if (Pos == Position::AFTER) 1693 ++MI; 1694 1695 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1696 switch (Scope) { 1697 case SIAtomicScope::SYSTEM: 1698 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1699 // hardware does not reorder memory operations by the same wave with 1700 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1701 // to initiate writeback of any dirty cache lines of earlier writes by the 1702 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1703 // writeback has completed. 1704 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1705 // Set SC bits to indicate system scope. 1706 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1707 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1708 // SIAtomicScope::SYSTEM, the following insertWait will generate the 1709 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 1710 Changed = true; 1711 break; 1712 case SIAtomicScope::AGENT: 1713 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1714 // Set SC bits to indicate agent scope. 1715 .addImm(AMDGPU::CPol::SC1); 1716 1717 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 1718 // SIAtomicScope::AGENT, the following insertWait will generate the 1719 // required "S_WAITCNT vmcnt(0)". 1720 Changed = true; 1721 break; 1722 case SIAtomicScope::WORKGROUP: 1723 case SIAtomicScope::WAVEFRONT: 1724 case SIAtomicScope::SINGLETHREAD: 1725 // Do not generate "BUFFER_WBL2" as there are no caches it would 1726 // writeback, and would require an otherwise unnecessary 1727 // "S_WAITCNT vmcnt(0)". 1728 break; 1729 default: 1730 llvm_unreachable("Unsupported synchronization scope"); 1731 } 1732 } 1733 1734 if (Pos == Position::AFTER) 1735 --MI; 1736 1737 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 1738 // S_WAITCNT needed. 1739 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1740 IsCrossAddrSpaceOrdering, Pos); 1741 1742 return Changed; 1743 } 1744 1745 bool SIGfx10CacheControl::enableLoadCacheBypass( 1746 const MachineBasicBlock::iterator &MI, 1747 SIAtomicScope Scope, 1748 SIAtomicAddrSpace AddrSpace) const { 1749 assert(MI->mayLoad() && !MI->mayStore()); 1750 bool Changed = false; 1751 1752 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1753 switch (Scope) { 1754 case SIAtomicScope::SYSTEM: 1755 case SIAtomicScope::AGENT: 1756 // Set the L0 and L1 cache policies to MISS_EVICT. 1757 // Note: there is no L2 cache coherent bypass control at the ISA level. 1758 Changed |= enableGLCBit(MI); 1759 Changed |= enableDLCBit(MI); 1760 break; 1761 case SIAtomicScope::WORKGROUP: 1762 // In WGP mode the waves of a work-group can be executing on either CU of 1763 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1764 // CU mode all waves of a work-group are on the same CU, and so the L0 1765 // does not need to be bypassed. 1766 if (!ST.isCuModeEnabled()) 1767 Changed |= enableGLCBit(MI); 1768 break; 1769 case SIAtomicScope::WAVEFRONT: 1770 case SIAtomicScope::SINGLETHREAD: 1771 // No cache to bypass. 1772 break; 1773 default: 1774 llvm_unreachable("Unsupported synchronization scope"); 1775 } 1776 } 1777 1778 /// The scratch address space does not need the global memory caches 1779 /// to be bypassed as all memory operations by the same thread are 1780 /// sequentially consistent, and no other thread can access scratch 1781 /// memory. 1782 1783 /// Other address spaces do not have a cache. 1784 1785 return Changed; 1786 } 1787 1788 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1789 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1790 bool IsVolatile, bool IsNonTemporal) const { 1791 1792 // Only handle load and store, not atomic read-modify-write insructions. The 1793 // latter use glc to indicate if the atomic returns a result and so must not 1794 // be used for cache control. 1795 assert(MI->mayLoad() ^ MI->mayStore()); 1796 1797 // Only update load and store, not LLVM IR atomic read-modify-write 1798 // instructions. The latter are always marked as volatile so cannot sensibly 1799 // handle it as do not want to pessimize all atomics. Also they do not support 1800 // the nontemporal attribute. 1801 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1802 1803 bool Changed = false; 1804 1805 if (IsVolatile) { 1806 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1807 // and MISS_LRU for store instructions. 1808 // Note: there is no L2 cache coherent bypass control at the ISA level. 1809 if (Op == SIMemOp::LOAD) { 1810 Changed |= enableGLCBit(MI); 1811 Changed |= enableDLCBit(MI); 1812 } 1813 1814 // Ensure operation has completed at system scope to cause all volatile 1815 // operations to be visible outside the program in a global order. Do not 1816 // request cross address space as only the global address space can be 1817 // observable outside the program, so no need to cause a waitcnt for LDS 1818 // address space operations. 1819 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1820 Position::AFTER); 1821 return Changed; 1822 } 1823 1824 if (IsNonTemporal) { 1825 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1826 // and L2 cache policy to STREAM. 1827 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1828 // to MISS_EVICT and the L2 cache policy to STREAM. 1829 if (Op == SIMemOp::STORE) 1830 Changed |= enableGLCBit(MI); 1831 Changed |= enableSLCBit(MI); 1832 1833 return Changed; 1834 } 1835 1836 return Changed; 1837 } 1838 1839 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1840 SIAtomicScope Scope, 1841 SIAtomicAddrSpace AddrSpace, 1842 SIMemOp Op, 1843 bool IsCrossAddrSpaceOrdering, 1844 Position Pos) const { 1845 bool Changed = false; 1846 1847 MachineBasicBlock &MBB = *MI->getParent(); 1848 DebugLoc DL = MI->getDebugLoc(); 1849 1850 if (Pos == Position::AFTER) 1851 ++MI; 1852 1853 bool VMCnt = false; 1854 bool VSCnt = false; 1855 bool LGKMCnt = false; 1856 1857 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1858 SIAtomicAddrSpace::NONE) { 1859 switch (Scope) { 1860 case SIAtomicScope::SYSTEM: 1861 case SIAtomicScope::AGENT: 1862 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1863 VMCnt |= true; 1864 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1865 VSCnt |= true; 1866 break; 1867 case SIAtomicScope::WORKGROUP: 1868 // In WGP mode the waves of a work-group can be executing on either CU of 1869 // the WGP. Therefore need to wait for operations to complete to ensure 1870 // they are visible to waves in the other CU as the L0 is per CU. 1871 // Otherwise in CU mode and all waves of a work-group are on the same CU 1872 // which shares the same L0. 1873 if (!ST.isCuModeEnabled()) { 1874 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1875 VMCnt |= true; 1876 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1877 VSCnt |= true; 1878 } 1879 break; 1880 case SIAtomicScope::WAVEFRONT: 1881 case SIAtomicScope::SINGLETHREAD: 1882 // The L0 cache keeps all memory operations in order for 1883 // work-items in the same wavefront. 1884 break; 1885 default: 1886 llvm_unreachable("Unsupported synchronization scope"); 1887 } 1888 } 1889 1890 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1891 switch (Scope) { 1892 case SIAtomicScope::SYSTEM: 1893 case SIAtomicScope::AGENT: 1894 case SIAtomicScope::WORKGROUP: 1895 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1896 // not needed as LDS operations for all waves are executed in a total 1897 // global ordering as observed by all waves. Required if also 1898 // synchronizing with global/GDS memory as LDS operations could be 1899 // reordered with respect to later global/GDS memory operations of the 1900 // same wave. 1901 LGKMCnt |= IsCrossAddrSpaceOrdering; 1902 break; 1903 case SIAtomicScope::WAVEFRONT: 1904 case SIAtomicScope::SINGLETHREAD: 1905 // The LDS keeps all memory operations in order for 1906 // the same wavefront. 1907 break; 1908 default: 1909 llvm_unreachable("Unsupported synchronization scope"); 1910 } 1911 } 1912 1913 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1914 switch (Scope) { 1915 case SIAtomicScope::SYSTEM: 1916 case SIAtomicScope::AGENT: 1917 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1918 // is not needed as GDS operations for all waves are executed in a total 1919 // global ordering as observed by all waves. Required if also 1920 // synchronizing with global/LDS memory as GDS operations could be 1921 // reordered with respect to later global/LDS memory operations of the 1922 // same wave. 1923 LGKMCnt |= IsCrossAddrSpaceOrdering; 1924 break; 1925 case SIAtomicScope::WORKGROUP: 1926 case SIAtomicScope::WAVEFRONT: 1927 case SIAtomicScope::SINGLETHREAD: 1928 // The GDS keeps all memory operations in order for 1929 // the same work-group. 1930 break; 1931 default: 1932 llvm_unreachable("Unsupported synchronization scope"); 1933 } 1934 } 1935 1936 if (VMCnt || LGKMCnt) { 1937 unsigned WaitCntImmediate = 1938 AMDGPU::encodeWaitcnt(IV, 1939 VMCnt ? 0 : getVmcntBitMask(IV), 1940 getExpcntBitMask(IV), 1941 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1942 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1943 Changed = true; 1944 } 1945 1946 if (VSCnt) { 1947 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1948 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1949 .addImm(0); 1950 Changed = true; 1951 } 1952 1953 if (Pos == Position::AFTER) 1954 --MI; 1955 1956 return Changed; 1957 } 1958 1959 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1960 SIAtomicScope Scope, 1961 SIAtomicAddrSpace AddrSpace, 1962 Position Pos) const { 1963 if (!InsertCacheInv) 1964 return false; 1965 1966 bool Changed = false; 1967 1968 MachineBasicBlock &MBB = *MI->getParent(); 1969 DebugLoc DL = MI->getDebugLoc(); 1970 1971 if (Pos == Position::AFTER) 1972 ++MI; 1973 1974 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1975 switch (Scope) { 1976 case SIAtomicScope::SYSTEM: 1977 case SIAtomicScope::AGENT: 1978 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1979 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1980 Changed = true; 1981 break; 1982 case SIAtomicScope::WORKGROUP: 1983 // In WGP mode the waves of a work-group can be executing on either CU of 1984 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1985 // in CU mode and all waves of a work-group are on the same CU, and so the 1986 // L0 does not need to be invalidated. 1987 if (!ST.isCuModeEnabled()) { 1988 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1989 Changed = true; 1990 } 1991 break; 1992 case SIAtomicScope::WAVEFRONT: 1993 case SIAtomicScope::SINGLETHREAD: 1994 // No cache to invalidate. 1995 break; 1996 default: 1997 llvm_unreachable("Unsupported synchronization scope"); 1998 } 1999 } 2000 2001 /// The scratch address space does not need the global memory cache 2002 /// to be flushed as all memory operations by the same thread are 2003 /// sequentially consistent, and no other thread can access scratch 2004 /// memory. 2005 2006 /// Other address spaces do not have a cache. 2007 2008 if (Pos == Position::AFTER) 2009 --MI; 2010 2011 return Changed; 2012 } 2013 2014 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 2015 if (AtomicPseudoMIs.empty()) 2016 return false; 2017 2018 for (auto &MI : AtomicPseudoMIs) 2019 MI->eraseFromParent(); 2020 2021 AtomicPseudoMIs.clear(); 2022 return true; 2023 } 2024 2025 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 2026 MachineBasicBlock::iterator &MI) { 2027 assert(MI->mayLoad() && !MI->mayStore()); 2028 2029 bool Changed = false; 2030 2031 if (MOI.isAtomic()) { 2032 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2033 MOI.getOrdering() == AtomicOrdering::Acquire || 2034 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2035 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 2036 MOI.getOrderingAddrSpace()); 2037 } 2038 2039 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2040 Changed |= CC->insertWait(MI, MOI.getScope(), 2041 MOI.getOrderingAddrSpace(), 2042 SIMemOp::LOAD | SIMemOp::STORE, 2043 MOI.getIsCrossAddressSpaceOrdering(), 2044 Position::BEFORE); 2045 2046 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2047 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2048 Changed |= CC->insertWait(MI, MOI.getScope(), 2049 MOI.getInstrAddrSpace(), 2050 SIMemOp::LOAD, 2051 MOI.getIsCrossAddressSpaceOrdering(), 2052 Position::AFTER); 2053 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2054 MOI.getOrderingAddrSpace(), 2055 Position::AFTER); 2056 } 2057 2058 return Changed; 2059 } 2060 2061 // Atomic instructions already bypass caches to the scope specified by the 2062 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2063 // need additional treatment. 2064 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 2065 SIMemOp::LOAD, MOI.isVolatile(), 2066 MOI.isNonTemporal()); 2067 return Changed; 2068 } 2069 2070 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 2071 MachineBasicBlock::iterator &MI) { 2072 assert(!MI->mayLoad() && MI->mayStore()); 2073 2074 bool Changed = false; 2075 2076 if (MOI.isAtomic()) { 2077 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2078 MOI.getOrdering() == AtomicOrdering::Release || 2079 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2080 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2081 MOI.getOrderingAddrSpace()); 2082 } 2083 2084 if (MOI.getOrdering() == AtomicOrdering::Release || 2085 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2086 Changed |= CC->insertRelease(MI, MOI.getScope(), 2087 MOI.getOrderingAddrSpace(), 2088 MOI.getIsCrossAddressSpaceOrdering(), 2089 Position::BEFORE); 2090 2091 return Changed; 2092 } 2093 2094 // Atomic instructions already bypass caches to the scope specified by the 2095 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2096 // need additional treatment. 2097 Changed |= CC->enableVolatileAndOrNonTemporal( 2098 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2099 MOI.isNonTemporal()); 2100 return Changed; 2101 } 2102 2103 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 2104 MachineBasicBlock::iterator &MI) { 2105 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 2106 2107 AtomicPseudoMIs.push_back(MI); 2108 bool Changed = false; 2109 2110 if (MOI.isAtomic()) { 2111 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2112 MOI.getOrdering() == AtomicOrdering::Release || 2113 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2114 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2115 /// TODO: This relies on a barrier always generating a waitcnt 2116 /// for LDS to ensure it is not reordered with the completion of 2117 /// the proceeding LDS operations. If barrier had a memory 2118 /// ordering and memory scope, then library does not need to 2119 /// generate a fence. Could add support in this file for 2120 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2121 /// adding S_WAITCNT before a S_BARRIER. 2122 Changed |= CC->insertRelease(MI, MOI.getScope(), 2123 MOI.getOrderingAddrSpace(), 2124 MOI.getIsCrossAddressSpaceOrdering(), 2125 Position::BEFORE); 2126 2127 // TODO: If both release and invalidate are happening they could be combined 2128 // to use the single "BUFFER_WBINV*" instruction. This could be done by 2129 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2130 // track cache invalidate and write back instructions. 2131 2132 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2133 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2134 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2135 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2136 MOI.getOrderingAddrSpace(), 2137 Position::BEFORE); 2138 2139 return Changed; 2140 } 2141 2142 return Changed; 2143 } 2144 2145 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 2146 MachineBasicBlock::iterator &MI) { 2147 assert(MI->mayLoad() && MI->mayStore()); 2148 2149 bool Changed = false; 2150 2151 if (MOI.isAtomic()) { 2152 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2153 MOI.getOrdering() == AtomicOrdering::Acquire || 2154 MOI.getOrdering() == AtomicOrdering::Release || 2155 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2156 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2157 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2158 MOI.getInstrAddrSpace()); 2159 } 2160 2161 if (MOI.getOrdering() == AtomicOrdering::Release || 2162 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2163 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2164 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2165 Changed |= CC->insertRelease(MI, MOI.getScope(), 2166 MOI.getOrderingAddrSpace(), 2167 MOI.getIsCrossAddressSpaceOrdering(), 2168 Position::BEFORE); 2169 2170 if (MOI.getOrdering() == AtomicOrdering::Acquire || 2171 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2172 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 2173 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 2174 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 2175 Changed |= CC->insertWait(MI, MOI.getScope(), 2176 MOI.getInstrAddrSpace(), 2177 isAtomicRet(*MI) ? SIMemOp::LOAD : 2178 SIMemOp::STORE, 2179 MOI.getIsCrossAddressSpaceOrdering(), 2180 Position::AFTER); 2181 Changed |= CC->insertAcquire(MI, MOI.getScope(), 2182 MOI.getOrderingAddrSpace(), 2183 Position::AFTER); 2184 } 2185 2186 return Changed; 2187 } 2188 2189 return Changed; 2190 } 2191 2192 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 2193 bool Changed = false; 2194 2195 SIMemOpAccess MOA(MF); 2196 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 2197 2198 for (auto &MBB : MF) { 2199 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 2200 2201 // Unbundle instructions after the post-RA scheduler. 2202 if (MI->isBundle() && MI->mayLoadOrStore()) { 2203 MachineBasicBlock::instr_iterator II(MI->getIterator()); 2204 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 2205 I != E && I->isBundledWithPred(); ++I) { 2206 I->unbundleFromPred(); 2207 for (MachineOperand &MO : I->operands()) 2208 if (MO.isReg()) 2209 MO.setIsInternalRead(false); 2210 } 2211 2212 MI->eraseFromParent(); 2213 MI = II->getIterator(); 2214 } 2215 2216 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 2217 continue; 2218 2219 if (const auto &MOI = MOA.getLoadInfo(MI)) 2220 Changed |= expandLoad(MOI.getValue(), MI); 2221 else if (const auto &MOI = MOA.getStoreInfo(MI)) 2222 Changed |= expandStore(MOI.getValue(), MI); 2223 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2224 Changed |= expandAtomicFence(MOI.getValue(), MI); 2225 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2226 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 2227 } 2228 } 2229 2230 Changed |= removeAtomicPseudoMIs(); 2231 return Changed; 2232 } 2233 2234 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 2235 2236 char SIMemoryLegalizer::ID = 0; 2237 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 2238 2239 FunctionPass *llvm::createSIMemoryLegalizerPass() { 2240 return new SIMemoryLegalizer(); 2241 } 2242