1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed together. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE); 130 131 // There is also no cross address space ordering if the ordering 132 // address space is the same as the instruction address space and 133 // only contains a single address space. 134 if ((OrderingAddrSpace == InstrAddrSpace) && 135 isPowerOf2_32(uint32_t(InstrAddrSpace))) 136 this->IsCrossAddressSpaceOrdering = false; 137 138 // Limit the scope to the maximum supported by the instruction's address 139 // spaces. 140 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 141 SIAtomicAddrSpace::NONE) { 142 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 143 } else if ((InstrAddrSpace & 144 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 145 SIAtomicAddrSpace::NONE) { 146 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 147 } else if ((InstrAddrSpace & 148 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 149 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 150 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 151 } 152 } 153 154 public: 155 /// \returns Atomic synchronization scope of the machine instruction used to 156 /// create this SIMemOpInfo. 157 SIAtomicScope getScope() const { 158 return Scope; 159 } 160 161 /// \returns Ordering constraint of the machine instruction used to 162 /// create this SIMemOpInfo. 163 AtomicOrdering getOrdering() const { 164 return Ordering; 165 } 166 167 /// \returns Failure ordering constraint of the machine instruction used to 168 /// create this SIMemOpInfo. 169 AtomicOrdering getFailureOrdering() const { 170 return FailureOrdering; 171 } 172 173 /// \returns The address spaces be accessed by the machine 174 /// instruction used to create this SiMemOpInfo. 175 SIAtomicAddrSpace getInstrAddrSpace() const { 176 return InstrAddrSpace; 177 } 178 179 /// \returns The address spaces that must be ordered by the machine 180 /// instruction used to create this SiMemOpInfo. 181 SIAtomicAddrSpace getOrderingAddrSpace() const { 182 return OrderingAddrSpace; 183 } 184 185 /// \returns Return true iff memory ordering of operations on 186 /// different address spaces is required. 187 bool getIsCrossAddressSpaceOrdering() const { 188 return IsCrossAddressSpaceOrdering; 189 } 190 191 /// \returns True if memory access of the machine instruction used to 192 /// create this SIMemOpInfo is volatile, false otherwise. 193 bool isVolatile() const { 194 return IsVolatile; 195 } 196 197 /// \returns True if memory access of the machine instruction used to 198 /// create this SIMemOpInfo is nontemporal, false otherwise. 199 bool isNonTemporal() const { 200 return IsNonTemporal; 201 } 202 203 /// \returns True if ordering constraint of the machine instruction used to 204 /// create this SIMemOpInfo is unordered or higher, false otherwise. 205 bool isAtomic() const { 206 return Ordering != AtomicOrdering::NotAtomic; 207 } 208 209 }; 210 211 class SIMemOpAccess final { 212 private: 213 AMDGPUMachineModuleInfo *MMI = nullptr; 214 215 /// Reports unsupported message \p Msg for \p MI to LLVM context. 216 void reportUnsupported(const MachineBasicBlock::iterator &MI, 217 const char *Msg) const; 218 219 /// Inspects the target synchronization scope \p SSID and determines 220 /// the SI atomic scope it corresponds to, the address spaces it 221 /// covers, and whether the memory ordering applies between address 222 /// spaces. 223 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 224 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 225 226 /// \return Return a bit set of the address spaces accessed by \p AS. 227 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 228 229 /// \returns Info constructed from \p MI, which has at least machine memory 230 /// operand. 231 Optional<SIMemOpInfo> constructFromMIWithMMO( 232 const MachineBasicBlock::iterator &MI) const; 233 234 public: 235 /// Construct class to support accessing the machine memory operands 236 /// of instructions in the machine function \p MF. 237 SIMemOpAccess(MachineFunction &MF); 238 239 /// \returns Load info if \p MI is a load operation, "None" otherwise. 240 Optional<SIMemOpInfo> getLoadInfo( 241 const MachineBasicBlock::iterator &MI) const; 242 243 /// \returns Store info if \p MI is a store operation, "None" otherwise. 244 Optional<SIMemOpInfo> getStoreInfo( 245 const MachineBasicBlock::iterator &MI) const; 246 247 /// \returns Atomic fence info if \p MI is an atomic fence operation, 248 /// "None" otherwise. 249 Optional<SIMemOpInfo> getAtomicFenceInfo( 250 const MachineBasicBlock::iterator &MI) const; 251 252 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 253 /// rmw operation, "None" otherwise. 254 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 255 const MachineBasicBlock::iterator &MI) const; 256 }; 257 258 class SICacheControl { 259 protected: 260 261 /// AMDGPU subtarget info. 262 const GCNSubtarget &ST; 263 264 /// Instruction info. 265 const SIInstrInfo *TII = nullptr; 266 267 IsaVersion IV; 268 269 /// Whether to insert cache invalidating instructions. 270 bool InsertCacheInv; 271 272 SICacheControl(const GCNSubtarget &ST); 273 274 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 275 /// \returns Returns true if \p MI is modified, false otherwise. 276 bool enableNamedBit(const MachineBasicBlock::iterator MI, 277 AMDGPU::CPol::CPol Bit) const; 278 279 public: 280 281 /// Create a cache control for the subtarget \p ST. 282 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 283 284 /// Update \p MI memory load instruction to bypass any caches up to 285 /// the \p Scope memory scope for address spaces \p 286 /// AddrSpace. Return true iff the instruction was modified. 287 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 288 SIAtomicScope Scope, 289 SIAtomicAddrSpace AddrSpace) const = 0; 290 291 /// Update \p MI memory store instruction to bypass any caches up to 292 /// the \p Scope memory scope for address spaces \p 293 /// AddrSpace. Return true iff the instruction was modified. 294 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 295 SIAtomicScope Scope, 296 SIAtomicAddrSpace AddrSpace) const = 0; 297 298 /// Update \p MI memory read-modify-write instruction to bypass any caches up 299 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 300 /// iff the instruction was modified. 301 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 302 SIAtomicScope Scope, 303 SIAtomicAddrSpace AddrSpace) const = 0; 304 305 /// Update \p MI memory instruction of kind \p Op associated with address 306 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 307 /// true iff the instruction was modified. 308 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 309 SIAtomicAddrSpace AddrSpace, 310 SIMemOp Op, bool IsVolatile, 311 bool IsNonTemporal) const = 0; 312 313 /// Inserts any necessary instructions at position \p Pos relative 314 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 315 /// \p Op associated with address spaces \p AddrSpace have completed. Used 316 /// between memory instructions to enforce the order they become visible as 317 /// observed by other memory instructions executing in memory scope \p Scope. 318 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 319 /// address spaces. Returns true iff any instructions inserted. 320 virtual bool insertWait(MachineBasicBlock::iterator &MI, 321 SIAtomicScope Scope, 322 SIAtomicAddrSpace AddrSpace, 323 SIMemOp Op, 324 bool IsCrossAddrSpaceOrdering, 325 Position Pos) const = 0; 326 327 /// Inserts any necessary instructions at position \p Pos relative to 328 /// instruction \p MI to ensure any subsequent memory instructions of this 329 /// thread with address spaces \p AddrSpace will observe the previous memory 330 /// operations by any thread for memory scopes up to memory scope \p Scope . 331 /// Returns true iff any instructions inserted. 332 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 333 SIAtomicScope Scope, 334 SIAtomicAddrSpace AddrSpace, 335 Position Pos) const = 0; 336 337 /// Inserts any necessary instructions at position \p Pos relative to 338 /// instruction \p MI to ensure previous memory instructions by this thread 339 /// with address spaces \p AddrSpace have completed and can be observed by 340 /// subsequent memory instructions by any thread executing in memory scope \p 341 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 342 /// between address spaces. Returns true iff any instructions inserted. 343 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 344 SIAtomicScope Scope, 345 SIAtomicAddrSpace AddrSpace, 346 bool IsCrossAddrSpaceOrdering, 347 Position Pos) const = 0; 348 349 /// Virtual destructor to allow derivations to be deleted. 350 virtual ~SICacheControl() = default; 351 352 }; 353 354 class SIGfx6CacheControl : public SICacheControl { 355 protected: 356 357 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 358 /// is modified, false otherwise. 359 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 360 return enableNamedBit(MI, AMDGPU::CPol::GLC); 361 } 362 363 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 364 /// is modified, false otherwise. 365 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 366 return enableNamedBit(MI, AMDGPU::CPol::SLC); 367 } 368 369 public: 370 371 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 372 373 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 374 SIAtomicScope Scope, 375 SIAtomicAddrSpace AddrSpace) const override; 376 377 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 378 SIAtomicScope Scope, 379 SIAtomicAddrSpace AddrSpace) const override; 380 381 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 382 SIAtomicScope Scope, 383 SIAtomicAddrSpace AddrSpace) const override; 384 385 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 386 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 387 bool IsVolatile, 388 bool IsNonTemporal) const override; 389 390 bool insertWait(MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace, 393 SIMemOp Op, 394 bool IsCrossAddrSpaceOrdering, 395 Position Pos) const override; 396 397 bool insertAcquire(MachineBasicBlock::iterator &MI, 398 SIAtomicScope Scope, 399 SIAtomicAddrSpace AddrSpace, 400 Position Pos) const override; 401 402 bool insertRelease(MachineBasicBlock::iterator &MI, 403 SIAtomicScope Scope, 404 SIAtomicAddrSpace AddrSpace, 405 bool IsCrossAddrSpaceOrdering, 406 Position Pos) const override; 407 }; 408 409 class SIGfx7CacheControl : public SIGfx6CacheControl { 410 public: 411 412 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 413 414 bool insertAcquire(MachineBasicBlock::iterator &MI, 415 SIAtomicScope Scope, 416 SIAtomicAddrSpace AddrSpace, 417 Position Pos) const override; 418 419 }; 420 421 class SIGfx90ACacheControl : public SIGfx7CacheControl { 422 public: 423 424 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 425 426 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 427 SIAtomicScope Scope, 428 SIAtomicAddrSpace AddrSpace) const override; 429 430 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 431 SIAtomicScope Scope, 432 SIAtomicAddrSpace AddrSpace) const override; 433 434 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 435 SIAtomicScope Scope, 436 SIAtomicAddrSpace AddrSpace) const override; 437 438 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 439 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 440 bool IsVolatile, 441 bool IsNonTemporal) const override; 442 443 bool insertWait(MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace, 446 SIMemOp Op, 447 bool IsCrossAddrSpaceOrdering, 448 Position Pos) const override; 449 450 bool insertAcquire(MachineBasicBlock::iterator &MI, 451 SIAtomicScope Scope, 452 SIAtomicAddrSpace AddrSpace, 453 Position Pos) const override; 454 455 bool insertRelease(MachineBasicBlock::iterator &MI, 456 SIAtomicScope Scope, 457 SIAtomicAddrSpace AddrSpace, 458 bool IsCrossAddrSpaceOrdering, 459 Position Pos) const override; 460 }; 461 462 class SIGfx10CacheControl : public SIGfx7CacheControl { 463 protected: 464 465 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 466 /// is modified, false otherwise. 467 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 468 return enableNamedBit(MI, AMDGPU::CPol::DLC); 469 } 470 471 public: 472 473 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 474 475 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 476 SIAtomicScope Scope, 477 SIAtomicAddrSpace AddrSpace) const override; 478 479 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 480 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 481 bool IsVolatile, 482 bool IsNonTemporal) const override; 483 484 bool insertWait(MachineBasicBlock::iterator &MI, 485 SIAtomicScope Scope, 486 SIAtomicAddrSpace AddrSpace, 487 SIMemOp Op, 488 bool IsCrossAddrSpaceOrdering, 489 Position Pos) const override; 490 491 bool insertAcquire(MachineBasicBlock::iterator &MI, 492 SIAtomicScope Scope, 493 SIAtomicAddrSpace AddrSpace, 494 Position Pos) const override; 495 }; 496 497 class SIMemoryLegalizer final : public MachineFunctionPass { 498 private: 499 500 /// Cache Control. 501 std::unique_ptr<SICacheControl> CC = nullptr; 502 503 /// List of atomic pseudo instructions. 504 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 505 506 /// Return true iff instruction \p MI is a atomic instruction that 507 /// returns a result. 508 bool isAtomicRet(const MachineInstr &MI) const { 509 return SIInstrInfo::isAtomicRet(MI); 510 } 511 512 /// Removes all processed atomic pseudo instructions from the current 513 /// function. Returns true if current function is modified, false otherwise. 514 bool removeAtomicPseudoMIs(); 515 516 /// Expands load operation \p MI. Returns true if instructions are 517 /// added/deleted or \p MI is modified, false otherwise. 518 bool expandLoad(const SIMemOpInfo &MOI, 519 MachineBasicBlock::iterator &MI); 520 /// Expands store operation \p MI. Returns true if instructions are 521 /// added/deleted or \p MI is modified, false otherwise. 522 bool expandStore(const SIMemOpInfo &MOI, 523 MachineBasicBlock::iterator &MI); 524 /// Expands atomic fence operation \p MI. Returns true if 525 /// instructions are added/deleted or \p MI is modified, false otherwise. 526 bool expandAtomicFence(const SIMemOpInfo &MOI, 527 MachineBasicBlock::iterator &MI); 528 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 529 /// instructions are added/deleted or \p MI is modified, false otherwise. 530 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 531 MachineBasicBlock::iterator &MI); 532 533 public: 534 static char ID; 535 536 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 537 538 void getAnalysisUsage(AnalysisUsage &AU) const override { 539 AU.setPreservesCFG(); 540 MachineFunctionPass::getAnalysisUsage(AU); 541 } 542 543 StringRef getPassName() const override { 544 return PASS_NAME; 545 } 546 547 bool runOnMachineFunction(MachineFunction &MF) override; 548 }; 549 550 } // end namespace anonymous 551 552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 553 const char *Msg) const { 554 const Function &Func = MI->getParent()->getParent()->getFunction(); 555 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 556 Func.getContext().diagnose(Diag); 557 } 558 559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 561 SIAtomicAddrSpace InstrAddrSpace) const { 562 if (SSID == SyncScope::System) 563 return std::make_tuple(SIAtomicScope::SYSTEM, 564 SIAtomicAddrSpace::ATOMIC, 565 true); 566 if (SSID == MMI->getAgentSSID()) 567 return std::make_tuple(SIAtomicScope::AGENT, 568 SIAtomicAddrSpace::ATOMIC, 569 true); 570 if (SSID == MMI->getWorkgroupSSID()) 571 return std::make_tuple(SIAtomicScope::WORKGROUP, 572 SIAtomicAddrSpace::ATOMIC, 573 true); 574 if (SSID == MMI->getWavefrontSSID()) 575 return std::make_tuple(SIAtomicScope::WAVEFRONT, 576 SIAtomicAddrSpace::ATOMIC, 577 true); 578 if (SSID == SyncScope::SingleThread) 579 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 580 SIAtomicAddrSpace::ATOMIC, 581 true); 582 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 583 return std::make_tuple(SIAtomicScope::SYSTEM, 584 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 585 false); 586 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 587 return std::make_tuple(SIAtomicScope::AGENT, 588 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 589 false); 590 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 591 return std::make_tuple(SIAtomicScope::WORKGROUP, 592 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 593 false); 594 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 595 return std::make_tuple(SIAtomicScope::WAVEFRONT, 596 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 597 false); 598 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 599 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 600 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 601 false); 602 return None; 603 } 604 605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 606 if (AS == AMDGPUAS::FLAT_ADDRESS) 607 return SIAtomicAddrSpace::FLAT; 608 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 609 return SIAtomicAddrSpace::GLOBAL; 610 if (AS == AMDGPUAS::LOCAL_ADDRESS) 611 return SIAtomicAddrSpace::LDS; 612 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 613 return SIAtomicAddrSpace::SCRATCH; 614 if (AS == AMDGPUAS::REGION_ADDRESS) 615 return SIAtomicAddrSpace::GDS; 616 617 return SIAtomicAddrSpace::OTHER; 618 } 619 620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 621 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 622 } 623 624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 625 const MachineBasicBlock::iterator &MI) const { 626 assert(MI->getNumMemOperands() > 0); 627 628 SyncScope::ID SSID = SyncScope::SingleThread; 629 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 630 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 631 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 632 bool IsNonTemporal = true; 633 bool IsVolatile = false; 634 635 // Validator should check whether or not MMOs cover the entire set of 636 // locations accessed by the memory instruction. 637 for (const auto &MMO : MI->memoperands()) { 638 IsNonTemporal &= MMO->isNonTemporal(); 639 IsVolatile |= MMO->isVolatile(); 640 InstrAddrSpace |= 641 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 642 AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 643 if (OpOrdering != AtomicOrdering::NotAtomic) { 644 const auto &IsSyncScopeInclusion = 645 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 646 if (!IsSyncScopeInclusion) { 647 reportUnsupported(MI, 648 "Unsupported non-inclusive atomic synchronization scope"); 649 return None; 650 } 651 652 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 653 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 654 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 655 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 656 FailureOrdering = 657 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 658 } 659 } 660 661 SIAtomicScope Scope = SIAtomicScope::NONE; 662 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 663 bool IsCrossAddressSpaceOrdering = false; 664 if (Ordering != AtomicOrdering::NotAtomic) { 665 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 666 if (!ScopeOrNone) { 667 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 668 return None; 669 } 670 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 671 ScopeOrNone.getValue(); 672 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 673 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 674 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 675 reportUnsupported(MI, "Unsupported atomic address space"); 676 return None; 677 } 678 } 679 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 680 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 681 IsNonTemporal); 682 } 683 684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 685 const MachineBasicBlock::iterator &MI) const { 686 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 687 688 if (!(MI->mayLoad() && !MI->mayStore())) 689 return None; 690 691 // Be conservative if there are no memory operands. 692 if (MI->getNumMemOperands() == 0) 693 return SIMemOpInfo(); 694 695 return constructFromMIWithMMO(MI); 696 } 697 698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 699 const MachineBasicBlock::iterator &MI) const { 700 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 701 702 if (!(!MI->mayLoad() && MI->mayStore())) 703 return None; 704 705 // Be conservative if there are no memory operands. 706 if (MI->getNumMemOperands() == 0) 707 return SIMemOpInfo(); 708 709 return constructFromMIWithMMO(MI); 710 } 711 712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 713 const MachineBasicBlock::iterator &MI) const { 714 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 715 716 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 717 return None; 718 719 AtomicOrdering Ordering = 720 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 721 722 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 723 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 724 if (!ScopeOrNone) { 725 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 726 return None; 727 } 728 729 SIAtomicScope Scope = SIAtomicScope::NONE; 730 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 731 bool IsCrossAddressSpaceOrdering = false; 732 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 733 ScopeOrNone.getValue(); 734 735 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 736 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 737 reportUnsupported(MI, "Unsupported atomic address space"); 738 return None; 739 } 740 741 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 742 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 743 } 744 745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 746 const MachineBasicBlock::iterator &MI) const { 747 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 748 749 if (!(MI->mayLoad() && MI->mayStore())) 750 return None; 751 752 // Be conservative if there are no memory operands. 753 if (MI->getNumMemOperands() == 0) 754 return SIMemOpInfo(); 755 756 return constructFromMIWithMMO(MI); 757 } 758 759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 760 TII = ST.getInstrInfo(); 761 IV = getIsaVersion(ST.getCPU()); 762 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 763 } 764 765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 766 AMDGPU::CPol::CPol Bit) const { 767 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 768 if (!CPol) 769 return false; 770 771 CPol->setImm(CPol->getImm() | Bit); 772 return true; 773 } 774 775 /* static */ 776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 777 GCNSubtarget::Generation Generation = ST.getGeneration(); 778 if (ST.hasGFX90AInsts()) 779 return std::make_unique<SIGfx90ACacheControl>(ST); 780 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 781 return std::make_unique<SIGfx6CacheControl>(ST); 782 if (Generation < AMDGPUSubtarget::GFX10) 783 return std::make_unique<SIGfx7CacheControl>(ST); 784 return std::make_unique<SIGfx10CacheControl>(ST); 785 } 786 787 bool SIGfx6CacheControl::enableLoadCacheBypass( 788 const MachineBasicBlock::iterator &MI, 789 SIAtomicScope Scope, 790 SIAtomicAddrSpace AddrSpace) const { 791 assert(MI->mayLoad() && !MI->mayStore()); 792 bool Changed = false; 793 794 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 795 switch (Scope) { 796 case SIAtomicScope::SYSTEM: 797 case SIAtomicScope::AGENT: 798 // Set L1 cache policy to MISS_EVICT. 799 // Note: there is no L2 cache bypass policy at the ISA level. 800 Changed |= enableGLCBit(MI); 801 break; 802 case SIAtomicScope::WORKGROUP: 803 case SIAtomicScope::WAVEFRONT: 804 case SIAtomicScope::SINGLETHREAD: 805 // No cache to bypass. 806 break; 807 default: 808 llvm_unreachable("Unsupported synchronization scope"); 809 } 810 } 811 812 /// The scratch address space does not need the global memory caches 813 /// to be bypassed as all memory operations by the same thread are 814 /// sequentially consistent, and no other thread can access scratch 815 /// memory. 816 817 /// Other address spaces do not have a cache. 818 819 return Changed; 820 } 821 822 bool SIGfx6CacheControl::enableStoreCacheBypass( 823 const MachineBasicBlock::iterator &MI, 824 SIAtomicScope Scope, 825 SIAtomicAddrSpace AddrSpace) const { 826 assert(!MI->mayLoad() && MI->mayStore()); 827 bool Changed = false; 828 829 /// The L1 cache is write through so does not need to be bypassed. There is no 830 /// bypass control for the L2 cache at the isa level. 831 832 return Changed; 833 } 834 835 bool SIGfx6CacheControl::enableRMWCacheBypass( 836 const MachineBasicBlock::iterator &MI, 837 SIAtomicScope Scope, 838 SIAtomicAddrSpace AddrSpace) const { 839 assert(MI->mayLoad() && MI->mayStore()); 840 bool Changed = false; 841 842 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 843 /// bypassed, and the GLC bit is instead used to indicate if they are 844 /// return or no-return. 845 /// Note: there is no L2 cache coherent bypass control at the ISA level. 846 847 return Changed; 848 } 849 850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 851 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 852 bool IsVolatile, bool IsNonTemporal) const { 853 // Only handle load and store, not atomic read-modify-write insructions. The 854 // latter use glc to indicate if the atomic returns a result and so must not 855 // be used for cache control. 856 assert(MI->mayLoad() ^ MI->mayStore()); 857 858 // Only update load and store, not LLVM IR atomic read-modify-write 859 // instructions. The latter are always marked as volatile so cannot sensibly 860 // handle it as do not want to pessimize all atomics. Also they do not support 861 // the nontemporal attribute. 862 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 863 864 bool Changed = false; 865 866 if (IsVolatile) { 867 // Set L1 cache policy to be MISS_EVICT for load instructions 868 // and MISS_LRU for store instructions. 869 // Note: there is no L2 cache bypass policy at the ISA level. 870 if (Op == SIMemOp::LOAD) 871 Changed |= enableGLCBit(MI); 872 873 // Ensure operation has completed at system scope to cause all volatile 874 // operations to be visible outside the program in a global order. Do not 875 // request cross address space as only the global address space can be 876 // observable outside the program, so no need to cause a waitcnt for LDS 877 // address space operations. 878 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 879 Position::AFTER); 880 881 return Changed; 882 } 883 884 if (IsNonTemporal) { 885 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 886 // for both loads and stores, and the L2 cache policy to STREAM. 887 Changed |= enableGLCBit(MI); 888 Changed |= enableSLCBit(MI); 889 return Changed; 890 } 891 892 return Changed; 893 } 894 895 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 896 SIAtomicScope Scope, 897 SIAtomicAddrSpace AddrSpace, 898 SIMemOp Op, 899 bool IsCrossAddrSpaceOrdering, 900 Position Pos) const { 901 bool Changed = false; 902 903 MachineBasicBlock &MBB = *MI->getParent(); 904 DebugLoc DL = MI->getDebugLoc(); 905 906 if (Pos == Position::AFTER) 907 ++MI; 908 909 bool VMCnt = false; 910 bool LGKMCnt = false; 911 912 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 913 SIAtomicAddrSpace::NONE) { 914 switch (Scope) { 915 case SIAtomicScope::SYSTEM: 916 case SIAtomicScope::AGENT: 917 VMCnt |= true; 918 break; 919 case SIAtomicScope::WORKGROUP: 920 case SIAtomicScope::WAVEFRONT: 921 case SIAtomicScope::SINGLETHREAD: 922 // The L1 cache keeps all memory operations in order for 923 // wavefronts in the same work-group. 924 break; 925 default: 926 llvm_unreachable("Unsupported synchronization scope"); 927 } 928 } 929 930 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 931 switch (Scope) { 932 case SIAtomicScope::SYSTEM: 933 case SIAtomicScope::AGENT: 934 case SIAtomicScope::WORKGROUP: 935 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 936 // not needed as LDS operations for all waves are executed in a total 937 // global ordering as observed by all waves. Required if also 938 // synchronizing with global/GDS memory as LDS operations could be 939 // reordered with respect to later global/GDS memory operations of the 940 // same wave. 941 LGKMCnt |= IsCrossAddrSpaceOrdering; 942 break; 943 case SIAtomicScope::WAVEFRONT: 944 case SIAtomicScope::SINGLETHREAD: 945 // The LDS keeps all memory operations in order for 946 // the same wavefront. 947 break; 948 default: 949 llvm_unreachable("Unsupported synchronization scope"); 950 } 951 } 952 953 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 954 switch (Scope) { 955 case SIAtomicScope::SYSTEM: 956 case SIAtomicScope::AGENT: 957 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 958 // is not needed as GDS operations for all waves are executed in a total 959 // global ordering as observed by all waves. Required if also 960 // synchronizing with global/LDS memory as GDS operations could be 961 // reordered with respect to later global/LDS memory operations of the 962 // same wave. 963 LGKMCnt |= IsCrossAddrSpaceOrdering; 964 break; 965 case SIAtomicScope::WORKGROUP: 966 case SIAtomicScope::WAVEFRONT: 967 case SIAtomicScope::SINGLETHREAD: 968 // The GDS keeps all memory operations in order for 969 // the same work-group. 970 break; 971 default: 972 llvm_unreachable("Unsupported synchronization scope"); 973 } 974 } 975 976 if (VMCnt || LGKMCnt) { 977 unsigned WaitCntImmediate = 978 AMDGPU::encodeWaitcnt(IV, 979 VMCnt ? 0 : getVmcntBitMask(IV), 980 getExpcntBitMask(IV), 981 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 982 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 983 Changed = true; 984 } 985 986 if (Pos == Position::AFTER) 987 --MI; 988 989 return Changed; 990 } 991 992 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 993 SIAtomicScope Scope, 994 SIAtomicAddrSpace AddrSpace, 995 Position Pos) const { 996 if (!InsertCacheInv) 997 return false; 998 999 bool Changed = false; 1000 1001 MachineBasicBlock &MBB = *MI->getParent(); 1002 DebugLoc DL = MI->getDebugLoc(); 1003 1004 if (Pos == Position::AFTER) 1005 ++MI; 1006 1007 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1008 switch (Scope) { 1009 case SIAtomicScope::SYSTEM: 1010 case SIAtomicScope::AGENT: 1011 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1012 Changed = true; 1013 break; 1014 case SIAtomicScope::WORKGROUP: 1015 case SIAtomicScope::WAVEFRONT: 1016 case SIAtomicScope::SINGLETHREAD: 1017 // No cache to invalidate. 1018 break; 1019 default: 1020 llvm_unreachable("Unsupported synchronization scope"); 1021 } 1022 } 1023 1024 /// The scratch address space does not need the global memory cache 1025 /// to be flushed as all memory operations by the same thread are 1026 /// sequentially consistent, and no other thread can access scratch 1027 /// memory. 1028 1029 /// Other address spaces do not have a cache. 1030 1031 if (Pos == Position::AFTER) 1032 --MI; 1033 1034 return Changed; 1035 } 1036 1037 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1038 SIAtomicScope Scope, 1039 SIAtomicAddrSpace AddrSpace, 1040 bool IsCrossAddrSpaceOrdering, 1041 Position Pos) const { 1042 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1043 IsCrossAddrSpaceOrdering, Pos); 1044 } 1045 1046 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1047 SIAtomicScope Scope, 1048 SIAtomicAddrSpace AddrSpace, 1049 Position Pos) const { 1050 if (!InsertCacheInv) 1051 return false; 1052 1053 bool Changed = false; 1054 1055 MachineBasicBlock &MBB = *MI->getParent(); 1056 DebugLoc DL = MI->getDebugLoc(); 1057 1058 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1059 1060 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1061 ? AMDGPU::BUFFER_WBINVL1 1062 : AMDGPU::BUFFER_WBINVL1_VOL; 1063 1064 if (Pos == Position::AFTER) 1065 ++MI; 1066 1067 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1068 switch (Scope) { 1069 case SIAtomicScope::SYSTEM: 1070 case SIAtomicScope::AGENT: 1071 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1072 Changed = true; 1073 break; 1074 case SIAtomicScope::WORKGROUP: 1075 case SIAtomicScope::WAVEFRONT: 1076 case SIAtomicScope::SINGLETHREAD: 1077 // No cache to invalidate. 1078 break; 1079 default: 1080 llvm_unreachable("Unsupported synchronization scope"); 1081 } 1082 } 1083 1084 /// The scratch address space does not need the global memory cache 1085 /// to be flushed as all memory operations by the same thread are 1086 /// sequentially consistent, and no other thread can access scratch 1087 /// memory. 1088 1089 /// Other address spaces do not have a cache. 1090 1091 if (Pos == Position::AFTER) 1092 --MI; 1093 1094 return Changed; 1095 } 1096 1097 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1098 const MachineBasicBlock::iterator &MI, 1099 SIAtomicScope Scope, 1100 SIAtomicAddrSpace AddrSpace) const { 1101 assert(MI->mayLoad() && !MI->mayStore()); 1102 bool Changed = false; 1103 1104 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1105 switch (Scope) { 1106 case SIAtomicScope::SYSTEM: 1107 case SIAtomicScope::AGENT: 1108 // Set the L1 cache policy to MISS_LRU. 1109 // Note: there is no L2 cache bypass policy at the ISA level. 1110 Changed |= enableGLCBit(MI); 1111 break; 1112 case SIAtomicScope::WORKGROUP: 1113 // In threadgroup split mode the waves of a work-group can be executing on 1114 // different CUs. Therefore need to bypass the L1 which is per CU. 1115 // Otherwise in non-threadgroup split mode all waves of a work-group are 1116 // on the same CU, and so the L1 does not need to be bypassed. 1117 if (ST.isTgSplitEnabled()) 1118 Changed |= enableGLCBit(MI); 1119 break; 1120 case SIAtomicScope::WAVEFRONT: 1121 case SIAtomicScope::SINGLETHREAD: 1122 // No cache to bypass. 1123 break; 1124 default: 1125 llvm_unreachable("Unsupported synchronization scope"); 1126 } 1127 } 1128 1129 /// The scratch address space does not need the global memory caches 1130 /// to be bypassed as all memory operations by the same thread are 1131 /// sequentially consistent, and no other thread can access scratch 1132 /// memory. 1133 1134 /// Other address spaces do not have a cache. 1135 1136 return Changed; 1137 } 1138 1139 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1140 const MachineBasicBlock::iterator &MI, 1141 SIAtomicScope Scope, 1142 SIAtomicAddrSpace AddrSpace) const { 1143 assert(!MI->mayLoad() && MI->mayStore()); 1144 bool Changed = false; 1145 1146 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1147 switch (Scope) { 1148 case SIAtomicScope::SYSTEM: 1149 case SIAtomicScope::AGENT: 1150 /// Do not set glc for store atomic operations as they implicitly write 1151 /// through the L1 cache. 1152 break; 1153 case SIAtomicScope::WORKGROUP: 1154 case SIAtomicScope::WAVEFRONT: 1155 case SIAtomicScope::SINGLETHREAD: 1156 // No cache to bypass. Store atomics implicitly write through the L1 1157 // cache. 1158 break; 1159 default: 1160 llvm_unreachable("Unsupported synchronization scope"); 1161 } 1162 } 1163 1164 /// The scratch address space does not need the global memory caches 1165 /// to be bypassed as all memory operations by the same thread are 1166 /// sequentially consistent, and no other thread can access scratch 1167 /// memory. 1168 1169 /// Other address spaces do not have a cache. 1170 1171 return Changed; 1172 } 1173 1174 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1175 const MachineBasicBlock::iterator &MI, 1176 SIAtomicScope Scope, 1177 SIAtomicAddrSpace AddrSpace) const { 1178 assert(MI->mayLoad() && MI->mayStore()); 1179 bool Changed = false; 1180 1181 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1182 switch (Scope) { 1183 case SIAtomicScope::SYSTEM: 1184 case SIAtomicScope::AGENT: 1185 /// Do not set glc for RMW atomic operations as they implicitly bypass 1186 /// the L1 cache, and the glc bit is instead used to indicate if they are 1187 /// return or no-return. 1188 break; 1189 case SIAtomicScope::WORKGROUP: 1190 case SIAtomicScope::WAVEFRONT: 1191 case SIAtomicScope::SINGLETHREAD: 1192 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1193 break; 1194 default: 1195 llvm_unreachable("Unsupported synchronization scope"); 1196 } 1197 } 1198 1199 return Changed; 1200 } 1201 1202 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1203 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1204 bool IsVolatile, bool IsNonTemporal) const { 1205 // Only handle load and store, not atomic read-modify-write insructions. The 1206 // latter use glc to indicate if the atomic returns a result and so must not 1207 // be used for cache control. 1208 assert(MI->mayLoad() ^ MI->mayStore()); 1209 1210 // Only update load and store, not LLVM IR atomic read-modify-write 1211 // instructions. The latter are always marked as volatile so cannot sensibly 1212 // handle it as do not want to pessimize all atomics. Also they do not support 1213 // the nontemporal attribute. 1214 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1215 1216 bool Changed = false; 1217 1218 if (IsVolatile) { 1219 // Set L1 cache policy to be MISS_EVICT for load instructions 1220 // and MISS_LRU for store instructions. 1221 // Note: there is no L2 cache bypass policy at the ISA level. 1222 if (Op == SIMemOp::LOAD) 1223 Changed |= enableGLCBit(MI); 1224 1225 // Ensure operation has completed at system scope to cause all volatile 1226 // operations to be visible outside the program in a global order. Do not 1227 // request cross address space as only the global address space can be 1228 // observable outside the program, so no need to cause a waitcnt for LDS 1229 // address space operations. 1230 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1231 Position::AFTER); 1232 1233 return Changed; 1234 } 1235 1236 if (IsNonTemporal) { 1237 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 1238 // for both loads and stores, and the L2 cache policy to STREAM. 1239 Changed |= enableGLCBit(MI); 1240 Changed |= enableSLCBit(MI); 1241 return Changed; 1242 } 1243 1244 return Changed; 1245 } 1246 1247 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1248 SIAtomicScope Scope, 1249 SIAtomicAddrSpace AddrSpace, 1250 SIMemOp Op, 1251 bool IsCrossAddrSpaceOrdering, 1252 Position Pos) const { 1253 if (ST.isTgSplitEnabled()) { 1254 // In threadgroup split mode the waves of a work-group can be executing on 1255 // different CUs. Therefore need to wait for global or GDS memory operations 1256 // to complete to ensure they are visible to waves in the other CUs. 1257 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1258 // the same CU, so no need to wait for global memory as all waves in the 1259 // work-group access the same the L1, nor wait for GDS as access are ordered 1260 // on a CU. 1261 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1262 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1263 (Scope == SIAtomicScope::WORKGROUP)) { 1264 // Same as GFX7 using agent scope. 1265 Scope = SIAtomicScope::AGENT; 1266 } 1267 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1268 // LDS memory operations. 1269 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1270 } 1271 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1272 IsCrossAddrSpaceOrdering, Pos); 1273 } 1274 1275 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1276 SIAtomicScope Scope, 1277 SIAtomicAddrSpace AddrSpace, 1278 Position Pos) const { 1279 if (!InsertCacheInv) 1280 return false; 1281 1282 bool Changed = false; 1283 1284 MachineBasicBlock &MBB = *MI->getParent(); 1285 DebugLoc DL = MI->getDebugLoc(); 1286 1287 if (Pos == Position::AFTER) 1288 ++MI; 1289 1290 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1291 switch (Scope) { 1292 case SIAtomicScope::SYSTEM: 1293 // Ensures that following loads will not see stale remote VMEM data or 1294 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1295 // CC will never be stale due to the local memory probes. 1296 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1297 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1298 // hardware does not reorder memory operations by the same wave with 1299 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1300 // remove any cache lines of earlier writes by the same wave and ensures 1301 // later reads by the same wave will refetch the cache lines. 1302 Changed = true; 1303 break; 1304 case SIAtomicScope::AGENT: 1305 // Same as GFX7. 1306 break; 1307 case SIAtomicScope::WORKGROUP: 1308 // In threadgroup split mode the waves of a work-group can be executing on 1309 // different CUs. Therefore need to invalidate the L1 which is per CU. 1310 // Otherwise in non-threadgroup split mode all waves of a work-group are 1311 // on the same CU, and so the L1 does not need to be invalidated. 1312 if (ST.isTgSplitEnabled()) { 1313 // Same as GFX7 using agent scope. 1314 Scope = SIAtomicScope::AGENT; 1315 } 1316 break; 1317 case SIAtomicScope::WAVEFRONT: 1318 case SIAtomicScope::SINGLETHREAD: 1319 // Same as GFX7. 1320 break; 1321 default: 1322 llvm_unreachable("Unsupported synchronization scope"); 1323 } 1324 } 1325 1326 /// The scratch address space does not need the global memory cache 1327 /// to be flushed as all memory operations by the same thread are 1328 /// sequentially consistent, and no other thread can access scratch 1329 /// memory. 1330 1331 /// Other address spaces do not have a cache. 1332 1333 if (Pos == Position::AFTER) 1334 --MI; 1335 1336 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1337 1338 return Changed; 1339 } 1340 1341 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1342 SIAtomicScope Scope, 1343 SIAtomicAddrSpace AddrSpace, 1344 bool IsCrossAddrSpaceOrdering, 1345 Position Pos) const { 1346 bool Changed = false; 1347 1348 MachineBasicBlock &MBB = *MI->getParent(); 1349 DebugLoc DL = MI->getDebugLoc(); 1350 1351 if (Pos == Position::AFTER) 1352 ++MI; 1353 1354 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1355 switch (Scope) { 1356 case SIAtomicScope::SYSTEM: 1357 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1358 // hardware does not reorder memory operations by the same wave with 1359 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1360 // to initiate writeback of any dirty cache lines of earlier writes by the 1361 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1362 // writeback has completed. 1363 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 1364 // Set SC bits to indicate system scope. 1365 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1366 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1367 // vmcnt(0)" needed by the "BUFFER_WBL2". 1368 Changed = true; 1369 break; 1370 case SIAtomicScope::AGENT: 1371 case SIAtomicScope::WORKGROUP: 1372 case SIAtomicScope::WAVEFRONT: 1373 case SIAtomicScope::SINGLETHREAD: 1374 // Same as GFX7. 1375 break; 1376 default: 1377 llvm_unreachable("Unsupported synchronization scope"); 1378 } 1379 } 1380 1381 if (Pos == Position::AFTER) 1382 --MI; 1383 1384 Changed |= 1385 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1386 IsCrossAddrSpaceOrdering, Pos); 1387 1388 return Changed; 1389 } 1390 1391 bool SIGfx10CacheControl::enableLoadCacheBypass( 1392 const MachineBasicBlock::iterator &MI, 1393 SIAtomicScope Scope, 1394 SIAtomicAddrSpace AddrSpace) const { 1395 assert(MI->mayLoad() && !MI->mayStore()); 1396 bool Changed = false; 1397 1398 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1399 switch (Scope) { 1400 case SIAtomicScope::SYSTEM: 1401 case SIAtomicScope::AGENT: 1402 // Set the L0 and L1 cache policies to MISS_EVICT. 1403 // Note: there is no L2 cache coherent bypass control at the ISA level. 1404 Changed |= enableGLCBit(MI); 1405 Changed |= enableDLCBit(MI); 1406 break; 1407 case SIAtomicScope::WORKGROUP: 1408 // In WGP mode the waves of a work-group can be executing on either CU of 1409 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1410 // CU mode all waves of a work-group are on the same CU, and so the L0 1411 // does not need to be bypassed. 1412 if (!ST.isCuModeEnabled()) 1413 Changed |= enableGLCBit(MI); 1414 break; 1415 case SIAtomicScope::WAVEFRONT: 1416 case SIAtomicScope::SINGLETHREAD: 1417 // No cache to bypass. 1418 break; 1419 default: 1420 llvm_unreachable("Unsupported synchronization scope"); 1421 } 1422 } 1423 1424 /// The scratch address space does not need the global memory caches 1425 /// to be bypassed as all memory operations by the same thread are 1426 /// sequentially consistent, and no other thread can access scratch 1427 /// memory. 1428 1429 /// Other address spaces do not have a cache. 1430 1431 return Changed; 1432 } 1433 1434 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1435 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1436 bool IsVolatile, bool IsNonTemporal) const { 1437 1438 // Only handle load and store, not atomic read-modify-write insructions. The 1439 // latter use glc to indicate if the atomic returns a result and so must not 1440 // be used for cache control. 1441 assert(MI->mayLoad() ^ MI->mayStore()); 1442 1443 // Only update load and store, not LLVM IR atomic read-modify-write 1444 // instructions. The latter are always marked as volatile so cannot sensibly 1445 // handle it as do not want to pessimize all atomics. Also they do not support 1446 // the nontemporal attribute. 1447 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1448 1449 bool Changed = false; 1450 1451 if (IsVolatile) { 1452 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 1453 // and MISS_LRU for store instructions. 1454 // Note: there is no L2 cache coherent bypass control at the ISA level. 1455 if (Op == SIMemOp::LOAD) { 1456 Changed |= enableGLCBit(MI); 1457 Changed |= enableDLCBit(MI); 1458 } 1459 1460 // Ensure operation has completed at system scope to cause all volatile 1461 // operations to be visible outside the program in a global order. Do not 1462 // request cross address space as only the global address space can be 1463 // observable outside the program, so no need to cause a waitcnt for LDS 1464 // address space operations. 1465 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1466 Position::AFTER); 1467 return Changed; 1468 } 1469 1470 if (IsNonTemporal) { 1471 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 1472 // and L2 cache policy to STREAM. 1473 // For stores setting both GLC and SLC configures L0 and L1 cache policy 1474 // to MISS_EVICT and the L2 cache policy to STREAM. 1475 if (Op == SIMemOp::STORE) 1476 Changed |= enableGLCBit(MI); 1477 Changed |= enableSLCBit(MI); 1478 1479 return Changed; 1480 } 1481 1482 return Changed; 1483 } 1484 1485 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1486 SIAtomicScope Scope, 1487 SIAtomicAddrSpace AddrSpace, 1488 SIMemOp Op, 1489 bool IsCrossAddrSpaceOrdering, 1490 Position Pos) const { 1491 bool Changed = false; 1492 1493 MachineBasicBlock &MBB = *MI->getParent(); 1494 DebugLoc DL = MI->getDebugLoc(); 1495 1496 if (Pos == Position::AFTER) 1497 ++MI; 1498 1499 bool VMCnt = false; 1500 bool VSCnt = false; 1501 bool LGKMCnt = false; 1502 1503 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1504 SIAtomicAddrSpace::NONE) { 1505 switch (Scope) { 1506 case SIAtomicScope::SYSTEM: 1507 case SIAtomicScope::AGENT: 1508 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1509 VMCnt |= true; 1510 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1511 VSCnt |= true; 1512 break; 1513 case SIAtomicScope::WORKGROUP: 1514 // In WGP mode the waves of a work-group can be executing on either CU of 1515 // the WGP. Therefore need to wait for operations to complete to ensure 1516 // they are visible to waves in the other CU as the L0 is per CU. 1517 // Otherwise in CU mode and all waves of a work-group are on the same CU 1518 // which shares the same L0. 1519 if (!ST.isCuModeEnabled()) { 1520 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1521 VMCnt |= true; 1522 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1523 VSCnt |= true; 1524 } 1525 break; 1526 case SIAtomicScope::WAVEFRONT: 1527 case SIAtomicScope::SINGLETHREAD: 1528 // The L0 cache keeps all memory operations in order for 1529 // work-items in the same wavefront. 1530 break; 1531 default: 1532 llvm_unreachable("Unsupported synchronization scope"); 1533 } 1534 } 1535 1536 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1537 switch (Scope) { 1538 case SIAtomicScope::SYSTEM: 1539 case SIAtomicScope::AGENT: 1540 case SIAtomicScope::WORKGROUP: 1541 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1542 // not needed as LDS operations for all waves are executed in a total 1543 // global ordering as observed by all waves. Required if also 1544 // synchronizing with global/GDS memory as LDS operations could be 1545 // reordered with respect to later global/GDS memory operations of the 1546 // same wave. 1547 LGKMCnt |= IsCrossAddrSpaceOrdering; 1548 break; 1549 case SIAtomicScope::WAVEFRONT: 1550 case SIAtomicScope::SINGLETHREAD: 1551 // The LDS keeps all memory operations in order for 1552 // the same wavefront. 1553 break; 1554 default: 1555 llvm_unreachable("Unsupported synchronization scope"); 1556 } 1557 } 1558 1559 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1560 switch (Scope) { 1561 case SIAtomicScope::SYSTEM: 1562 case SIAtomicScope::AGENT: 1563 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1564 // is not needed as GDS operations for all waves are executed in a total 1565 // global ordering as observed by all waves. Required if also 1566 // synchronizing with global/LDS memory as GDS operations could be 1567 // reordered with respect to later global/LDS memory operations of the 1568 // same wave. 1569 LGKMCnt |= IsCrossAddrSpaceOrdering; 1570 break; 1571 case SIAtomicScope::WORKGROUP: 1572 case SIAtomicScope::WAVEFRONT: 1573 case SIAtomicScope::SINGLETHREAD: 1574 // The GDS keeps all memory operations in order for 1575 // the same work-group. 1576 break; 1577 default: 1578 llvm_unreachable("Unsupported synchronization scope"); 1579 } 1580 } 1581 1582 if (VMCnt || LGKMCnt) { 1583 unsigned WaitCntImmediate = 1584 AMDGPU::encodeWaitcnt(IV, 1585 VMCnt ? 0 : getVmcntBitMask(IV), 1586 getExpcntBitMask(IV), 1587 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1588 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1589 Changed = true; 1590 } 1591 1592 if (VSCnt) { 1593 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1594 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1595 .addImm(0); 1596 Changed = true; 1597 } 1598 1599 if (Pos == Position::AFTER) 1600 --MI; 1601 1602 return Changed; 1603 } 1604 1605 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1606 SIAtomicScope Scope, 1607 SIAtomicAddrSpace AddrSpace, 1608 Position Pos) const { 1609 if (!InsertCacheInv) 1610 return false; 1611 1612 bool Changed = false; 1613 1614 MachineBasicBlock &MBB = *MI->getParent(); 1615 DebugLoc DL = MI->getDebugLoc(); 1616 1617 if (Pos == Position::AFTER) 1618 ++MI; 1619 1620 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1621 switch (Scope) { 1622 case SIAtomicScope::SYSTEM: 1623 case SIAtomicScope::AGENT: 1624 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1625 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1626 Changed = true; 1627 break; 1628 case SIAtomicScope::WORKGROUP: 1629 // In WGP mode the waves of a work-group can be executing on either CU of 1630 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1631 // in CU mode and all waves of a work-group are on the same CU, and so the 1632 // L0 does not need to be invalidated. 1633 if (!ST.isCuModeEnabled()) { 1634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1635 Changed = true; 1636 } 1637 break; 1638 case SIAtomicScope::WAVEFRONT: 1639 case SIAtomicScope::SINGLETHREAD: 1640 // No cache to invalidate. 1641 break; 1642 default: 1643 llvm_unreachable("Unsupported synchronization scope"); 1644 } 1645 } 1646 1647 /// The scratch address space does not need the global memory cache 1648 /// to be flushed as all memory operations by the same thread are 1649 /// sequentially consistent, and no other thread can access scratch 1650 /// memory. 1651 1652 /// Other address spaces do not have a cache. 1653 1654 if (Pos == Position::AFTER) 1655 --MI; 1656 1657 return Changed; 1658 } 1659 1660 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1661 if (AtomicPseudoMIs.empty()) 1662 return false; 1663 1664 for (auto &MI : AtomicPseudoMIs) 1665 MI->eraseFromParent(); 1666 1667 AtomicPseudoMIs.clear(); 1668 return true; 1669 } 1670 1671 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1672 MachineBasicBlock::iterator &MI) { 1673 assert(MI->mayLoad() && !MI->mayStore()); 1674 1675 bool Changed = false; 1676 1677 if (MOI.isAtomic()) { 1678 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1679 MOI.getOrdering() == AtomicOrdering::Acquire || 1680 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1681 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1682 MOI.getOrderingAddrSpace()); 1683 } 1684 1685 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1686 Changed |= CC->insertWait(MI, MOI.getScope(), 1687 MOI.getOrderingAddrSpace(), 1688 SIMemOp::LOAD | SIMemOp::STORE, 1689 MOI.getIsCrossAddressSpaceOrdering(), 1690 Position::BEFORE); 1691 1692 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1693 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1694 Changed |= CC->insertWait(MI, MOI.getScope(), 1695 MOI.getInstrAddrSpace(), 1696 SIMemOp::LOAD, 1697 MOI.getIsCrossAddressSpaceOrdering(), 1698 Position::AFTER); 1699 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1700 MOI.getOrderingAddrSpace(), 1701 Position::AFTER); 1702 } 1703 1704 return Changed; 1705 } 1706 1707 // Atomic instructions already bypass caches to the scope specified by the 1708 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1709 // need additional treatment. 1710 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1711 SIMemOp::LOAD, MOI.isVolatile(), 1712 MOI.isNonTemporal()); 1713 return Changed; 1714 } 1715 1716 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1717 MachineBasicBlock::iterator &MI) { 1718 assert(!MI->mayLoad() && MI->mayStore()); 1719 1720 bool Changed = false; 1721 1722 if (MOI.isAtomic()) { 1723 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1724 MOI.getOrdering() == AtomicOrdering::Release || 1725 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1726 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1727 MOI.getOrderingAddrSpace()); 1728 } 1729 1730 if (MOI.getOrdering() == AtomicOrdering::Release || 1731 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1732 Changed |= CC->insertRelease(MI, MOI.getScope(), 1733 MOI.getOrderingAddrSpace(), 1734 MOI.getIsCrossAddressSpaceOrdering(), 1735 Position::BEFORE); 1736 1737 return Changed; 1738 } 1739 1740 // Atomic instructions already bypass caches to the scope specified by the 1741 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1742 // need additional treatment. 1743 Changed |= CC->enableVolatileAndOrNonTemporal( 1744 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1745 MOI.isNonTemporal()); 1746 return Changed; 1747 } 1748 1749 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1750 MachineBasicBlock::iterator &MI) { 1751 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1752 1753 AtomicPseudoMIs.push_back(MI); 1754 bool Changed = false; 1755 1756 if (MOI.isAtomic()) { 1757 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1758 MOI.getOrdering() == AtomicOrdering::Release || 1759 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1760 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1761 /// TODO: This relies on a barrier always generating a waitcnt 1762 /// for LDS to ensure it is not reordered with the completion of 1763 /// the proceeding LDS operations. If barrier had a memory 1764 /// ordering and memory scope, then library does not need to 1765 /// generate a fence. Could add support in this file for 1766 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1767 /// adding S_WAITCNT before a S_BARRIER. 1768 Changed |= CC->insertRelease(MI, MOI.getScope(), 1769 MOI.getOrderingAddrSpace(), 1770 MOI.getIsCrossAddressSpaceOrdering(), 1771 Position::BEFORE); 1772 1773 // TODO: If both release and invalidate are happening they could be combined 1774 // to use the single "BUFFER_WBINV*" instruction. This could be done by 1775 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1776 // track cache invalidate and write back instructions. 1777 1778 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1779 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1780 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1781 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1782 MOI.getOrderingAddrSpace(), 1783 Position::BEFORE); 1784 1785 return Changed; 1786 } 1787 1788 return Changed; 1789 } 1790 1791 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1792 MachineBasicBlock::iterator &MI) { 1793 assert(MI->mayLoad() && MI->mayStore()); 1794 1795 bool Changed = false; 1796 1797 if (MOI.isAtomic()) { 1798 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1799 MOI.getOrdering() == AtomicOrdering::Acquire || 1800 MOI.getOrdering() == AtomicOrdering::Release || 1801 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1802 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1803 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1804 MOI.getInstrAddrSpace()); 1805 } 1806 1807 if (MOI.getOrdering() == AtomicOrdering::Release || 1808 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1809 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1810 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1811 Changed |= CC->insertRelease(MI, MOI.getScope(), 1812 MOI.getOrderingAddrSpace(), 1813 MOI.getIsCrossAddressSpaceOrdering(), 1814 Position::BEFORE); 1815 1816 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1817 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1818 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1819 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1820 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1821 Changed |= CC->insertWait(MI, MOI.getScope(), 1822 MOI.getInstrAddrSpace(), 1823 isAtomicRet(*MI) ? SIMemOp::LOAD : 1824 SIMemOp::STORE, 1825 MOI.getIsCrossAddressSpaceOrdering(), 1826 Position::AFTER); 1827 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1828 MOI.getOrderingAddrSpace(), 1829 Position::AFTER); 1830 } 1831 1832 return Changed; 1833 } 1834 1835 return Changed; 1836 } 1837 1838 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1839 bool Changed = false; 1840 1841 SIMemOpAccess MOA(MF); 1842 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1843 1844 for (auto &MBB : MF) { 1845 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1846 1847 // Unbundle instructions after the post-RA scheduler. 1848 if (MI->isBundle() && MI->mayLoadOrStore()) { 1849 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1850 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1851 I != E && I->isBundledWithPred(); ++I) { 1852 I->unbundleFromPred(); 1853 for (MachineOperand &MO : I->operands()) 1854 if (MO.isReg()) 1855 MO.setIsInternalRead(false); 1856 } 1857 1858 MI->eraseFromParent(); 1859 MI = II->getIterator(); 1860 } 1861 1862 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1863 continue; 1864 1865 if (const auto &MOI = MOA.getLoadInfo(MI)) 1866 Changed |= expandLoad(MOI.getValue(), MI); 1867 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1868 Changed |= expandStore(MOI.getValue(), MI); 1869 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1870 Changed |= expandAtomicFence(MOI.getValue(), MI); 1871 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1872 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1873 } 1874 } 1875 1876 Changed |= removeAtomicPseudoMIs(); 1877 return Changed; 1878 } 1879 1880 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1881 1882 char SIMemoryLegalizer::ID = 0; 1883 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1884 1885 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1886 return new SIMemoryLegalizer(); 1887 } 1888