1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE && 130 !isStrongerThan(FailureOrdering, Ordering)); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SiMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SiMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 Optional<SIMemOpInfo> constructFromMIWithMMO( 233 const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "None" otherwise. 241 Optional<SIMemOpInfo> getLoadInfo( 242 const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "None" otherwise. 245 Optional<SIMemOpInfo> getStoreInfo( 246 const MachineBasicBlock::iterator &MI) const; 247 248 /// \returns Atomic fence info if \p MI is an atomic fence operation, 249 /// "None" otherwise. 250 Optional<SIMemOpInfo> getAtomicFenceInfo( 251 const MachineBasicBlock::iterator &MI) const; 252 253 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 254 /// rmw operation, "None" otherwise. 255 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 256 const MachineBasicBlock::iterator &MI) const; 257 }; 258 259 class SICacheControl { 260 protected: 261 262 /// AMDGPU subtarget info. 263 const GCNSubtarget &ST; 264 265 /// Instruction info. 266 const SIInstrInfo *TII = nullptr; 267 268 IsaVersion IV; 269 270 /// Whether to insert cache invalidating instructions. 271 bool InsertCacheInv; 272 273 SICacheControl(const GCNSubtarget &ST); 274 275 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 276 /// \returns Returns true if \p MI is modified, false otherwise. 277 bool enableNamedBit(const MachineBasicBlock::iterator MI, 278 AMDGPU::CPol::CPol Bit) const; 279 280 public: 281 282 /// Create a cache control for the subtarget \p ST. 283 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 284 285 /// Update \p MI memory load instruction to bypass any caches up to 286 /// the \p Scope memory scope for address spaces \p 287 /// AddrSpace. Return true iff the instruction was modified. 288 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 289 SIAtomicScope Scope, 290 SIAtomicAddrSpace AddrSpace) const = 0; 291 292 /// Update \p MI memory store instruction to bypass any caches up to 293 /// the \p Scope memory scope for address spaces \p 294 /// AddrSpace. Return true iff the instruction was modified. 295 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 296 SIAtomicScope Scope, 297 SIAtomicAddrSpace AddrSpace) const = 0; 298 299 /// Update \p MI memory read-modify-write instruction to bypass any caches up 300 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 301 /// iff the instruction was modified. 302 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 303 SIAtomicScope Scope, 304 SIAtomicAddrSpace AddrSpace) const = 0; 305 306 /// Update \p MI memory instruction of kind \p Op associated with address 307 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 308 /// true iff the instruction was modified. 309 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 310 SIAtomicAddrSpace AddrSpace, 311 SIMemOp Op, bool IsVolatile, 312 bool IsNonTemporal) const = 0; 313 314 /// Inserts any necessary instructions at position \p Pos relative 315 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 316 /// \p Op associated with address spaces \p AddrSpace have completed. Used 317 /// between memory instructions to enforce the order they become visible as 318 /// observed by other memory instructions executing in memory scope \p Scope. 319 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 320 /// address spaces. Returns true iff any instructions inserted. 321 virtual bool insertWait(MachineBasicBlock::iterator &MI, 322 SIAtomicScope Scope, 323 SIAtomicAddrSpace AddrSpace, 324 SIMemOp Op, 325 bool IsCrossAddrSpaceOrdering, 326 Position Pos) const = 0; 327 328 /// Inserts any necessary instructions at position \p Pos relative to 329 /// instruction \p MI to ensure any subsequent memory instructions of this 330 /// thread with address spaces \p AddrSpace will observe the previous memory 331 /// operations by any thread for memory scopes up to memory scope \p Scope . 332 /// Returns true iff any instructions inserted. 333 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 334 SIAtomicScope Scope, 335 SIAtomicAddrSpace AddrSpace, 336 Position Pos) const = 0; 337 338 /// Inserts any necessary instructions at position \p Pos relative to 339 /// instruction \p MI to ensure previous memory instructions by this thread 340 /// with address spaces \p AddrSpace have completed and can be observed by 341 /// subsequent memory instructions by any thread executing in memory scope \p 342 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 343 /// between address spaces. Returns true iff any instructions inserted. 344 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 345 SIAtomicScope Scope, 346 SIAtomicAddrSpace AddrSpace, 347 bool IsCrossAddrSpaceOrdering, 348 Position Pos) const = 0; 349 350 /// Virtual destructor to allow derivations to be deleted. 351 virtual ~SICacheControl() = default; 352 353 }; 354 355 class SIGfx6CacheControl : public SICacheControl { 356 protected: 357 358 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 359 /// is modified, false otherwise. 360 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 361 return enableNamedBit(MI, AMDGPU::CPol::GLC); 362 } 363 364 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 365 /// is modified, false otherwise. 366 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 367 return enableNamedBit(MI, AMDGPU::CPol::SLC); 368 } 369 370 public: 371 372 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 373 374 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 375 SIAtomicScope Scope, 376 SIAtomicAddrSpace AddrSpace) const override; 377 378 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 379 SIAtomicScope Scope, 380 SIAtomicAddrSpace AddrSpace) const override; 381 382 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 383 SIAtomicScope Scope, 384 SIAtomicAddrSpace AddrSpace) const override; 385 386 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 387 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 388 bool IsVolatile, 389 bool IsNonTemporal) const override; 390 391 bool insertWait(MachineBasicBlock::iterator &MI, 392 SIAtomicScope Scope, 393 SIAtomicAddrSpace AddrSpace, 394 SIMemOp Op, 395 bool IsCrossAddrSpaceOrdering, 396 Position Pos) const override; 397 398 bool insertAcquire(MachineBasicBlock::iterator &MI, 399 SIAtomicScope Scope, 400 SIAtomicAddrSpace AddrSpace, 401 Position Pos) const override; 402 403 bool insertRelease(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 bool IsCrossAddrSpaceOrdering, 407 Position Pos) const override; 408 }; 409 410 class SIGfx7CacheControl : public SIGfx6CacheControl { 411 public: 412 413 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 414 415 bool insertAcquire(MachineBasicBlock::iterator &MI, 416 SIAtomicScope Scope, 417 SIAtomicAddrSpace AddrSpace, 418 Position Pos) const override; 419 420 }; 421 422 class SIGfx90ACacheControl : public SIGfx7CacheControl { 423 protected: 424 425 /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI 426 /// is modified, false otherwise. 427 bool enableSCCBit(const MachineBasicBlock::iterator &MI) const { 428 return enableNamedBit(MI, AMDGPU::CPol::SCC);; 429 } 430 431 public: 432 433 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 434 435 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 436 SIAtomicScope Scope, 437 SIAtomicAddrSpace AddrSpace) const override; 438 439 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 440 SIAtomicScope Scope, 441 SIAtomicAddrSpace AddrSpace) const override; 442 443 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 444 SIAtomicScope Scope, 445 SIAtomicAddrSpace AddrSpace) const override; 446 447 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 448 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 449 bool IsVolatile, 450 bool IsNonTemporal) const override; 451 452 bool insertWait(MachineBasicBlock::iterator &MI, 453 SIAtomicScope Scope, 454 SIAtomicAddrSpace AddrSpace, 455 SIMemOp Op, 456 bool IsCrossAddrSpaceOrdering, 457 Position Pos) const override; 458 459 bool insertAcquire(MachineBasicBlock::iterator &MI, 460 SIAtomicScope Scope, 461 SIAtomicAddrSpace AddrSpace, 462 Position Pos) const override; 463 464 bool insertRelease(MachineBasicBlock::iterator &MI, 465 SIAtomicScope Scope, 466 SIAtomicAddrSpace AddrSpace, 467 bool IsCrossAddrSpaceOrdering, 468 Position Pos) const override; 469 }; 470 471 class SIGfx10CacheControl : public SIGfx7CacheControl { 472 protected: 473 474 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 475 /// is modified, false otherwise. 476 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 477 return enableNamedBit(MI, AMDGPU::CPol::DLC); 478 } 479 480 public: 481 482 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 483 484 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 485 SIAtomicScope Scope, 486 SIAtomicAddrSpace AddrSpace) const override; 487 488 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 489 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 490 bool IsVolatile, 491 bool IsNonTemporal) const override; 492 493 bool insertWait(MachineBasicBlock::iterator &MI, 494 SIAtomicScope Scope, 495 SIAtomicAddrSpace AddrSpace, 496 SIMemOp Op, 497 bool IsCrossAddrSpaceOrdering, 498 Position Pos) const override; 499 500 bool insertAcquire(MachineBasicBlock::iterator &MI, 501 SIAtomicScope Scope, 502 SIAtomicAddrSpace AddrSpace, 503 Position Pos) const override; 504 }; 505 506 class SIMemoryLegalizer final : public MachineFunctionPass { 507 private: 508 509 /// Cache Control. 510 std::unique_ptr<SICacheControl> CC = nullptr; 511 512 /// List of atomic pseudo instructions. 513 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 514 515 /// Return true iff instruction \p MI is a atomic instruction that 516 /// returns a result. 517 bool isAtomicRet(const MachineInstr &MI) const { 518 return SIInstrInfo::isAtomicRet(MI); 519 } 520 521 /// Removes all processed atomic pseudo instructions from the current 522 /// function. Returns true if current function is modified, false otherwise. 523 bool removeAtomicPseudoMIs(); 524 525 /// Expands load operation \p MI. Returns true if instructions are 526 /// added/deleted or \p MI is modified, false otherwise. 527 bool expandLoad(const SIMemOpInfo &MOI, 528 MachineBasicBlock::iterator &MI); 529 /// Expands store operation \p MI. Returns true if instructions are 530 /// added/deleted or \p MI is modified, false otherwise. 531 bool expandStore(const SIMemOpInfo &MOI, 532 MachineBasicBlock::iterator &MI); 533 /// Expands atomic fence operation \p MI. Returns true if 534 /// instructions are added/deleted or \p MI is modified, false otherwise. 535 bool expandAtomicFence(const SIMemOpInfo &MOI, 536 MachineBasicBlock::iterator &MI); 537 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 538 /// instructions are added/deleted or \p MI is modified, false otherwise. 539 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 540 MachineBasicBlock::iterator &MI); 541 542 public: 543 static char ID; 544 545 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 546 547 void getAnalysisUsage(AnalysisUsage &AU) const override { 548 AU.setPreservesCFG(); 549 MachineFunctionPass::getAnalysisUsage(AU); 550 } 551 552 StringRef getPassName() const override { 553 return PASS_NAME; 554 } 555 556 bool runOnMachineFunction(MachineFunction &MF) override; 557 }; 558 559 } // end namespace anonymous 560 561 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 562 const char *Msg) const { 563 const Function &Func = MI->getParent()->getParent()->getFunction(); 564 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 565 Func.getContext().diagnose(Diag); 566 } 567 568 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 569 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 570 SIAtomicAddrSpace InstrAddrSpace) const { 571 if (SSID == SyncScope::System) 572 return std::make_tuple(SIAtomicScope::SYSTEM, 573 SIAtomicAddrSpace::ATOMIC, 574 true); 575 if (SSID == MMI->getAgentSSID()) 576 return std::make_tuple(SIAtomicScope::AGENT, 577 SIAtomicAddrSpace::ATOMIC, 578 true); 579 if (SSID == MMI->getWorkgroupSSID()) 580 return std::make_tuple(SIAtomicScope::WORKGROUP, 581 SIAtomicAddrSpace::ATOMIC, 582 true); 583 if (SSID == MMI->getWavefrontSSID()) 584 return std::make_tuple(SIAtomicScope::WAVEFRONT, 585 SIAtomicAddrSpace::ATOMIC, 586 true); 587 if (SSID == SyncScope::SingleThread) 588 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 589 SIAtomicAddrSpace::ATOMIC, 590 true); 591 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 592 return std::make_tuple(SIAtomicScope::SYSTEM, 593 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 594 false); 595 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 596 return std::make_tuple(SIAtomicScope::AGENT, 597 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 598 false); 599 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 600 return std::make_tuple(SIAtomicScope::WORKGROUP, 601 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 602 false); 603 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 604 return std::make_tuple(SIAtomicScope::WAVEFRONT, 605 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 606 false); 607 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 608 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 609 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 610 false); 611 return None; 612 } 613 614 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 615 if (AS == AMDGPUAS::FLAT_ADDRESS) 616 return SIAtomicAddrSpace::FLAT; 617 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 618 return SIAtomicAddrSpace::GLOBAL; 619 if (AS == AMDGPUAS::LOCAL_ADDRESS) 620 return SIAtomicAddrSpace::LDS; 621 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 622 return SIAtomicAddrSpace::SCRATCH; 623 if (AS == AMDGPUAS::REGION_ADDRESS) 624 return SIAtomicAddrSpace::GDS; 625 626 return SIAtomicAddrSpace::OTHER; 627 } 628 629 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 630 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 631 } 632 633 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 634 const MachineBasicBlock::iterator &MI) const { 635 assert(MI->getNumMemOperands() > 0); 636 637 SyncScope::ID SSID = SyncScope::SingleThread; 638 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 639 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 640 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 641 bool IsNonTemporal = true; 642 bool IsVolatile = false; 643 644 // Validator should check whether or not MMOs cover the entire set of 645 // locations accessed by the memory instruction. 646 for (const auto &MMO : MI->memoperands()) { 647 IsNonTemporal &= MMO->isNonTemporal(); 648 IsVolatile |= MMO->isVolatile(); 649 InstrAddrSpace |= 650 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 651 AtomicOrdering OpOrdering = MMO->getOrdering(); 652 if (OpOrdering != AtomicOrdering::NotAtomic) { 653 const auto &IsSyncScopeInclusion = 654 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 655 if (!IsSyncScopeInclusion) { 656 reportUnsupported(MI, 657 "Unsupported non-inclusive atomic synchronization scope"); 658 return None; 659 } 660 661 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 662 Ordering = 663 isStrongerThan(Ordering, OpOrdering) ? 664 Ordering : MMO->getOrdering(); 665 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 666 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 667 FailureOrdering = 668 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 669 FailureOrdering : MMO->getFailureOrdering(); 670 } 671 } 672 673 SIAtomicScope Scope = SIAtomicScope::NONE; 674 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 675 bool IsCrossAddressSpaceOrdering = false; 676 if (Ordering != AtomicOrdering::NotAtomic) { 677 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 678 if (!ScopeOrNone) { 679 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 680 return None; 681 } 682 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 683 ScopeOrNone.getValue(); 684 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 685 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 686 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 687 reportUnsupported(MI, "Unsupported atomic address space"); 688 return None; 689 } 690 } 691 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 692 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 693 IsNonTemporal); 694 } 695 696 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 697 const MachineBasicBlock::iterator &MI) const { 698 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 699 700 if (!(MI->mayLoad() && !MI->mayStore())) 701 return None; 702 703 // Be conservative if there are no memory operands. 704 if (MI->getNumMemOperands() == 0) 705 return SIMemOpInfo(); 706 707 return constructFromMIWithMMO(MI); 708 } 709 710 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 711 const MachineBasicBlock::iterator &MI) const { 712 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 713 714 if (!(!MI->mayLoad() && MI->mayStore())) 715 return None; 716 717 // Be conservative if there are no memory operands. 718 if (MI->getNumMemOperands() == 0) 719 return SIMemOpInfo(); 720 721 return constructFromMIWithMMO(MI); 722 } 723 724 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 725 const MachineBasicBlock::iterator &MI) const { 726 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 727 728 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 729 return None; 730 731 AtomicOrdering Ordering = 732 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 733 734 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 735 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 736 if (!ScopeOrNone) { 737 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 738 return None; 739 } 740 741 SIAtomicScope Scope = SIAtomicScope::NONE; 742 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 743 bool IsCrossAddressSpaceOrdering = false; 744 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 745 ScopeOrNone.getValue(); 746 747 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 748 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 749 reportUnsupported(MI, "Unsupported atomic address space"); 750 return None; 751 } 752 753 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 754 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 755 } 756 757 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 758 const MachineBasicBlock::iterator &MI) const { 759 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 760 761 if (!(MI->mayLoad() && MI->mayStore())) 762 return None; 763 764 // Be conservative if there are no memory operands. 765 if (MI->getNumMemOperands() == 0) 766 return SIMemOpInfo(); 767 768 return constructFromMIWithMMO(MI); 769 } 770 771 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 772 TII = ST.getInstrInfo(); 773 IV = getIsaVersion(ST.getCPU()); 774 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 775 } 776 777 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 778 AMDGPU::CPol::CPol Bit) const { 779 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 780 if (!CPol) 781 return false; 782 783 CPol->setImm(CPol->getImm() | Bit); 784 return true; 785 } 786 787 /* static */ 788 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 789 GCNSubtarget::Generation Generation = ST.getGeneration(); 790 if (ST.hasGFX90AInsts()) 791 return std::make_unique<SIGfx90ACacheControl>(ST); 792 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 793 return std::make_unique<SIGfx6CacheControl>(ST); 794 if (Generation < AMDGPUSubtarget::GFX10) 795 return std::make_unique<SIGfx7CacheControl>(ST); 796 return std::make_unique<SIGfx10CacheControl>(ST); 797 } 798 799 bool SIGfx6CacheControl::enableLoadCacheBypass( 800 const MachineBasicBlock::iterator &MI, 801 SIAtomicScope Scope, 802 SIAtomicAddrSpace AddrSpace) const { 803 assert(MI->mayLoad() && !MI->mayStore()); 804 bool Changed = false; 805 806 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 807 switch (Scope) { 808 case SIAtomicScope::SYSTEM: 809 case SIAtomicScope::AGENT: 810 Changed |= enableGLCBit(MI); 811 break; 812 case SIAtomicScope::WORKGROUP: 813 case SIAtomicScope::WAVEFRONT: 814 case SIAtomicScope::SINGLETHREAD: 815 // No cache to bypass. 816 break; 817 default: 818 llvm_unreachable("Unsupported synchronization scope"); 819 } 820 } 821 822 /// The scratch address space does not need the global memory caches 823 /// to be bypassed as all memory operations by the same thread are 824 /// sequentially consistent, and no other thread can access scratch 825 /// memory. 826 827 /// Other address spaces do not have a cache. 828 829 return Changed; 830 } 831 832 bool SIGfx6CacheControl::enableStoreCacheBypass( 833 const MachineBasicBlock::iterator &MI, 834 SIAtomicScope Scope, 835 SIAtomicAddrSpace AddrSpace) const { 836 assert(!MI->mayLoad() && MI->mayStore()); 837 bool Changed = false; 838 839 /// The L1 cache is write through so does not need to be bypassed. There is no 840 /// bypass control for the L2 cache at the isa level. 841 842 return Changed; 843 } 844 845 bool SIGfx6CacheControl::enableRMWCacheBypass( 846 const MachineBasicBlock::iterator &MI, 847 SIAtomicScope Scope, 848 SIAtomicAddrSpace AddrSpace) const { 849 assert(MI->mayLoad() && MI->mayStore()); 850 bool Changed = false; 851 852 /// The L1 cache is write through so does not need to be bypassed. There is no 853 /// bypass control for the L2 cache at the isa level. 854 855 return Changed; 856 } 857 858 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 859 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 860 bool IsVolatile, bool IsNonTemporal) const { 861 // Only handle load and store, not atomic read-modify-write insructions. The 862 // latter use glc to indicate if the atomic returns a result and so must not 863 // be used for cache control. 864 assert(MI->mayLoad() ^ MI->mayStore()); 865 866 // Only update load and store, not LLVM IR atomic read-modify-write 867 // instructions. The latter are always marked as volatile so cannot sensibly 868 // handle it as do not want to pessimize all atomics. Also they do not support 869 // the nontemporal attribute. 870 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 871 872 bool Changed = false; 873 874 if (IsVolatile) { 875 if (Op == SIMemOp::LOAD) 876 Changed |= enableGLCBit(MI); 877 878 // Ensure operation has completed at system scope to cause all volatile 879 // operations to be visible outside the program in a global order. Do not 880 // request cross address space as only the global address space can be 881 // observable outside the program, so no need to cause a waitcnt for LDS 882 // address space operations. 883 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 884 Position::AFTER); 885 886 return Changed; 887 } 888 889 if (IsNonTemporal) { 890 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 891 Changed |= enableGLCBit(MI); 892 Changed |= enableSLCBit(MI); 893 return Changed; 894 } 895 896 return Changed; 897 } 898 899 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 900 SIAtomicScope Scope, 901 SIAtomicAddrSpace AddrSpace, 902 SIMemOp Op, 903 bool IsCrossAddrSpaceOrdering, 904 Position Pos) const { 905 bool Changed = false; 906 907 MachineBasicBlock &MBB = *MI->getParent(); 908 DebugLoc DL = MI->getDebugLoc(); 909 910 if (Pos == Position::AFTER) 911 ++MI; 912 913 bool VMCnt = false; 914 bool LGKMCnt = false; 915 916 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 917 SIAtomicAddrSpace::NONE) { 918 switch (Scope) { 919 case SIAtomicScope::SYSTEM: 920 case SIAtomicScope::AGENT: 921 VMCnt |= true; 922 break; 923 case SIAtomicScope::WORKGROUP: 924 case SIAtomicScope::WAVEFRONT: 925 case SIAtomicScope::SINGLETHREAD: 926 // The L1 cache keeps all memory operations in order for 927 // wavefronts in the same work-group. 928 break; 929 default: 930 llvm_unreachable("Unsupported synchronization scope"); 931 } 932 } 933 934 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 935 switch (Scope) { 936 case SIAtomicScope::SYSTEM: 937 case SIAtomicScope::AGENT: 938 case SIAtomicScope::WORKGROUP: 939 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 940 // not needed as LDS operations for all waves are executed in a total 941 // global ordering as observed by all waves. Required if also 942 // synchronizing with global/GDS memory as LDS operations could be 943 // reordered with respect to later global/GDS memory operations of the 944 // same wave. 945 LGKMCnt |= IsCrossAddrSpaceOrdering; 946 break; 947 case SIAtomicScope::WAVEFRONT: 948 case SIAtomicScope::SINGLETHREAD: 949 // The LDS keeps all memory operations in order for 950 // the same wavesfront. 951 break; 952 default: 953 llvm_unreachable("Unsupported synchronization scope"); 954 } 955 } 956 957 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 958 switch (Scope) { 959 case SIAtomicScope::SYSTEM: 960 case SIAtomicScope::AGENT: 961 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 962 // is not needed as GDS operations for all waves are executed in a total 963 // global ordering as observed by all waves. Required if also 964 // synchronizing with global/LDS memory as GDS operations could be 965 // reordered with respect to later global/LDS memory operations of the 966 // same wave. 967 LGKMCnt |= IsCrossAddrSpaceOrdering; 968 break; 969 case SIAtomicScope::WORKGROUP: 970 case SIAtomicScope::WAVEFRONT: 971 case SIAtomicScope::SINGLETHREAD: 972 // The GDS keeps all memory operations in order for 973 // the same work-group. 974 break; 975 default: 976 llvm_unreachable("Unsupported synchronization scope"); 977 } 978 } 979 980 if (VMCnt || LGKMCnt) { 981 unsigned WaitCntImmediate = 982 AMDGPU::encodeWaitcnt(IV, 983 VMCnt ? 0 : getVmcntBitMask(IV), 984 getExpcntBitMask(IV), 985 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 986 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 987 Changed = true; 988 } 989 990 if (Pos == Position::AFTER) 991 --MI; 992 993 return Changed; 994 } 995 996 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 997 SIAtomicScope Scope, 998 SIAtomicAddrSpace AddrSpace, 999 Position Pos) const { 1000 if (!InsertCacheInv) 1001 return false; 1002 1003 bool Changed = false; 1004 1005 MachineBasicBlock &MBB = *MI->getParent(); 1006 DebugLoc DL = MI->getDebugLoc(); 1007 1008 if (Pos == Position::AFTER) 1009 ++MI; 1010 1011 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1012 switch (Scope) { 1013 case SIAtomicScope::SYSTEM: 1014 case SIAtomicScope::AGENT: 1015 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1016 Changed = true; 1017 break; 1018 case SIAtomicScope::WORKGROUP: 1019 case SIAtomicScope::WAVEFRONT: 1020 case SIAtomicScope::SINGLETHREAD: 1021 // No cache to invalidate. 1022 break; 1023 default: 1024 llvm_unreachable("Unsupported synchronization scope"); 1025 } 1026 } 1027 1028 /// The scratch address space does not need the global memory cache 1029 /// to be flushed as all memory operations by the same thread are 1030 /// sequentially consistent, and no other thread can access scratch 1031 /// memory. 1032 1033 /// Other address spaces do not have a cache. 1034 1035 if (Pos == Position::AFTER) 1036 --MI; 1037 1038 return Changed; 1039 } 1040 1041 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1042 SIAtomicScope Scope, 1043 SIAtomicAddrSpace AddrSpace, 1044 bool IsCrossAddrSpaceOrdering, 1045 Position Pos) const { 1046 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1047 IsCrossAddrSpaceOrdering, Pos); 1048 } 1049 1050 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1051 SIAtomicScope Scope, 1052 SIAtomicAddrSpace AddrSpace, 1053 Position Pos) const { 1054 if (!InsertCacheInv) 1055 return false; 1056 1057 bool Changed = false; 1058 1059 MachineBasicBlock &MBB = *MI->getParent(); 1060 DebugLoc DL = MI->getDebugLoc(); 1061 1062 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1063 1064 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1065 ? AMDGPU::BUFFER_WBINVL1 1066 : AMDGPU::BUFFER_WBINVL1_VOL; 1067 1068 if (Pos == Position::AFTER) 1069 ++MI; 1070 1071 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1072 switch (Scope) { 1073 case SIAtomicScope::SYSTEM: 1074 case SIAtomicScope::AGENT: 1075 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1076 Changed = true; 1077 break; 1078 case SIAtomicScope::WORKGROUP: 1079 case SIAtomicScope::WAVEFRONT: 1080 case SIAtomicScope::SINGLETHREAD: 1081 // No cache to invalidate. 1082 break; 1083 default: 1084 llvm_unreachable("Unsupported synchronization scope"); 1085 } 1086 } 1087 1088 /// The scratch address space does not need the global memory cache 1089 /// to be flushed as all memory operations by the same thread are 1090 /// sequentially consistent, and no other thread can access scratch 1091 /// memory. 1092 1093 /// Other address spaces do not have a cache. 1094 1095 if (Pos == Position::AFTER) 1096 --MI; 1097 1098 return Changed; 1099 } 1100 1101 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1102 const MachineBasicBlock::iterator &MI, 1103 SIAtomicScope Scope, 1104 SIAtomicAddrSpace AddrSpace) const { 1105 assert(MI->mayLoad() && !MI->mayStore()); 1106 bool Changed = false; 1107 1108 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1109 switch (Scope) { 1110 case SIAtomicScope::SYSTEM: 1111 Changed |= enableSCCBit(MI); 1112 Changed |= enableGLCBit(MI); 1113 break; 1114 case SIAtomicScope::AGENT: 1115 Changed |= enableGLCBit(MI); 1116 break; 1117 case SIAtomicScope::WORKGROUP: 1118 // In threadgroup split mode the waves of a work-group can be executing on 1119 // different CUs. Therefore need to bypass the L1 which is per CU. 1120 // Otherwise in non-threadgroup split mode all waves of a work-group are 1121 // on the same CU, and so the L1 does not need to be bypassed. 1122 if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI); 1123 break; 1124 case SIAtomicScope::WAVEFRONT: 1125 case SIAtomicScope::SINGLETHREAD: 1126 // No cache to bypass. 1127 break; 1128 default: 1129 llvm_unreachable("Unsupported synchronization scope"); 1130 } 1131 } 1132 1133 /// The scratch address space does not need the global memory caches 1134 /// to be bypassed as all memory operations by the same thread are 1135 /// sequentially consistent, and no other thread can access scratch 1136 /// memory. 1137 1138 /// Other address spaces do not have a cache. 1139 1140 return Changed; 1141 } 1142 1143 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1144 const MachineBasicBlock::iterator &MI, 1145 SIAtomicScope Scope, 1146 SIAtomicAddrSpace AddrSpace) const { 1147 assert(!MI->mayLoad() && MI->mayStore()); 1148 bool Changed = false; 1149 1150 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1151 switch (Scope) { 1152 case SIAtomicScope::SYSTEM: 1153 Changed |= enableSCCBit(MI); 1154 LLVM_FALLTHROUGH; 1155 case SIAtomicScope::AGENT: 1156 /// Do not set glc for store atomic operations as they implicitly write 1157 /// through the L1 cache. 1158 break; 1159 case SIAtomicScope::WORKGROUP: 1160 case SIAtomicScope::WAVEFRONT: 1161 case SIAtomicScope::SINGLETHREAD: 1162 // No cache to bypass. Store atomics implicitly write through the L1 1163 // cache. 1164 break; 1165 default: 1166 llvm_unreachable("Unsupported synchronization scope"); 1167 } 1168 } 1169 1170 /// The scratch address space does not need the global memory caches 1171 /// to be bypassed as all memory operations by the same thread are 1172 /// sequentially consistent, and no other thread can access scratch 1173 /// memory. 1174 1175 /// Other address spaces do not have a cache. 1176 1177 return Changed; 1178 } 1179 1180 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1181 const MachineBasicBlock::iterator &MI, 1182 SIAtomicScope Scope, 1183 SIAtomicAddrSpace AddrSpace) const { 1184 assert(MI->mayLoad() && MI->mayStore()); 1185 bool Changed = false; 1186 1187 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1188 switch (Scope) { 1189 case SIAtomicScope::SYSTEM: 1190 Changed |= enableSCCBit(MI); 1191 LLVM_FALLTHROUGH; 1192 case SIAtomicScope::AGENT: 1193 /// Do not set glc for RMW atomic operations as they implicitly bypass 1194 /// the L1 cache, and the glc bit is instead used to indicate if they are 1195 /// return or no-return. 1196 break; 1197 case SIAtomicScope::WORKGROUP: 1198 case SIAtomicScope::WAVEFRONT: 1199 case SIAtomicScope::SINGLETHREAD: 1200 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1201 break; 1202 default: 1203 llvm_unreachable("Unsupported synchronization scope"); 1204 } 1205 } 1206 1207 return Changed; 1208 } 1209 1210 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1211 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1212 bool IsVolatile, bool IsNonTemporal) const { 1213 // Only handle load and store, not atomic read-modify-write insructions. The 1214 // latter use glc to indicate if the atomic returns a result and so must not 1215 // be used for cache control. 1216 assert(MI->mayLoad() ^ MI->mayStore()); 1217 1218 // Only update load and store, not LLVM IR atomic read-modify-write 1219 // instructions. The latter are always marked as volatile so cannot sensibly 1220 // handle it as do not want to pessimize all atomics. Also they do not support 1221 // the nontemporal attribute. 1222 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1223 1224 bool Changed = false; 1225 1226 if (IsVolatile) { 1227 if (Op == SIMemOp::LOAD) { 1228 Changed |= enableGLCBit(MI); 1229 } 1230 Changed |= enableSCCBit(MI); 1231 1232 // Ensure operation has completed at system scope to cause all volatile 1233 // operations to be visible outside the program in a global order. Do not 1234 // request cross address space as only the global address space can be 1235 // observable outside the program, so no need to cause a waitcnt for LDS 1236 // address space operations. 1237 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1238 Position::AFTER); 1239 1240 return Changed; 1241 } 1242 1243 if (IsNonTemporal) { 1244 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 1245 Changed |= enableGLCBit(MI); 1246 Changed |= enableSLCBit(MI); 1247 return Changed; 1248 } 1249 1250 return Changed; 1251 } 1252 1253 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1254 SIAtomicScope Scope, 1255 SIAtomicAddrSpace AddrSpace, 1256 SIMemOp Op, 1257 bool IsCrossAddrSpaceOrdering, 1258 Position Pos) const { 1259 if (ST.isTgSplitEnabled()) { 1260 // In threadgroup split mode the waves of a work-group can be executing on 1261 // different CUs. Therefore need to wait for global or GDS memory operations 1262 // to complete to ensure they are visible to waves in the other CUs. 1263 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1264 // the same CU, so no need to wait for global memory as all waves in the 1265 // work-group access the same the L1, nor wait for GDS as access are ordered 1266 // on a CU. 1267 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1268 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1269 (Scope == SIAtomicScope::WORKGROUP)) { 1270 // Same as GFX7 using agent scope. 1271 Scope = SIAtomicScope::AGENT; 1272 } 1273 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1274 // LDS memory operations. 1275 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1276 } 1277 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1278 IsCrossAddrSpaceOrdering, Pos); 1279 } 1280 1281 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1282 SIAtomicScope Scope, 1283 SIAtomicAddrSpace AddrSpace, 1284 Position Pos) const { 1285 if (!InsertCacheInv) 1286 return false; 1287 1288 bool Changed = false; 1289 1290 MachineBasicBlock &MBB = *MI->getParent(); 1291 DebugLoc DL = MI->getDebugLoc(); 1292 1293 if (Pos == Position::AFTER) 1294 ++MI; 1295 1296 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1297 switch (Scope) { 1298 case SIAtomicScope::SYSTEM: 1299 // Ensures that following loads will not see stale remote VMEM data or 1300 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1301 // CC will never be stale due to the local memory probes. 1302 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1303 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1304 // hardware does not reorder memory operations by the same wave with 1305 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1306 // remove any cache lines of earlier writes by the same wave and ensures 1307 // later reads by the same wave will refetch the cache lines. 1308 Changed = true; 1309 break; 1310 case SIAtomicScope::AGENT: 1311 // Same as GFX7. 1312 break; 1313 case SIAtomicScope::WORKGROUP: 1314 // In threadgroup split mode the waves of a work-group can be executing on 1315 // different CUs. Therefore need to invalidate the L1 which is per CU. 1316 // Otherwise in non-threadgroup split mode all waves of a work-group are 1317 // on the same CU, and so the L1 does not need to be invalidated. 1318 if (ST.isTgSplitEnabled()) { 1319 // Same as GFX7 using agent scope. 1320 Scope = SIAtomicScope::AGENT; 1321 } 1322 break; 1323 case SIAtomicScope::WAVEFRONT: 1324 case SIAtomicScope::SINGLETHREAD: 1325 // Same as GFX7. 1326 break; 1327 default: 1328 llvm_unreachable("Unsupported synchronization scope"); 1329 } 1330 } 1331 1332 /// The scratch address space does not need the global memory cache 1333 /// to be flushed as all memory operations by the same thread are 1334 /// sequentially consistent, and no other thread can access scratch 1335 /// memory. 1336 1337 /// Other address spaces do not have a cache. 1338 1339 if (Pos == Position::AFTER) 1340 --MI; 1341 1342 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1343 1344 return Changed; 1345 } 1346 1347 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1348 SIAtomicScope Scope, 1349 SIAtomicAddrSpace AddrSpace, 1350 bool IsCrossAddrSpaceOrdering, 1351 Position Pos) const { 1352 bool Changed = false; 1353 1354 MachineBasicBlock &MBB = *MI->getParent(); 1355 DebugLoc DL = MI->getDebugLoc(); 1356 1357 if (Pos == Position::AFTER) 1358 ++MI; 1359 1360 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1361 switch (Scope) { 1362 case SIAtomicScope::SYSTEM: 1363 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1364 // hardware does not reorder memory operations by the same wave with 1365 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1366 // to initiate writeback of any dirty cache lines of earlier writes by the 1367 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1368 // writeback has completed. 1369 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); 1370 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1371 // vmcnt(0)" needed by the "BUFFER_WBL2". 1372 Changed = true; 1373 break; 1374 case SIAtomicScope::AGENT: 1375 case SIAtomicScope::WORKGROUP: 1376 case SIAtomicScope::WAVEFRONT: 1377 case SIAtomicScope::SINGLETHREAD: 1378 // Same as GFX7. 1379 break; 1380 default: 1381 llvm_unreachable("Unsupported synchronization scope"); 1382 } 1383 } 1384 1385 if (Pos == Position::AFTER) 1386 --MI; 1387 1388 Changed |= 1389 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1390 IsCrossAddrSpaceOrdering, Pos); 1391 1392 return Changed; 1393 } 1394 1395 bool SIGfx10CacheControl::enableLoadCacheBypass( 1396 const MachineBasicBlock::iterator &MI, 1397 SIAtomicScope Scope, 1398 SIAtomicAddrSpace AddrSpace) const { 1399 assert(MI->mayLoad() && !MI->mayStore()); 1400 bool Changed = false; 1401 1402 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1403 /// TODO Do not set glc for rmw atomic operations as they 1404 /// implicitly bypass the L0/L1 caches. 1405 1406 switch (Scope) { 1407 case SIAtomicScope::SYSTEM: 1408 case SIAtomicScope::AGENT: 1409 Changed |= enableGLCBit(MI); 1410 Changed |= enableDLCBit(MI); 1411 break; 1412 case SIAtomicScope::WORKGROUP: 1413 // In WGP mode the waves of a work-group can be executing on either CU of 1414 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1415 // CU mode all waves of a work-group are on the same CU, and so the L0 1416 // does not need to be bypassed. 1417 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 1418 break; 1419 case SIAtomicScope::WAVEFRONT: 1420 case SIAtomicScope::SINGLETHREAD: 1421 // No cache to bypass. 1422 break; 1423 default: 1424 llvm_unreachable("Unsupported synchronization scope"); 1425 } 1426 } 1427 1428 /// The scratch address space does not need the global memory caches 1429 /// to be bypassed as all memory operations by the same thread are 1430 /// sequentially consistent, and no other thread can access scratch 1431 /// memory. 1432 1433 /// Other address spaces do not have a cache. 1434 1435 return Changed; 1436 } 1437 1438 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1439 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1440 bool IsVolatile, bool IsNonTemporal) const { 1441 1442 // Only handle load and store, not atomic read-modify-write insructions. The 1443 // latter use glc to indicate if the atomic returns a result and so must not 1444 // be used for cache control. 1445 assert(MI->mayLoad() ^ MI->mayStore()); 1446 1447 // Only update load and store, not LLVM IR atomic read-modify-write 1448 // instructions. The latter are always marked as volatile so cannot sensibly 1449 // handle it as do not want to pessimize all atomics. Also they do not support 1450 // the nontemporal attribute. 1451 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1452 1453 bool Changed = false; 1454 1455 if (IsVolatile) { 1456 1457 if (Op == SIMemOp::LOAD) { 1458 Changed |= enableGLCBit(MI); 1459 Changed |= enableDLCBit(MI); 1460 } 1461 1462 // Ensure operation has completed at system scope to cause all volatile 1463 // operations to be visible outside the program in a global order. Do not 1464 // request cross address space as only the global address space can be 1465 // observable outside the program, so no need to cause a waitcnt for LDS 1466 // address space operations. 1467 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1468 Position::AFTER); 1469 return Changed; 1470 } 1471 1472 if (IsNonTemporal) { 1473 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1474 Changed |= enableSLCBit(MI); 1475 return Changed; 1476 } 1477 1478 return Changed; 1479 } 1480 1481 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1482 SIAtomicScope Scope, 1483 SIAtomicAddrSpace AddrSpace, 1484 SIMemOp Op, 1485 bool IsCrossAddrSpaceOrdering, 1486 Position Pos) const { 1487 bool Changed = false; 1488 1489 MachineBasicBlock &MBB = *MI->getParent(); 1490 DebugLoc DL = MI->getDebugLoc(); 1491 1492 if (Pos == Position::AFTER) 1493 ++MI; 1494 1495 bool VMCnt = false; 1496 bool VSCnt = false; 1497 bool LGKMCnt = false; 1498 1499 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1500 SIAtomicAddrSpace::NONE) { 1501 switch (Scope) { 1502 case SIAtomicScope::SYSTEM: 1503 case SIAtomicScope::AGENT: 1504 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1505 VMCnt |= true; 1506 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1507 VSCnt |= true; 1508 break; 1509 case SIAtomicScope::WORKGROUP: 1510 // In WGP mode the waves of a work-group can be executing on either CU of 1511 // the WGP. Therefore need to wait for operations to complete to ensure 1512 // they are visible to waves in the other CU as the L0 is per CU. 1513 // Otherwise in CU mode and all waves of a work-group are on the same CU 1514 // which shares the same L0. 1515 if (!ST.isCuModeEnabled()) { 1516 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1517 VMCnt |= true; 1518 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1519 VSCnt |= true; 1520 } 1521 break; 1522 case SIAtomicScope::WAVEFRONT: 1523 case SIAtomicScope::SINGLETHREAD: 1524 // The L0 cache keeps all memory operations in order for 1525 // work-items in the same wavefront. 1526 break; 1527 default: 1528 llvm_unreachable("Unsupported synchronization scope"); 1529 } 1530 } 1531 1532 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1533 switch (Scope) { 1534 case SIAtomicScope::SYSTEM: 1535 case SIAtomicScope::AGENT: 1536 case SIAtomicScope::WORKGROUP: 1537 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1538 // not needed as LDS operations for all waves are executed in a total 1539 // global ordering as observed by all waves. Required if also 1540 // synchronizing with global/GDS memory as LDS operations could be 1541 // reordered with respect to later global/GDS memory operations of the 1542 // same wave. 1543 LGKMCnt |= IsCrossAddrSpaceOrdering; 1544 break; 1545 case SIAtomicScope::WAVEFRONT: 1546 case SIAtomicScope::SINGLETHREAD: 1547 // The LDS keeps all memory operations in order for 1548 // the same wavesfront. 1549 break; 1550 default: 1551 llvm_unreachable("Unsupported synchronization scope"); 1552 } 1553 } 1554 1555 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1556 switch (Scope) { 1557 case SIAtomicScope::SYSTEM: 1558 case SIAtomicScope::AGENT: 1559 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1560 // is not needed as GDS operations for all waves are executed in a total 1561 // global ordering as observed by all waves. Required if also 1562 // synchronizing with global/LDS memory as GDS operations could be 1563 // reordered with respect to later global/LDS memory operations of the 1564 // same wave. 1565 LGKMCnt |= IsCrossAddrSpaceOrdering; 1566 break; 1567 case SIAtomicScope::WORKGROUP: 1568 case SIAtomicScope::WAVEFRONT: 1569 case SIAtomicScope::SINGLETHREAD: 1570 // The GDS keeps all memory operations in order for 1571 // the same work-group. 1572 break; 1573 default: 1574 llvm_unreachable("Unsupported synchronization scope"); 1575 } 1576 } 1577 1578 if (VMCnt || LGKMCnt) { 1579 unsigned WaitCntImmediate = 1580 AMDGPU::encodeWaitcnt(IV, 1581 VMCnt ? 0 : getVmcntBitMask(IV), 1582 getExpcntBitMask(IV), 1583 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1584 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1585 Changed = true; 1586 } 1587 1588 if (VSCnt) { 1589 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1590 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1591 .addImm(0); 1592 Changed = true; 1593 } 1594 1595 if (Pos == Position::AFTER) 1596 --MI; 1597 1598 return Changed; 1599 } 1600 1601 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1602 SIAtomicScope Scope, 1603 SIAtomicAddrSpace AddrSpace, 1604 Position Pos) const { 1605 if (!InsertCacheInv) 1606 return false; 1607 1608 bool Changed = false; 1609 1610 MachineBasicBlock &MBB = *MI->getParent(); 1611 DebugLoc DL = MI->getDebugLoc(); 1612 1613 if (Pos == Position::AFTER) 1614 ++MI; 1615 1616 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1617 switch (Scope) { 1618 case SIAtomicScope::SYSTEM: 1619 case SIAtomicScope::AGENT: 1620 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1621 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1622 Changed = true; 1623 break; 1624 case SIAtomicScope::WORKGROUP: 1625 // In WGP mode the waves of a work-group can be executing on either CU of 1626 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1627 // in CU mode and all waves of a work-group are on the same CU, and so the 1628 // L0 does not need to be invalidated. 1629 if (!ST.isCuModeEnabled()) { 1630 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1631 Changed = true; 1632 } 1633 break; 1634 case SIAtomicScope::WAVEFRONT: 1635 case SIAtomicScope::SINGLETHREAD: 1636 // No cache to invalidate. 1637 break; 1638 default: 1639 llvm_unreachable("Unsupported synchronization scope"); 1640 } 1641 } 1642 1643 /// The scratch address space does not need the global memory cache 1644 /// to be flushed as all memory operations by the same thread are 1645 /// sequentially consistent, and no other thread can access scratch 1646 /// memory. 1647 1648 /// Other address spaces do not have a cache. 1649 1650 if (Pos == Position::AFTER) 1651 --MI; 1652 1653 return Changed; 1654 } 1655 1656 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1657 if (AtomicPseudoMIs.empty()) 1658 return false; 1659 1660 for (auto &MI : AtomicPseudoMIs) 1661 MI->eraseFromParent(); 1662 1663 AtomicPseudoMIs.clear(); 1664 return true; 1665 } 1666 1667 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1668 MachineBasicBlock::iterator &MI) { 1669 assert(MI->mayLoad() && !MI->mayStore()); 1670 1671 bool Changed = false; 1672 1673 if (MOI.isAtomic()) { 1674 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1675 MOI.getOrdering() == AtomicOrdering::Acquire || 1676 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1677 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1678 MOI.getOrderingAddrSpace()); 1679 } 1680 1681 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1682 Changed |= CC->insertWait(MI, MOI.getScope(), 1683 MOI.getOrderingAddrSpace(), 1684 SIMemOp::LOAD | SIMemOp::STORE, 1685 MOI.getIsCrossAddressSpaceOrdering(), 1686 Position::BEFORE); 1687 1688 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1689 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1690 Changed |= CC->insertWait(MI, MOI.getScope(), 1691 MOI.getInstrAddrSpace(), 1692 SIMemOp::LOAD, 1693 MOI.getIsCrossAddressSpaceOrdering(), 1694 Position::AFTER); 1695 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1696 MOI.getOrderingAddrSpace(), 1697 Position::AFTER); 1698 } 1699 1700 return Changed; 1701 } 1702 1703 // Atomic instructions already bypass caches to the scope specified by the 1704 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1705 // need additional treatment. 1706 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1707 SIMemOp::LOAD, MOI.isVolatile(), 1708 MOI.isNonTemporal()); 1709 return Changed; 1710 } 1711 1712 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1713 MachineBasicBlock::iterator &MI) { 1714 assert(!MI->mayLoad() && MI->mayStore()); 1715 1716 bool Changed = false; 1717 1718 if (MOI.isAtomic()) { 1719 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1720 MOI.getOrdering() == AtomicOrdering::Release || 1721 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1722 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1723 MOI.getOrderingAddrSpace()); 1724 } 1725 1726 if (MOI.getOrdering() == AtomicOrdering::Release || 1727 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1728 Changed |= CC->insertRelease(MI, MOI.getScope(), 1729 MOI.getOrderingAddrSpace(), 1730 MOI.getIsCrossAddressSpaceOrdering(), 1731 Position::BEFORE); 1732 1733 return Changed; 1734 } 1735 1736 // Atomic instructions already bypass caches to the scope specified by the 1737 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1738 // need additional treatment. 1739 Changed |= CC->enableVolatileAndOrNonTemporal( 1740 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1741 MOI.isNonTemporal()); 1742 return Changed; 1743 } 1744 1745 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1746 MachineBasicBlock::iterator &MI) { 1747 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1748 1749 AtomicPseudoMIs.push_back(MI); 1750 bool Changed = false; 1751 1752 if (MOI.isAtomic()) { 1753 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1754 MOI.getOrdering() == AtomicOrdering::Release || 1755 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1756 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1757 /// TODO: This relies on a barrier always generating a waitcnt 1758 /// for LDS to ensure it is not reordered with the completion of 1759 /// the proceeding LDS operations. If barrier had a memory 1760 /// ordering and memory scope, then library does not need to 1761 /// generate a fence. Could add support in this file for 1762 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1763 /// adding S_WAITCNT before a S_BARRIER. 1764 Changed |= CC->insertRelease(MI, MOI.getScope(), 1765 MOI.getOrderingAddrSpace(), 1766 MOI.getIsCrossAddressSpaceOrdering(), 1767 Position::BEFORE); 1768 1769 // TODO: If both release and invalidate are happening they could be combined 1770 // to use the single "BUFFER_WBL2" instruction. This could be done by 1771 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1772 // track cache invalidate and write back instructions. 1773 1774 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1775 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1776 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1777 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1778 MOI.getOrderingAddrSpace(), 1779 Position::BEFORE); 1780 1781 return Changed; 1782 } 1783 1784 return Changed; 1785 } 1786 1787 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1788 MachineBasicBlock::iterator &MI) { 1789 assert(MI->mayLoad() && MI->mayStore()); 1790 1791 bool Changed = false; 1792 1793 if (MOI.isAtomic()) { 1794 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1795 MOI.getOrdering() == AtomicOrdering::Acquire || 1796 MOI.getOrdering() == AtomicOrdering::Release || 1797 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1798 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1799 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1800 MOI.getInstrAddrSpace()); 1801 } 1802 1803 if (MOI.getOrdering() == AtomicOrdering::Release || 1804 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1805 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1806 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1807 Changed |= CC->insertRelease(MI, MOI.getScope(), 1808 MOI.getOrderingAddrSpace(), 1809 MOI.getIsCrossAddressSpaceOrdering(), 1810 Position::BEFORE); 1811 1812 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1813 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1814 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1815 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1816 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1817 Changed |= CC->insertWait(MI, MOI.getScope(), 1818 MOI.getInstrAddrSpace(), 1819 isAtomicRet(*MI) ? SIMemOp::LOAD : 1820 SIMemOp::STORE, 1821 MOI.getIsCrossAddressSpaceOrdering(), 1822 Position::AFTER); 1823 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1824 MOI.getOrderingAddrSpace(), 1825 Position::AFTER); 1826 } 1827 1828 return Changed; 1829 } 1830 1831 return Changed; 1832 } 1833 1834 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1835 bool Changed = false; 1836 1837 SIMemOpAccess MOA(MF); 1838 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1839 1840 for (auto &MBB : MF) { 1841 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1842 1843 // Unbundle instructions after the post-RA scheduler. 1844 if (MI->isBundle() && MI->mayLoadOrStore()) { 1845 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1846 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1847 I != E && I->isBundledWithPred(); ++I) { 1848 I->unbundleFromPred(); 1849 for (MachineOperand &MO : I->operands()) 1850 if (MO.isReg()) 1851 MO.setIsInternalRead(false); 1852 } 1853 1854 MI->eraseFromParent(); 1855 MI = II->getIterator(); 1856 } 1857 1858 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1859 continue; 1860 1861 if (const auto &MOI = MOA.getLoadInfo(MI)) 1862 Changed |= expandLoad(MOI.getValue(), MI); 1863 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1864 Changed |= expandStore(MOI.getValue(), MI); 1865 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1866 Changed |= expandAtomicFence(MOI.getValue(), MI); 1867 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1868 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1869 } 1870 } 1871 1872 Changed |= removeAtomicPseudoMIs(); 1873 return Changed; 1874 } 1875 1876 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1877 1878 char SIMemoryLegalizer::ID = 0; 1879 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1880 1881 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1882 return new SIMemoryLegalizer(); 1883 } 1884