1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "llvm/ADT/BitmaskEnum.h" 20 #include "llvm/CodeGen/MachineBasicBlock.h" 21 #include "llvm/IR/DiagnosticInfo.h" 22 #include "llvm/Support/AtomicOrdering.h" 23 #include "llvm/Support/TargetParser.h" 24 25 using namespace llvm; 26 using namespace llvm::AMDGPU; 27 28 #define DEBUG_TYPE "si-memory-legalizer" 29 #define PASS_NAME "SI Memory Legalizer" 30 31 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 32 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 33 cl::desc("Use this to skip inserting cache invalidating instructions.")); 34 35 namespace { 36 37 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 38 39 /// Memory operation flags. Can be ORed together. 40 enum class SIMemOp { 41 NONE = 0u, 42 LOAD = 1u << 0, 43 STORE = 1u << 1, 44 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 45 }; 46 47 /// Position to insert a new instruction relative to an existing 48 /// instruction. 49 enum class Position { 50 BEFORE, 51 AFTER 52 }; 53 54 /// The atomic synchronization scopes supported by the AMDGPU target. 55 enum class SIAtomicScope { 56 NONE, 57 SINGLETHREAD, 58 WAVEFRONT, 59 WORKGROUP, 60 AGENT, 61 SYSTEM 62 }; 63 64 /// The distinct address spaces supported by the AMDGPU target for 65 /// atomic memory operation. Can be ORed toether. 66 enum class SIAtomicAddrSpace { 67 NONE = 0u, 68 GLOBAL = 1u << 0, 69 LDS = 1u << 1, 70 SCRATCH = 1u << 2, 71 GDS = 1u << 3, 72 OTHER = 1u << 4, 73 74 /// The address spaces that can be accessed by a FLAT instruction. 75 FLAT = GLOBAL | LDS | SCRATCH, 76 77 /// The address spaces that support atomic instructions. 78 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 79 80 /// All address spaces. 81 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 82 83 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 84 }; 85 86 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 87 /// \returns Returns true if \p MI is modified, false otherwise. 88 template <uint16_t BitName> 89 bool enableNamedBit(const MachineBasicBlock::iterator &MI) { 90 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); 91 if (BitIdx == -1) 92 return false; 93 94 MachineOperand &Bit = MI->getOperand(BitIdx); 95 if (Bit.getImm() != 0) 96 return false; 97 98 Bit.setImm(1); 99 return true; 100 } 101 102 class SIMemOpInfo final { 103 private: 104 105 friend class SIMemOpAccess; 106 107 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 108 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 109 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 110 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 111 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 112 bool IsCrossAddressSpaceOrdering = false; 113 bool IsNonTemporal = false; 114 115 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 116 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 117 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 118 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 119 bool IsCrossAddressSpaceOrdering = true, 120 AtomicOrdering FailureOrdering = 121 AtomicOrdering::SequentiallyConsistent, 122 bool IsNonTemporal = false) 123 : Ordering(Ordering), FailureOrdering(FailureOrdering), 124 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 125 InstrAddrSpace(InstrAddrSpace), 126 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 127 IsNonTemporal(IsNonTemporal) { 128 // There is also no cross address space ordering if the ordering 129 // address space is the same as the instruction address space and 130 // only contains a single address space. 131 if ((OrderingAddrSpace == InstrAddrSpace) && 132 isPowerOf2_32(uint32_t(InstrAddrSpace))) 133 this->IsCrossAddressSpaceOrdering = false; 134 } 135 136 public: 137 /// \returns Atomic synchronization scope of the machine instruction used to 138 /// create this SIMemOpInfo. 139 SIAtomicScope getScope() const { 140 return Scope; 141 } 142 143 /// \returns Ordering constraint of the machine instruction used to 144 /// create this SIMemOpInfo. 145 AtomicOrdering getOrdering() const { 146 return Ordering; 147 } 148 149 /// \returns Failure ordering constraint of the machine instruction used to 150 /// create this SIMemOpInfo. 151 AtomicOrdering getFailureOrdering() const { 152 return FailureOrdering; 153 } 154 155 /// \returns The address spaces be accessed by the machine 156 /// instruction used to create this SiMemOpInfo. 157 SIAtomicAddrSpace getInstrAddrSpace() const { 158 return InstrAddrSpace; 159 } 160 161 /// \returns The address spaces that must be ordered by the machine 162 /// instruction used to create this SiMemOpInfo. 163 SIAtomicAddrSpace getOrderingAddrSpace() const { 164 return OrderingAddrSpace; 165 } 166 167 /// \returns Return true iff memory ordering of operations on 168 /// different address spaces is required. 169 bool getIsCrossAddressSpaceOrdering() const { 170 return IsCrossAddressSpaceOrdering; 171 } 172 173 /// \returns True if memory access of the machine instruction used to 174 /// create this SIMemOpInfo is non-temporal, false otherwise. 175 bool isNonTemporal() const { 176 return IsNonTemporal; 177 } 178 179 /// \returns True if ordering constraint of the machine instruction used to 180 /// create this SIMemOpInfo is unordered or higher, false otherwise. 181 bool isAtomic() const { 182 return Ordering != AtomicOrdering::NotAtomic; 183 } 184 185 }; 186 187 class SIMemOpAccess final { 188 private: 189 AMDGPUMachineModuleInfo *MMI = nullptr; 190 191 /// Reports unsupported message \p Msg for \p MI to LLVM context. 192 void reportUnsupported(const MachineBasicBlock::iterator &MI, 193 const char *Msg) const; 194 195 /// Inspects the target synchonization scope \p SSID and determines 196 /// the SI atomic scope it corresponds to, the address spaces it 197 /// covers, and whether the memory ordering applies between address 198 /// spaces. 199 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 200 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; 201 202 /// \return Return a bit set of the address spaces accessed by \p AS. 203 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 204 205 /// \returns Info constructed from \p MI, which has at least machine memory 206 /// operand. 207 Optional<SIMemOpInfo> constructFromMIWithMMO( 208 const MachineBasicBlock::iterator &MI) const; 209 210 public: 211 /// Construct class to support accessing the machine memory operands 212 /// of instructions in the machine function \p MF. 213 SIMemOpAccess(MachineFunction &MF); 214 215 /// \returns Load info if \p MI is a load operation, "None" otherwise. 216 Optional<SIMemOpInfo> getLoadInfo( 217 const MachineBasicBlock::iterator &MI) const; 218 219 /// \returns Store info if \p MI is a store operation, "None" otherwise. 220 Optional<SIMemOpInfo> getStoreInfo( 221 const MachineBasicBlock::iterator &MI) const; 222 223 /// \returns Atomic fence info if \p MI is an atomic fence operation, 224 /// "None" otherwise. 225 Optional<SIMemOpInfo> getAtomicFenceInfo( 226 const MachineBasicBlock::iterator &MI) const; 227 228 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 229 /// rmw operation, "None" otherwise. 230 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 231 const MachineBasicBlock::iterator &MI) const; 232 }; 233 234 class SICacheControl { 235 protected: 236 237 /// AMDGPU subtarget info. 238 const GCNSubtarget &ST; 239 240 /// Instruction info. 241 const SIInstrInfo *TII = nullptr; 242 243 IsaVersion IV; 244 245 /// Whether to insert cache invalidating instructions. 246 bool InsertCacheInv; 247 248 SICacheControl(const GCNSubtarget &ST); 249 250 public: 251 252 /// Create a cache control for the subtarget \p ST. 253 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 254 255 /// Update \p MI memory load instruction to bypass any caches up to 256 /// the \p Scope memory scope for address spaces \p 257 /// AddrSpace. Return true iff the instruction was modified. 258 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 259 SIAtomicScope Scope, 260 SIAtomicAddrSpace AddrSpace) const = 0; 261 262 /// Update \p MI memory instruction to indicate it is 263 /// nontemporal. Return true iff the instruction was modified. 264 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) 265 const = 0; 266 267 /// Inserts any necessary instructions at position \p Pos relative 268 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 269 /// \p Op associated with address spaces \p AddrSpace have completed. Used 270 /// between memory instructions to enforce the order they become visible as 271 /// observed by other memory instructions executing in memory scope \p Scope. 272 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 273 /// address spaces. Returns true iff any instructions inserted. 274 virtual bool insertWait(MachineBasicBlock::iterator &MI, 275 SIAtomicScope Scope, 276 SIAtomicAddrSpace AddrSpace, 277 SIMemOp Op, 278 bool IsCrossAddrSpaceOrdering, 279 Position Pos) const = 0; 280 281 /// Inserts any necessary instructions at position \p Pos relative to 282 /// instruction \p MI to ensure any subsequent memory instructions of this 283 /// thread with address spaces \p AddrSpace will observe the previous memory 284 /// operations by any thread for memory scopes up to memory scope \p Scope . 285 /// Returns true iff any instructions inserted. 286 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 287 SIAtomicScope Scope, 288 SIAtomicAddrSpace AddrSpace, 289 Position Pos) const = 0; 290 291 /// Inserts any necessary instructions at position \p Pos relative to 292 /// instruction \p MI to ensure previous memory instructions by this thread 293 /// with address spaces \p AddrSpace have completed and can be observed by 294 /// subsequent memory instructions by any thread executing in memory scope \p 295 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 296 /// between address spaces. Returns true iff any instructions inserted. 297 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 298 SIAtomicScope Scope, 299 SIAtomicAddrSpace AddrSpace, 300 bool IsCrossAddrSpaceOrdering, 301 Position Pos) const = 0; 302 303 /// Virtual destructor to allow derivations to be deleted. 304 virtual ~SICacheControl() = default; 305 306 }; 307 308 class SIGfx6CacheControl : public SICacheControl { 309 protected: 310 311 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 312 /// is modified, false otherwise. 313 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 314 return enableNamedBit<AMDGPU::OpName::glc>(MI); 315 } 316 317 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 318 /// is modified, false otherwise. 319 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 320 return enableNamedBit<AMDGPU::OpName::slc>(MI); 321 } 322 323 public: 324 325 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 326 327 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 328 SIAtomicScope Scope, 329 SIAtomicAddrSpace AddrSpace) const override; 330 331 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 332 333 bool insertWait(MachineBasicBlock::iterator &MI, 334 SIAtomicScope Scope, 335 SIAtomicAddrSpace AddrSpace, 336 SIMemOp Op, 337 bool IsCrossAddrSpaceOrdering, 338 Position Pos) const override; 339 340 bool insertAcquire(MachineBasicBlock::iterator &MI, 341 SIAtomicScope Scope, 342 SIAtomicAddrSpace AddrSpace, 343 Position Pos) const override; 344 345 bool insertRelease(MachineBasicBlock::iterator &MI, 346 SIAtomicScope Scope, 347 SIAtomicAddrSpace AddrSpace, 348 bool IsCrossAddrSpaceOrdering, 349 Position Pos) const override; 350 }; 351 352 class SIGfx7CacheControl : public SIGfx6CacheControl { 353 public: 354 355 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 356 357 bool insertAcquire(MachineBasicBlock::iterator &MI, 358 SIAtomicScope Scope, 359 SIAtomicAddrSpace AddrSpace, 360 Position Pos) const override; 361 362 }; 363 364 class SIGfx10CacheControl : public SIGfx7CacheControl { 365 protected: 366 367 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 368 /// is modified, false otherwise. 369 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 370 return enableNamedBit<AMDGPU::OpName::dlc>(MI); 371 } 372 373 public: 374 375 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 376 377 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 378 SIAtomicScope Scope, 379 SIAtomicAddrSpace AddrSpace) const override; 380 381 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; 382 383 bool insertWait(MachineBasicBlock::iterator &MI, 384 SIAtomicScope Scope, 385 SIAtomicAddrSpace AddrSpace, 386 SIMemOp Op, 387 bool IsCrossAddrSpaceOrdering, 388 Position Pos) const override; 389 390 bool insertAcquire(MachineBasicBlock::iterator &MI, 391 SIAtomicScope Scope, 392 SIAtomicAddrSpace AddrSpace, 393 Position Pos) const override; 394 }; 395 396 class SIMemoryLegalizer final : public MachineFunctionPass { 397 private: 398 399 /// Cache Control. 400 std::unique_ptr<SICacheControl> CC = nullptr; 401 402 /// List of atomic pseudo instructions. 403 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 404 405 /// Return true iff instruction \p MI is a atomic instruction that 406 /// returns a result. 407 bool isAtomicRet(const MachineInstr &MI) const { 408 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; 409 } 410 411 /// Removes all processed atomic pseudo instructions from the current 412 /// function. Returns true if current function is modified, false otherwise. 413 bool removeAtomicPseudoMIs(); 414 415 /// Expands load operation \p MI. Returns true if instructions are 416 /// added/deleted or \p MI is modified, false otherwise. 417 bool expandLoad(const SIMemOpInfo &MOI, 418 MachineBasicBlock::iterator &MI); 419 /// Expands store operation \p MI. Returns true if instructions are 420 /// added/deleted or \p MI is modified, false otherwise. 421 bool expandStore(const SIMemOpInfo &MOI, 422 MachineBasicBlock::iterator &MI); 423 /// Expands atomic fence operation \p MI. Returns true if 424 /// instructions are added/deleted or \p MI is modified, false otherwise. 425 bool expandAtomicFence(const SIMemOpInfo &MOI, 426 MachineBasicBlock::iterator &MI); 427 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 428 /// instructions are added/deleted or \p MI is modified, false otherwise. 429 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 430 MachineBasicBlock::iterator &MI); 431 432 public: 433 static char ID; 434 435 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 436 437 void getAnalysisUsage(AnalysisUsage &AU) const override { 438 AU.setPreservesCFG(); 439 MachineFunctionPass::getAnalysisUsage(AU); 440 } 441 442 StringRef getPassName() const override { 443 return PASS_NAME; 444 } 445 446 bool runOnMachineFunction(MachineFunction &MF) override; 447 }; 448 449 } // end namespace anonymous 450 451 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 452 const char *Msg) const { 453 const Function &Func = MI->getParent()->getParent()->getFunction(); 454 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 455 Func.getContext().diagnose(Diag); 456 } 457 458 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 459 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 460 SIAtomicAddrSpace InstrScope) const { 461 if (SSID == SyncScope::System) 462 return std::make_tuple(SIAtomicScope::SYSTEM, 463 SIAtomicAddrSpace::ATOMIC, 464 true); 465 if (SSID == MMI->getAgentSSID()) 466 return std::make_tuple(SIAtomicScope::AGENT, 467 SIAtomicAddrSpace::ATOMIC, 468 true); 469 if (SSID == MMI->getWorkgroupSSID()) 470 return std::make_tuple(SIAtomicScope::WORKGROUP, 471 SIAtomicAddrSpace::ATOMIC, 472 true); 473 if (SSID == MMI->getWavefrontSSID()) 474 return std::make_tuple(SIAtomicScope::WAVEFRONT, 475 SIAtomicAddrSpace::ATOMIC, 476 true); 477 if (SSID == SyncScope::SingleThread) 478 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 479 SIAtomicAddrSpace::ATOMIC, 480 true); 481 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 482 return std::make_tuple(SIAtomicScope::SYSTEM, 483 SIAtomicAddrSpace::ATOMIC & InstrScope, 484 false); 485 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 486 return std::make_tuple(SIAtomicScope::AGENT, 487 SIAtomicAddrSpace::ATOMIC & InstrScope, 488 false); 489 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 490 return std::make_tuple(SIAtomicScope::WORKGROUP, 491 SIAtomicAddrSpace::ATOMIC & InstrScope, 492 false); 493 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 494 return std::make_tuple(SIAtomicScope::WAVEFRONT, 495 SIAtomicAddrSpace::ATOMIC & InstrScope, 496 false); 497 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 498 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 499 SIAtomicAddrSpace::ATOMIC & InstrScope, 500 false); 501 return None; 502 } 503 504 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 505 if (AS == AMDGPUAS::FLAT_ADDRESS) 506 return SIAtomicAddrSpace::FLAT; 507 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 508 return SIAtomicAddrSpace::GLOBAL; 509 if (AS == AMDGPUAS::LOCAL_ADDRESS) 510 return SIAtomicAddrSpace::LDS; 511 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 512 return SIAtomicAddrSpace::SCRATCH; 513 if (AS == AMDGPUAS::REGION_ADDRESS) 514 return SIAtomicAddrSpace::GDS; 515 516 return SIAtomicAddrSpace::OTHER; 517 } 518 519 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 520 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 521 } 522 523 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 524 const MachineBasicBlock::iterator &MI) const { 525 assert(MI->getNumMemOperands() > 0); 526 527 SyncScope::ID SSID = SyncScope::SingleThread; 528 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 529 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 530 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 531 bool IsNonTemporal = true; 532 533 // Validator should check whether or not MMOs cover the entire set of 534 // locations accessed by the memory instruction. 535 for (const auto &MMO : MI->memoperands()) { 536 IsNonTemporal &= MMO->isNonTemporal(); 537 InstrAddrSpace |= 538 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 539 AtomicOrdering OpOrdering = MMO->getOrdering(); 540 if (OpOrdering != AtomicOrdering::NotAtomic) { 541 const auto &IsSyncScopeInclusion = 542 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 543 if (!IsSyncScopeInclusion) { 544 reportUnsupported(MI, 545 "Unsupported non-inclusive atomic synchronization scope"); 546 return None; 547 } 548 549 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 550 Ordering = 551 isStrongerThan(Ordering, OpOrdering) ? 552 Ordering : MMO->getOrdering(); 553 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 554 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 555 FailureOrdering = 556 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 557 FailureOrdering : MMO->getFailureOrdering(); 558 } 559 } 560 561 SIAtomicScope Scope = SIAtomicScope::NONE; 562 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 563 bool IsCrossAddressSpaceOrdering = false; 564 if (Ordering != AtomicOrdering::NotAtomic) { 565 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 566 if (!ScopeOrNone) { 567 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 568 return None; 569 } 570 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 571 ScopeOrNone.getValue(); 572 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 573 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 574 reportUnsupported(MI, "Unsupported atomic address space"); 575 return None; 576 } 577 } 578 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 579 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); 580 } 581 582 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 583 const MachineBasicBlock::iterator &MI) const { 584 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 585 586 if (!(MI->mayLoad() && !MI->mayStore())) 587 return None; 588 589 // Be conservative if there are no memory operands. 590 if (MI->getNumMemOperands() == 0) 591 return SIMemOpInfo(); 592 593 return constructFromMIWithMMO(MI); 594 } 595 596 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 597 const MachineBasicBlock::iterator &MI) const { 598 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 599 600 if (!(!MI->mayLoad() && MI->mayStore())) 601 return None; 602 603 // Be conservative if there are no memory operands. 604 if (MI->getNumMemOperands() == 0) 605 return SIMemOpInfo(); 606 607 return constructFromMIWithMMO(MI); 608 } 609 610 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 611 const MachineBasicBlock::iterator &MI) const { 612 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 613 614 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 615 return None; 616 617 AtomicOrdering Ordering = 618 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 619 620 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 621 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 622 if (!ScopeOrNone) { 623 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 624 return None; 625 } 626 627 SIAtomicScope Scope = SIAtomicScope::NONE; 628 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 629 bool IsCrossAddressSpaceOrdering = false; 630 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 631 ScopeOrNone.getValue(); 632 633 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 634 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 635 reportUnsupported(MI, "Unsupported atomic address space"); 636 return None; 637 } 638 639 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 640 IsCrossAddressSpaceOrdering); 641 } 642 643 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 644 const MachineBasicBlock::iterator &MI) const { 645 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 646 647 if (!(MI->mayLoad() && MI->mayStore())) 648 return None; 649 650 // Be conservative if there are no memory operands. 651 if (MI->getNumMemOperands() == 0) 652 return SIMemOpInfo(); 653 654 return constructFromMIWithMMO(MI); 655 } 656 657 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 658 TII = ST.getInstrInfo(); 659 IV = getIsaVersion(ST.getCPU()); 660 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 661 } 662 663 /* static */ 664 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 665 GCNSubtarget::Generation Generation = ST.getGeneration(); 666 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 667 return std::make_unique<SIGfx6CacheControl>(ST); 668 if (Generation < AMDGPUSubtarget::GFX10) 669 return std::make_unique<SIGfx7CacheControl>(ST); 670 return std::make_unique<SIGfx10CacheControl>(ST); 671 } 672 673 bool SIGfx6CacheControl::enableLoadCacheBypass( 674 const MachineBasicBlock::iterator &MI, 675 SIAtomicScope Scope, 676 SIAtomicAddrSpace AddrSpace) const { 677 assert(MI->mayLoad() && !MI->mayStore()); 678 bool Changed = false; 679 680 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 681 switch (Scope) { 682 case SIAtomicScope::SYSTEM: 683 case SIAtomicScope::AGENT: 684 Changed |= enableGLCBit(MI); 685 break; 686 case SIAtomicScope::WORKGROUP: 687 case SIAtomicScope::WAVEFRONT: 688 case SIAtomicScope::SINGLETHREAD: 689 // No cache to bypass. 690 break; 691 default: 692 llvm_unreachable("Unsupported synchronization scope"); 693 } 694 } 695 696 /// The scratch address space does not need the global memory caches 697 /// to be bypassed as all memory operations by the same thread are 698 /// sequentially consistent, and no other thread can access scratch 699 /// memory. 700 701 /// Other address spaces do not have a cache. 702 703 return Changed; 704 } 705 706 bool SIGfx6CacheControl::enableNonTemporal( 707 const MachineBasicBlock::iterator &MI) const { 708 assert(MI->mayLoad() ^ MI->mayStore()); 709 bool Changed = false; 710 711 /// TODO: Do not enableGLCBit if rmw atomic. 712 Changed |= enableGLCBit(MI); 713 Changed |= enableSLCBit(MI); 714 715 return Changed; 716 } 717 718 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 719 SIAtomicScope Scope, 720 SIAtomicAddrSpace AddrSpace, 721 SIMemOp Op, 722 bool IsCrossAddrSpaceOrdering, 723 Position Pos) const { 724 bool Changed = false; 725 726 MachineBasicBlock &MBB = *MI->getParent(); 727 DebugLoc DL = MI->getDebugLoc(); 728 729 if (Pos == Position::AFTER) 730 ++MI; 731 732 bool VMCnt = false; 733 bool LGKMCnt = false; 734 735 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 736 switch (Scope) { 737 case SIAtomicScope::SYSTEM: 738 case SIAtomicScope::AGENT: 739 VMCnt |= true; 740 break; 741 case SIAtomicScope::WORKGROUP: 742 case SIAtomicScope::WAVEFRONT: 743 case SIAtomicScope::SINGLETHREAD: 744 // The L1 cache keeps all memory operations in order for 745 // wavefronts in the same work-group. 746 break; 747 default: 748 llvm_unreachable("Unsupported synchronization scope"); 749 } 750 } 751 752 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 753 switch (Scope) { 754 case SIAtomicScope::SYSTEM: 755 case SIAtomicScope::AGENT: 756 case SIAtomicScope::WORKGROUP: 757 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 758 // not needed as LDS operations for all waves are executed in a total 759 // global ordering as observed by all waves. Required if also 760 // synchronizing with global/GDS memory as LDS operations could be 761 // reordered with respect to later global/GDS memory operations of the 762 // same wave. 763 LGKMCnt |= IsCrossAddrSpaceOrdering; 764 break; 765 case SIAtomicScope::WAVEFRONT: 766 case SIAtomicScope::SINGLETHREAD: 767 // The LDS keeps all memory operations in order for 768 // the same wavesfront. 769 break; 770 default: 771 llvm_unreachable("Unsupported synchronization scope"); 772 } 773 } 774 775 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 776 switch (Scope) { 777 case SIAtomicScope::SYSTEM: 778 case SIAtomicScope::AGENT: 779 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 780 // is not needed as GDS operations for all waves are executed in a total 781 // global ordering as observed by all waves. Required if also 782 // synchronizing with global/LDS memory as GDS operations could be 783 // reordered with respect to later global/LDS memory operations of the 784 // same wave. 785 LGKMCnt |= IsCrossAddrSpaceOrdering; 786 break; 787 case SIAtomicScope::WORKGROUP: 788 case SIAtomicScope::WAVEFRONT: 789 case SIAtomicScope::SINGLETHREAD: 790 // The GDS keeps all memory operations in order for 791 // the same work-group. 792 break; 793 default: 794 llvm_unreachable("Unsupported synchronization scope"); 795 } 796 } 797 798 if (VMCnt || LGKMCnt) { 799 unsigned WaitCntImmediate = 800 AMDGPU::encodeWaitcnt(IV, 801 VMCnt ? 0 : getVmcntBitMask(IV), 802 getExpcntBitMask(IV), 803 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 804 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 805 Changed = true; 806 } 807 808 if (Pos == Position::AFTER) 809 --MI; 810 811 return Changed; 812 } 813 814 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 815 SIAtomicScope Scope, 816 SIAtomicAddrSpace AddrSpace, 817 Position Pos) const { 818 if (!InsertCacheInv) 819 return false; 820 821 bool Changed = false; 822 823 MachineBasicBlock &MBB = *MI->getParent(); 824 DebugLoc DL = MI->getDebugLoc(); 825 826 if (Pos == Position::AFTER) 827 ++MI; 828 829 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 830 switch (Scope) { 831 case SIAtomicScope::SYSTEM: 832 case SIAtomicScope::AGENT: 833 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 834 Changed = true; 835 break; 836 case SIAtomicScope::WORKGROUP: 837 case SIAtomicScope::WAVEFRONT: 838 case SIAtomicScope::SINGLETHREAD: 839 // No cache to invalidate. 840 break; 841 default: 842 llvm_unreachable("Unsupported synchronization scope"); 843 } 844 } 845 846 /// The scratch address space does not need the global memory cache 847 /// to be flushed as all memory operations by the same thread are 848 /// sequentially consistent, and no other thread can access scratch 849 /// memory. 850 851 /// Other address spaces do not have a cache. 852 853 if (Pos == Position::AFTER) 854 --MI; 855 856 return Changed; 857 } 858 859 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 860 SIAtomicScope Scope, 861 SIAtomicAddrSpace AddrSpace, 862 bool IsCrossAddrSpaceOrdering, 863 Position Pos) const { 864 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 865 IsCrossAddrSpaceOrdering, Pos); 866 } 867 868 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 869 SIAtomicScope Scope, 870 SIAtomicAddrSpace AddrSpace, 871 Position Pos) const { 872 if (!InsertCacheInv) 873 return false; 874 875 bool Changed = false; 876 877 MachineBasicBlock &MBB = *MI->getParent(); 878 DebugLoc DL = MI->getDebugLoc(); 879 880 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 881 882 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 883 ? AMDGPU::BUFFER_WBINVL1 884 : AMDGPU::BUFFER_WBINVL1_VOL; 885 886 if (Pos == Position::AFTER) 887 ++MI; 888 889 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 890 switch (Scope) { 891 case SIAtomicScope::SYSTEM: 892 case SIAtomicScope::AGENT: 893 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 894 Changed = true; 895 break; 896 case SIAtomicScope::WORKGROUP: 897 case SIAtomicScope::WAVEFRONT: 898 case SIAtomicScope::SINGLETHREAD: 899 // No cache to invalidate. 900 break; 901 default: 902 llvm_unreachable("Unsupported synchronization scope"); 903 } 904 } 905 906 /// The scratch address space does not need the global memory cache 907 /// to be flushed as all memory operations by the same thread are 908 /// sequentially consistent, and no other thread can access scratch 909 /// memory. 910 911 /// Other address spaces do not have a cache. 912 913 if (Pos == Position::AFTER) 914 --MI; 915 916 return Changed; 917 } 918 919 bool SIGfx10CacheControl::enableLoadCacheBypass( 920 const MachineBasicBlock::iterator &MI, 921 SIAtomicScope Scope, 922 SIAtomicAddrSpace AddrSpace) const { 923 assert(MI->mayLoad() && !MI->mayStore()); 924 bool Changed = false; 925 926 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 927 /// TODO Do not set glc for rmw atomic operations as they 928 /// implicitly bypass the L0/L1 caches. 929 930 switch (Scope) { 931 case SIAtomicScope::SYSTEM: 932 case SIAtomicScope::AGENT: 933 Changed |= enableGLCBit(MI); 934 Changed |= enableDLCBit(MI); 935 break; 936 case SIAtomicScope::WORKGROUP: 937 // In WGP mode the waves of a work-group can be executing on either CU of 938 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 939 // CU mode all waves of a work-group are on the same CU, and so the L0 940 // does not need to be bypassed. 941 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 942 break; 943 case SIAtomicScope::WAVEFRONT: 944 case SIAtomicScope::SINGLETHREAD: 945 // No cache to bypass. 946 break; 947 default: 948 llvm_unreachable("Unsupported synchronization scope"); 949 } 950 } 951 952 /// The scratch address space does not need the global memory caches 953 /// to be bypassed as all memory operations by the same thread are 954 /// sequentially consistent, and no other thread can access scratch 955 /// memory. 956 957 /// Other address spaces do not have a cache. 958 959 return Changed; 960 } 961 962 bool SIGfx10CacheControl::enableNonTemporal( 963 const MachineBasicBlock::iterator &MI) const { 964 assert(MI->mayLoad() ^ MI->mayStore()); 965 bool Changed = false; 966 967 Changed |= enableSLCBit(MI); 968 /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) 969 970 return Changed; 971 } 972 973 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 974 SIAtomicScope Scope, 975 SIAtomicAddrSpace AddrSpace, 976 SIMemOp Op, 977 bool IsCrossAddrSpaceOrdering, 978 Position Pos) const { 979 bool Changed = false; 980 981 MachineBasicBlock &MBB = *MI->getParent(); 982 DebugLoc DL = MI->getDebugLoc(); 983 984 if (Pos == Position::AFTER) 985 ++MI; 986 987 bool VMCnt = false; 988 bool VSCnt = false; 989 bool LGKMCnt = false; 990 991 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 992 switch (Scope) { 993 case SIAtomicScope::SYSTEM: 994 case SIAtomicScope::AGENT: 995 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 996 VMCnt |= true; 997 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 998 VSCnt |= true; 999 break; 1000 case SIAtomicScope::WORKGROUP: 1001 // In WGP mode the waves of a work-group can be executing on either CU of 1002 // the WGP. Therefore need to wait for operations to complete to ensure 1003 // they are visible to waves in the other CU as the L0 is per CU. 1004 // Otherwise in CU mode and all waves of a work-group are on the same CU 1005 // which shares the same L0. 1006 if (!ST.isCuModeEnabled()) { 1007 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1008 VMCnt |= true; 1009 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1010 VSCnt |= true; 1011 } 1012 break; 1013 case SIAtomicScope::WAVEFRONT: 1014 case SIAtomicScope::SINGLETHREAD: 1015 // The L0 cache keeps all memory operations in order for 1016 // work-items in the same wavefront. 1017 break; 1018 default: 1019 llvm_unreachable("Unsupported synchronization scope"); 1020 } 1021 } 1022 1023 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1024 switch (Scope) { 1025 case SIAtomicScope::SYSTEM: 1026 case SIAtomicScope::AGENT: 1027 case SIAtomicScope::WORKGROUP: 1028 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1029 // not needed as LDS operations for all waves are executed in a total 1030 // global ordering as observed by all waves. Required if also 1031 // synchronizing with global/GDS memory as LDS operations could be 1032 // reordered with respect to later global/GDS memory operations of the 1033 // same wave. 1034 LGKMCnt |= IsCrossAddrSpaceOrdering; 1035 break; 1036 case SIAtomicScope::WAVEFRONT: 1037 case SIAtomicScope::SINGLETHREAD: 1038 // The LDS keeps all memory operations in order for 1039 // the same wavesfront. 1040 break; 1041 default: 1042 llvm_unreachable("Unsupported synchronization scope"); 1043 } 1044 } 1045 1046 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1047 switch (Scope) { 1048 case SIAtomicScope::SYSTEM: 1049 case SIAtomicScope::AGENT: 1050 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1051 // is not needed as GDS operations for all waves are executed in a total 1052 // global ordering as observed by all waves. Required if also 1053 // synchronizing with global/LDS memory as GDS operations could be 1054 // reordered with respect to later global/LDS memory operations of the 1055 // same wave. 1056 LGKMCnt |= IsCrossAddrSpaceOrdering; 1057 break; 1058 case SIAtomicScope::WORKGROUP: 1059 case SIAtomicScope::WAVEFRONT: 1060 case SIAtomicScope::SINGLETHREAD: 1061 // The GDS keeps all memory operations in order for 1062 // the same work-group. 1063 break; 1064 default: 1065 llvm_unreachable("Unsupported synchronization scope"); 1066 } 1067 } 1068 1069 if (VMCnt || LGKMCnt) { 1070 unsigned WaitCntImmediate = 1071 AMDGPU::encodeWaitcnt(IV, 1072 VMCnt ? 0 : getVmcntBitMask(IV), 1073 getExpcntBitMask(IV), 1074 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1075 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1076 Changed = true; 1077 } 1078 1079 if (VSCnt) { 1080 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1081 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1082 .addImm(0); 1083 Changed = true; 1084 } 1085 1086 if (Pos == Position::AFTER) 1087 --MI; 1088 1089 return Changed; 1090 } 1091 1092 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1093 SIAtomicScope Scope, 1094 SIAtomicAddrSpace AddrSpace, 1095 Position Pos) const { 1096 if (!InsertCacheInv) 1097 return false; 1098 1099 bool Changed = false; 1100 1101 MachineBasicBlock &MBB = *MI->getParent(); 1102 DebugLoc DL = MI->getDebugLoc(); 1103 1104 if (Pos == Position::AFTER) 1105 ++MI; 1106 1107 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1108 switch (Scope) { 1109 case SIAtomicScope::SYSTEM: 1110 case SIAtomicScope::AGENT: 1111 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1112 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1113 Changed = true; 1114 break; 1115 case SIAtomicScope::WORKGROUP: 1116 // In WGP mode the waves of a work-group can be executing on either CU of 1117 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1118 // in CU mode and all waves of a work-group are on the same CU, and so the 1119 // L0 does not need to be invalidated. 1120 if (!ST.isCuModeEnabled()) { 1121 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1122 Changed = true; 1123 } 1124 break; 1125 case SIAtomicScope::WAVEFRONT: 1126 case SIAtomicScope::SINGLETHREAD: 1127 // No cache to invalidate. 1128 break; 1129 default: 1130 llvm_unreachable("Unsupported synchronization scope"); 1131 } 1132 } 1133 1134 /// The scratch address space does not need the global memory cache 1135 /// to be flushed as all memory operations by the same thread are 1136 /// sequentially consistent, and no other thread can access scratch 1137 /// memory. 1138 1139 /// Other address spaces do not have a cache. 1140 1141 if (Pos == Position::AFTER) 1142 --MI; 1143 1144 return Changed; 1145 } 1146 1147 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1148 if (AtomicPseudoMIs.empty()) 1149 return false; 1150 1151 for (auto &MI : AtomicPseudoMIs) 1152 MI->eraseFromParent(); 1153 1154 AtomicPseudoMIs.clear(); 1155 return true; 1156 } 1157 1158 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1159 MachineBasicBlock::iterator &MI) { 1160 assert(MI->mayLoad() && !MI->mayStore()); 1161 1162 bool Changed = false; 1163 1164 if (MOI.isAtomic()) { 1165 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1166 MOI.getOrdering() == AtomicOrdering::Acquire || 1167 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1168 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1169 MOI.getOrderingAddrSpace()); 1170 } 1171 1172 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1173 Changed |= CC->insertWait(MI, MOI.getScope(), 1174 MOI.getOrderingAddrSpace(), 1175 SIMemOp::LOAD | SIMemOp::STORE, 1176 MOI.getIsCrossAddressSpaceOrdering(), 1177 Position::BEFORE); 1178 1179 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1180 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1181 Changed |= CC->insertWait(MI, MOI.getScope(), 1182 MOI.getInstrAddrSpace(), 1183 SIMemOp::LOAD, 1184 MOI.getIsCrossAddressSpaceOrdering(), 1185 Position::AFTER); 1186 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1187 MOI.getOrderingAddrSpace(), 1188 Position::AFTER); 1189 } 1190 1191 return Changed; 1192 } 1193 1194 // Atomic instructions do not have the nontemporal attribute. 1195 if (MOI.isNonTemporal()) { 1196 Changed |= CC->enableNonTemporal(MI); 1197 return Changed; 1198 } 1199 1200 return Changed; 1201 } 1202 1203 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1204 MachineBasicBlock::iterator &MI) { 1205 assert(!MI->mayLoad() && MI->mayStore()); 1206 1207 bool Changed = false; 1208 1209 if (MOI.isAtomic()) { 1210 if (MOI.getOrdering() == AtomicOrdering::Release || 1211 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1212 Changed |= CC->insertRelease(MI, MOI.getScope(), 1213 MOI.getOrderingAddrSpace(), 1214 MOI.getIsCrossAddressSpaceOrdering(), 1215 Position::BEFORE); 1216 1217 return Changed; 1218 } 1219 1220 // Atomic instructions do not have the nontemporal attribute. 1221 if (MOI.isNonTemporal()) { 1222 Changed |= CC->enableNonTemporal(MI); 1223 return Changed; 1224 } 1225 1226 return Changed; 1227 } 1228 1229 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1230 MachineBasicBlock::iterator &MI) { 1231 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1232 1233 AtomicPseudoMIs.push_back(MI); 1234 bool Changed = false; 1235 1236 if (MOI.isAtomic()) { 1237 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1238 MOI.getOrdering() == AtomicOrdering::Release || 1239 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1240 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1241 /// TODO: This relies on a barrier always generating a waitcnt 1242 /// for LDS to ensure it is not reordered with the completion of 1243 /// the proceeding LDS operations. If barrier had a memory 1244 /// ordering and memory scope, then library does not need to 1245 /// generate a fence. Could add support in this file for 1246 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1247 /// adding S_WAITCNT before a S_BARRIER. 1248 Changed |= CC->insertRelease(MI, MOI.getScope(), 1249 MOI.getOrderingAddrSpace(), 1250 MOI.getIsCrossAddressSpaceOrdering(), 1251 Position::BEFORE); 1252 1253 // TODO: If both release and invalidate are happening they could be combined 1254 // to use the single "BUFFER_WBL2" instruction. This could be done by 1255 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1256 // track cache invalidate and write back instructions. 1257 1258 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1259 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1260 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1261 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1262 MOI.getOrderingAddrSpace(), 1263 Position::BEFORE); 1264 1265 return Changed; 1266 } 1267 1268 return Changed; 1269 } 1270 1271 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1272 MachineBasicBlock::iterator &MI) { 1273 assert(MI->mayLoad() && MI->mayStore()); 1274 1275 bool Changed = false; 1276 1277 if (MOI.isAtomic()) { 1278 if (MOI.getOrdering() == AtomicOrdering::Release || 1279 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1280 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1281 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1282 Changed |= CC->insertRelease(MI, MOI.getScope(), 1283 MOI.getOrderingAddrSpace(), 1284 MOI.getIsCrossAddressSpaceOrdering(), 1285 Position::BEFORE); 1286 1287 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1288 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1289 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1290 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1291 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1292 Changed |= CC->insertWait(MI, MOI.getScope(), 1293 MOI.getOrderingAddrSpace(), 1294 isAtomicRet(*MI) ? SIMemOp::LOAD : 1295 SIMemOp::STORE, 1296 MOI.getIsCrossAddressSpaceOrdering(), 1297 Position::AFTER); 1298 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1299 MOI.getOrderingAddrSpace(), 1300 Position::AFTER); 1301 } 1302 1303 return Changed; 1304 } 1305 1306 return Changed; 1307 } 1308 1309 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1310 bool Changed = false; 1311 1312 SIMemOpAccess MOA(MF); 1313 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1314 1315 for (auto &MBB : MF) { 1316 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1317 1318 // Unbundle instructions after the post-RA scheduler. 1319 if (MI->isBundle()) { 1320 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1321 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1322 I != E && I->isBundledWithPred(); ++I) { 1323 I->unbundleFromPred(); 1324 for (MachineOperand &MO : I->operands()) 1325 if (MO.isReg()) 1326 MO.setIsInternalRead(false); 1327 } 1328 1329 MI->eraseFromParent(); 1330 MI = II->getIterator(); 1331 } 1332 1333 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1334 continue; 1335 1336 if (const auto &MOI = MOA.getLoadInfo(MI)) 1337 Changed |= expandLoad(MOI.getValue(), MI); 1338 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1339 Changed |= expandStore(MOI.getValue(), MI); 1340 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1341 Changed |= expandAtomicFence(MOI.getValue(), MI); 1342 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1343 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1344 } 1345 } 1346 1347 Changed |= removeAtomicPseudoMIs(); 1348 return Changed; 1349 } 1350 1351 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1352 1353 char SIMemoryLegalizer::ID = 0; 1354 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1355 1356 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1357 return new SIMemoryLegalizer(); 1358 } 1359