1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Memory legalizer - implements memory model. More information can be 11 /// found here: 12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUMachineModuleInfo.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/BitmaskEnum.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/IR/DiagnosticInfo.h" 23 #include "llvm/Support/AtomicOrdering.h" 24 #include "llvm/Support/TargetParser.h" 25 26 using namespace llvm; 27 using namespace llvm::AMDGPU; 28 29 #define DEBUG_TYPE "si-memory-legalizer" 30 #define PASS_NAME "SI Memory Legalizer" 31 32 static cl::opt<bool> AmdgcnSkipCacheInvalidations( 33 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 34 cl::desc("Use this to skip inserting cache invalidating instructions.")); 35 36 namespace { 37 38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 39 40 /// Memory operation flags. Can be ORed together. 41 enum class SIMemOp { 42 NONE = 0u, 43 LOAD = 1u << 0, 44 STORE = 1u << 1, 45 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 46 }; 47 48 /// Position to insert a new instruction relative to an existing 49 /// instruction. 50 enum class Position { 51 BEFORE, 52 AFTER 53 }; 54 55 /// The atomic synchronization scopes supported by the AMDGPU target. 56 enum class SIAtomicScope { 57 NONE, 58 SINGLETHREAD, 59 WAVEFRONT, 60 WORKGROUP, 61 AGENT, 62 SYSTEM 63 }; 64 65 /// The distinct address spaces supported by the AMDGPU target for 66 /// atomic memory operation. Can be ORed toether. 67 enum class SIAtomicAddrSpace { 68 NONE = 0u, 69 GLOBAL = 1u << 0, 70 LDS = 1u << 1, 71 SCRATCH = 1u << 2, 72 GDS = 1u << 3, 73 OTHER = 1u << 4, 74 75 /// The address spaces that can be accessed by a FLAT instruction. 76 FLAT = GLOBAL | LDS | SCRATCH, 77 78 /// The address spaces that support atomic instructions. 79 ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 80 81 /// All address spaces. 82 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 83 84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 85 }; 86 87 class SIMemOpInfo final { 88 private: 89 90 friend class SIMemOpAccess; 91 92 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 93 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 94 SIAtomicScope Scope = SIAtomicScope::SYSTEM; 95 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 96 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 97 bool IsCrossAddressSpaceOrdering = false; 98 bool IsVolatile = false; 99 bool IsNonTemporal = false; 100 101 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 102 SIAtomicScope Scope = SIAtomicScope::SYSTEM, 103 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 104 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 105 bool IsCrossAddressSpaceOrdering = true, 106 AtomicOrdering FailureOrdering = 107 AtomicOrdering::SequentiallyConsistent, 108 bool IsVolatile = false, 109 bool IsNonTemporal = false) 110 : Ordering(Ordering), FailureOrdering(FailureOrdering), 111 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), 112 InstrAddrSpace(InstrAddrSpace), 113 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 114 IsVolatile(IsVolatile), 115 IsNonTemporal(IsNonTemporal) { 116 117 if (Ordering == AtomicOrdering::NotAtomic) { 118 assert(Scope == SIAtomicScope::NONE && 119 OrderingAddrSpace == SIAtomicAddrSpace::NONE && 120 !IsCrossAddressSpaceOrdering && 121 FailureOrdering == AtomicOrdering::NotAtomic); 122 return; 123 } 124 125 assert(Scope != SIAtomicScope::NONE && 126 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 127 SIAtomicAddrSpace::NONE && 128 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 129 SIAtomicAddrSpace::NONE && 130 !isStrongerThan(FailureOrdering, Ordering)); 131 132 // There is also no cross address space ordering if the ordering 133 // address space is the same as the instruction address space and 134 // only contains a single address space. 135 if ((OrderingAddrSpace == InstrAddrSpace) && 136 isPowerOf2_32(uint32_t(InstrAddrSpace))) 137 this->IsCrossAddressSpaceOrdering = false; 138 139 // Limit the scope to the maximum supported by the instruction's address 140 // spaces. 141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 142 SIAtomicAddrSpace::NONE) { 143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 144 } else if ((InstrAddrSpace & 145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 146 SIAtomicAddrSpace::NONE) { 147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 148 } else if ((InstrAddrSpace & 149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 151 this->Scope = std::min(Scope, SIAtomicScope::AGENT); 152 } 153 } 154 155 public: 156 /// \returns Atomic synchronization scope of the machine instruction used to 157 /// create this SIMemOpInfo. 158 SIAtomicScope getScope() const { 159 return Scope; 160 } 161 162 /// \returns Ordering constraint of the machine instruction used to 163 /// create this SIMemOpInfo. 164 AtomicOrdering getOrdering() const { 165 return Ordering; 166 } 167 168 /// \returns Failure ordering constraint of the machine instruction used to 169 /// create this SIMemOpInfo. 170 AtomicOrdering getFailureOrdering() const { 171 return FailureOrdering; 172 } 173 174 /// \returns The address spaces be accessed by the machine 175 /// instruction used to create this SiMemOpInfo. 176 SIAtomicAddrSpace getInstrAddrSpace() const { 177 return InstrAddrSpace; 178 } 179 180 /// \returns The address spaces that must be ordered by the machine 181 /// instruction used to create this SiMemOpInfo. 182 SIAtomicAddrSpace getOrderingAddrSpace() const { 183 return OrderingAddrSpace; 184 } 185 186 /// \returns Return true iff memory ordering of operations on 187 /// different address spaces is required. 188 bool getIsCrossAddressSpaceOrdering() const { 189 return IsCrossAddressSpaceOrdering; 190 } 191 192 /// \returns True if memory access of the machine instruction used to 193 /// create this SIMemOpInfo is volatile, false otherwise. 194 bool isVolatile() const { 195 return IsVolatile; 196 } 197 198 /// \returns True if memory access of the machine instruction used to 199 /// create this SIMemOpInfo is nontemporal, false otherwise. 200 bool isNonTemporal() const { 201 return IsNonTemporal; 202 } 203 204 /// \returns True if ordering constraint of the machine instruction used to 205 /// create this SIMemOpInfo is unordered or higher, false otherwise. 206 bool isAtomic() const { 207 return Ordering != AtomicOrdering::NotAtomic; 208 } 209 210 }; 211 212 class SIMemOpAccess final { 213 private: 214 AMDGPUMachineModuleInfo *MMI = nullptr; 215 216 /// Reports unsupported message \p Msg for \p MI to LLVM context. 217 void reportUnsupported(const MachineBasicBlock::iterator &MI, 218 const char *Msg) const; 219 220 /// Inspects the target synchronization scope \p SSID and determines 221 /// the SI atomic scope it corresponds to, the address spaces it 222 /// covers, and whether the memory ordering applies between address 223 /// spaces. 224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 226 227 /// \return Return a bit set of the address spaces accessed by \p AS. 228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 229 230 /// \returns Info constructed from \p MI, which has at least machine memory 231 /// operand. 232 Optional<SIMemOpInfo> constructFromMIWithMMO( 233 const MachineBasicBlock::iterator &MI) const; 234 235 public: 236 /// Construct class to support accessing the machine memory operands 237 /// of instructions in the machine function \p MF. 238 SIMemOpAccess(MachineFunction &MF); 239 240 /// \returns Load info if \p MI is a load operation, "None" otherwise. 241 Optional<SIMemOpInfo> getLoadInfo( 242 const MachineBasicBlock::iterator &MI) const; 243 244 /// \returns Store info if \p MI is a store operation, "None" otherwise. 245 Optional<SIMemOpInfo> getStoreInfo( 246 const MachineBasicBlock::iterator &MI) const; 247 248 /// \returns Atomic fence info if \p MI is an atomic fence operation, 249 /// "None" otherwise. 250 Optional<SIMemOpInfo> getAtomicFenceInfo( 251 const MachineBasicBlock::iterator &MI) const; 252 253 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 254 /// rmw operation, "None" otherwise. 255 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( 256 const MachineBasicBlock::iterator &MI) const; 257 }; 258 259 class SICacheControl { 260 protected: 261 262 /// AMDGPU subtarget info. 263 const GCNSubtarget &ST; 264 265 /// Instruction info. 266 const SIInstrInfo *TII = nullptr; 267 268 IsaVersion IV; 269 270 /// Whether to insert cache invalidating instructions. 271 bool InsertCacheInv; 272 273 SICacheControl(const GCNSubtarget &ST); 274 275 /// Sets named bit \p BitName to "true" if present in instruction \p MI. 276 /// \returns Returns true if \p MI is modified, false otherwise. 277 bool enableNamedBit(const MachineBasicBlock::iterator MI, 278 AMDGPU::CPol::CPol Bit) const; 279 280 public: 281 282 /// Create a cache control for the subtarget \p ST. 283 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 284 285 /// Update \p MI memory load instruction to bypass any caches up to 286 /// the \p Scope memory scope for address spaces \p 287 /// AddrSpace. Return true iff the instruction was modified. 288 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 289 SIAtomicScope Scope, 290 SIAtomicAddrSpace AddrSpace) const = 0; 291 292 /// Update \p MI memory store instruction to bypass any caches up to 293 /// the \p Scope memory scope for address spaces \p 294 /// AddrSpace. Return true iff the instruction was modified. 295 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 296 SIAtomicScope Scope, 297 SIAtomicAddrSpace AddrSpace) const = 0; 298 299 /// Update \p MI memory read-modify-write instruction to bypass any caches up 300 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 301 /// iff the instruction was modified. 302 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 303 SIAtomicScope Scope, 304 SIAtomicAddrSpace AddrSpace) const = 0; 305 306 /// Update \p MI memory instruction of kind \p Op associated with address 307 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return 308 /// true iff the instruction was modified. 309 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 310 SIAtomicAddrSpace AddrSpace, 311 SIMemOp Op, bool IsVolatile, 312 bool IsNonTemporal) const = 0; 313 314 /// Inserts any necessary instructions at position \p Pos relative 315 /// to instruction \p MI to ensure memory instructions before \p Pos of kind 316 /// \p Op associated with address spaces \p AddrSpace have completed. Used 317 /// between memory instructions to enforce the order they become visible as 318 /// observed by other memory instructions executing in memory scope \p Scope. 319 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 320 /// address spaces. Returns true iff any instructions inserted. 321 virtual bool insertWait(MachineBasicBlock::iterator &MI, 322 SIAtomicScope Scope, 323 SIAtomicAddrSpace AddrSpace, 324 SIMemOp Op, 325 bool IsCrossAddrSpaceOrdering, 326 Position Pos) const = 0; 327 328 /// Inserts any necessary instructions at position \p Pos relative to 329 /// instruction \p MI to ensure any subsequent memory instructions of this 330 /// thread with address spaces \p AddrSpace will observe the previous memory 331 /// operations by any thread for memory scopes up to memory scope \p Scope . 332 /// Returns true iff any instructions inserted. 333 virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 334 SIAtomicScope Scope, 335 SIAtomicAddrSpace AddrSpace, 336 Position Pos) const = 0; 337 338 /// Inserts any necessary instructions at position \p Pos relative to 339 /// instruction \p MI to ensure previous memory instructions by this thread 340 /// with address spaces \p AddrSpace have completed and can be observed by 341 /// subsequent memory instructions by any thread executing in memory scope \p 342 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 343 /// between address spaces. Returns true iff any instructions inserted. 344 virtual bool insertRelease(MachineBasicBlock::iterator &MI, 345 SIAtomicScope Scope, 346 SIAtomicAddrSpace AddrSpace, 347 bool IsCrossAddrSpaceOrdering, 348 Position Pos) const = 0; 349 350 /// Virtual destructor to allow derivations to be deleted. 351 virtual ~SICacheControl() = default; 352 353 }; 354 355 class SIGfx6CacheControl : public SICacheControl { 356 protected: 357 358 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 359 /// is modified, false otherwise. 360 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 361 return enableNamedBit(MI, AMDGPU::CPol::GLC); 362 } 363 364 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 365 /// is modified, false otherwise. 366 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 367 return enableNamedBit(MI, AMDGPU::CPol::SLC); 368 } 369 370 public: 371 372 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; 373 374 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 375 SIAtomicScope Scope, 376 SIAtomicAddrSpace AddrSpace) const override; 377 378 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 379 SIAtomicScope Scope, 380 SIAtomicAddrSpace AddrSpace) const override; 381 382 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 383 SIAtomicScope Scope, 384 SIAtomicAddrSpace AddrSpace) const override; 385 386 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 387 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 388 bool IsVolatile, 389 bool IsNonTemporal) const override; 390 391 bool insertWait(MachineBasicBlock::iterator &MI, 392 SIAtomicScope Scope, 393 SIAtomicAddrSpace AddrSpace, 394 SIMemOp Op, 395 bool IsCrossAddrSpaceOrdering, 396 Position Pos) const override; 397 398 bool insertAcquire(MachineBasicBlock::iterator &MI, 399 SIAtomicScope Scope, 400 SIAtomicAddrSpace AddrSpace, 401 Position Pos) const override; 402 403 bool insertRelease(MachineBasicBlock::iterator &MI, 404 SIAtomicScope Scope, 405 SIAtomicAddrSpace AddrSpace, 406 bool IsCrossAddrSpaceOrdering, 407 Position Pos) const override; 408 }; 409 410 class SIGfx7CacheControl : public SIGfx6CacheControl { 411 public: 412 413 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; 414 415 bool insertAcquire(MachineBasicBlock::iterator &MI, 416 SIAtomicScope Scope, 417 SIAtomicAddrSpace AddrSpace, 418 Position Pos) const override; 419 420 }; 421 422 class SIGfx90ACacheControl : public SIGfx7CacheControl { 423 public: 424 425 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 426 427 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 428 SIAtomicScope Scope, 429 SIAtomicAddrSpace AddrSpace) const override; 430 431 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 432 SIAtomicScope Scope, 433 SIAtomicAddrSpace AddrSpace) const override; 434 435 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 436 SIAtomicScope Scope, 437 SIAtomicAddrSpace AddrSpace) const override; 438 439 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 440 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 441 bool IsVolatile, 442 bool IsNonTemporal) const override; 443 444 bool insertWait(MachineBasicBlock::iterator &MI, 445 SIAtomicScope Scope, 446 SIAtomicAddrSpace AddrSpace, 447 SIMemOp Op, 448 bool IsCrossAddrSpaceOrdering, 449 Position Pos) const override; 450 451 bool insertAcquire(MachineBasicBlock::iterator &MI, 452 SIAtomicScope Scope, 453 SIAtomicAddrSpace AddrSpace, 454 Position Pos) const override; 455 }; 456 457 class SIGfx10CacheControl : public SIGfx7CacheControl { 458 protected: 459 460 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 461 /// is modified, false otherwise. 462 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 463 return enableNamedBit(MI, AMDGPU::CPol::DLC); 464 } 465 466 public: 467 468 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; 469 470 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 471 SIAtomicScope Scope, 472 SIAtomicAddrSpace AddrSpace) const override; 473 474 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 475 SIAtomicAddrSpace AddrSpace, SIMemOp Op, 476 bool IsVolatile, 477 bool IsNonTemporal) const override; 478 479 bool insertWait(MachineBasicBlock::iterator &MI, 480 SIAtomicScope Scope, 481 SIAtomicAddrSpace AddrSpace, 482 SIMemOp Op, 483 bool IsCrossAddrSpaceOrdering, 484 Position Pos) const override; 485 486 bool insertAcquire(MachineBasicBlock::iterator &MI, 487 SIAtomicScope Scope, 488 SIAtomicAddrSpace AddrSpace, 489 Position Pos) const override; 490 }; 491 492 class SIMemoryLegalizer final : public MachineFunctionPass { 493 private: 494 495 /// Cache Control. 496 std::unique_ptr<SICacheControl> CC = nullptr; 497 498 /// List of atomic pseudo instructions. 499 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 500 501 /// Return true iff instruction \p MI is a atomic instruction that 502 /// returns a result. 503 bool isAtomicRet(const MachineInstr &MI) const { 504 return SIInstrInfo::isAtomicRet(MI); 505 } 506 507 /// Removes all processed atomic pseudo instructions from the current 508 /// function. Returns true if current function is modified, false otherwise. 509 bool removeAtomicPseudoMIs(); 510 511 /// Expands load operation \p MI. Returns true if instructions are 512 /// added/deleted or \p MI is modified, false otherwise. 513 bool expandLoad(const SIMemOpInfo &MOI, 514 MachineBasicBlock::iterator &MI); 515 /// Expands store operation \p MI. Returns true if instructions are 516 /// added/deleted or \p MI is modified, false otherwise. 517 bool expandStore(const SIMemOpInfo &MOI, 518 MachineBasicBlock::iterator &MI); 519 /// Expands atomic fence operation \p MI. Returns true if 520 /// instructions are added/deleted or \p MI is modified, false otherwise. 521 bool expandAtomicFence(const SIMemOpInfo &MOI, 522 MachineBasicBlock::iterator &MI); 523 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 524 /// instructions are added/deleted or \p MI is modified, false otherwise. 525 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 526 MachineBasicBlock::iterator &MI); 527 528 public: 529 static char ID; 530 531 SIMemoryLegalizer() : MachineFunctionPass(ID) {} 532 533 void getAnalysisUsage(AnalysisUsage &AU) const override { 534 AU.setPreservesCFG(); 535 MachineFunctionPass::getAnalysisUsage(AU); 536 } 537 538 StringRef getPassName() const override { 539 return PASS_NAME; 540 } 541 542 bool runOnMachineFunction(MachineFunction &MF) override; 543 }; 544 545 } // end namespace anonymous 546 547 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 548 const char *Msg) const { 549 const Function &Func = MI->getParent()->getParent()->getFunction(); 550 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 551 Func.getContext().diagnose(Diag); 552 } 553 554 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 555 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 556 SIAtomicAddrSpace InstrAddrSpace) const { 557 if (SSID == SyncScope::System) 558 return std::make_tuple(SIAtomicScope::SYSTEM, 559 SIAtomicAddrSpace::ATOMIC, 560 true); 561 if (SSID == MMI->getAgentSSID()) 562 return std::make_tuple(SIAtomicScope::AGENT, 563 SIAtomicAddrSpace::ATOMIC, 564 true); 565 if (SSID == MMI->getWorkgroupSSID()) 566 return std::make_tuple(SIAtomicScope::WORKGROUP, 567 SIAtomicAddrSpace::ATOMIC, 568 true); 569 if (SSID == MMI->getWavefrontSSID()) 570 return std::make_tuple(SIAtomicScope::WAVEFRONT, 571 SIAtomicAddrSpace::ATOMIC, 572 true); 573 if (SSID == SyncScope::SingleThread) 574 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 575 SIAtomicAddrSpace::ATOMIC, 576 true); 577 if (SSID == MMI->getSystemOneAddressSpaceSSID()) 578 return std::make_tuple(SIAtomicScope::SYSTEM, 579 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 580 false); 581 if (SSID == MMI->getAgentOneAddressSpaceSSID()) 582 return std::make_tuple(SIAtomicScope::AGENT, 583 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 584 false); 585 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 586 return std::make_tuple(SIAtomicScope::WORKGROUP, 587 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 588 false); 589 if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 590 return std::make_tuple(SIAtomicScope::WAVEFRONT, 591 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 592 false); 593 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 594 return std::make_tuple(SIAtomicScope::SINGLETHREAD, 595 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, 596 false); 597 return None; 598 } 599 600 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 601 if (AS == AMDGPUAS::FLAT_ADDRESS) 602 return SIAtomicAddrSpace::FLAT; 603 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 604 return SIAtomicAddrSpace::GLOBAL; 605 if (AS == AMDGPUAS::LOCAL_ADDRESS) 606 return SIAtomicAddrSpace::LDS; 607 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 608 return SIAtomicAddrSpace::SCRATCH; 609 if (AS == AMDGPUAS::REGION_ADDRESS) 610 return SIAtomicAddrSpace::GDS; 611 612 return SIAtomicAddrSpace::OTHER; 613 } 614 615 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { 616 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); 617 } 618 619 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 620 const MachineBasicBlock::iterator &MI) const { 621 assert(MI->getNumMemOperands() > 0); 622 623 SyncScope::ID SSID = SyncScope::SingleThread; 624 AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 625 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 626 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 627 bool IsNonTemporal = true; 628 bool IsVolatile = false; 629 630 // Validator should check whether or not MMOs cover the entire set of 631 // locations accessed by the memory instruction. 632 for (const auto &MMO : MI->memoperands()) { 633 IsNonTemporal &= MMO->isNonTemporal(); 634 IsVolatile |= MMO->isVolatile(); 635 InstrAddrSpace |= 636 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 637 AtomicOrdering OpOrdering = MMO->getOrdering(); 638 if (OpOrdering != AtomicOrdering::NotAtomic) { 639 const auto &IsSyncScopeInclusion = 640 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 641 if (!IsSyncScopeInclusion) { 642 reportUnsupported(MI, 643 "Unsupported non-inclusive atomic synchronization scope"); 644 return None; 645 } 646 647 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); 648 Ordering = 649 isStrongerThan(Ordering, OpOrdering) ? 650 Ordering : MMO->getOrdering(); 651 assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 652 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 653 FailureOrdering = 654 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? 655 FailureOrdering : MMO->getFailureOrdering(); 656 } 657 } 658 659 SIAtomicScope Scope = SIAtomicScope::NONE; 660 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 661 bool IsCrossAddressSpaceOrdering = false; 662 if (Ordering != AtomicOrdering::NotAtomic) { 663 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 664 if (!ScopeOrNone) { 665 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 666 return None; 667 } 668 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 669 ScopeOrNone.getValue(); 670 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 671 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 672 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 673 reportUnsupported(MI, "Unsupported atomic address space"); 674 return None; 675 } 676 } 677 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 678 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 679 IsNonTemporal); 680 } 681 682 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( 683 const MachineBasicBlock::iterator &MI) const { 684 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 685 686 if (!(MI->mayLoad() && !MI->mayStore())) 687 return None; 688 689 // Be conservative if there are no memory operands. 690 if (MI->getNumMemOperands() == 0) 691 return SIMemOpInfo(); 692 693 return constructFromMIWithMMO(MI); 694 } 695 696 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( 697 const MachineBasicBlock::iterator &MI) const { 698 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 699 700 if (!(!MI->mayLoad() && MI->mayStore())) 701 return None; 702 703 // Be conservative if there are no memory operands. 704 if (MI->getNumMemOperands() == 0) 705 return SIMemOpInfo(); 706 707 return constructFromMIWithMMO(MI); 708 } 709 710 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( 711 const MachineBasicBlock::iterator &MI) const { 712 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 713 714 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 715 return None; 716 717 AtomicOrdering Ordering = 718 static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 719 720 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 721 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 722 if (!ScopeOrNone) { 723 reportUnsupported(MI, "Unsupported atomic synchronization scope"); 724 return None; 725 } 726 727 SIAtomicScope Scope = SIAtomicScope::NONE; 728 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 729 bool IsCrossAddressSpaceOrdering = false; 730 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 731 ScopeOrNone.getValue(); 732 733 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 734 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 735 reportUnsupported(MI, "Unsupported atomic address space"); 736 return None; 737 } 738 739 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 740 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 741 } 742 743 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 744 const MachineBasicBlock::iterator &MI) const { 745 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 746 747 if (!(MI->mayLoad() && MI->mayStore())) 748 return None; 749 750 // Be conservative if there are no memory operands. 751 if (MI->getNumMemOperands() == 0) 752 return SIMemOpInfo(); 753 754 return constructFromMIWithMMO(MI); 755 } 756 757 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 758 TII = ST.getInstrInfo(); 759 IV = getIsaVersion(ST.getCPU()); 760 InsertCacheInv = !AmdgcnSkipCacheInvalidations; 761 } 762 763 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 764 AMDGPU::CPol::CPol Bit) const { 765 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 766 if (!CPol) 767 return false; 768 769 CPol->setImm(CPol->getImm() | Bit); 770 return true; 771 } 772 773 /* static */ 774 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 775 GCNSubtarget::Generation Generation = ST.getGeneration(); 776 if (ST.hasGFX90AInsts()) 777 return std::make_unique<SIGfx90ACacheControl>(ST); 778 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 779 return std::make_unique<SIGfx6CacheControl>(ST); 780 if (Generation < AMDGPUSubtarget::GFX10) 781 return std::make_unique<SIGfx7CacheControl>(ST); 782 return std::make_unique<SIGfx10CacheControl>(ST); 783 } 784 785 bool SIGfx6CacheControl::enableLoadCacheBypass( 786 const MachineBasicBlock::iterator &MI, 787 SIAtomicScope Scope, 788 SIAtomicAddrSpace AddrSpace) const { 789 assert(MI->mayLoad() && !MI->mayStore()); 790 bool Changed = false; 791 792 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 793 switch (Scope) { 794 case SIAtomicScope::SYSTEM: 795 case SIAtomicScope::AGENT: 796 Changed |= enableGLCBit(MI); 797 break; 798 case SIAtomicScope::WORKGROUP: 799 case SIAtomicScope::WAVEFRONT: 800 case SIAtomicScope::SINGLETHREAD: 801 // No cache to bypass. 802 break; 803 default: 804 llvm_unreachable("Unsupported synchronization scope"); 805 } 806 } 807 808 /// The scratch address space does not need the global memory caches 809 /// to be bypassed as all memory operations by the same thread are 810 /// sequentially consistent, and no other thread can access scratch 811 /// memory. 812 813 /// Other address spaces do not have a cache. 814 815 return Changed; 816 } 817 818 bool SIGfx6CacheControl::enableStoreCacheBypass( 819 const MachineBasicBlock::iterator &MI, 820 SIAtomicScope Scope, 821 SIAtomicAddrSpace AddrSpace) const { 822 assert(!MI->mayLoad() && MI->mayStore()); 823 bool Changed = false; 824 825 /// The L1 cache is write through so does not need to be bypassed. There is no 826 /// bypass control for the L2 cache at the isa level. 827 828 return Changed; 829 } 830 831 bool SIGfx6CacheControl::enableRMWCacheBypass( 832 const MachineBasicBlock::iterator &MI, 833 SIAtomicScope Scope, 834 SIAtomicAddrSpace AddrSpace) const { 835 assert(MI->mayLoad() && MI->mayStore()); 836 bool Changed = false; 837 838 /// The L1 cache is write through so does not need to be bypassed. There is no 839 /// bypass control for the L2 cache at the isa level. 840 841 return Changed; 842 } 843 844 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 845 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 846 bool IsVolatile, bool IsNonTemporal) const { 847 // Only handle load and store, not atomic read-modify-write insructions. The 848 // latter use glc to indicate if the atomic returns a result and so must not 849 // be used for cache control. 850 assert(MI->mayLoad() ^ MI->mayStore()); 851 852 // Only update load and store, not LLVM IR atomic read-modify-write 853 // instructions. The latter are always marked as volatile so cannot sensibly 854 // handle it as do not want to pessimize all atomics. Also they do not support 855 // the nontemporal attribute. 856 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 857 858 bool Changed = false; 859 860 if (IsVolatile) { 861 if (Op == SIMemOp::LOAD) 862 Changed |= enableGLCBit(MI); 863 864 // Ensure operation has completed at system scope to cause all volatile 865 // operations to be visible outside the program in a global order. Do not 866 // request cross address space as only the global address space can be 867 // observable outside the program, so no need to cause a waitcnt for LDS 868 // address space operations. 869 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 870 Position::AFTER); 871 872 return Changed; 873 } 874 875 if (IsNonTemporal) { 876 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 877 Changed |= enableGLCBit(MI); 878 Changed |= enableSLCBit(MI); 879 return Changed; 880 } 881 882 return Changed; 883 } 884 885 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 886 SIAtomicScope Scope, 887 SIAtomicAddrSpace AddrSpace, 888 SIMemOp Op, 889 bool IsCrossAddrSpaceOrdering, 890 Position Pos) const { 891 bool Changed = false; 892 893 MachineBasicBlock &MBB = *MI->getParent(); 894 DebugLoc DL = MI->getDebugLoc(); 895 896 if (Pos == Position::AFTER) 897 ++MI; 898 899 bool VMCnt = false; 900 bool LGKMCnt = false; 901 902 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 903 SIAtomicAddrSpace::NONE) { 904 switch (Scope) { 905 case SIAtomicScope::SYSTEM: 906 case SIAtomicScope::AGENT: 907 VMCnt |= true; 908 break; 909 case SIAtomicScope::WORKGROUP: 910 case SIAtomicScope::WAVEFRONT: 911 case SIAtomicScope::SINGLETHREAD: 912 // The L1 cache keeps all memory operations in order for 913 // wavefronts in the same work-group. 914 break; 915 default: 916 llvm_unreachable("Unsupported synchronization scope"); 917 } 918 } 919 920 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 921 switch (Scope) { 922 case SIAtomicScope::SYSTEM: 923 case SIAtomicScope::AGENT: 924 case SIAtomicScope::WORKGROUP: 925 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 926 // not needed as LDS operations for all waves are executed in a total 927 // global ordering as observed by all waves. Required if also 928 // synchronizing with global/GDS memory as LDS operations could be 929 // reordered with respect to later global/GDS memory operations of the 930 // same wave. 931 LGKMCnt |= IsCrossAddrSpaceOrdering; 932 break; 933 case SIAtomicScope::WAVEFRONT: 934 case SIAtomicScope::SINGLETHREAD: 935 // The LDS keeps all memory operations in order for 936 // the same wavesfront. 937 break; 938 default: 939 llvm_unreachable("Unsupported synchronization scope"); 940 } 941 } 942 943 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 944 switch (Scope) { 945 case SIAtomicScope::SYSTEM: 946 case SIAtomicScope::AGENT: 947 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 948 // is not needed as GDS operations for all waves are executed in a total 949 // global ordering as observed by all waves. Required if also 950 // synchronizing with global/LDS memory as GDS operations could be 951 // reordered with respect to later global/LDS memory operations of the 952 // same wave. 953 LGKMCnt |= IsCrossAddrSpaceOrdering; 954 break; 955 case SIAtomicScope::WORKGROUP: 956 case SIAtomicScope::WAVEFRONT: 957 case SIAtomicScope::SINGLETHREAD: 958 // The GDS keeps all memory operations in order for 959 // the same work-group. 960 break; 961 default: 962 llvm_unreachable("Unsupported synchronization scope"); 963 } 964 } 965 966 if (VMCnt || LGKMCnt) { 967 unsigned WaitCntImmediate = 968 AMDGPU::encodeWaitcnt(IV, 969 VMCnt ? 0 : getVmcntBitMask(IV), 970 getExpcntBitMask(IV), 971 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 972 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 973 Changed = true; 974 } 975 976 if (Pos == Position::AFTER) 977 --MI; 978 979 return Changed; 980 } 981 982 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 983 SIAtomicScope Scope, 984 SIAtomicAddrSpace AddrSpace, 985 Position Pos) const { 986 if (!InsertCacheInv) 987 return false; 988 989 bool Changed = false; 990 991 MachineBasicBlock &MBB = *MI->getParent(); 992 DebugLoc DL = MI->getDebugLoc(); 993 994 if (Pos == Position::AFTER) 995 ++MI; 996 997 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 998 switch (Scope) { 999 case SIAtomicScope::SYSTEM: 1000 case SIAtomicScope::AGENT: 1001 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 1002 Changed = true; 1003 break; 1004 case SIAtomicScope::WORKGROUP: 1005 case SIAtomicScope::WAVEFRONT: 1006 case SIAtomicScope::SINGLETHREAD: 1007 // No cache to invalidate. 1008 break; 1009 default: 1010 llvm_unreachable("Unsupported synchronization scope"); 1011 } 1012 } 1013 1014 /// The scratch address space does not need the global memory cache 1015 /// to be flushed as all memory operations by the same thread are 1016 /// sequentially consistent, and no other thread can access scratch 1017 /// memory. 1018 1019 /// Other address spaces do not have a cache. 1020 1021 if (Pos == Position::AFTER) 1022 --MI; 1023 1024 return Changed; 1025 } 1026 1027 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1028 SIAtomicScope Scope, 1029 SIAtomicAddrSpace AddrSpace, 1030 bool IsCrossAddrSpaceOrdering, 1031 Position Pos) const { 1032 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1033 IsCrossAddrSpaceOrdering, Pos); 1034 } 1035 1036 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1037 SIAtomicScope Scope, 1038 SIAtomicAddrSpace AddrSpace, 1039 Position Pos) const { 1040 if (!InsertCacheInv) 1041 return false; 1042 1043 bool Changed = false; 1044 1045 MachineBasicBlock &MBB = *MI->getParent(); 1046 DebugLoc DL = MI->getDebugLoc(); 1047 1048 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 1049 1050 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 1051 ? AMDGPU::BUFFER_WBINVL1 1052 : AMDGPU::BUFFER_WBINVL1_VOL; 1053 1054 if (Pos == Position::AFTER) 1055 ++MI; 1056 1057 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1058 switch (Scope) { 1059 case SIAtomicScope::SYSTEM: 1060 case SIAtomicScope::AGENT: 1061 BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 1062 Changed = true; 1063 break; 1064 case SIAtomicScope::WORKGROUP: 1065 case SIAtomicScope::WAVEFRONT: 1066 case SIAtomicScope::SINGLETHREAD: 1067 // No cache to invalidate. 1068 break; 1069 default: 1070 llvm_unreachable("Unsupported synchronization scope"); 1071 } 1072 } 1073 1074 /// The scratch address space does not need the global memory cache 1075 /// to be flushed as all memory operations by the same thread are 1076 /// sequentially consistent, and no other thread can access scratch 1077 /// memory. 1078 1079 /// Other address spaces do not have a cache. 1080 1081 if (Pos == Position::AFTER) 1082 --MI; 1083 1084 return Changed; 1085 } 1086 1087 bool SIGfx90ACacheControl::enableLoadCacheBypass( 1088 const MachineBasicBlock::iterator &MI, 1089 SIAtomicScope Scope, 1090 SIAtomicAddrSpace AddrSpace) const { 1091 assert(MI->mayLoad() && !MI->mayStore()); 1092 bool Changed = false; 1093 1094 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1095 switch (Scope) { 1096 case SIAtomicScope::SYSTEM: 1097 case SIAtomicScope::AGENT: 1098 Changed |= enableGLCBit(MI); 1099 break; 1100 case SIAtomicScope::WORKGROUP: 1101 // In threadgroup split mode the waves of a work-group can be executing on 1102 // different CUs. Therefore need to bypass the L1 which is per CU. 1103 // Otherwise in non-threadgroup split mode all waves of a work-group are 1104 // on the same CU, and so the L1 does not need to be bypassed. 1105 if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI); 1106 break; 1107 case SIAtomicScope::WAVEFRONT: 1108 case SIAtomicScope::SINGLETHREAD: 1109 // No cache to bypass. 1110 break; 1111 default: 1112 llvm_unreachable("Unsupported synchronization scope"); 1113 } 1114 } 1115 1116 /// The scratch address space does not need the global memory caches 1117 /// to be bypassed as all memory operations by the same thread are 1118 /// sequentially consistent, and no other thread can access scratch 1119 /// memory. 1120 1121 /// Other address spaces do not have a cache. 1122 1123 return Changed; 1124 } 1125 1126 bool SIGfx90ACacheControl::enableStoreCacheBypass( 1127 const MachineBasicBlock::iterator &MI, 1128 SIAtomicScope Scope, 1129 SIAtomicAddrSpace AddrSpace) const { 1130 assert(!MI->mayLoad() && MI->mayStore()); 1131 bool Changed = false; 1132 1133 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1134 switch (Scope) { 1135 case SIAtomicScope::SYSTEM: 1136 case SIAtomicScope::AGENT: 1137 /// Do not set glc for store atomic operations as they implicitly write 1138 /// through the L1 cache. 1139 break; 1140 case SIAtomicScope::WORKGROUP: 1141 case SIAtomicScope::WAVEFRONT: 1142 case SIAtomicScope::SINGLETHREAD: 1143 // No cache to bypass. Store atomics implicitly write through the L1 1144 // cache. 1145 break; 1146 default: 1147 llvm_unreachable("Unsupported synchronization scope"); 1148 } 1149 } 1150 1151 /// The scratch address space does not need the global memory caches 1152 /// to be bypassed as all memory operations by the same thread are 1153 /// sequentially consistent, and no other thread can access scratch 1154 /// memory. 1155 1156 /// Other address spaces do not have a cache. 1157 1158 return Changed; 1159 } 1160 1161 bool SIGfx90ACacheControl::enableRMWCacheBypass( 1162 const MachineBasicBlock::iterator &MI, 1163 SIAtomicScope Scope, 1164 SIAtomicAddrSpace AddrSpace) const { 1165 assert(MI->mayLoad() && MI->mayStore()); 1166 bool Changed = false; 1167 1168 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1169 switch (Scope) { 1170 case SIAtomicScope::SYSTEM: 1171 case SIAtomicScope::AGENT: 1172 /// Do not set glc for RMW atomic operations as they implicitly bypass 1173 /// the L1 cache, and the glc bit is instead used to indicate if they are 1174 /// return or no-return. 1175 break; 1176 case SIAtomicScope::WORKGROUP: 1177 case SIAtomicScope::WAVEFRONT: 1178 case SIAtomicScope::SINGLETHREAD: 1179 // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1180 break; 1181 default: 1182 llvm_unreachable("Unsupported synchronization scope"); 1183 } 1184 } 1185 1186 return Changed; 1187 } 1188 1189 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1190 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1191 bool IsVolatile, bool IsNonTemporal) const { 1192 // Only handle load and store, not atomic read-modify-write insructions. The 1193 // latter use glc to indicate if the atomic returns a result and so must not 1194 // be used for cache control. 1195 assert(MI->mayLoad() ^ MI->mayStore()); 1196 1197 // Only update load and store, not LLVM IR atomic read-modify-write 1198 // instructions. The latter are always marked as volatile so cannot sensibly 1199 // handle it as do not want to pessimize all atomics. Also they do not support 1200 // the nontemporal attribute. 1201 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1202 1203 bool Changed = false; 1204 1205 if (IsVolatile) { 1206 if (Op == SIMemOp::LOAD) { 1207 Changed |= enableGLCBit(MI); 1208 } 1209 1210 // Ensure operation has completed at system scope to cause all volatile 1211 // operations to be visible outside the program in a global order. Do not 1212 // request cross address space as only the global address space can be 1213 // observable outside the program, so no need to cause a waitcnt for LDS 1214 // address space operations. 1215 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1216 Position::AFTER); 1217 1218 return Changed; 1219 } 1220 1221 if (IsNonTemporal) { 1222 // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. 1223 Changed |= enableGLCBit(MI); 1224 Changed |= enableSLCBit(MI); 1225 return Changed; 1226 } 1227 1228 return Changed; 1229 } 1230 1231 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1232 SIAtomicScope Scope, 1233 SIAtomicAddrSpace AddrSpace, 1234 SIMemOp Op, 1235 bool IsCrossAddrSpaceOrdering, 1236 Position Pos) const { 1237 if (ST.isTgSplitEnabled()) { 1238 // In threadgroup split mode the waves of a work-group can be executing on 1239 // different CUs. Therefore need to wait for global or GDS memory operations 1240 // to complete to ensure they are visible to waves in the other CUs. 1241 // Otherwise in non-threadgroup split mode all waves of a work-group are on 1242 // the same CU, so no need to wait for global memory as all waves in the 1243 // work-group access the same the L1, nor wait for GDS as access are ordered 1244 // on a CU. 1245 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1246 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1247 (Scope == SIAtomicScope::WORKGROUP)) { 1248 // Same as GFX7 using agent scope. 1249 Scope = SIAtomicScope::AGENT; 1250 } 1251 // In threadgroup split mode LDS cannot be allocated so no need to wait for 1252 // LDS memory operations. 1253 AddrSpace &= ~SIAtomicAddrSpace::LDS; 1254 } 1255 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1256 IsCrossAddrSpaceOrdering, Pos); 1257 } 1258 1259 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1260 SIAtomicScope Scope, 1261 SIAtomicAddrSpace AddrSpace, 1262 Position Pos) const { 1263 if (!InsertCacheInv) 1264 return false; 1265 1266 bool Changed = false; 1267 1268 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1269 switch (Scope) { 1270 case SIAtomicScope::SYSTEM: 1271 case SIAtomicScope::AGENT: 1272 // Same as GFX7. 1273 break; 1274 case SIAtomicScope::WORKGROUP: 1275 // In threadgroup split mode the waves of a work-group can be executing on 1276 // different CUs. Therefore need to invalidate the L1 which is per CU. 1277 // Otherwise in non-threadgroup split mode all waves of a work-group are 1278 // on the same CU, and so the L1 does not need to be invalidated. 1279 if (ST.isTgSplitEnabled()) { 1280 // Same as GFX7 using agent scope. 1281 Scope = SIAtomicScope::AGENT; 1282 } 1283 break; 1284 case SIAtomicScope::WAVEFRONT: 1285 case SIAtomicScope::SINGLETHREAD: 1286 // Same as GFX7. 1287 break; 1288 default: 1289 llvm_unreachable("Unsupported synchronization scope"); 1290 } 1291 } 1292 1293 /// The scratch address space does not need the global memory cache 1294 /// to be flushed as all memory operations by the same thread are 1295 /// sequentially consistent, and no other thread can access scratch 1296 /// memory. 1297 1298 /// Other address spaces do not have a cache. 1299 1300 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1301 1302 return Changed; 1303 } 1304 1305 bool SIGfx10CacheControl::enableLoadCacheBypass( 1306 const MachineBasicBlock::iterator &MI, 1307 SIAtomicScope Scope, 1308 SIAtomicAddrSpace AddrSpace) const { 1309 assert(MI->mayLoad() && !MI->mayStore()); 1310 bool Changed = false; 1311 1312 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1313 /// TODO Do not set glc for rmw atomic operations as they 1314 /// implicitly bypass the L0/L1 caches. 1315 1316 switch (Scope) { 1317 case SIAtomicScope::SYSTEM: 1318 case SIAtomicScope::AGENT: 1319 Changed |= enableGLCBit(MI); 1320 Changed |= enableDLCBit(MI); 1321 break; 1322 case SIAtomicScope::WORKGROUP: 1323 // In WGP mode the waves of a work-group can be executing on either CU of 1324 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1325 // CU mode all waves of a work-group are on the same CU, and so the L0 1326 // does not need to be bypassed. 1327 if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI); 1328 break; 1329 case SIAtomicScope::WAVEFRONT: 1330 case SIAtomicScope::SINGLETHREAD: 1331 // No cache to bypass. 1332 break; 1333 default: 1334 llvm_unreachable("Unsupported synchronization scope"); 1335 } 1336 } 1337 1338 /// The scratch address space does not need the global memory caches 1339 /// to be bypassed as all memory operations by the same thread are 1340 /// sequentially consistent, and no other thread can access scratch 1341 /// memory. 1342 1343 /// Other address spaces do not have a cache. 1344 1345 return Changed; 1346 } 1347 1348 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1349 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1350 bool IsVolatile, bool IsNonTemporal) const { 1351 1352 // Only handle load and store, not atomic read-modify-write insructions. The 1353 // latter use glc to indicate if the atomic returns a result and so must not 1354 // be used for cache control. 1355 assert(MI->mayLoad() ^ MI->mayStore()); 1356 1357 // Only update load and store, not LLVM IR atomic read-modify-write 1358 // instructions. The latter are always marked as volatile so cannot sensibly 1359 // handle it as do not want to pessimize all atomics. Also they do not support 1360 // the nontemporal attribute. 1361 assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1362 1363 bool Changed = false; 1364 1365 if (IsVolatile) { 1366 1367 if (Op == SIMemOp::LOAD) { 1368 Changed |= enableGLCBit(MI); 1369 Changed |= enableDLCBit(MI); 1370 } 1371 1372 // Ensure operation has completed at system scope to cause all volatile 1373 // operations to be visible outside the program in a global order. Do not 1374 // request cross address space as only the global address space can be 1375 // observable outside the program, so no need to cause a waitcnt for LDS 1376 // address space operations. 1377 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1378 Position::AFTER); 1379 return Changed; 1380 } 1381 1382 if (IsNonTemporal) { 1383 // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. 1384 Changed |= enableSLCBit(MI); 1385 return Changed; 1386 } 1387 1388 return Changed; 1389 } 1390 1391 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1392 SIAtomicScope Scope, 1393 SIAtomicAddrSpace AddrSpace, 1394 SIMemOp Op, 1395 bool IsCrossAddrSpaceOrdering, 1396 Position Pos) const { 1397 bool Changed = false; 1398 1399 MachineBasicBlock &MBB = *MI->getParent(); 1400 DebugLoc DL = MI->getDebugLoc(); 1401 1402 if (Pos == Position::AFTER) 1403 ++MI; 1404 1405 bool VMCnt = false; 1406 bool VSCnt = false; 1407 bool LGKMCnt = false; 1408 1409 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1410 SIAtomicAddrSpace::NONE) { 1411 switch (Scope) { 1412 case SIAtomicScope::SYSTEM: 1413 case SIAtomicScope::AGENT: 1414 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1415 VMCnt |= true; 1416 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1417 VSCnt |= true; 1418 break; 1419 case SIAtomicScope::WORKGROUP: 1420 // In WGP mode the waves of a work-group can be executing on either CU of 1421 // the WGP. Therefore need to wait for operations to complete to ensure 1422 // they are visible to waves in the other CU as the L0 is per CU. 1423 // Otherwise in CU mode and all waves of a work-group are on the same CU 1424 // which shares the same L0. 1425 if (!ST.isCuModeEnabled()) { 1426 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 1427 VMCnt |= true; 1428 if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 1429 VSCnt |= true; 1430 } 1431 break; 1432 case SIAtomicScope::WAVEFRONT: 1433 case SIAtomicScope::SINGLETHREAD: 1434 // The L0 cache keeps all memory operations in order for 1435 // work-items in the same wavefront. 1436 break; 1437 default: 1438 llvm_unreachable("Unsupported synchronization scope"); 1439 } 1440 } 1441 1442 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1443 switch (Scope) { 1444 case SIAtomicScope::SYSTEM: 1445 case SIAtomicScope::AGENT: 1446 case SIAtomicScope::WORKGROUP: 1447 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1448 // not needed as LDS operations for all waves are executed in a total 1449 // global ordering as observed by all waves. Required if also 1450 // synchronizing with global/GDS memory as LDS operations could be 1451 // reordered with respect to later global/GDS memory operations of the 1452 // same wave. 1453 LGKMCnt |= IsCrossAddrSpaceOrdering; 1454 break; 1455 case SIAtomicScope::WAVEFRONT: 1456 case SIAtomicScope::SINGLETHREAD: 1457 // The LDS keeps all memory operations in order for 1458 // the same wavesfront. 1459 break; 1460 default: 1461 llvm_unreachable("Unsupported synchronization scope"); 1462 } 1463 } 1464 1465 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1466 switch (Scope) { 1467 case SIAtomicScope::SYSTEM: 1468 case SIAtomicScope::AGENT: 1469 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1470 // is not needed as GDS operations for all waves are executed in a total 1471 // global ordering as observed by all waves. Required if also 1472 // synchronizing with global/LDS memory as GDS operations could be 1473 // reordered with respect to later global/LDS memory operations of the 1474 // same wave. 1475 LGKMCnt |= IsCrossAddrSpaceOrdering; 1476 break; 1477 case SIAtomicScope::WORKGROUP: 1478 case SIAtomicScope::WAVEFRONT: 1479 case SIAtomicScope::SINGLETHREAD: 1480 // The GDS keeps all memory operations in order for 1481 // the same work-group. 1482 break; 1483 default: 1484 llvm_unreachable("Unsupported synchronization scope"); 1485 } 1486 } 1487 1488 if (VMCnt || LGKMCnt) { 1489 unsigned WaitCntImmediate = 1490 AMDGPU::encodeWaitcnt(IV, 1491 VMCnt ? 0 : getVmcntBitMask(IV), 1492 getExpcntBitMask(IV), 1493 LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 1494 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); 1495 Changed = true; 1496 } 1497 1498 if (VSCnt) { 1499 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1500 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1501 .addImm(0); 1502 Changed = true; 1503 } 1504 1505 if (Pos == Position::AFTER) 1506 --MI; 1507 1508 return Changed; 1509 } 1510 1511 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1512 SIAtomicScope Scope, 1513 SIAtomicAddrSpace AddrSpace, 1514 Position Pos) const { 1515 if (!InsertCacheInv) 1516 return false; 1517 1518 bool Changed = false; 1519 1520 MachineBasicBlock &MBB = *MI->getParent(); 1521 DebugLoc DL = MI->getDebugLoc(); 1522 1523 if (Pos == Position::AFTER) 1524 ++MI; 1525 1526 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1527 switch (Scope) { 1528 case SIAtomicScope::SYSTEM: 1529 case SIAtomicScope::AGENT: 1530 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1531 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 1532 Changed = true; 1533 break; 1534 case SIAtomicScope::WORKGROUP: 1535 // In WGP mode the waves of a work-group can be executing on either CU of 1536 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 1537 // in CU mode and all waves of a work-group are on the same CU, and so the 1538 // L0 does not need to be invalidated. 1539 if (!ST.isCuModeEnabled()) { 1540 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 1541 Changed = true; 1542 } 1543 break; 1544 case SIAtomicScope::WAVEFRONT: 1545 case SIAtomicScope::SINGLETHREAD: 1546 // No cache to invalidate. 1547 break; 1548 default: 1549 llvm_unreachable("Unsupported synchronization scope"); 1550 } 1551 } 1552 1553 /// The scratch address space does not need the global memory cache 1554 /// to be flushed as all memory operations by the same thread are 1555 /// sequentially consistent, and no other thread can access scratch 1556 /// memory. 1557 1558 /// Other address spaces do not have a cache. 1559 1560 if (Pos == Position::AFTER) 1561 --MI; 1562 1563 return Changed; 1564 } 1565 1566 bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 1567 if (AtomicPseudoMIs.empty()) 1568 return false; 1569 1570 for (auto &MI : AtomicPseudoMIs) 1571 MI->eraseFromParent(); 1572 1573 AtomicPseudoMIs.clear(); 1574 return true; 1575 } 1576 1577 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 1578 MachineBasicBlock::iterator &MI) { 1579 assert(MI->mayLoad() && !MI->mayStore()); 1580 1581 bool Changed = false; 1582 1583 if (MOI.isAtomic()) { 1584 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1585 MOI.getOrdering() == AtomicOrdering::Acquire || 1586 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1587 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 1588 MOI.getOrderingAddrSpace()); 1589 } 1590 1591 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1592 Changed |= CC->insertWait(MI, MOI.getScope(), 1593 MOI.getOrderingAddrSpace(), 1594 SIMemOp::LOAD | SIMemOp::STORE, 1595 MOI.getIsCrossAddressSpaceOrdering(), 1596 Position::BEFORE); 1597 1598 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1599 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1600 Changed |= CC->insertWait(MI, MOI.getScope(), 1601 MOI.getInstrAddrSpace(), 1602 SIMemOp::LOAD, 1603 MOI.getIsCrossAddressSpaceOrdering(), 1604 Position::AFTER); 1605 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1606 MOI.getOrderingAddrSpace(), 1607 Position::AFTER); 1608 } 1609 1610 return Changed; 1611 } 1612 1613 // Atomic instructions already bypass caches to the scope specified by the 1614 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1615 // need additional treatment. 1616 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), 1617 SIMemOp::LOAD, MOI.isVolatile(), 1618 MOI.isNonTemporal()); 1619 return Changed; 1620 } 1621 1622 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 1623 MachineBasicBlock::iterator &MI) { 1624 assert(!MI->mayLoad() && MI->mayStore()); 1625 1626 bool Changed = false; 1627 1628 if (MOI.isAtomic()) { 1629 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1630 MOI.getOrdering() == AtomicOrdering::Release || 1631 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1632 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 1633 MOI.getOrderingAddrSpace()); 1634 } 1635 1636 if (MOI.getOrdering() == AtomicOrdering::Release || 1637 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1638 Changed |= CC->insertRelease(MI, MOI.getScope(), 1639 MOI.getOrderingAddrSpace(), 1640 MOI.getIsCrossAddressSpaceOrdering(), 1641 Position::BEFORE); 1642 1643 return Changed; 1644 } 1645 1646 // Atomic instructions already bypass caches to the scope specified by the 1647 // SyncScope operand. Only non-atomic volatile and nontemporal instructions 1648 // need additional treatment. 1649 Changed |= CC->enableVolatileAndOrNonTemporal( 1650 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 1651 MOI.isNonTemporal()); 1652 return Changed; 1653 } 1654 1655 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 1656 MachineBasicBlock::iterator &MI) { 1657 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 1658 1659 AtomicPseudoMIs.push_back(MI); 1660 bool Changed = false; 1661 1662 if (MOI.isAtomic()) { 1663 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1664 MOI.getOrdering() == AtomicOrdering::Release || 1665 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1666 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1667 /// TODO: This relies on a barrier always generating a waitcnt 1668 /// for LDS to ensure it is not reordered with the completion of 1669 /// the proceeding LDS operations. If barrier had a memory 1670 /// ordering and memory scope, then library does not need to 1671 /// generate a fence. Could add support in this file for 1672 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 1673 /// adding S_WAITCNT before a S_BARRIER. 1674 Changed |= CC->insertRelease(MI, MOI.getScope(), 1675 MOI.getOrderingAddrSpace(), 1676 MOI.getIsCrossAddressSpaceOrdering(), 1677 Position::BEFORE); 1678 1679 // TODO: If both release and invalidate are happening they could be combined 1680 // to use the single "BUFFER_WBINV*" instruction. This could be done by 1681 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 1682 // track cache invalidate and write back instructions. 1683 1684 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1685 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1686 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 1687 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1688 MOI.getOrderingAddrSpace(), 1689 Position::BEFORE); 1690 1691 return Changed; 1692 } 1693 1694 return Changed; 1695 } 1696 1697 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 1698 MachineBasicBlock::iterator &MI) { 1699 assert(MI->mayLoad() && MI->mayStore()); 1700 1701 bool Changed = false; 1702 1703 if (MOI.isAtomic()) { 1704 if (MOI.getOrdering() == AtomicOrdering::Monotonic || 1705 MOI.getOrdering() == AtomicOrdering::Acquire || 1706 MOI.getOrdering() == AtomicOrdering::Release || 1707 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1708 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 1709 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 1710 MOI.getInstrAddrSpace()); 1711 } 1712 1713 if (MOI.getOrdering() == AtomicOrdering::Release || 1714 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1715 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1716 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 1717 Changed |= CC->insertRelease(MI, MOI.getScope(), 1718 MOI.getOrderingAddrSpace(), 1719 MOI.getIsCrossAddressSpaceOrdering(), 1720 Position::BEFORE); 1721 1722 if (MOI.getOrdering() == AtomicOrdering::Acquire || 1723 MOI.getOrdering() == AtomicOrdering::AcquireRelease || 1724 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 1725 MOI.getFailureOrdering() == AtomicOrdering::Acquire || 1726 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 1727 Changed |= CC->insertWait(MI, MOI.getScope(), 1728 MOI.getInstrAddrSpace(), 1729 isAtomicRet(*MI) ? SIMemOp::LOAD : 1730 SIMemOp::STORE, 1731 MOI.getIsCrossAddressSpaceOrdering(), 1732 Position::AFTER); 1733 Changed |= CC->insertAcquire(MI, MOI.getScope(), 1734 MOI.getOrderingAddrSpace(), 1735 Position::AFTER); 1736 } 1737 1738 return Changed; 1739 } 1740 1741 return Changed; 1742 } 1743 1744 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 1745 bool Changed = false; 1746 1747 SIMemOpAccess MOA(MF); 1748 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 1749 1750 for (auto &MBB : MF) { 1751 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 1752 1753 // Unbundle instructions after the post-RA scheduler. 1754 if (MI->isBundle() && MI->mayLoadOrStore()) { 1755 MachineBasicBlock::instr_iterator II(MI->getIterator()); 1756 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 1757 I != E && I->isBundledWithPred(); ++I) { 1758 I->unbundleFromPred(); 1759 for (MachineOperand &MO : I->operands()) 1760 if (MO.isReg()) 1761 MO.setIsInternalRead(false); 1762 } 1763 1764 MI->eraseFromParent(); 1765 MI = II->getIterator(); 1766 } 1767 1768 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 1769 continue; 1770 1771 if (const auto &MOI = MOA.getLoadInfo(MI)) 1772 Changed |= expandLoad(MOI.getValue(), MI); 1773 else if (const auto &MOI = MOA.getStoreInfo(MI)) 1774 Changed |= expandStore(MOI.getValue(), MI); 1775 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 1776 Changed |= expandAtomicFence(MOI.getValue(), MI); 1777 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 1778 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); 1779 } 1780 } 1781 1782 Changed |= removeAtomicPseudoMIs(); 1783 return Changed; 1784 } 1785 1786 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 1787 1788 char SIMemoryLegalizer::ID = 0; 1789 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 1790 1791 FunctionPass *llvm::createSIMemoryLegalizerPass() { 1792 return new SIMemoryLegalizer(); 1793 } 1794