1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
88 /// \returns Returns true if \p MI is modified, false otherwise.
89 template <uint16_t BitName>
90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
91   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
92   if (BitIdx == -1)
93     return false;
94 
95   MachineOperand &Bit = MI->getOperand(BitIdx);
96   if (Bit.getImm() != 0)
97     return false;
98 
99   Bit.setImm(1);
100   return true;
101 }
102 
103 class SIMemOpInfo final {
104 private:
105 
106   friend class SIMemOpAccess;
107 
108   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
109   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
110   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
111   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
112   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
113   bool IsCrossAddressSpaceOrdering = false;
114   bool IsVolatile = false;
115   bool IsNonTemporal = false;
116 
117   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
118               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
119               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
120               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
121               bool IsCrossAddressSpaceOrdering = true,
122               AtomicOrdering FailureOrdering =
123                 AtomicOrdering::SequentiallyConsistent,
124               bool IsVolatile = false,
125               bool IsNonTemporal = false)
126     : Ordering(Ordering), FailureOrdering(FailureOrdering),
127       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
128       InstrAddrSpace(InstrAddrSpace),
129       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
130       IsVolatile(IsVolatile),
131       IsNonTemporal(IsNonTemporal) {
132 
133     if (Ordering == AtomicOrdering::NotAtomic) {
134       assert(Scope == SIAtomicScope::NONE &&
135              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
136              !IsCrossAddressSpaceOrdering &&
137              FailureOrdering == AtomicOrdering::NotAtomic);
138       return;
139     }
140 
141     assert(Scope != SIAtomicScope::NONE &&
142            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
143                SIAtomicAddrSpace::NONE &&
144            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
145                SIAtomicAddrSpace::NONE &&
146            !isStrongerThan(FailureOrdering, Ordering));
147 
148     // There is also no cross address space ordering if the ordering
149     // address space is the same as the instruction address space and
150     // only contains a single address space.
151     if ((OrderingAddrSpace == InstrAddrSpace) &&
152         isPowerOf2_32(uint32_t(InstrAddrSpace)))
153       this->IsCrossAddressSpaceOrdering = false;
154 
155     // Limit the scope to the maximum supported by the instruction's address
156     // spaces.
157     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
158         SIAtomicAddrSpace::NONE) {
159       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
160     } else if ((InstrAddrSpace &
161                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
162                SIAtomicAddrSpace::NONE) {
163       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
164     } else if ((InstrAddrSpace &
165                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
166                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
167       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
168     }
169   }
170 
171 public:
172   /// \returns Atomic synchronization scope of the machine instruction used to
173   /// create this SIMemOpInfo.
174   SIAtomicScope getScope() const {
175     return Scope;
176   }
177 
178   /// \returns Ordering constraint of the machine instruction used to
179   /// create this SIMemOpInfo.
180   AtomicOrdering getOrdering() const {
181     return Ordering;
182   }
183 
184   /// \returns Failure ordering constraint of the machine instruction used to
185   /// create this SIMemOpInfo.
186   AtomicOrdering getFailureOrdering() const {
187     return FailureOrdering;
188   }
189 
190   /// \returns The address spaces be accessed by the machine
191   /// instruction used to create this SiMemOpInfo.
192   SIAtomicAddrSpace getInstrAddrSpace() const {
193     return InstrAddrSpace;
194   }
195 
196   /// \returns The address spaces that must be ordered by the machine
197   /// instruction used to create this SiMemOpInfo.
198   SIAtomicAddrSpace getOrderingAddrSpace() const {
199     return OrderingAddrSpace;
200   }
201 
202   /// \returns Return true iff memory ordering of operations on
203   /// different address spaces is required.
204   bool getIsCrossAddressSpaceOrdering() const {
205     return IsCrossAddressSpaceOrdering;
206   }
207 
208   /// \returns True if memory access of the machine instruction used to
209   /// create this SIMemOpInfo is volatile, false otherwise.
210   bool isVolatile() const {
211     return IsVolatile;
212   }
213 
214   /// \returns True if memory access of the machine instruction used to
215   /// create this SIMemOpInfo is nontemporal, false otherwise.
216   bool isNonTemporal() const {
217     return IsNonTemporal;
218   }
219 
220   /// \returns True if ordering constraint of the machine instruction used to
221   /// create this SIMemOpInfo is unordered or higher, false otherwise.
222   bool isAtomic() const {
223     return Ordering != AtomicOrdering::NotAtomic;
224   }
225 
226 };
227 
228 class SIMemOpAccess final {
229 private:
230   AMDGPUMachineModuleInfo *MMI = nullptr;
231 
232   /// Reports unsupported message \p Msg for \p MI to LLVM context.
233   void reportUnsupported(const MachineBasicBlock::iterator &MI,
234                          const char *Msg) const;
235 
236   /// Inspects the target synchronization scope \p SSID and determines
237   /// the SI atomic scope it corresponds to, the address spaces it
238   /// covers, and whether the memory ordering applies between address
239   /// spaces.
240   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
241   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
242 
243   /// \return Return a bit set of the address spaces accessed by \p AS.
244   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
245 
246   /// \returns Info constructed from \p MI, which has at least machine memory
247   /// operand.
248   Optional<SIMemOpInfo> constructFromMIWithMMO(
249       const MachineBasicBlock::iterator &MI) const;
250 
251 public:
252   /// Construct class to support accessing the machine memory operands
253   /// of instructions in the machine function \p MF.
254   SIMemOpAccess(MachineFunction &MF);
255 
256   /// \returns Load info if \p MI is a load operation, "None" otherwise.
257   Optional<SIMemOpInfo> getLoadInfo(
258       const MachineBasicBlock::iterator &MI) const;
259 
260   /// \returns Store info if \p MI is a store operation, "None" otherwise.
261   Optional<SIMemOpInfo> getStoreInfo(
262       const MachineBasicBlock::iterator &MI) const;
263 
264   /// \returns Atomic fence info if \p MI is an atomic fence operation,
265   /// "None" otherwise.
266   Optional<SIMemOpInfo> getAtomicFenceInfo(
267       const MachineBasicBlock::iterator &MI) const;
268 
269   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
270   /// rmw operation, "None" otherwise.
271   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
272       const MachineBasicBlock::iterator &MI) const;
273 };
274 
275 class SICacheControl {
276 protected:
277 
278   /// AMDGPU subtarget info.
279   const GCNSubtarget &ST;
280 
281   /// Instruction info.
282   const SIInstrInfo *TII = nullptr;
283 
284   IsaVersion IV;
285 
286   /// Whether to insert cache invalidating instructions.
287   bool InsertCacheInv;
288 
289   SICacheControl(const GCNSubtarget &ST);
290 
291 public:
292 
293   /// Create a cache control for the subtarget \p ST.
294   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
295 
296   /// Update \p MI memory load instruction to bypass any caches up to
297   /// the \p Scope memory scope for address spaces \p
298   /// AddrSpace. Return true iff the instruction was modified.
299   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
300                                      SIAtomicScope Scope,
301                                      SIAtomicAddrSpace AddrSpace) const = 0;
302 
303   /// Update \p MI memory instruction of kind \p Op associated with address
304   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
305   /// true iff the instruction was modified.
306   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
307                                               SIAtomicAddrSpace AddrSpace,
308                                               SIMemOp Op, bool IsVolatile,
309                                               bool IsNonTemporal) const = 0;
310 
311   /// Inserts any necessary instructions at position \p Pos relative
312   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
313   /// \p Op associated with address spaces \p AddrSpace have completed. Used
314   /// between memory instructions to enforce the order they become visible as
315   /// observed by other memory instructions executing in memory scope \p Scope.
316   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
317   /// address spaces. Returns true iff any instructions inserted.
318   virtual bool insertWait(MachineBasicBlock::iterator &MI,
319                           SIAtomicScope Scope,
320                           SIAtomicAddrSpace AddrSpace,
321                           SIMemOp Op,
322                           bool IsCrossAddrSpaceOrdering,
323                           Position Pos) const = 0;
324 
325   /// Inserts any necessary instructions at position \p Pos relative to
326   /// instruction \p MI to ensure any subsequent memory instructions of this
327   /// thread with address spaces \p AddrSpace will observe the previous memory
328   /// operations by any thread for memory scopes up to memory scope \p Scope .
329   /// Returns true iff any instructions inserted.
330   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
331                              SIAtomicScope Scope,
332                              SIAtomicAddrSpace AddrSpace,
333                              Position Pos) const = 0;
334 
335   /// Inserts any necessary instructions at position \p Pos relative to
336   /// instruction \p MI to ensure previous memory instructions by this thread
337   /// with address spaces \p AddrSpace have completed and can be observed by
338   /// subsequent memory instructions by any thread executing in memory scope \p
339   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
340   /// between address spaces. Returns true iff any instructions inserted.
341   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
342                              SIAtomicScope Scope,
343                              SIAtomicAddrSpace AddrSpace,
344                              bool IsCrossAddrSpaceOrdering,
345                              Position Pos) const = 0;
346 
347   /// Virtual destructor to allow derivations to be deleted.
348   virtual ~SICacheControl() = default;
349 
350 };
351 
352 class SIGfx6CacheControl : public SICacheControl {
353 protected:
354 
355   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
356   /// is modified, false otherwise.
357   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
358     return enableNamedBit<AMDGPU::OpName::glc>(MI);
359   }
360 
361   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
362   /// is modified, false otherwise.
363   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
364     return enableNamedBit<AMDGPU::OpName::slc>(MI);
365   }
366 
367 public:
368 
369   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
370 
371   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
372                              SIAtomicScope Scope,
373                              SIAtomicAddrSpace AddrSpace) const override;
374 
375   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
376                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
377                                       bool IsVolatile,
378                                       bool IsNonTemporal) const override;
379 
380   bool insertWait(MachineBasicBlock::iterator &MI,
381                   SIAtomicScope Scope,
382                   SIAtomicAddrSpace AddrSpace,
383                   SIMemOp Op,
384                   bool IsCrossAddrSpaceOrdering,
385                   Position Pos) const override;
386 
387   bool insertAcquire(MachineBasicBlock::iterator &MI,
388                      SIAtomicScope Scope,
389                      SIAtomicAddrSpace AddrSpace,
390                      Position Pos) const override;
391 
392   bool insertRelease(MachineBasicBlock::iterator &MI,
393                      SIAtomicScope Scope,
394                      SIAtomicAddrSpace AddrSpace,
395                      bool IsCrossAddrSpaceOrdering,
396                      Position Pos) const override;
397 };
398 
399 class SIGfx7CacheControl : public SIGfx6CacheControl {
400 public:
401 
402   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
403 
404   bool insertAcquire(MachineBasicBlock::iterator &MI,
405                      SIAtomicScope Scope,
406                      SIAtomicAddrSpace AddrSpace,
407                      Position Pos) const override;
408 
409 };
410 
411 class SIGfx10CacheControl : public SIGfx7CacheControl {
412 protected:
413 
414   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
415   /// is modified, false otherwise.
416   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
417     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
418   }
419 
420 public:
421 
422   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
423 
424   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
425                              SIAtomicScope Scope,
426                              SIAtomicAddrSpace AddrSpace) const override;
427 
428   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
429                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
430                                       bool IsVolatile,
431                                       bool IsNonTemporal) const override;
432 
433   bool insertWait(MachineBasicBlock::iterator &MI,
434                   SIAtomicScope Scope,
435                   SIAtomicAddrSpace AddrSpace,
436                   SIMemOp Op,
437                   bool IsCrossAddrSpaceOrdering,
438                   Position Pos) const override;
439 
440   bool insertAcquire(MachineBasicBlock::iterator &MI,
441                      SIAtomicScope Scope,
442                      SIAtomicAddrSpace AddrSpace,
443                      Position Pos) const override;
444 };
445 
446 class SIMemoryLegalizer final : public MachineFunctionPass {
447 private:
448 
449   /// Cache Control.
450   std::unique_ptr<SICacheControl> CC = nullptr;
451 
452   /// List of atomic pseudo instructions.
453   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
454 
455   /// Return true iff instruction \p MI is a atomic instruction that
456   /// returns a result.
457   bool isAtomicRet(const MachineInstr &MI) const {
458     return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
459   }
460 
461   /// Removes all processed atomic pseudo instructions from the current
462   /// function. Returns true if current function is modified, false otherwise.
463   bool removeAtomicPseudoMIs();
464 
465   /// Expands load operation \p MI. Returns true if instructions are
466   /// added/deleted or \p MI is modified, false otherwise.
467   bool expandLoad(const SIMemOpInfo &MOI,
468                   MachineBasicBlock::iterator &MI);
469   /// Expands store operation \p MI. Returns true if instructions are
470   /// added/deleted or \p MI is modified, false otherwise.
471   bool expandStore(const SIMemOpInfo &MOI,
472                    MachineBasicBlock::iterator &MI);
473   /// Expands atomic fence operation \p MI. Returns true if
474   /// instructions are added/deleted or \p MI is modified, false otherwise.
475   bool expandAtomicFence(const SIMemOpInfo &MOI,
476                          MachineBasicBlock::iterator &MI);
477   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
478   /// instructions are added/deleted or \p MI is modified, false otherwise.
479   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
480                                 MachineBasicBlock::iterator &MI);
481 
482 public:
483   static char ID;
484 
485   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
486 
487   void getAnalysisUsage(AnalysisUsage &AU) const override {
488     AU.setPreservesCFG();
489     MachineFunctionPass::getAnalysisUsage(AU);
490   }
491 
492   StringRef getPassName() const override {
493     return PASS_NAME;
494   }
495 
496   bool runOnMachineFunction(MachineFunction &MF) override;
497 };
498 
499 } // end namespace anonymous
500 
501 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
502                                       const char *Msg) const {
503   const Function &Func = MI->getParent()->getParent()->getFunction();
504   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
505   Func.getContext().diagnose(Diag);
506 }
507 
508 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
509 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
510                                SIAtomicAddrSpace InstrAddrSpace) const {
511   if (SSID == SyncScope::System)
512     return std::make_tuple(SIAtomicScope::SYSTEM,
513                            SIAtomicAddrSpace::ATOMIC,
514                            true);
515   if (SSID == MMI->getAgentSSID())
516     return std::make_tuple(SIAtomicScope::AGENT,
517                            SIAtomicAddrSpace::ATOMIC,
518                            true);
519   if (SSID == MMI->getWorkgroupSSID())
520     return std::make_tuple(SIAtomicScope::WORKGROUP,
521                            SIAtomicAddrSpace::ATOMIC,
522                            true);
523   if (SSID == MMI->getWavefrontSSID())
524     return std::make_tuple(SIAtomicScope::WAVEFRONT,
525                            SIAtomicAddrSpace::ATOMIC,
526                            true);
527   if (SSID == SyncScope::SingleThread)
528     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
529                            SIAtomicAddrSpace::ATOMIC,
530                            true);
531   if (SSID == MMI->getSystemOneAddressSpaceSSID())
532     return std::make_tuple(SIAtomicScope::SYSTEM,
533                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
534                            false);
535   if (SSID == MMI->getAgentOneAddressSpaceSSID())
536     return std::make_tuple(SIAtomicScope::AGENT,
537                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
538                            false);
539   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
540     return std::make_tuple(SIAtomicScope::WORKGROUP,
541                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
542                            false);
543   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
544     return std::make_tuple(SIAtomicScope::WAVEFRONT,
545                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
546                            false);
547   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
548     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
549                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
550                            false);
551   return None;
552 }
553 
554 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
555   if (AS == AMDGPUAS::FLAT_ADDRESS)
556     return SIAtomicAddrSpace::FLAT;
557   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
558     return SIAtomicAddrSpace::GLOBAL;
559   if (AS == AMDGPUAS::LOCAL_ADDRESS)
560     return SIAtomicAddrSpace::LDS;
561   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
562     return SIAtomicAddrSpace::SCRATCH;
563   if (AS == AMDGPUAS::REGION_ADDRESS)
564     return SIAtomicAddrSpace::GDS;
565 
566   return SIAtomicAddrSpace::OTHER;
567 }
568 
569 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
570   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
571 }
572 
573 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
574     const MachineBasicBlock::iterator &MI) const {
575   assert(MI->getNumMemOperands() > 0);
576 
577   SyncScope::ID SSID = SyncScope::SingleThread;
578   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
579   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
580   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
581   bool IsNonTemporal = true;
582   bool IsVolatile = false;
583 
584   // Validator should check whether or not MMOs cover the entire set of
585   // locations accessed by the memory instruction.
586   for (const auto &MMO : MI->memoperands()) {
587     IsNonTemporal &= MMO->isNonTemporal();
588     IsVolatile |= MMO->isVolatile();
589     InstrAddrSpace |=
590       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
591     AtomicOrdering OpOrdering = MMO->getOrdering();
592     if (OpOrdering != AtomicOrdering::NotAtomic) {
593       const auto &IsSyncScopeInclusion =
594           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
595       if (!IsSyncScopeInclusion) {
596         reportUnsupported(MI,
597           "Unsupported non-inclusive atomic synchronization scope");
598         return None;
599       }
600 
601       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
602       Ordering =
603           isStrongerThan(Ordering, OpOrdering) ?
604               Ordering : MMO->getOrdering();
605       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
606              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
607       FailureOrdering =
608           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
609               FailureOrdering : MMO->getFailureOrdering();
610     }
611   }
612 
613   SIAtomicScope Scope = SIAtomicScope::NONE;
614   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
615   bool IsCrossAddressSpaceOrdering = false;
616   if (Ordering != AtomicOrdering::NotAtomic) {
617     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
618     if (!ScopeOrNone) {
619       reportUnsupported(MI, "Unsupported atomic synchronization scope");
620       return None;
621     }
622     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
623       ScopeOrNone.getValue();
624     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
625         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
626         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
627       reportUnsupported(MI, "Unsupported atomic address space");
628       return None;
629     }
630   }
631   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
632                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
633                      IsNonTemporal);
634 }
635 
636 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
637     const MachineBasicBlock::iterator &MI) const {
638   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
639 
640   if (!(MI->mayLoad() && !MI->mayStore()))
641     return None;
642 
643   // Be conservative if there are no memory operands.
644   if (MI->getNumMemOperands() == 0)
645     return SIMemOpInfo();
646 
647   return constructFromMIWithMMO(MI);
648 }
649 
650 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
651     const MachineBasicBlock::iterator &MI) const {
652   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
653 
654   if (!(!MI->mayLoad() && MI->mayStore()))
655     return None;
656 
657   // Be conservative if there are no memory operands.
658   if (MI->getNumMemOperands() == 0)
659     return SIMemOpInfo();
660 
661   return constructFromMIWithMMO(MI);
662 }
663 
664 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
665     const MachineBasicBlock::iterator &MI) const {
666   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
667 
668   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
669     return None;
670 
671   AtomicOrdering Ordering =
672     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
673 
674   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
675   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
676   if (!ScopeOrNone) {
677     reportUnsupported(MI, "Unsupported atomic synchronization scope");
678     return None;
679   }
680 
681   SIAtomicScope Scope = SIAtomicScope::NONE;
682   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
683   bool IsCrossAddressSpaceOrdering = false;
684   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
685     ScopeOrNone.getValue();
686 
687   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
688       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
689     reportUnsupported(MI, "Unsupported atomic address space");
690     return None;
691   }
692 
693   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
694                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
695 }
696 
697 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
698     const MachineBasicBlock::iterator &MI) const {
699   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
700 
701   if (!(MI->mayLoad() && MI->mayStore()))
702     return None;
703 
704   // Be conservative if there are no memory operands.
705   if (MI->getNumMemOperands() == 0)
706     return SIMemOpInfo();
707 
708   return constructFromMIWithMMO(MI);
709 }
710 
711 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
712   TII = ST.getInstrInfo();
713   IV = getIsaVersion(ST.getCPU());
714   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
715 }
716 
717 /* static */
718 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
719   GCNSubtarget::Generation Generation = ST.getGeneration();
720   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
721     return std::make_unique<SIGfx6CacheControl>(ST);
722   if (Generation < AMDGPUSubtarget::GFX10)
723     return std::make_unique<SIGfx7CacheControl>(ST);
724   return std::make_unique<SIGfx10CacheControl>(ST);
725 }
726 
727 bool SIGfx6CacheControl::enableLoadCacheBypass(
728     const MachineBasicBlock::iterator &MI,
729     SIAtomicScope Scope,
730     SIAtomicAddrSpace AddrSpace) const {
731   assert(MI->mayLoad() && !MI->mayStore());
732   bool Changed = false;
733 
734   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
735     switch (Scope) {
736     case SIAtomicScope::SYSTEM:
737     case SIAtomicScope::AGENT:
738       Changed |= enableGLCBit(MI);
739       break;
740     case SIAtomicScope::WORKGROUP:
741     case SIAtomicScope::WAVEFRONT:
742     case SIAtomicScope::SINGLETHREAD:
743       // No cache to bypass.
744       break;
745     default:
746       llvm_unreachable("Unsupported synchronization scope");
747     }
748   }
749 
750   /// The scratch address space does not need the global memory caches
751   /// to be bypassed as all memory operations by the same thread are
752   /// sequentially consistent, and no other thread can access scratch
753   /// memory.
754 
755   /// Other address spaces do not have a cache.
756 
757   return Changed;
758 }
759 
760 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
761     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
762     bool IsVolatile, bool IsNonTemporal) const {
763   // Only handle load and store, not atomic read-modify-write insructions. The
764   // latter use glc to indicate if the atomic returns a result and so must not
765   // be used for cache control.
766   assert(MI->mayLoad() ^ MI->mayStore());
767 
768   // Only update load and store, not LLVM IR atomic read-modify-write
769   // instructions. The latter are always marked as volatile so cannot sensibly
770   // handle it as do not want to pessimize all atomics. Also they do not support
771   // the nontemporal attribute.
772   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
773 
774   bool Changed = false;
775 
776   if (IsVolatile) {
777     if (Op == SIMemOp::LOAD)
778       Changed |= enableGLCBit(MI);
779 
780     // Ensure operation has completed at system scope to cause all volatile
781     // operations to be visible outside the program in a global order. Do not
782     // request cross address space as only the global address space can be
783     // observable outside the program, so no need to cause a waitcnt for LDS
784     // address space operations.
785     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
786                           Position::AFTER);
787 
788     return Changed;
789   }
790 
791   if (IsNonTemporal) {
792     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
793     Changed |= enableGLCBit(MI);
794     Changed |= enableSLCBit(MI);
795     return Changed;
796   }
797 
798   return Changed;
799 }
800 
801 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
802                                     SIAtomicScope Scope,
803                                     SIAtomicAddrSpace AddrSpace,
804                                     SIMemOp Op,
805                                     bool IsCrossAddrSpaceOrdering,
806                                     Position Pos) const {
807   bool Changed = false;
808 
809   MachineBasicBlock &MBB = *MI->getParent();
810   DebugLoc DL = MI->getDebugLoc();
811 
812   if (Pos == Position::AFTER)
813     ++MI;
814 
815   bool VMCnt = false;
816   bool LGKMCnt = false;
817 
818   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
819       SIAtomicAddrSpace::NONE) {
820     switch (Scope) {
821     case SIAtomicScope::SYSTEM:
822     case SIAtomicScope::AGENT:
823       VMCnt |= true;
824       break;
825     case SIAtomicScope::WORKGROUP:
826     case SIAtomicScope::WAVEFRONT:
827     case SIAtomicScope::SINGLETHREAD:
828       // The L1 cache keeps all memory operations in order for
829       // wavefronts in the same work-group.
830       break;
831     default:
832       llvm_unreachable("Unsupported synchronization scope");
833     }
834   }
835 
836   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
837     switch (Scope) {
838     case SIAtomicScope::SYSTEM:
839     case SIAtomicScope::AGENT:
840     case SIAtomicScope::WORKGROUP:
841       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
842       // not needed as LDS operations for all waves are executed in a total
843       // global ordering as observed by all waves. Required if also
844       // synchronizing with global/GDS memory as LDS operations could be
845       // reordered with respect to later global/GDS memory operations of the
846       // same wave.
847       LGKMCnt |= IsCrossAddrSpaceOrdering;
848       break;
849     case SIAtomicScope::WAVEFRONT:
850     case SIAtomicScope::SINGLETHREAD:
851       // The LDS keeps all memory operations in order for
852       // the same wavesfront.
853       break;
854     default:
855       llvm_unreachable("Unsupported synchronization scope");
856     }
857   }
858 
859   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
860     switch (Scope) {
861     case SIAtomicScope::SYSTEM:
862     case SIAtomicScope::AGENT:
863       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
864       // is not needed as GDS operations for all waves are executed in a total
865       // global ordering as observed by all waves. Required if also
866       // synchronizing with global/LDS memory as GDS operations could be
867       // reordered with respect to later global/LDS memory operations of the
868       // same wave.
869       LGKMCnt |= IsCrossAddrSpaceOrdering;
870       break;
871     case SIAtomicScope::WORKGROUP:
872     case SIAtomicScope::WAVEFRONT:
873     case SIAtomicScope::SINGLETHREAD:
874       // The GDS keeps all memory operations in order for
875       // the same work-group.
876       break;
877     default:
878       llvm_unreachable("Unsupported synchronization scope");
879     }
880   }
881 
882   if (VMCnt || LGKMCnt) {
883     unsigned WaitCntImmediate =
884       AMDGPU::encodeWaitcnt(IV,
885                             VMCnt ? 0 : getVmcntBitMask(IV),
886                             getExpcntBitMask(IV),
887                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
888     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
889     Changed = true;
890   }
891 
892   if (Pos == Position::AFTER)
893     --MI;
894 
895   return Changed;
896 }
897 
898 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
899                                        SIAtomicScope Scope,
900                                        SIAtomicAddrSpace AddrSpace,
901                                        Position Pos) const {
902   if (!InsertCacheInv)
903     return false;
904 
905   bool Changed = false;
906 
907   MachineBasicBlock &MBB = *MI->getParent();
908   DebugLoc DL = MI->getDebugLoc();
909 
910   if (Pos == Position::AFTER)
911     ++MI;
912 
913   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
914     switch (Scope) {
915     case SIAtomicScope::SYSTEM:
916     case SIAtomicScope::AGENT:
917       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
918       Changed = true;
919       break;
920     case SIAtomicScope::WORKGROUP:
921     case SIAtomicScope::WAVEFRONT:
922     case SIAtomicScope::SINGLETHREAD:
923       // No cache to invalidate.
924       break;
925     default:
926       llvm_unreachable("Unsupported synchronization scope");
927     }
928   }
929 
930   /// The scratch address space does not need the global memory cache
931   /// to be flushed as all memory operations by the same thread are
932   /// sequentially consistent, and no other thread can access scratch
933   /// memory.
934 
935   /// Other address spaces do not have a cache.
936 
937   if (Pos == Position::AFTER)
938     --MI;
939 
940   return Changed;
941 }
942 
943 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
944                                        SIAtomicScope Scope,
945                                        SIAtomicAddrSpace AddrSpace,
946                                        bool IsCrossAddrSpaceOrdering,
947                                        Position Pos) const {
948     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
949                       IsCrossAddrSpaceOrdering, Pos);
950 }
951 
952 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
953                                        SIAtomicScope Scope,
954                                        SIAtomicAddrSpace AddrSpace,
955                                        Position Pos) const {
956   if (!InsertCacheInv)
957     return false;
958 
959   bool Changed = false;
960 
961   MachineBasicBlock &MBB = *MI->getParent();
962   DebugLoc DL = MI->getDebugLoc();
963 
964   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
965 
966   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
967                                     ? AMDGPU::BUFFER_WBINVL1
968                                     : AMDGPU::BUFFER_WBINVL1_VOL;
969 
970   if (Pos == Position::AFTER)
971     ++MI;
972 
973   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
974     switch (Scope) {
975     case SIAtomicScope::SYSTEM:
976     case SIAtomicScope::AGENT:
977       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
978       Changed = true;
979       break;
980     case SIAtomicScope::WORKGROUP:
981     case SIAtomicScope::WAVEFRONT:
982     case SIAtomicScope::SINGLETHREAD:
983       // No cache to invalidate.
984       break;
985     default:
986       llvm_unreachable("Unsupported synchronization scope");
987     }
988   }
989 
990   /// The scratch address space does not need the global memory cache
991   /// to be flushed as all memory operations by the same thread are
992   /// sequentially consistent, and no other thread can access scratch
993   /// memory.
994 
995   /// Other address spaces do not have a cache.
996 
997   if (Pos == Position::AFTER)
998     --MI;
999 
1000   return Changed;
1001 }
1002 
1003 bool SIGfx10CacheControl::enableLoadCacheBypass(
1004     const MachineBasicBlock::iterator &MI,
1005     SIAtomicScope Scope,
1006     SIAtomicAddrSpace AddrSpace) const {
1007   assert(MI->mayLoad() && !MI->mayStore());
1008   bool Changed = false;
1009 
1010   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1011     /// TODO Do not set glc for rmw atomic operations as they
1012     /// implicitly bypass the L0/L1 caches.
1013 
1014     switch (Scope) {
1015     case SIAtomicScope::SYSTEM:
1016     case SIAtomicScope::AGENT:
1017       Changed |= enableGLCBit(MI);
1018       Changed |= enableDLCBit(MI);
1019       break;
1020     case SIAtomicScope::WORKGROUP:
1021       // In WGP mode the waves of a work-group can be executing on either CU of
1022       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1023       // CU mode all waves of a work-group are on the same CU, and so the L0
1024       // does not need to be bypassed.
1025       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1026       break;
1027     case SIAtomicScope::WAVEFRONT:
1028     case SIAtomicScope::SINGLETHREAD:
1029       // No cache to bypass.
1030       break;
1031     default:
1032       llvm_unreachable("Unsupported synchronization scope");
1033     }
1034   }
1035 
1036   /// The scratch address space does not need the global memory caches
1037   /// to be bypassed as all memory operations by the same thread are
1038   /// sequentially consistent, and no other thread can access scratch
1039   /// memory.
1040 
1041   /// Other address spaces do not have a cache.
1042 
1043   return Changed;
1044 }
1045 
1046 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1047     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1048     bool IsVolatile, bool IsNonTemporal) const {
1049 
1050   // Only handle load and store, not atomic read-modify-write insructions. The
1051   // latter use glc to indicate if the atomic returns a result and so must not
1052   // be used for cache control.
1053   assert(MI->mayLoad() ^ MI->mayStore());
1054 
1055   // Only update load and store, not LLVM IR atomic read-modify-write
1056   // instructions. The latter are always marked as volatile so cannot sensibly
1057   // handle it as do not want to pessimize all atomics. Also they do not support
1058   // the nontemporal attribute.
1059   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1060 
1061   bool Changed = false;
1062 
1063   if (IsVolatile) {
1064 
1065     if (Op == SIMemOp::LOAD) {
1066       Changed |= enableGLCBit(MI);
1067       Changed |= enableDLCBit(MI);
1068     }
1069 
1070     // Ensure operation has completed at system scope to cause all volatile
1071     // operations to be visible outside the program in a global order. Do not
1072     // request cross address space as only the global address space can be
1073     // observable outside the program, so no need to cause a waitcnt for LDS
1074     // address space operations.
1075     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1076                           Position::AFTER);
1077     return Changed;
1078   }
1079 
1080   if (IsNonTemporal) {
1081     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1082     Changed |= enableSLCBit(MI);
1083     return Changed;
1084   }
1085 
1086   return Changed;
1087 }
1088 
1089 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1090                                      SIAtomicScope Scope,
1091                                      SIAtomicAddrSpace AddrSpace,
1092                                      SIMemOp Op,
1093                                      bool IsCrossAddrSpaceOrdering,
1094                                      Position Pos) const {
1095   bool Changed = false;
1096 
1097   MachineBasicBlock &MBB = *MI->getParent();
1098   DebugLoc DL = MI->getDebugLoc();
1099 
1100   if (Pos == Position::AFTER)
1101     ++MI;
1102 
1103   bool VMCnt = false;
1104   bool VSCnt = false;
1105   bool LGKMCnt = false;
1106 
1107   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1108       SIAtomicAddrSpace::NONE) {
1109     switch (Scope) {
1110     case SIAtomicScope::SYSTEM:
1111     case SIAtomicScope::AGENT:
1112       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1113         VMCnt |= true;
1114       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1115         VSCnt |= true;
1116       break;
1117     case SIAtomicScope::WORKGROUP:
1118       // In WGP mode the waves of a work-group can be executing on either CU of
1119       // the WGP. Therefore need to wait for operations to complete to ensure
1120       // they are visible to waves in the other CU as the L0 is per CU.
1121       // Otherwise in CU mode and all waves of a work-group are on the same CU
1122       // which shares the same L0.
1123       if (!ST.isCuModeEnabled()) {
1124         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1125           VMCnt |= true;
1126         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1127           VSCnt |= true;
1128       }
1129       break;
1130     case SIAtomicScope::WAVEFRONT:
1131     case SIAtomicScope::SINGLETHREAD:
1132       // The L0 cache keeps all memory operations in order for
1133       // work-items in the same wavefront.
1134       break;
1135     default:
1136       llvm_unreachable("Unsupported synchronization scope");
1137     }
1138   }
1139 
1140   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1141     switch (Scope) {
1142     case SIAtomicScope::SYSTEM:
1143     case SIAtomicScope::AGENT:
1144     case SIAtomicScope::WORKGROUP:
1145       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1146       // not needed as LDS operations for all waves are executed in a total
1147       // global ordering as observed by all waves. Required if also
1148       // synchronizing with global/GDS memory as LDS operations could be
1149       // reordered with respect to later global/GDS memory operations of the
1150       // same wave.
1151       LGKMCnt |= IsCrossAddrSpaceOrdering;
1152       break;
1153     case SIAtomicScope::WAVEFRONT:
1154     case SIAtomicScope::SINGLETHREAD:
1155       // The LDS keeps all memory operations in order for
1156       // the same wavesfront.
1157       break;
1158     default:
1159       llvm_unreachable("Unsupported synchronization scope");
1160     }
1161   }
1162 
1163   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1164     switch (Scope) {
1165     case SIAtomicScope::SYSTEM:
1166     case SIAtomicScope::AGENT:
1167       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1168       // is not needed as GDS operations for all waves are executed in a total
1169       // global ordering as observed by all waves. Required if also
1170       // synchronizing with global/LDS memory as GDS operations could be
1171       // reordered with respect to later global/LDS memory operations of the
1172       // same wave.
1173       LGKMCnt |= IsCrossAddrSpaceOrdering;
1174       break;
1175     case SIAtomicScope::WORKGROUP:
1176     case SIAtomicScope::WAVEFRONT:
1177     case SIAtomicScope::SINGLETHREAD:
1178       // The GDS keeps all memory operations in order for
1179       // the same work-group.
1180       break;
1181     default:
1182       llvm_unreachable("Unsupported synchronization scope");
1183     }
1184   }
1185 
1186   if (VMCnt || LGKMCnt) {
1187     unsigned WaitCntImmediate =
1188       AMDGPU::encodeWaitcnt(IV,
1189                             VMCnt ? 0 : getVmcntBitMask(IV),
1190                             getExpcntBitMask(IV),
1191                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1192     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1193     Changed = true;
1194   }
1195 
1196   if (VSCnt) {
1197     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1198       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1199       .addImm(0);
1200     Changed = true;
1201   }
1202 
1203   if (Pos == Position::AFTER)
1204     --MI;
1205 
1206   return Changed;
1207 }
1208 
1209 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1210                                         SIAtomicScope Scope,
1211                                         SIAtomicAddrSpace AddrSpace,
1212                                         Position Pos) const {
1213   if (!InsertCacheInv)
1214     return false;
1215 
1216   bool Changed = false;
1217 
1218   MachineBasicBlock &MBB = *MI->getParent();
1219   DebugLoc DL = MI->getDebugLoc();
1220 
1221   if (Pos == Position::AFTER)
1222     ++MI;
1223 
1224   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1225     switch (Scope) {
1226     case SIAtomicScope::SYSTEM:
1227     case SIAtomicScope::AGENT:
1228       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1229       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1230       Changed = true;
1231       break;
1232     case SIAtomicScope::WORKGROUP:
1233       // In WGP mode the waves of a work-group can be executing on either CU of
1234       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1235       // in CU mode and all waves of a work-group are on the same CU, and so the
1236       // L0 does not need to be invalidated.
1237       if (!ST.isCuModeEnabled()) {
1238         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1239         Changed = true;
1240       }
1241       break;
1242     case SIAtomicScope::WAVEFRONT:
1243     case SIAtomicScope::SINGLETHREAD:
1244       // No cache to invalidate.
1245       break;
1246     default:
1247       llvm_unreachable("Unsupported synchronization scope");
1248     }
1249   }
1250 
1251   /// The scratch address space does not need the global memory cache
1252   /// to be flushed as all memory operations by the same thread are
1253   /// sequentially consistent, and no other thread can access scratch
1254   /// memory.
1255 
1256   /// Other address spaces do not have a cache.
1257 
1258   if (Pos == Position::AFTER)
1259     --MI;
1260 
1261   return Changed;
1262 }
1263 
1264 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1265   if (AtomicPseudoMIs.empty())
1266     return false;
1267 
1268   for (auto &MI : AtomicPseudoMIs)
1269     MI->eraseFromParent();
1270 
1271   AtomicPseudoMIs.clear();
1272   return true;
1273 }
1274 
1275 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1276                                    MachineBasicBlock::iterator &MI) {
1277   assert(MI->mayLoad() && !MI->mayStore());
1278 
1279   bool Changed = false;
1280 
1281   if (MOI.isAtomic()) {
1282     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1283         MOI.getOrdering() == AtomicOrdering::Acquire ||
1284         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1285       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1286                                            MOI.getOrderingAddrSpace());
1287     }
1288 
1289     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1290       Changed |= CC->insertWait(MI, MOI.getScope(),
1291                                 MOI.getOrderingAddrSpace(),
1292                                 SIMemOp::LOAD | SIMemOp::STORE,
1293                                 MOI.getIsCrossAddressSpaceOrdering(),
1294                                 Position::BEFORE);
1295 
1296     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1297         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1298       Changed |= CC->insertWait(MI, MOI.getScope(),
1299                                 MOI.getInstrAddrSpace(),
1300                                 SIMemOp::LOAD,
1301                                 MOI.getIsCrossAddressSpaceOrdering(),
1302                                 Position::AFTER);
1303       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1304                                    MOI.getOrderingAddrSpace(),
1305                                    Position::AFTER);
1306     }
1307 
1308     return Changed;
1309   }
1310 
1311   // Atomic instructions already bypass caches to the scope specified by the
1312   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1313   // need additional treatment.
1314   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1315                                                 SIMemOp::LOAD, MOI.isVolatile(),
1316                                                 MOI.isNonTemporal());
1317   return Changed;
1318 }
1319 
1320 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1321                                     MachineBasicBlock::iterator &MI) {
1322   assert(!MI->mayLoad() && MI->mayStore());
1323 
1324   bool Changed = false;
1325 
1326   if (MOI.isAtomic()) {
1327     if (MOI.getOrdering() == AtomicOrdering::Release ||
1328         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1329       Changed |= CC->insertRelease(MI, MOI.getScope(),
1330                                    MOI.getOrderingAddrSpace(),
1331                                    MOI.getIsCrossAddressSpaceOrdering(),
1332                                    Position::BEFORE);
1333 
1334     return Changed;
1335   }
1336 
1337   // Atomic instructions already bypass caches to the scope specified by the
1338   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1339   // need additional treatment.
1340   Changed |= CC->enableVolatileAndOrNonTemporal(
1341       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1342       MOI.isNonTemporal());
1343   return Changed;
1344 }
1345 
1346 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1347                                           MachineBasicBlock::iterator &MI) {
1348   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1349 
1350   AtomicPseudoMIs.push_back(MI);
1351   bool Changed = false;
1352 
1353   if (MOI.isAtomic()) {
1354     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1355         MOI.getOrdering() == AtomicOrdering::Release ||
1356         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1357         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1358       /// TODO: This relies on a barrier always generating a waitcnt
1359       /// for LDS to ensure it is not reordered with the completion of
1360       /// the proceeding LDS operations. If barrier had a memory
1361       /// ordering and memory scope, then library does not need to
1362       /// generate a fence. Could add support in this file for
1363       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1364       /// adding S_WAITCNT before a S_BARRIER.
1365       Changed |= CC->insertRelease(MI, MOI.getScope(),
1366                                    MOI.getOrderingAddrSpace(),
1367                                    MOI.getIsCrossAddressSpaceOrdering(),
1368                                    Position::BEFORE);
1369 
1370     // TODO: If both release and invalidate are happening they could be combined
1371     // to use the single "BUFFER_WBL2" instruction. This could be done by
1372     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1373     // track cache invalidate and write back instructions.
1374 
1375     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1376         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1377         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1378       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1379                                    MOI.getOrderingAddrSpace(),
1380                                    Position::BEFORE);
1381 
1382     return Changed;
1383   }
1384 
1385   return Changed;
1386 }
1387 
1388 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1389   MachineBasicBlock::iterator &MI) {
1390   assert(MI->mayLoad() && MI->mayStore());
1391 
1392   bool Changed = false;
1393 
1394   if (MOI.isAtomic()) {
1395     if (MOI.getOrdering() == AtomicOrdering::Release ||
1396         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1397         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1398         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1399       Changed |= CC->insertRelease(MI, MOI.getScope(),
1400                                    MOI.getOrderingAddrSpace(),
1401                                    MOI.getIsCrossAddressSpaceOrdering(),
1402                                    Position::BEFORE);
1403 
1404     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1405         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1406         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1407         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1408         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1409       Changed |= CC->insertWait(MI, MOI.getScope(),
1410                                 MOI.getOrderingAddrSpace(),
1411                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1412                                                    SIMemOp::STORE,
1413                                 MOI.getIsCrossAddressSpaceOrdering(),
1414                                 Position::AFTER);
1415       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1416                                    MOI.getOrderingAddrSpace(),
1417                                    Position::AFTER);
1418     }
1419 
1420     return Changed;
1421   }
1422 
1423   return Changed;
1424 }
1425 
1426 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1427   bool Changed = false;
1428 
1429   SIMemOpAccess MOA(MF);
1430   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1431 
1432   for (auto &MBB : MF) {
1433     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1434 
1435       // Unbundle instructions after the post-RA scheduler.
1436       if (MI->isBundle()) {
1437         MachineBasicBlock::instr_iterator II(MI->getIterator());
1438         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1439              I != E && I->isBundledWithPred(); ++I) {
1440           I->unbundleFromPred();
1441           for (MachineOperand &MO : I->operands())
1442             if (MO.isReg())
1443               MO.setIsInternalRead(false);
1444         }
1445 
1446         MI->eraseFromParent();
1447         MI = II->getIterator();
1448       }
1449 
1450       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1451         continue;
1452 
1453       if (const auto &MOI = MOA.getLoadInfo(MI))
1454         Changed |= expandLoad(MOI.getValue(), MI);
1455       else if (const auto &MOI = MOA.getStoreInfo(MI))
1456         Changed |= expandStore(MOI.getValue(), MI);
1457       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1458         Changed |= expandAtomicFence(MOI.getValue(), MI);
1459       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1460         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1461     }
1462   }
1463 
1464   Changed |= removeAtomicPseudoMIs();
1465   return Changed;
1466 }
1467 
1468 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1469 
1470 char SIMemoryLegalizer::ID = 0;
1471 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1472 
1473 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1474   return new SIMemoryLegalizer();
1475 }
1476