1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "llvm/ADT/BitmaskEnum.h"
20 #include "llvm/CodeGen/MachineBasicBlock.h"
21 #include "llvm/IR/DiagnosticInfo.h"
22 #include "llvm/Support/AtomicOrdering.h"
23 #include "llvm/Support/TargetParser.h"
24 
25 using namespace llvm;
26 using namespace llvm::AMDGPU;
27 
28 #define DEBUG_TYPE "si-memory-legalizer"
29 #define PASS_NAME "SI Memory Legalizer"
30 
31 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
32     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
33     cl::desc("Use this to skip inserting cache invalidating instructions."));
34 
35 namespace {
36 
37 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
38 
39 /// Memory operation flags. Can be ORed together.
40 enum class SIMemOp {
41   NONE = 0u,
42   LOAD = 1u << 0,
43   STORE = 1u << 1,
44   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
45 };
46 
47 /// Position to insert a new instruction relative to an existing
48 /// instruction.
49 enum class Position {
50   BEFORE,
51   AFTER
52 };
53 
54 /// The atomic synchronization scopes supported by the AMDGPU target.
55 enum class SIAtomicScope {
56   NONE,
57   SINGLETHREAD,
58   WAVEFRONT,
59   WORKGROUP,
60   AGENT,
61   SYSTEM
62 };
63 
64 /// The distinct address spaces supported by the AMDGPU target for
65 /// atomic memory operation. Can be ORed toether.
66 enum class SIAtomicAddrSpace {
67   NONE = 0u,
68   GLOBAL = 1u << 0,
69   LDS = 1u << 1,
70   SCRATCH = 1u << 2,
71   GDS = 1u << 3,
72   OTHER = 1u << 4,
73 
74   /// The address spaces that can be accessed by a FLAT instruction.
75   FLAT = GLOBAL | LDS | SCRATCH,
76 
77   /// The address spaces that support atomic instructions.
78   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
79 
80   /// All address spaces.
81   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
82 
83   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
84 };
85 
86 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
87 /// \returns Returns true if \p MI is modified, false otherwise.
88 template <uint16_t BitName>
89 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
90   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
91   if (BitIdx == -1)
92     return false;
93 
94   MachineOperand &Bit = MI->getOperand(BitIdx);
95   if (Bit.getImm() != 0)
96     return false;
97 
98   Bit.setImm(1);
99   return true;
100 }
101 
102 class SIMemOpInfo final {
103 private:
104 
105   friend class SIMemOpAccess;
106 
107   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
108   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
109   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
110   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
111   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
112   bool IsCrossAddressSpaceOrdering = false;
113   bool IsVolatile = false;
114   bool IsNonTemporal = false;
115 
116   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
117               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
118               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
119               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
120               bool IsCrossAddressSpaceOrdering = true,
121               AtomicOrdering FailureOrdering =
122                 AtomicOrdering::SequentiallyConsistent,
123               bool IsVolatile = false,
124               bool IsNonTemporal = false)
125     : Ordering(Ordering), FailureOrdering(FailureOrdering),
126       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
127       InstrAddrSpace(InstrAddrSpace),
128       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
129       IsVolatile(IsVolatile),
130       IsNonTemporal(IsNonTemporal) {
131     // There is also no cross address space ordering if the ordering
132     // address space is the same as the instruction address space and
133     // only contains a single address space.
134     if ((OrderingAddrSpace == InstrAddrSpace) &&
135         isPowerOf2_32(uint32_t(InstrAddrSpace)))
136       this->IsCrossAddressSpaceOrdering = false;
137   }
138 
139 public:
140   /// \returns Atomic synchronization scope of the machine instruction used to
141   /// create this SIMemOpInfo.
142   SIAtomicScope getScope() const {
143     return Scope;
144   }
145 
146   /// \returns Ordering constraint of the machine instruction used to
147   /// create this SIMemOpInfo.
148   AtomicOrdering getOrdering() const {
149     return Ordering;
150   }
151 
152   /// \returns Failure ordering constraint of the machine instruction used to
153   /// create this SIMemOpInfo.
154   AtomicOrdering getFailureOrdering() const {
155     return FailureOrdering;
156   }
157 
158   /// \returns The address spaces be accessed by the machine
159   /// instruction used to create this SiMemOpInfo.
160   SIAtomicAddrSpace getInstrAddrSpace() const {
161     return InstrAddrSpace;
162   }
163 
164   /// \returns The address spaces that must be ordered by the machine
165   /// instruction used to create this SiMemOpInfo.
166   SIAtomicAddrSpace getOrderingAddrSpace() const {
167     return OrderingAddrSpace;
168   }
169 
170   /// \returns Return true iff memory ordering of operations on
171   /// different address spaces is required.
172   bool getIsCrossAddressSpaceOrdering() const {
173     return IsCrossAddressSpaceOrdering;
174   }
175 
176   /// \returns True if memory access of the machine instruction used to
177   /// create this SIMemOpInfo is volatile, false otherwise.
178   bool isVolatile() const {
179     return IsVolatile;
180   }
181 
182   /// \returns True if memory access of the machine instruction used to
183   /// create this SIMemOpInfo is nontemporal, false otherwise.
184   bool isNonTemporal() const {
185     return IsNonTemporal;
186   }
187 
188   /// \returns True if ordering constraint of the machine instruction used to
189   /// create this SIMemOpInfo is unordered or higher, false otherwise.
190   bool isAtomic() const {
191     return Ordering != AtomicOrdering::NotAtomic;
192   }
193 
194 };
195 
196 class SIMemOpAccess final {
197 private:
198   AMDGPUMachineModuleInfo *MMI = nullptr;
199 
200   /// Reports unsupported message \p Msg for \p MI to LLVM context.
201   void reportUnsupported(const MachineBasicBlock::iterator &MI,
202                          const char *Msg) const;
203 
204   /// Inspects the target synchonization scope \p SSID and determines
205   /// the SI atomic scope it corresponds to, the address spaces it
206   /// covers, and whether the memory ordering applies between address
207   /// spaces.
208   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
209   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
210 
211   /// \return Return a bit set of the address spaces accessed by \p AS.
212   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
213 
214   /// \returns Info constructed from \p MI, which has at least machine memory
215   /// operand.
216   Optional<SIMemOpInfo> constructFromMIWithMMO(
217       const MachineBasicBlock::iterator &MI) const;
218 
219 public:
220   /// Construct class to support accessing the machine memory operands
221   /// of instructions in the machine function \p MF.
222   SIMemOpAccess(MachineFunction &MF);
223 
224   /// \returns Load info if \p MI is a load operation, "None" otherwise.
225   Optional<SIMemOpInfo> getLoadInfo(
226       const MachineBasicBlock::iterator &MI) const;
227 
228   /// \returns Store info if \p MI is a store operation, "None" otherwise.
229   Optional<SIMemOpInfo> getStoreInfo(
230       const MachineBasicBlock::iterator &MI) const;
231 
232   /// \returns Atomic fence info if \p MI is an atomic fence operation,
233   /// "None" otherwise.
234   Optional<SIMemOpInfo> getAtomicFenceInfo(
235       const MachineBasicBlock::iterator &MI) const;
236 
237   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
238   /// rmw operation, "None" otherwise.
239   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
240       const MachineBasicBlock::iterator &MI) const;
241 };
242 
243 class SICacheControl {
244 protected:
245 
246   /// AMDGPU subtarget info.
247   const GCNSubtarget &ST;
248 
249   /// Instruction info.
250   const SIInstrInfo *TII = nullptr;
251 
252   IsaVersion IV;
253 
254   /// Whether to insert cache invalidating instructions.
255   bool InsertCacheInv;
256 
257   SICacheControl(const GCNSubtarget &ST);
258 
259 public:
260 
261   /// Create a cache control for the subtarget \p ST.
262   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
263 
264   /// Update \p MI memory load instruction to bypass any caches up to
265   /// the \p Scope memory scope for address spaces \p
266   /// AddrSpace. Return true iff the instruction was modified.
267   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
268                                      SIAtomicScope Scope,
269                                      SIAtomicAddrSpace AddrSpace) const = 0;
270 
271   /// Update \p MI memory instruction of kind \p Op associated with address
272   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
273   /// true iff the instruction was modified.
274   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
275                                               SIAtomicAddrSpace AddrSpace,
276                                               SIMemOp Op, bool IsVolatile,
277                                               bool IsNonTemporal) const = 0;
278 
279   /// Inserts any necessary instructions at position \p Pos relative
280   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
281   /// \p Op associated with address spaces \p AddrSpace have completed. Used
282   /// between memory instructions to enforce the order they become visible as
283   /// observed by other memory instructions executing in memory scope \p Scope.
284   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
285   /// address spaces. Returns true iff any instructions inserted.
286   virtual bool insertWait(MachineBasicBlock::iterator &MI,
287                           SIAtomicScope Scope,
288                           SIAtomicAddrSpace AddrSpace,
289                           SIMemOp Op,
290                           bool IsCrossAddrSpaceOrdering,
291                           Position Pos) const = 0;
292 
293   /// Inserts any necessary instructions at position \p Pos relative to
294   /// instruction \p MI to ensure any subsequent memory instructions of this
295   /// thread with address spaces \p AddrSpace will observe the previous memory
296   /// operations by any thread for memory scopes up to memory scope \p Scope .
297   /// Returns true iff any instructions inserted.
298   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
299                              SIAtomicScope Scope,
300                              SIAtomicAddrSpace AddrSpace,
301                              Position Pos) const = 0;
302 
303   /// Inserts any necessary instructions at position \p Pos relative to
304   /// instruction \p MI to ensure previous memory instructions by this thread
305   /// with address spaces \p AddrSpace have completed and can be observed by
306   /// subsequent memory instructions by any thread executing in memory scope \p
307   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
308   /// between address spaces. Returns true iff any instructions inserted.
309   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
310                              SIAtomicScope Scope,
311                              SIAtomicAddrSpace AddrSpace,
312                              bool IsCrossAddrSpaceOrdering,
313                              Position Pos) const = 0;
314 
315   /// Virtual destructor to allow derivations to be deleted.
316   virtual ~SICacheControl() = default;
317 
318 };
319 
320 class SIGfx6CacheControl : public SICacheControl {
321 protected:
322 
323   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
324   /// is modified, false otherwise.
325   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
326     return enableNamedBit<AMDGPU::OpName::glc>(MI);
327   }
328 
329   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
330   /// is modified, false otherwise.
331   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
332     return enableNamedBit<AMDGPU::OpName::slc>(MI);
333   }
334 
335 public:
336 
337   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
338 
339   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
340                              SIAtomicScope Scope,
341                              SIAtomicAddrSpace AddrSpace) const override;
342 
343   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
344                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
345                                       bool IsVolatile,
346                                       bool IsNonTemporal) const override;
347 
348   bool insertWait(MachineBasicBlock::iterator &MI,
349                   SIAtomicScope Scope,
350                   SIAtomicAddrSpace AddrSpace,
351                   SIMemOp Op,
352                   bool IsCrossAddrSpaceOrdering,
353                   Position Pos) const override;
354 
355   bool insertAcquire(MachineBasicBlock::iterator &MI,
356                      SIAtomicScope Scope,
357                      SIAtomicAddrSpace AddrSpace,
358                      Position Pos) const override;
359 
360   bool insertRelease(MachineBasicBlock::iterator &MI,
361                      SIAtomicScope Scope,
362                      SIAtomicAddrSpace AddrSpace,
363                      bool IsCrossAddrSpaceOrdering,
364                      Position Pos) const override;
365 };
366 
367 class SIGfx7CacheControl : public SIGfx6CacheControl {
368 public:
369 
370   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
371 
372   bool insertAcquire(MachineBasicBlock::iterator &MI,
373                      SIAtomicScope Scope,
374                      SIAtomicAddrSpace AddrSpace,
375                      Position Pos) const override;
376 
377 };
378 
379 class SIGfx10CacheControl : public SIGfx7CacheControl {
380 protected:
381 
382   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
383   /// is modified, false otherwise.
384   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
385     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
386   }
387 
388 public:
389 
390   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
391 
392   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
393                              SIAtomicScope Scope,
394                              SIAtomicAddrSpace AddrSpace) const override;
395 
396   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
397                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
398                                       bool IsVolatile,
399                                       bool IsNonTemporal) const override;
400 
401   bool insertWait(MachineBasicBlock::iterator &MI,
402                   SIAtomicScope Scope,
403                   SIAtomicAddrSpace AddrSpace,
404                   SIMemOp Op,
405                   bool IsCrossAddrSpaceOrdering,
406                   Position Pos) const override;
407 
408   bool insertAcquire(MachineBasicBlock::iterator &MI,
409                      SIAtomicScope Scope,
410                      SIAtomicAddrSpace AddrSpace,
411                      Position Pos) const override;
412 };
413 
414 class SIMemoryLegalizer final : public MachineFunctionPass {
415 private:
416 
417   /// Cache Control.
418   std::unique_ptr<SICacheControl> CC = nullptr;
419 
420   /// List of atomic pseudo instructions.
421   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
422 
423   /// Return true iff instruction \p MI is a atomic instruction that
424   /// returns a result.
425   bool isAtomicRet(const MachineInstr &MI) const {
426     return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
427   }
428 
429   /// Removes all processed atomic pseudo instructions from the current
430   /// function. Returns true if current function is modified, false otherwise.
431   bool removeAtomicPseudoMIs();
432 
433   /// Expands load operation \p MI. Returns true if instructions are
434   /// added/deleted or \p MI is modified, false otherwise.
435   bool expandLoad(const SIMemOpInfo &MOI,
436                   MachineBasicBlock::iterator &MI);
437   /// Expands store operation \p MI. Returns true if instructions are
438   /// added/deleted or \p MI is modified, false otherwise.
439   bool expandStore(const SIMemOpInfo &MOI,
440                    MachineBasicBlock::iterator &MI);
441   /// Expands atomic fence operation \p MI. Returns true if
442   /// instructions are added/deleted or \p MI is modified, false otherwise.
443   bool expandAtomicFence(const SIMemOpInfo &MOI,
444                          MachineBasicBlock::iterator &MI);
445   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
446   /// instructions are added/deleted or \p MI is modified, false otherwise.
447   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
448                                 MachineBasicBlock::iterator &MI);
449 
450 public:
451   static char ID;
452 
453   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
454 
455   void getAnalysisUsage(AnalysisUsage &AU) const override {
456     AU.setPreservesCFG();
457     MachineFunctionPass::getAnalysisUsage(AU);
458   }
459 
460   StringRef getPassName() const override {
461     return PASS_NAME;
462   }
463 
464   bool runOnMachineFunction(MachineFunction &MF) override;
465 };
466 
467 } // end namespace anonymous
468 
469 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
470                                       const char *Msg) const {
471   const Function &Func = MI->getParent()->getParent()->getFunction();
472   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
473   Func.getContext().diagnose(Diag);
474 }
475 
476 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
477 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
478                                SIAtomicAddrSpace InstrScope) const {
479   if (SSID == SyncScope::System)
480     return std::make_tuple(SIAtomicScope::SYSTEM,
481                            SIAtomicAddrSpace::ATOMIC,
482                            true);
483   if (SSID == MMI->getAgentSSID())
484     return std::make_tuple(SIAtomicScope::AGENT,
485                            SIAtomicAddrSpace::ATOMIC,
486                            true);
487   if (SSID == MMI->getWorkgroupSSID())
488     return std::make_tuple(SIAtomicScope::WORKGROUP,
489                            SIAtomicAddrSpace::ATOMIC,
490                            true);
491   if (SSID == MMI->getWavefrontSSID())
492     return std::make_tuple(SIAtomicScope::WAVEFRONT,
493                            SIAtomicAddrSpace::ATOMIC,
494                            true);
495   if (SSID == SyncScope::SingleThread)
496     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
497                            SIAtomicAddrSpace::ATOMIC,
498                            true);
499   if (SSID == MMI->getSystemOneAddressSpaceSSID())
500     return std::make_tuple(SIAtomicScope::SYSTEM,
501                            SIAtomicAddrSpace::ATOMIC & InstrScope,
502                            false);
503   if (SSID == MMI->getAgentOneAddressSpaceSSID())
504     return std::make_tuple(SIAtomicScope::AGENT,
505                            SIAtomicAddrSpace::ATOMIC & InstrScope,
506                            false);
507   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
508     return std::make_tuple(SIAtomicScope::WORKGROUP,
509                            SIAtomicAddrSpace::ATOMIC & InstrScope,
510                            false);
511   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
512     return std::make_tuple(SIAtomicScope::WAVEFRONT,
513                            SIAtomicAddrSpace::ATOMIC & InstrScope,
514                            false);
515   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
516     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
517                            SIAtomicAddrSpace::ATOMIC & InstrScope,
518                            false);
519   return None;
520 }
521 
522 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
523   if (AS == AMDGPUAS::FLAT_ADDRESS)
524     return SIAtomicAddrSpace::FLAT;
525   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
526     return SIAtomicAddrSpace::GLOBAL;
527   if (AS == AMDGPUAS::LOCAL_ADDRESS)
528     return SIAtomicAddrSpace::LDS;
529   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
530     return SIAtomicAddrSpace::SCRATCH;
531   if (AS == AMDGPUAS::REGION_ADDRESS)
532     return SIAtomicAddrSpace::GDS;
533 
534   return SIAtomicAddrSpace::OTHER;
535 }
536 
537 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
538   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
539 }
540 
541 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
542     const MachineBasicBlock::iterator &MI) const {
543   assert(MI->getNumMemOperands() > 0);
544 
545   SyncScope::ID SSID = SyncScope::SingleThread;
546   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
547   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
548   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
549   bool IsNonTemporal = true;
550   bool IsVolatile = false;
551 
552   // Validator should check whether or not MMOs cover the entire set of
553   // locations accessed by the memory instruction.
554   for (const auto &MMO : MI->memoperands()) {
555     IsNonTemporal &= MMO->isNonTemporal();
556     IsVolatile |= MMO->isVolatile();
557     InstrAddrSpace |=
558       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
559     AtomicOrdering OpOrdering = MMO->getOrdering();
560     if (OpOrdering != AtomicOrdering::NotAtomic) {
561       const auto &IsSyncScopeInclusion =
562           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
563       if (!IsSyncScopeInclusion) {
564         reportUnsupported(MI,
565           "Unsupported non-inclusive atomic synchronization scope");
566         return None;
567       }
568 
569       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
570       Ordering =
571           isStrongerThan(Ordering, OpOrdering) ?
572               Ordering : MMO->getOrdering();
573       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
574              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
575       FailureOrdering =
576           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
577               FailureOrdering : MMO->getFailureOrdering();
578     }
579   }
580 
581   SIAtomicScope Scope = SIAtomicScope::NONE;
582   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
583   bool IsCrossAddressSpaceOrdering = false;
584   if (Ordering != AtomicOrdering::NotAtomic) {
585     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
586     if (!ScopeOrNone) {
587       reportUnsupported(MI, "Unsupported atomic synchronization scope");
588       return None;
589     }
590     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
591       ScopeOrNone.getValue();
592     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
593         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
594       reportUnsupported(MI, "Unsupported atomic address space");
595       return None;
596     }
597   }
598   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
599                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
600                      IsNonTemporal);
601 }
602 
603 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
604     const MachineBasicBlock::iterator &MI) const {
605   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
606 
607   if (!(MI->mayLoad() && !MI->mayStore()))
608     return None;
609 
610   // Be conservative if there are no memory operands.
611   if (MI->getNumMemOperands() == 0)
612     return SIMemOpInfo();
613 
614   return constructFromMIWithMMO(MI);
615 }
616 
617 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
618     const MachineBasicBlock::iterator &MI) const {
619   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
620 
621   if (!(!MI->mayLoad() && MI->mayStore()))
622     return None;
623 
624   // Be conservative if there are no memory operands.
625   if (MI->getNumMemOperands() == 0)
626     return SIMemOpInfo();
627 
628   return constructFromMIWithMMO(MI);
629 }
630 
631 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
632     const MachineBasicBlock::iterator &MI) const {
633   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
634 
635   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
636     return None;
637 
638   AtomicOrdering Ordering =
639     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
640 
641   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
642   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
643   if (!ScopeOrNone) {
644     reportUnsupported(MI, "Unsupported atomic synchronization scope");
645     return None;
646   }
647 
648   SIAtomicScope Scope = SIAtomicScope::NONE;
649   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
650   bool IsCrossAddressSpaceOrdering = false;
651   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
652     ScopeOrNone.getValue();
653 
654   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
655       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
656     reportUnsupported(MI, "Unsupported atomic address space");
657     return None;
658   }
659 
660   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
661                      IsCrossAddressSpaceOrdering);
662 }
663 
664 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
665     const MachineBasicBlock::iterator &MI) const {
666   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
667 
668   if (!(MI->mayLoad() && MI->mayStore()))
669     return None;
670 
671   // Be conservative if there are no memory operands.
672   if (MI->getNumMemOperands() == 0)
673     return SIMemOpInfo();
674 
675   return constructFromMIWithMMO(MI);
676 }
677 
678 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
679   TII = ST.getInstrInfo();
680   IV = getIsaVersion(ST.getCPU());
681   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
682 }
683 
684 /* static */
685 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
686   GCNSubtarget::Generation Generation = ST.getGeneration();
687   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
688     return std::make_unique<SIGfx6CacheControl>(ST);
689   if (Generation < AMDGPUSubtarget::GFX10)
690     return std::make_unique<SIGfx7CacheControl>(ST);
691   return std::make_unique<SIGfx10CacheControl>(ST);
692 }
693 
694 bool SIGfx6CacheControl::enableLoadCacheBypass(
695     const MachineBasicBlock::iterator &MI,
696     SIAtomicScope Scope,
697     SIAtomicAddrSpace AddrSpace) const {
698   assert(MI->mayLoad() && !MI->mayStore());
699   bool Changed = false;
700 
701   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
702     switch (Scope) {
703     case SIAtomicScope::SYSTEM:
704     case SIAtomicScope::AGENT:
705       Changed |= enableGLCBit(MI);
706       break;
707     case SIAtomicScope::WORKGROUP:
708     case SIAtomicScope::WAVEFRONT:
709     case SIAtomicScope::SINGLETHREAD:
710       // No cache to bypass.
711       break;
712     default:
713       llvm_unreachable("Unsupported synchronization scope");
714     }
715   }
716 
717   /// The scratch address space does not need the global memory caches
718   /// to be bypassed as all memory operations by the same thread are
719   /// sequentially consistent, and no other thread can access scratch
720   /// memory.
721 
722   /// Other address spaces do not have a cache.
723 
724   return Changed;
725 }
726 
727 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
728     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
729     bool IsVolatile, bool IsNonTemporal) const {
730   // Only handle load and store, not atomic read-modify-write insructions. The
731   // latter use glc to indicate if the atomic returns a result and so must not
732   // be used for cache control.
733   assert(MI->mayLoad() ^ MI->mayStore());
734 
735   // Only update load and store, not LLVM IR atomic read-modify-write
736   // instructions. The latter are always marked as volatile so cannot sensibly
737   // handle it as do not want to pessimize all atomics. Also they do not support
738   // the nontemporal attribute.
739   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
740 
741   bool Changed = false;
742 
743   if (IsVolatile) {
744     if (Op == SIMemOp::LOAD)
745       Changed |= enableGLCBit(MI);
746 
747     // Ensure operation has completed at system scope to cause all volatile
748     // operations to be visible outside the program in a global order. Do not
749     // request cross address space as only the global address space can be
750     // observable outside the program, so no need to cause a waitcnt for LDS
751     // address space operations.
752     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
753                           Position::AFTER);
754 
755     return Changed;
756   }
757 
758   if (IsNonTemporal) {
759     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
760     Changed |= enableGLCBit(MI);
761     Changed |= enableSLCBit(MI);
762     return Changed;
763   }
764 
765   return Changed;
766 }
767 
768 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
769                                     SIAtomicScope Scope,
770                                     SIAtomicAddrSpace AddrSpace,
771                                     SIMemOp Op,
772                                     bool IsCrossAddrSpaceOrdering,
773                                     Position Pos) const {
774   bool Changed = false;
775 
776   MachineBasicBlock &MBB = *MI->getParent();
777   DebugLoc DL = MI->getDebugLoc();
778 
779   if (Pos == Position::AFTER)
780     ++MI;
781 
782   bool VMCnt = false;
783   bool LGKMCnt = false;
784 
785   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
786       SIAtomicAddrSpace::NONE) {
787     switch (Scope) {
788     case SIAtomicScope::SYSTEM:
789     case SIAtomicScope::AGENT:
790       VMCnt |= true;
791       break;
792     case SIAtomicScope::WORKGROUP:
793     case SIAtomicScope::WAVEFRONT:
794     case SIAtomicScope::SINGLETHREAD:
795       // The L1 cache keeps all memory operations in order for
796       // wavefronts in the same work-group.
797       break;
798     default:
799       llvm_unreachable("Unsupported synchronization scope");
800     }
801   }
802 
803   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
804     switch (Scope) {
805     case SIAtomicScope::SYSTEM:
806     case SIAtomicScope::AGENT:
807     case SIAtomicScope::WORKGROUP:
808       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
809       // not needed as LDS operations for all waves are executed in a total
810       // global ordering as observed by all waves. Required if also
811       // synchronizing with global/GDS memory as LDS operations could be
812       // reordered with respect to later global/GDS memory operations of the
813       // same wave.
814       LGKMCnt |= IsCrossAddrSpaceOrdering;
815       break;
816     case SIAtomicScope::WAVEFRONT:
817     case SIAtomicScope::SINGLETHREAD:
818       // The LDS keeps all memory operations in order for
819       // the same wavesfront.
820       break;
821     default:
822       llvm_unreachable("Unsupported synchronization scope");
823     }
824   }
825 
826   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
827     switch (Scope) {
828     case SIAtomicScope::SYSTEM:
829     case SIAtomicScope::AGENT:
830       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
831       // is not needed as GDS operations for all waves are executed in a total
832       // global ordering as observed by all waves. Required if also
833       // synchronizing with global/LDS memory as GDS operations could be
834       // reordered with respect to later global/LDS memory operations of the
835       // same wave.
836       LGKMCnt |= IsCrossAddrSpaceOrdering;
837       break;
838     case SIAtomicScope::WORKGROUP:
839     case SIAtomicScope::WAVEFRONT:
840     case SIAtomicScope::SINGLETHREAD:
841       // The GDS keeps all memory operations in order for
842       // the same work-group.
843       break;
844     default:
845       llvm_unreachable("Unsupported synchronization scope");
846     }
847   }
848 
849   if (VMCnt || LGKMCnt) {
850     unsigned WaitCntImmediate =
851       AMDGPU::encodeWaitcnt(IV,
852                             VMCnt ? 0 : getVmcntBitMask(IV),
853                             getExpcntBitMask(IV),
854                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
855     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
856     Changed = true;
857   }
858 
859   if (Pos == Position::AFTER)
860     --MI;
861 
862   return Changed;
863 }
864 
865 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
866                                        SIAtomicScope Scope,
867                                        SIAtomicAddrSpace AddrSpace,
868                                        Position Pos) const {
869   if (!InsertCacheInv)
870     return false;
871 
872   bool Changed = false;
873 
874   MachineBasicBlock &MBB = *MI->getParent();
875   DebugLoc DL = MI->getDebugLoc();
876 
877   if (Pos == Position::AFTER)
878     ++MI;
879 
880   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
881     switch (Scope) {
882     case SIAtomicScope::SYSTEM:
883     case SIAtomicScope::AGENT:
884       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
885       Changed = true;
886       break;
887     case SIAtomicScope::WORKGROUP:
888     case SIAtomicScope::WAVEFRONT:
889     case SIAtomicScope::SINGLETHREAD:
890       // No cache to invalidate.
891       break;
892     default:
893       llvm_unreachable("Unsupported synchronization scope");
894     }
895   }
896 
897   /// The scratch address space does not need the global memory cache
898   /// to be flushed as all memory operations by the same thread are
899   /// sequentially consistent, and no other thread can access scratch
900   /// memory.
901 
902   /// Other address spaces do not have a cache.
903 
904   if (Pos == Position::AFTER)
905     --MI;
906 
907   return Changed;
908 }
909 
910 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
911                                        SIAtomicScope Scope,
912                                        SIAtomicAddrSpace AddrSpace,
913                                        bool IsCrossAddrSpaceOrdering,
914                                        Position Pos) const {
915     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
916                       IsCrossAddrSpaceOrdering, Pos);
917 }
918 
919 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
920                                        SIAtomicScope Scope,
921                                        SIAtomicAddrSpace AddrSpace,
922                                        Position Pos) const {
923   if (!InsertCacheInv)
924     return false;
925 
926   bool Changed = false;
927 
928   MachineBasicBlock &MBB = *MI->getParent();
929   DebugLoc DL = MI->getDebugLoc();
930 
931   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
932 
933   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
934                                     ? AMDGPU::BUFFER_WBINVL1
935                                     : AMDGPU::BUFFER_WBINVL1_VOL;
936 
937   if (Pos == Position::AFTER)
938     ++MI;
939 
940   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
941     switch (Scope) {
942     case SIAtomicScope::SYSTEM:
943     case SIAtomicScope::AGENT:
944       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
945       Changed = true;
946       break;
947     case SIAtomicScope::WORKGROUP:
948     case SIAtomicScope::WAVEFRONT:
949     case SIAtomicScope::SINGLETHREAD:
950       // No cache to invalidate.
951       break;
952     default:
953       llvm_unreachable("Unsupported synchronization scope");
954     }
955   }
956 
957   /// The scratch address space does not need the global memory cache
958   /// to be flushed as all memory operations by the same thread are
959   /// sequentially consistent, and no other thread can access scratch
960   /// memory.
961 
962   /// Other address spaces do not have a cache.
963 
964   if (Pos == Position::AFTER)
965     --MI;
966 
967   return Changed;
968 }
969 
970 bool SIGfx10CacheControl::enableLoadCacheBypass(
971     const MachineBasicBlock::iterator &MI,
972     SIAtomicScope Scope,
973     SIAtomicAddrSpace AddrSpace) const {
974   assert(MI->mayLoad() && !MI->mayStore());
975   bool Changed = false;
976 
977   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
978     /// TODO Do not set glc for rmw atomic operations as they
979     /// implicitly bypass the L0/L1 caches.
980 
981     switch (Scope) {
982     case SIAtomicScope::SYSTEM:
983     case SIAtomicScope::AGENT:
984       Changed |= enableGLCBit(MI);
985       Changed |= enableDLCBit(MI);
986       break;
987     case SIAtomicScope::WORKGROUP:
988       // In WGP mode the waves of a work-group can be executing on either CU of
989       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
990       // CU mode all waves of a work-group are on the same CU, and so the L0
991       // does not need to be bypassed.
992       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
993       break;
994     case SIAtomicScope::WAVEFRONT:
995     case SIAtomicScope::SINGLETHREAD:
996       // No cache to bypass.
997       break;
998     default:
999       llvm_unreachable("Unsupported synchronization scope");
1000     }
1001   }
1002 
1003   /// The scratch address space does not need the global memory caches
1004   /// to be bypassed as all memory operations by the same thread are
1005   /// sequentially consistent, and no other thread can access scratch
1006   /// memory.
1007 
1008   /// Other address spaces do not have a cache.
1009 
1010   return Changed;
1011 }
1012 
1013 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1014     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1015     bool IsVolatile, bool IsNonTemporal) const {
1016 
1017   // Only handle load and store, not atomic read-modify-write insructions. The
1018   // latter use glc to indicate if the atomic returns a result and so must not
1019   // be used for cache control.
1020   assert(MI->mayLoad() ^ MI->mayStore());
1021 
1022   // Only update load and store, not LLVM IR atomic read-modify-write
1023   // instructions. The latter are always marked as volatile so cannot sensibly
1024   // handle it as do not want to pessimize all atomics. Also they do not support
1025   // the nontemporal attribute.
1026   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1027 
1028   bool Changed = false;
1029 
1030   if (IsVolatile) {
1031 
1032     if (Op == SIMemOp::LOAD) {
1033       Changed |= enableGLCBit(MI);
1034       Changed |= enableDLCBit(MI);
1035     }
1036 
1037     // Ensure operation has completed at system scope to cause all volatile
1038     // operations to be visible outside the program in a global order. Do not
1039     // request cross address space as only the global address space can be
1040     // observable outside the program, so no need to cause a waitcnt for LDS
1041     // address space operations.
1042     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1043                           Position::AFTER);
1044     return Changed;
1045   }
1046 
1047   if (IsNonTemporal) {
1048     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1049     Changed |= enableSLCBit(MI);
1050     return Changed;
1051   }
1052 
1053   return Changed;
1054 }
1055 
1056 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1057                                      SIAtomicScope Scope,
1058                                      SIAtomicAddrSpace AddrSpace,
1059                                      SIMemOp Op,
1060                                      bool IsCrossAddrSpaceOrdering,
1061                                      Position Pos) const {
1062   bool Changed = false;
1063 
1064   MachineBasicBlock &MBB = *MI->getParent();
1065   DebugLoc DL = MI->getDebugLoc();
1066 
1067   if (Pos == Position::AFTER)
1068     ++MI;
1069 
1070   bool VMCnt = false;
1071   bool VSCnt = false;
1072   bool LGKMCnt = false;
1073 
1074   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1075       SIAtomicAddrSpace::NONE) {
1076     switch (Scope) {
1077     case SIAtomicScope::SYSTEM:
1078     case SIAtomicScope::AGENT:
1079       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1080         VMCnt |= true;
1081       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1082         VSCnt |= true;
1083       break;
1084     case SIAtomicScope::WORKGROUP:
1085       // In WGP mode the waves of a work-group can be executing on either CU of
1086       // the WGP. Therefore need to wait for operations to complete to ensure
1087       // they are visible to waves in the other CU as the L0 is per CU.
1088       // Otherwise in CU mode and all waves of a work-group are on the same CU
1089       // which shares the same L0.
1090       if (!ST.isCuModeEnabled()) {
1091         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1092           VMCnt |= true;
1093         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1094           VSCnt |= true;
1095       }
1096       break;
1097     case SIAtomicScope::WAVEFRONT:
1098     case SIAtomicScope::SINGLETHREAD:
1099       // The L0 cache keeps all memory operations in order for
1100       // work-items in the same wavefront.
1101       break;
1102     default:
1103       llvm_unreachable("Unsupported synchronization scope");
1104     }
1105   }
1106 
1107   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1108     switch (Scope) {
1109     case SIAtomicScope::SYSTEM:
1110     case SIAtomicScope::AGENT:
1111     case SIAtomicScope::WORKGROUP:
1112       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1113       // not needed as LDS operations for all waves are executed in a total
1114       // global ordering as observed by all waves. Required if also
1115       // synchronizing with global/GDS memory as LDS operations could be
1116       // reordered with respect to later global/GDS memory operations of the
1117       // same wave.
1118       LGKMCnt |= IsCrossAddrSpaceOrdering;
1119       break;
1120     case SIAtomicScope::WAVEFRONT:
1121     case SIAtomicScope::SINGLETHREAD:
1122       // The LDS keeps all memory operations in order for
1123       // the same wavesfront.
1124       break;
1125     default:
1126       llvm_unreachable("Unsupported synchronization scope");
1127     }
1128   }
1129 
1130   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1131     switch (Scope) {
1132     case SIAtomicScope::SYSTEM:
1133     case SIAtomicScope::AGENT:
1134       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1135       // is not needed as GDS operations for all waves are executed in a total
1136       // global ordering as observed by all waves. Required if also
1137       // synchronizing with global/LDS memory as GDS operations could be
1138       // reordered with respect to later global/LDS memory operations of the
1139       // same wave.
1140       LGKMCnt |= IsCrossAddrSpaceOrdering;
1141       break;
1142     case SIAtomicScope::WORKGROUP:
1143     case SIAtomicScope::WAVEFRONT:
1144     case SIAtomicScope::SINGLETHREAD:
1145       // The GDS keeps all memory operations in order for
1146       // the same work-group.
1147       break;
1148     default:
1149       llvm_unreachable("Unsupported synchronization scope");
1150     }
1151   }
1152 
1153   if (VMCnt || LGKMCnt) {
1154     unsigned WaitCntImmediate =
1155       AMDGPU::encodeWaitcnt(IV,
1156                             VMCnt ? 0 : getVmcntBitMask(IV),
1157                             getExpcntBitMask(IV),
1158                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1159     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1160     Changed = true;
1161   }
1162 
1163   if (VSCnt) {
1164     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1165       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1166       .addImm(0);
1167     Changed = true;
1168   }
1169 
1170   if (Pos == Position::AFTER)
1171     --MI;
1172 
1173   return Changed;
1174 }
1175 
1176 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1177                                         SIAtomicScope Scope,
1178                                         SIAtomicAddrSpace AddrSpace,
1179                                         Position Pos) const {
1180   if (!InsertCacheInv)
1181     return false;
1182 
1183   bool Changed = false;
1184 
1185   MachineBasicBlock &MBB = *MI->getParent();
1186   DebugLoc DL = MI->getDebugLoc();
1187 
1188   if (Pos == Position::AFTER)
1189     ++MI;
1190 
1191   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1192     switch (Scope) {
1193     case SIAtomicScope::SYSTEM:
1194     case SIAtomicScope::AGENT:
1195       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1196       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1197       Changed = true;
1198       break;
1199     case SIAtomicScope::WORKGROUP:
1200       // In WGP mode the waves of a work-group can be executing on either CU of
1201       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1202       // in CU mode and all waves of a work-group are on the same CU, and so the
1203       // L0 does not need to be invalidated.
1204       if (!ST.isCuModeEnabled()) {
1205         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1206         Changed = true;
1207       }
1208       break;
1209     case SIAtomicScope::WAVEFRONT:
1210     case SIAtomicScope::SINGLETHREAD:
1211       // No cache to invalidate.
1212       break;
1213     default:
1214       llvm_unreachable("Unsupported synchronization scope");
1215     }
1216   }
1217 
1218   /// The scratch address space does not need the global memory cache
1219   /// to be flushed as all memory operations by the same thread are
1220   /// sequentially consistent, and no other thread can access scratch
1221   /// memory.
1222 
1223   /// Other address spaces do not have a cache.
1224 
1225   if (Pos == Position::AFTER)
1226     --MI;
1227 
1228   return Changed;
1229 }
1230 
1231 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1232   if (AtomicPseudoMIs.empty())
1233     return false;
1234 
1235   for (auto &MI : AtomicPseudoMIs)
1236     MI->eraseFromParent();
1237 
1238   AtomicPseudoMIs.clear();
1239   return true;
1240 }
1241 
1242 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1243                                    MachineBasicBlock::iterator &MI) {
1244   assert(MI->mayLoad() && !MI->mayStore());
1245 
1246   bool Changed = false;
1247 
1248   if (MOI.isAtomic()) {
1249     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1250         MOI.getOrdering() == AtomicOrdering::Acquire ||
1251         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1252       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1253                                            MOI.getOrderingAddrSpace());
1254     }
1255 
1256     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1257       Changed |= CC->insertWait(MI, MOI.getScope(),
1258                                 MOI.getOrderingAddrSpace(),
1259                                 SIMemOp::LOAD | SIMemOp::STORE,
1260                                 MOI.getIsCrossAddressSpaceOrdering(),
1261                                 Position::BEFORE);
1262 
1263     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1264         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1265       Changed |= CC->insertWait(MI, MOI.getScope(),
1266                                 MOI.getInstrAddrSpace(),
1267                                 SIMemOp::LOAD,
1268                                 MOI.getIsCrossAddressSpaceOrdering(),
1269                                 Position::AFTER);
1270       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1271                                    MOI.getOrderingAddrSpace(),
1272                                    Position::AFTER);
1273     }
1274 
1275     return Changed;
1276   }
1277 
1278   // Atomic instructions already bypass caches to the scope specified by the
1279   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1280   // need additional treatment.
1281   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1282                                                 SIMemOp::LOAD, MOI.isVolatile(),
1283                                                 MOI.isNonTemporal());
1284   return Changed;
1285 }
1286 
1287 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1288                                     MachineBasicBlock::iterator &MI) {
1289   assert(!MI->mayLoad() && MI->mayStore());
1290 
1291   bool Changed = false;
1292 
1293   if (MOI.isAtomic()) {
1294     if (MOI.getOrdering() == AtomicOrdering::Release ||
1295         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1296       Changed |= CC->insertRelease(MI, MOI.getScope(),
1297                                    MOI.getOrderingAddrSpace(),
1298                                    MOI.getIsCrossAddressSpaceOrdering(),
1299                                    Position::BEFORE);
1300 
1301     return Changed;
1302   }
1303 
1304   // Atomic instructions already bypass caches to the scope specified by the
1305   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1306   // need additional treatment.
1307   Changed |= CC->enableVolatileAndOrNonTemporal(
1308       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1309       MOI.isNonTemporal());
1310   return Changed;
1311 }
1312 
1313 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1314                                           MachineBasicBlock::iterator &MI) {
1315   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1316 
1317   AtomicPseudoMIs.push_back(MI);
1318   bool Changed = false;
1319 
1320   if (MOI.isAtomic()) {
1321     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1322         MOI.getOrdering() == AtomicOrdering::Release ||
1323         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1324         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1325       /// TODO: This relies on a barrier always generating a waitcnt
1326       /// for LDS to ensure it is not reordered with the completion of
1327       /// the proceeding LDS operations. If barrier had a memory
1328       /// ordering and memory scope, then library does not need to
1329       /// generate a fence. Could add support in this file for
1330       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1331       /// adding S_WAITCNT before a S_BARRIER.
1332       Changed |= CC->insertRelease(MI, MOI.getScope(),
1333                                    MOI.getOrderingAddrSpace(),
1334                                    MOI.getIsCrossAddressSpaceOrdering(),
1335                                    Position::BEFORE);
1336 
1337     // TODO: If both release and invalidate are happening they could be combined
1338     // to use the single "BUFFER_WBL2" instruction. This could be done by
1339     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1340     // track cache invalidate and write back instructions.
1341 
1342     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1343         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1344         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1345       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1346                                    MOI.getOrderingAddrSpace(),
1347                                    Position::BEFORE);
1348 
1349     return Changed;
1350   }
1351 
1352   return Changed;
1353 }
1354 
1355 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1356   MachineBasicBlock::iterator &MI) {
1357   assert(MI->mayLoad() && MI->mayStore());
1358 
1359   bool Changed = false;
1360 
1361   if (MOI.isAtomic()) {
1362     if (MOI.getOrdering() == AtomicOrdering::Release ||
1363         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1364         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1365         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1366       Changed |= CC->insertRelease(MI, MOI.getScope(),
1367                                    MOI.getOrderingAddrSpace(),
1368                                    MOI.getIsCrossAddressSpaceOrdering(),
1369                                    Position::BEFORE);
1370 
1371     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1372         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1373         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1374         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1375         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1376       Changed |= CC->insertWait(MI, MOI.getScope(),
1377                                 MOI.getOrderingAddrSpace(),
1378                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1379                                                    SIMemOp::STORE,
1380                                 MOI.getIsCrossAddressSpaceOrdering(),
1381                                 Position::AFTER);
1382       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1383                                    MOI.getOrderingAddrSpace(),
1384                                    Position::AFTER);
1385     }
1386 
1387     return Changed;
1388   }
1389 
1390   return Changed;
1391 }
1392 
1393 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1394   bool Changed = false;
1395 
1396   SIMemOpAccess MOA(MF);
1397   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1398 
1399   for (auto &MBB : MF) {
1400     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1401 
1402       // Unbundle instructions after the post-RA scheduler.
1403       if (MI->isBundle()) {
1404         MachineBasicBlock::instr_iterator II(MI->getIterator());
1405         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1406              I != E && I->isBundledWithPred(); ++I) {
1407           I->unbundleFromPred();
1408           for (MachineOperand &MO : I->operands())
1409             if (MO.isReg())
1410               MO.setIsInternalRead(false);
1411         }
1412 
1413         MI->eraseFromParent();
1414         MI = II->getIterator();
1415       }
1416 
1417       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1418         continue;
1419 
1420       if (const auto &MOI = MOA.getLoadInfo(MI))
1421         Changed |= expandLoad(MOI.getValue(), MI);
1422       else if (const auto &MOI = MOA.getStoreInfo(MI))
1423         Changed |= expandStore(MOI.getValue(), MI);
1424       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1425         Changed |= expandAtomicFence(MOI.getValue(), MI);
1426       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1427         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1428     }
1429   }
1430 
1431   Changed |= removeAtomicPseudoMIs();
1432   return Changed;
1433 }
1434 
1435 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1436 
1437 char SIMemoryLegalizer::ID = 0;
1438 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1439 
1440 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1441   return new SIMemoryLegalizer();
1442 }
1443