1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "llvm/ADT/BitmaskEnum.h"
20 #include "llvm/CodeGen/MachineBasicBlock.h"
21 #include "llvm/IR/DiagnosticInfo.h"
22 #include "llvm/Support/AtomicOrdering.h"
23 #include "llvm/Support/TargetParser.h"
24 
25 using namespace llvm;
26 using namespace llvm::AMDGPU;
27 
28 #define DEBUG_TYPE "si-memory-legalizer"
29 #define PASS_NAME "SI Memory Legalizer"
30 
31 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
32     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
33     cl::desc("Use this to skip inserting cache invalidating instructions."));
34 
35 namespace {
36 
37 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
38 
39 /// Memory operation flags. Can be ORed together.
40 enum class SIMemOp {
41   NONE = 0u,
42   LOAD = 1u << 0,
43   STORE = 1u << 1,
44   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
45 };
46 
47 /// Position to insert a new instruction relative to an existing
48 /// instruction.
49 enum class Position {
50   BEFORE,
51   AFTER
52 };
53 
54 /// The atomic synchronization scopes supported by the AMDGPU target.
55 enum class SIAtomicScope {
56   NONE,
57   SINGLETHREAD,
58   WAVEFRONT,
59   WORKGROUP,
60   AGENT,
61   SYSTEM
62 };
63 
64 /// The distinct address spaces supported by the AMDGPU target for
65 /// atomic memory operation. Can be ORed toether.
66 enum class SIAtomicAddrSpace {
67   NONE = 0u,
68   GLOBAL = 1u << 0,
69   LDS = 1u << 1,
70   SCRATCH = 1u << 2,
71   GDS = 1u << 3,
72   OTHER = 1u << 4,
73 
74   /// The address spaces that can be accessed by a FLAT instruction.
75   FLAT = GLOBAL | LDS | SCRATCH,
76 
77   /// The address spaces that support atomic instructions.
78   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
79 
80   /// All address spaces.
81   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
82 
83   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
84 };
85 
86 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
87 /// \returns Returns true if \p MI is modified, false otherwise.
88 template <uint16_t BitName>
89 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
90   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
91   if (BitIdx == -1)
92     return false;
93 
94   MachineOperand &Bit = MI->getOperand(BitIdx);
95   if (Bit.getImm() != 0)
96     return false;
97 
98   Bit.setImm(1);
99   return true;
100 }
101 
102 class SIMemOpInfo final {
103 private:
104 
105   friend class SIMemOpAccess;
106 
107   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
108   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
109   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
110   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
111   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
112   bool IsCrossAddressSpaceOrdering = false;
113   bool IsNonTemporal = false;
114 
115   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
116               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
117               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
118               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
119               bool IsCrossAddressSpaceOrdering = true,
120               AtomicOrdering FailureOrdering =
121                 AtomicOrdering::SequentiallyConsistent,
122               bool IsNonTemporal = false)
123     : Ordering(Ordering), FailureOrdering(FailureOrdering),
124       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
125       InstrAddrSpace(InstrAddrSpace),
126       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
127       IsNonTemporal(IsNonTemporal) {
128     // There is also no cross address space ordering if the ordering
129     // address space is the same as the instruction address space and
130     // only contains a single address space.
131     if ((OrderingAddrSpace == InstrAddrSpace) &&
132         isPowerOf2_32(uint32_t(InstrAddrSpace)))
133       this->IsCrossAddressSpaceOrdering = false;
134   }
135 
136 public:
137   /// \returns Atomic synchronization scope of the machine instruction used to
138   /// create this SIMemOpInfo.
139   SIAtomicScope getScope() const {
140     return Scope;
141   }
142 
143   /// \returns Ordering constraint of the machine instruction used to
144   /// create this SIMemOpInfo.
145   AtomicOrdering getOrdering() const {
146     return Ordering;
147   }
148 
149   /// \returns Failure ordering constraint of the machine instruction used to
150   /// create this SIMemOpInfo.
151   AtomicOrdering getFailureOrdering() const {
152     return FailureOrdering;
153   }
154 
155   /// \returns The address spaces be accessed by the machine
156   /// instruction used to create this SiMemOpInfo.
157   SIAtomicAddrSpace getInstrAddrSpace() const {
158     return InstrAddrSpace;
159   }
160 
161   /// \returns The address spaces that must be ordered by the machine
162   /// instruction used to create this SiMemOpInfo.
163   SIAtomicAddrSpace getOrderingAddrSpace() const {
164     return OrderingAddrSpace;
165   }
166 
167   /// \returns Return true iff memory ordering of operations on
168   /// different address spaces is required.
169   bool getIsCrossAddressSpaceOrdering() const {
170     return IsCrossAddressSpaceOrdering;
171   }
172 
173   /// \returns True if memory access of the machine instruction used to
174   /// create this SIMemOpInfo is non-temporal, false otherwise.
175   bool isNonTemporal() const {
176     return IsNonTemporal;
177   }
178 
179   /// \returns True if ordering constraint of the machine instruction used to
180   /// create this SIMemOpInfo is unordered or higher, false otherwise.
181   bool isAtomic() const {
182     return Ordering != AtomicOrdering::NotAtomic;
183   }
184 
185 };
186 
187 class SIMemOpAccess final {
188 private:
189   AMDGPUMachineModuleInfo *MMI = nullptr;
190 
191   /// Reports unsupported message \p Msg for \p MI to LLVM context.
192   void reportUnsupported(const MachineBasicBlock::iterator &MI,
193                          const char *Msg) const;
194 
195   /// Inspects the target synchonization scope \p SSID and determines
196   /// the SI atomic scope it corresponds to, the address spaces it
197   /// covers, and whether the memory ordering applies between address
198   /// spaces.
199   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
200   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
201 
202   /// \return Return a bit set of the address spaces accessed by \p AS.
203   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
204 
205   /// \returns Info constructed from \p MI, which has at least machine memory
206   /// operand.
207   Optional<SIMemOpInfo> constructFromMIWithMMO(
208       const MachineBasicBlock::iterator &MI) const;
209 
210 public:
211   /// Construct class to support accessing the machine memory operands
212   /// of instructions in the machine function \p MF.
213   SIMemOpAccess(MachineFunction &MF);
214 
215   /// \returns Load info if \p MI is a load operation, "None" otherwise.
216   Optional<SIMemOpInfo> getLoadInfo(
217       const MachineBasicBlock::iterator &MI) const;
218 
219   /// \returns Store info if \p MI is a store operation, "None" otherwise.
220   Optional<SIMemOpInfo> getStoreInfo(
221       const MachineBasicBlock::iterator &MI) const;
222 
223   /// \returns Atomic fence info if \p MI is an atomic fence operation,
224   /// "None" otherwise.
225   Optional<SIMemOpInfo> getAtomicFenceInfo(
226       const MachineBasicBlock::iterator &MI) const;
227 
228   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
229   /// rmw operation, "None" otherwise.
230   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
231       const MachineBasicBlock::iterator &MI) const;
232 };
233 
234 class SICacheControl {
235 protected:
236 
237   /// AMDGPU subtarget info.
238   const GCNSubtarget &ST;
239 
240   /// Instruction info.
241   const SIInstrInfo *TII = nullptr;
242 
243   IsaVersion IV;
244 
245   /// Whether to insert cache invalidating instructions.
246   bool InsertCacheInv;
247 
248   SICacheControl(const GCNSubtarget &ST);
249 
250 public:
251 
252   /// Create a cache control for the subtarget \p ST.
253   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
254 
255   /// Update \p MI memory load instruction to bypass any caches up to
256   /// the \p Scope memory scope for address spaces \p
257   /// AddrSpace. Return true iff the instruction was modified.
258   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
259                                      SIAtomicScope Scope,
260                                      SIAtomicAddrSpace AddrSpace) const = 0;
261 
262   /// Update \p MI memory instruction to indicate it is
263   /// nontemporal. Return true iff the instruction was modified.
264   virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
265     const = 0;
266 
267   /// Inserts any necessary instructions at position \p Pos relative
268   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
269   /// \p Op associated with address spaces \p AddrSpace have completed. Used
270   /// between memory instructions to enforce the order they become visible as
271   /// observed by other memory instructions executing in memory scope \p Scope.
272   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
273   /// address spaces. Returns true iff any instructions inserted.
274   virtual bool insertWait(MachineBasicBlock::iterator &MI,
275                           SIAtomicScope Scope,
276                           SIAtomicAddrSpace AddrSpace,
277                           SIMemOp Op,
278                           bool IsCrossAddrSpaceOrdering,
279                           Position Pos) const = 0;
280 
281   /// Inserts any necessary instructions at position \p Pos relative to
282   /// instruction \p MI to ensure any subsequent memory instructions of this
283   /// thread with address spaces \p AddrSpace will observe the previous memory
284   /// operations by any thread for memory scopes up to memory scope \p Scope .
285   /// Returns true iff any instructions inserted.
286   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
287                              SIAtomicScope Scope,
288                              SIAtomicAddrSpace AddrSpace,
289                              Position Pos) const = 0;
290 
291   /// Inserts any necessary instructions at position \p Pos relative to
292   /// instruction \p MI to ensure previous memory instructions by this thread
293   /// with address spaces \p AddrSpace have completed and can be observed by
294   /// subsequent memory instructions by any thread executing in memory scope \p
295   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
296   /// between address spaces. Returns true iff any instructions inserted.
297   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
298                              SIAtomicScope Scope,
299                              SIAtomicAddrSpace AddrSpace,
300                              bool IsCrossAddrSpaceOrdering,
301                              Position Pos) const = 0;
302 
303   /// Virtual destructor to allow derivations to be deleted.
304   virtual ~SICacheControl() = default;
305 
306 };
307 
308 class SIGfx6CacheControl : public SICacheControl {
309 protected:
310 
311   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
312   /// is modified, false otherwise.
313   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
314     return enableNamedBit<AMDGPU::OpName::glc>(MI);
315   }
316 
317   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
318   /// is modified, false otherwise.
319   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
320     return enableNamedBit<AMDGPU::OpName::slc>(MI);
321   }
322 
323 public:
324 
325   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
326 
327   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
328                              SIAtomicScope Scope,
329                              SIAtomicAddrSpace AddrSpace) const override;
330 
331   bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
332 
333   bool insertWait(MachineBasicBlock::iterator &MI,
334                   SIAtomicScope Scope,
335                   SIAtomicAddrSpace AddrSpace,
336                   SIMemOp Op,
337                   bool IsCrossAddrSpaceOrdering,
338                   Position Pos) const override;
339 
340   bool insertAcquire(MachineBasicBlock::iterator &MI,
341                      SIAtomicScope Scope,
342                      SIAtomicAddrSpace AddrSpace,
343                      Position Pos) const override;
344 
345   bool insertRelease(MachineBasicBlock::iterator &MI,
346                      SIAtomicScope Scope,
347                      SIAtomicAddrSpace AddrSpace,
348                      bool IsCrossAddrSpaceOrdering,
349                      Position Pos) const override;
350 };
351 
352 class SIGfx7CacheControl : public SIGfx6CacheControl {
353 public:
354 
355   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
356 
357   bool insertAcquire(MachineBasicBlock::iterator &MI,
358                      SIAtomicScope Scope,
359                      SIAtomicAddrSpace AddrSpace,
360                      Position Pos) const override;
361 
362 };
363 
364 class SIGfx10CacheControl : public SIGfx7CacheControl {
365 protected:
366 
367   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
368   /// is modified, false otherwise.
369   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
370     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
371   }
372 
373 public:
374 
375   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
376 
377   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
378                              SIAtomicScope Scope,
379                              SIAtomicAddrSpace AddrSpace) const override;
380 
381   bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
382 
383   bool insertWait(MachineBasicBlock::iterator &MI,
384                   SIAtomicScope Scope,
385                   SIAtomicAddrSpace AddrSpace,
386                   SIMemOp Op,
387                   bool IsCrossAddrSpaceOrdering,
388                   Position Pos) const override;
389 
390   bool insertAcquire(MachineBasicBlock::iterator &MI,
391                      SIAtomicScope Scope,
392                      SIAtomicAddrSpace AddrSpace,
393                      Position Pos) const override;
394 };
395 
396 class SIMemoryLegalizer final : public MachineFunctionPass {
397 private:
398 
399   /// Cache Control.
400   std::unique_ptr<SICacheControl> CC = nullptr;
401 
402   /// List of atomic pseudo instructions.
403   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
404 
405   /// Return true iff instruction \p MI is a atomic instruction that
406   /// returns a result.
407   bool isAtomicRet(const MachineInstr &MI) const {
408     return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
409   }
410 
411   /// Removes all processed atomic pseudo instructions from the current
412   /// function. Returns true if current function is modified, false otherwise.
413   bool removeAtomicPseudoMIs();
414 
415   /// Expands load operation \p MI. Returns true if instructions are
416   /// added/deleted or \p MI is modified, false otherwise.
417   bool expandLoad(const SIMemOpInfo &MOI,
418                   MachineBasicBlock::iterator &MI);
419   /// Expands store operation \p MI. Returns true if instructions are
420   /// added/deleted or \p MI is modified, false otherwise.
421   bool expandStore(const SIMemOpInfo &MOI,
422                    MachineBasicBlock::iterator &MI);
423   /// Expands atomic fence operation \p MI. Returns true if
424   /// instructions are added/deleted or \p MI is modified, false otherwise.
425   bool expandAtomicFence(const SIMemOpInfo &MOI,
426                          MachineBasicBlock::iterator &MI);
427   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
428   /// instructions are added/deleted or \p MI is modified, false otherwise.
429   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
430                                 MachineBasicBlock::iterator &MI);
431 
432 public:
433   static char ID;
434 
435   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
436 
437   void getAnalysisUsage(AnalysisUsage &AU) const override {
438     AU.setPreservesCFG();
439     MachineFunctionPass::getAnalysisUsage(AU);
440   }
441 
442   StringRef getPassName() const override {
443     return PASS_NAME;
444   }
445 
446   bool runOnMachineFunction(MachineFunction &MF) override;
447 };
448 
449 } // end namespace anonymous
450 
451 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
452                                       const char *Msg) const {
453   const Function &Func = MI->getParent()->getParent()->getFunction();
454   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
455   Func.getContext().diagnose(Diag);
456 }
457 
458 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
459 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
460                                SIAtomicAddrSpace InstrScope) const {
461   if (SSID == SyncScope::System)
462     return std::make_tuple(SIAtomicScope::SYSTEM,
463                            SIAtomicAddrSpace::ATOMIC,
464                            true);
465   if (SSID == MMI->getAgentSSID())
466     return std::make_tuple(SIAtomicScope::AGENT,
467                            SIAtomicAddrSpace::ATOMIC,
468                            true);
469   if (SSID == MMI->getWorkgroupSSID())
470     return std::make_tuple(SIAtomicScope::WORKGROUP,
471                            SIAtomicAddrSpace::ATOMIC,
472                            true);
473   if (SSID == MMI->getWavefrontSSID())
474     return std::make_tuple(SIAtomicScope::WAVEFRONT,
475                            SIAtomicAddrSpace::ATOMIC,
476                            true);
477   if (SSID == SyncScope::SingleThread)
478     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
479                            SIAtomicAddrSpace::ATOMIC,
480                            true);
481   if (SSID == MMI->getSystemOneAddressSpaceSSID())
482     return std::make_tuple(SIAtomicScope::SYSTEM,
483                            SIAtomicAddrSpace::ATOMIC & InstrScope,
484                            false);
485   if (SSID == MMI->getAgentOneAddressSpaceSSID())
486     return std::make_tuple(SIAtomicScope::AGENT,
487                            SIAtomicAddrSpace::ATOMIC & InstrScope,
488                            false);
489   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
490     return std::make_tuple(SIAtomicScope::WORKGROUP,
491                            SIAtomicAddrSpace::ATOMIC & InstrScope,
492                            false);
493   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
494     return std::make_tuple(SIAtomicScope::WAVEFRONT,
495                            SIAtomicAddrSpace::ATOMIC & InstrScope,
496                            false);
497   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
498     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
499                            SIAtomicAddrSpace::ATOMIC & InstrScope,
500                            false);
501   return None;
502 }
503 
504 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
505   if (AS == AMDGPUAS::FLAT_ADDRESS)
506     return SIAtomicAddrSpace::FLAT;
507   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
508     return SIAtomicAddrSpace::GLOBAL;
509   if (AS == AMDGPUAS::LOCAL_ADDRESS)
510     return SIAtomicAddrSpace::LDS;
511   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
512     return SIAtomicAddrSpace::SCRATCH;
513   if (AS == AMDGPUAS::REGION_ADDRESS)
514     return SIAtomicAddrSpace::GDS;
515 
516   return SIAtomicAddrSpace::OTHER;
517 }
518 
519 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
520   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
521 }
522 
523 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
524     const MachineBasicBlock::iterator &MI) const {
525   assert(MI->getNumMemOperands() > 0);
526 
527   SyncScope::ID SSID = SyncScope::SingleThread;
528   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
529   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
530   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
531   bool IsNonTemporal = true;
532 
533   // Validator should check whether or not MMOs cover the entire set of
534   // locations accessed by the memory instruction.
535   for (const auto &MMO : MI->memoperands()) {
536     IsNonTemporal &= MMO->isNonTemporal();
537     InstrAddrSpace |=
538       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
539     AtomicOrdering OpOrdering = MMO->getOrdering();
540     if (OpOrdering != AtomicOrdering::NotAtomic) {
541       const auto &IsSyncScopeInclusion =
542           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
543       if (!IsSyncScopeInclusion) {
544         reportUnsupported(MI,
545           "Unsupported non-inclusive atomic synchronization scope");
546         return None;
547       }
548 
549       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
550       Ordering =
551           isStrongerThan(Ordering, OpOrdering) ?
552               Ordering : MMO->getOrdering();
553       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
554              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
555       FailureOrdering =
556           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
557               FailureOrdering : MMO->getFailureOrdering();
558     }
559   }
560 
561   SIAtomicScope Scope = SIAtomicScope::NONE;
562   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
563   bool IsCrossAddressSpaceOrdering = false;
564   if (Ordering != AtomicOrdering::NotAtomic) {
565     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
566     if (!ScopeOrNone) {
567       reportUnsupported(MI, "Unsupported atomic synchronization scope");
568       return None;
569     }
570     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
571       ScopeOrNone.getValue();
572     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
573         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
574       reportUnsupported(MI, "Unsupported atomic address space");
575       return None;
576     }
577   }
578   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
579                      IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
580 }
581 
582 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
583     const MachineBasicBlock::iterator &MI) const {
584   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
585 
586   if (!(MI->mayLoad() && !MI->mayStore()))
587     return None;
588 
589   // Be conservative if there are no memory operands.
590   if (MI->getNumMemOperands() == 0)
591     return SIMemOpInfo();
592 
593   return constructFromMIWithMMO(MI);
594 }
595 
596 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
597     const MachineBasicBlock::iterator &MI) const {
598   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
599 
600   if (!(!MI->mayLoad() && MI->mayStore()))
601     return None;
602 
603   // Be conservative if there are no memory operands.
604   if (MI->getNumMemOperands() == 0)
605     return SIMemOpInfo();
606 
607   return constructFromMIWithMMO(MI);
608 }
609 
610 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
611     const MachineBasicBlock::iterator &MI) const {
612   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
613 
614   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
615     return None;
616 
617   AtomicOrdering Ordering =
618     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
619 
620   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
621   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
622   if (!ScopeOrNone) {
623     reportUnsupported(MI, "Unsupported atomic synchronization scope");
624     return None;
625   }
626 
627   SIAtomicScope Scope = SIAtomicScope::NONE;
628   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
629   bool IsCrossAddressSpaceOrdering = false;
630   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
631     ScopeOrNone.getValue();
632 
633   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
634       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
635     reportUnsupported(MI, "Unsupported atomic address space");
636     return None;
637   }
638 
639   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
640                      IsCrossAddressSpaceOrdering);
641 }
642 
643 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
644     const MachineBasicBlock::iterator &MI) const {
645   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
646 
647   if (!(MI->mayLoad() && MI->mayStore()))
648     return None;
649 
650   // Be conservative if there are no memory operands.
651   if (MI->getNumMemOperands() == 0)
652     return SIMemOpInfo();
653 
654   return constructFromMIWithMMO(MI);
655 }
656 
657 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
658   TII = ST.getInstrInfo();
659   IV = getIsaVersion(ST.getCPU());
660   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
661 }
662 
663 /* static */
664 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
665   GCNSubtarget::Generation Generation = ST.getGeneration();
666   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
667     return std::make_unique<SIGfx6CacheControl>(ST);
668   if (Generation < AMDGPUSubtarget::GFX10)
669     return std::make_unique<SIGfx7CacheControl>(ST);
670   return std::make_unique<SIGfx10CacheControl>(ST);
671 }
672 
673 bool SIGfx6CacheControl::enableLoadCacheBypass(
674     const MachineBasicBlock::iterator &MI,
675     SIAtomicScope Scope,
676     SIAtomicAddrSpace AddrSpace) const {
677   assert(MI->mayLoad() && !MI->mayStore());
678   bool Changed = false;
679 
680   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
681     switch (Scope) {
682     case SIAtomicScope::SYSTEM:
683     case SIAtomicScope::AGENT:
684       Changed |= enableGLCBit(MI);
685       break;
686     case SIAtomicScope::WORKGROUP:
687     case SIAtomicScope::WAVEFRONT:
688     case SIAtomicScope::SINGLETHREAD:
689       // No cache to bypass.
690       break;
691     default:
692       llvm_unreachable("Unsupported synchronization scope");
693     }
694   }
695 
696   /// The scratch address space does not need the global memory caches
697   /// to be bypassed as all memory operations by the same thread are
698   /// sequentially consistent, and no other thread can access scratch
699   /// memory.
700 
701   /// Other address spaces do not have a cache.
702 
703   return Changed;
704 }
705 
706 bool SIGfx6CacheControl::enableNonTemporal(
707     const MachineBasicBlock::iterator &MI) const {
708   assert(MI->mayLoad() ^ MI->mayStore());
709   bool Changed = false;
710 
711   /// TODO: Do not enableGLCBit if rmw atomic.
712   Changed |= enableGLCBit(MI);
713   Changed |= enableSLCBit(MI);
714 
715   return Changed;
716 }
717 
718 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
719                                     SIAtomicScope Scope,
720                                     SIAtomicAddrSpace AddrSpace,
721                                     SIMemOp Op,
722                                     bool IsCrossAddrSpaceOrdering,
723                                     Position Pos) const {
724   bool Changed = false;
725 
726   MachineBasicBlock &MBB = *MI->getParent();
727   DebugLoc DL = MI->getDebugLoc();
728 
729   if (Pos == Position::AFTER)
730     ++MI;
731 
732   bool VMCnt = false;
733   bool LGKMCnt = false;
734 
735   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
736     switch (Scope) {
737     case SIAtomicScope::SYSTEM:
738     case SIAtomicScope::AGENT:
739       VMCnt |= true;
740       break;
741     case SIAtomicScope::WORKGROUP:
742     case SIAtomicScope::WAVEFRONT:
743     case SIAtomicScope::SINGLETHREAD:
744       // The L1 cache keeps all memory operations in order for
745       // wavefronts in the same work-group.
746       break;
747     default:
748       llvm_unreachable("Unsupported synchronization scope");
749     }
750   }
751 
752   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
753     switch (Scope) {
754     case SIAtomicScope::SYSTEM:
755     case SIAtomicScope::AGENT:
756     case SIAtomicScope::WORKGROUP:
757       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
758       // not needed as LDS operations for all waves are executed in a total
759       // global ordering as observed by all waves. Required if also
760       // synchronizing with global/GDS memory as LDS operations could be
761       // reordered with respect to later global/GDS memory operations of the
762       // same wave.
763       LGKMCnt |= IsCrossAddrSpaceOrdering;
764       break;
765     case SIAtomicScope::WAVEFRONT:
766     case SIAtomicScope::SINGLETHREAD:
767       // The LDS keeps all memory operations in order for
768       // the same wavesfront.
769       break;
770     default:
771       llvm_unreachable("Unsupported synchronization scope");
772     }
773   }
774 
775   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
776     switch (Scope) {
777     case SIAtomicScope::SYSTEM:
778     case SIAtomicScope::AGENT:
779       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
780       // is not needed as GDS operations for all waves are executed in a total
781       // global ordering as observed by all waves. Required if also
782       // synchronizing with global/LDS memory as GDS operations could be
783       // reordered with respect to later global/LDS memory operations of the
784       // same wave.
785       LGKMCnt |= IsCrossAddrSpaceOrdering;
786       break;
787     case SIAtomicScope::WORKGROUP:
788     case SIAtomicScope::WAVEFRONT:
789     case SIAtomicScope::SINGLETHREAD:
790       // The GDS keeps all memory operations in order for
791       // the same work-group.
792       break;
793     default:
794       llvm_unreachable("Unsupported synchronization scope");
795     }
796   }
797 
798   if (VMCnt || LGKMCnt) {
799     unsigned WaitCntImmediate =
800       AMDGPU::encodeWaitcnt(IV,
801                             VMCnt ? 0 : getVmcntBitMask(IV),
802                             getExpcntBitMask(IV),
803                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
804     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
805     Changed = true;
806   }
807 
808   if (Pos == Position::AFTER)
809     --MI;
810 
811   return Changed;
812 }
813 
814 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
815                                        SIAtomicScope Scope,
816                                        SIAtomicAddrSpace AddrSpace,
817                                        Position Pos) const {
818   if (!InsertCacheInv)
819     return false;
820 
821   bool Changed = false;
822 
823   MachineBasicBlock &MBB = *MI->getParent();
824   DebugLoc DL = MI->getDebugLoc();
825 
826   if (Pos == Position::AFTER)
827     ++MI;
828 
829   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
830     switch (Scope) {
831     case SIAtomicScope::SYSTEM:
832     case SIAtomicScope::AGENT:
833       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
834       Changed = true;
835       break;
836     case SIAtomicScope::WORKGROUP:
837     case SIAtomicScope::WAVEFRONT:
838     case SIAtomicScope::SINGLETHREAD:
839       // No cache to invalidate.
840       break;
841     default:
842       llvm_unreachable("Unsupported synchronization scope");
843     }
844   }
845 
846   /// The scratch address space does not need the global memory cache
847   /// to be flushed as all memory operations by the same thread are
848   /// sequentially consistent, and no other thread can access scratch
849   /// memory.
850 
851   /// Other address spaces do not have a cache.
852 
853   if (Pos == Position::AFTER)
854     --MI;
855 
856   return Changed;
857 }
858 
859 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
860                                        SIAtomicScope Scope,
861                                        SIAtomicAddrSpace AddrSpace,
862                                        bool IsCrossAddrSpaceOrdering,
863                                        Position Pos) const {
864     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
865                       IsCrossAddrSpaceOrdering, Pos);
866 }
867 
868 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
869                                        SIAtomicScope Scope,
870                                        SIAtomicAddrSpace AddrSpace,
871                                        Position Pos) const {
872   if (!InsertCacheInv)
873     return false;
874 
875   bool Changed = false;
876 
877   MachineBasicBlock &MBB = *MI->getParent();
878   DebugLoc DL = MI->getDebugLoc();
879 
880   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
881 
882   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
883                                     ? AMDGPU::BUFFER_WBINVL1
884                                     : AMDGPU::BUFFER_WBINVL1_VOL;
885 
886   if (Pos == Position::AFTER)
887     ++MI;
888 
889   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
890     switch (Scope) {
891     case SIAtomicScope::SYSTEM:
892     case SIAtomicScope::AGENT:
893       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
894       Changed = true;
895       break;
896     case SIAtomicScope::WORKGROUP:
897     case SIAtomicScope::WAVEFRONT:
898     case SIAtomicScope::SINGLETHREAD:
899       // No cache to invalidate.
900       break;
901     default:
902       llvm_unreachable("Unsupported synchronization scope");
903     }
904   }
905 
906   /// The scratch address space does not need the global memory cache
907   /// to be flushed as all memory operations by the same thread are
908   /// sequentially consistent, and no other thread can access scratch
909   /// memory.
910 
911   /// Other address spaces do not have a cache.
912 
913   if (Pos == Position::AFTER)
914     --MI;
915 
916   return Changed;
917 }
918 
919 bool SIGfx10CacheControl::enableLoadCacheBypass(
920     const MachineBasicBlock::iterator &MI,
921     SIAtomicScope Scope,
922     SIAtomicAddrSpace AddrSpace) const {
923   assert(MI->mayLoad() && !MI->mayStore());
924   bool Changed = false;
925 
926   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
927     /// TODO Do not set glc for rmw atomic operations as they
928     /// implicitly bypass the L0/L1 caches.
929 
930     switch (Scope) {
931     case SIAtomicScope::SYSTEM:
932     case SIAtomicScope::AGENT:
933       Changed |= enableGLCBit(MI);
934       Changed |= enableDLCBit(MI);
935       break;
936     case SIAtomicScope::WORKGROUP:
937       // In WGP mode the waves of a work-group can be executing on either CU of
938       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
939       // CU mode all waves of a work-group are on the same CU, and so the L0
940       // does not need to be bypassed.
941       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
942       break;
943     case SIAtomicScope::WAVEFRONT:
944     case SIAtomicScope::SINGLETHREAD:
945       // No cache to bypass.
946       break;
947     default:
948       llvm_unreachable("Unsupported synchronization scope");
949     }
950   }
951 
952   /// The scratch address space does not need the global memory caches
953   /// to be bypassed as all memory operations by the same thread are
954   /// sequentially consistent, and no other thread can access scratch
955   /// memory.
956 
957   /// Other address spaces do not have a cache.
958 
959   return Changed;
960 }
961 
962 bool SIGfx10CacheControl::enableNonTemporal(
963     const MachineBasicBlock::iterator &MI) const {
964   assert(MI->mayLoad() ^ MI->mayStore());
965   bool Changed = false;
966 
967   Changed |= enableSLCBit(MI);
968   /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
969 
970   return Changed;
971 }
972 
973 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
974                                      SIAtomicScope Scope,
975                                      SIAtomicAddrSpace AddrSpace,
976                                      SIMemOp Op,
977                                      bool IsCrossAddrSpaceOrdering,
978                                      Position Pos) const {
979   bool Changed = false;
980 
981   MachineBasicBlock &MBB = *MI->getParent();
982   DebugLoc DL = MI->getDebugLoc();
983 
984   if (Pos == Position::AFTER)
985     ++MI;
986 
987   bool VMCnt = false;
988   bool VSCnt = false;
989   bool LGKMCnt = false;
990 
991   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
992     switch (Scope) {
993     case SIAtomicScope::SYSTEM:
994     case SIAtomicScope::AGENT:
995       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
996         VMCnt |= true;
997       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
998         VSCnt |= true;
999       break;
1000     case SIAtomicScope::WORKGROUP:
1001       // In WGP mode the waves of a work-group can be executing on either CU of
1002       // the WGP. Therefore need to wait for operations to complete to ensure
1003       // they are visible to waves in the other CU as the L0 is per CU.
1004       // Otherwise in CU mode and all waves of a work-group are on the same CU
1005       // which shares the same L0.
1006       if (!ST.isCuModeEnabled()) {
1007         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1008           VMCnt |= true;
1009         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1010           VSCnt |= true;
1011       }
1012       break;
1013     case SIAtomicScope::WAVEFRONT:
1014     case SIAtomicScope::SINGLETHREAD:
1015       // The L0 cache keeps all memory operations in order for
1016       // work-items in the same wavefront.
1017       break;
1018     default:
1019       llvm_unreachable("Unsupported synchronization scope");
1020     }
1021   }
1022 
1023   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1024     switch (Scope) {
1025     case SIAtomicScope::SYSTEM:
1026     case SIAtomicScope::AGENT:
1027     case SIAtomicScope::WORKGROUP:
1028       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1029       // not needed as LDS operations for all waves are executed in a total
1030       // global ordering as observed by all waves. Required if also
1031       // synchronizing with global/GDS memory as LDS operations could be
1032       // reordered with respect to later global/GDS memory operations of the
1033       // same wave.
1034       LGKMCnt |= IsCrossAddrSpaceOrdering;
1035       break;
1036     case SIAtomicScope::WAVEFRONT:
1037     case SIAtomicScope::SINGLETHREAD:
1038       // The LDS keeps all memory operations in order for
1039       // the same wavesfront.
1040       break;
1041     default:
1042       llvm_unreachable("Unsupported synchronization scope");
1043     }
1044   }
1045 
1046   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1047     switch (Scope) {
1048     case SIAtomicScope::SYSTEM:
1049     case SIAtomicScope::AGENT:
1050       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1051       // is not needed as GDS operations for all waves are executed in a total
1052       // global ordering as observed by all waves. Required if also
1053       // synchronizing with global/LDS memory as GDS operations could be
1054       // reordered with respect to later global/LDS memory operations of the
1055       // same wave.
1056       LGKMCnt |= IsCrossAddrSpaceOrdering;
1057       break;
1058     case SIAtomicScope::WORKGROUP:
1059     case SIAtomicScope::WAVEFRONT:
1060     case SIAtomicScope::SINGLETHREAD:
1061       // The GDS keeps all memory operations in order for
1062       // the same work-group.
1063       break;
1064     default:
1065       llvm_unreachable("Unsupported synchronization scope");
1066     }
1067   }
1068 
1069   if (VMCnt || LGKMCnt) {
1070     unsigned WaitCntImmediate =
1071       AMDGPU::encodeWaitcnt(IV,
1072                             VMCnt ? 0 : getVmcntBitMask(IV),
1073                             getExpcntBitMask(IV),
1074                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1075     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1076     Changed = true;
1077   }
1078 
1079   if (VSCnt) {
1080     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1081       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1082       .addImm(0);
1083     Changed = true;
1084   }
1085 
1086   if (Pos == Position::AFTER)
1087     --MI;
1088 
1089   return Changed;
1090 }
1091 
1092 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1093                                         SIAtomicScope Scope,
1094                                         SIAtomicAddrSpace AddrSpace,
1095                                         Position Pos) const {
1096   if (!InsertCacheInv)
1097     return false;
1098 
1099   bool Changed = false;
1100 
1101   MachineBasicBlock &MBB = *MI->getParent();
1102   DebugLoc DL = MI->getDebugLoc();
1103 
1104   if (Pos == Position::AFTER)
1105     ++MI;
1106 
1107   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1108     switch (Scope) {
1109     case SIAtomicScope::SYSTEM:
1110     case SIAtomicScope::AGENT:
1111       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1112       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1113       Changed = true;
1114       break;
1115     case SIAtomicScope::WORKGROUP:
1116       // In WGP mode the waves of a work-group can be executing on either CU of
1117       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1118       // in CU mode and all waves of a work-group are on the same CU, and so the
1119       // L0 does not need to be invalidated.
1120       if (!ST.isCuModeEnabled()) {
1121         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1122         Changed = true;
1123       }
1124       break;
1125     case SIAtomicScope::WAVEFRONT:
1126     case SIAtomicScope::SINGLETHREAD:
1127       // No cache to invalidate.
1128       break;
1129     default:
1130       llvm_unreachable("Unsupported synchronization scope");
1131     }
1132   }
1133 
1134   /// The scratch address space does not need the global memory cache
1135   /// to be flushed as all memory operations by the same thread are
1136   /// sequentially consistent, and no other thread can access scratch
1137   /// memory.
1138 
1139   /// Other address spaces do not have a cache.
1140 
1141   if (Pos == Position::AFTER)
1142     --MI;
1143 
1144   return Changed;
1145 }
1146 
1147 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1148   if (AtomicPseudoMIs.empty())
1149     return false;
1150 
1151   for (auto &MI : AtomicPseudoMIs)
1152     MI->eraseFromParent();
1153 
1154   AtomicPseudoMIs.clear();
1155   return true;
1156 }
1157 
1158 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1159                                    MachineBasicBlock::iterator &MI) {
1160   assert(MI->mayLoad() && !MI->mayStore());
1161 
1162   bool Changed = false;
1163 
1164   if (MOI.isAtomic()) {
1165     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1166         MOI.getOrdering() == AtomicOrdering::Acquire ||
1167         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1168       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1169                                            MOI.getOrderingAddrSpace());
1170     }
1171 
1172     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1173       Changed |= CC->insertWait(MI, MOI.getScope(),
1174                                 MOI.getOrderingAddrSpace(),
1175                                 SIMemOp::LOAD | SIMemOp::STORE,
1176                                 MOI.getIsCrossAddressSpaceOrdering(),
1177                                 Position::BEFORE);
1178 
1179     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1180         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1181       Changed |= CC->insertWait(MI, MOI.getScope(),
1182                                 MOI.getInstrAddrSpace(),
1183                                 SIMemOp::LOAD,
1184                                 MOI.getIsCrossAddressSpaceOrdering(),
1185                                 Position::AFTER);
1186       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1187                                    MOI.getOrderingAddrSpace(),
1188                                    Position::AFTER);
1189     }
1190 
1191     return Changed;
1192   }
1193 
1194   // Atomic instructions do not have the nontemporal attribute.
1195   if (MOI.isNonTemporal()) {
1196     Changed |= CC->enableNonTemporal(MI);
1197     return Changed;
1198   }
1199 
1200   return Changed;
1201 }
1202 
1203 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1204                                     MachineBasicBlock::iterator &MI) {
1205   assert(!MI->mayLoad() && MI->mayStore());
1206 
1207   bool Changed = false;
1208 
1209   if (MOI.isAtomic()) {
1210     if (MOI.getOrdering() == AtomicOrdering::Release ||
1211         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1212       Changed |= CC->insertRelease(MI, MOI.getScope(),
1213                                    MOI.getOrderingAddrSpace(),
1214                                    MOI.getIsCrossAddressSpaceOrdering(),
1215                                    Position::BEFORE);
1216 
1217     return Changed;
1218   }
1219 
1220   // Atomic instructions do not have the nontemporal attribute.
1221   if (MOI.isNonTemporal()) {
1222     Changed |= CC->enableNonTemporal(MI);
1223     return Changed;
1224   }
1225 
1226   return Changed;
1227 }
1228 
1229 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1230                                           MachineBasicBlock::iterator &MI) {
1231   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1232 
1233   AtomicPseudoMIs.push_back(MI);
1234   bool Changed = false;
1235 
1236   if (MOI.isAtomic()) {
1237     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1238         MOI.getOrdering() == AtomicOrdering::Release ||
1239         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1240         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1241       /// TODO: This relies on a barrier always generating a waitcnt
1242       /// for LDS to ensure it is not reordered with the completion of
1243       /// the proceeding LDS operations. If barrier had a memory
1244       /// ordering and memory scope, then library does not need to
1245       /// generate a fence. Could add support in this file for
1246       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1247       /// adding S_WAITCNT before a S_BARRIER.
1248       Changed |= CC->insertRelease(MI, MOI.getScope(),
1249                                    MOI.getOrderingAddrSpace(),
1250                                    MOI.getIsCrossAddressSpaceOrdering(),
1251                                    Position::BEFORE);
1252 
1253     // TODO: If both release and invalidate are happening they could be combined
1254     // to use the single "BUFFER_WBL2" instruction. This could be done by
1255     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1256     // track cache invalidate and write back instructions.
1257 
1258     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1259         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1260         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1261       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1262                                    MOI.getOrderingAddrSpace(),
1263                                    Position::BEFORE);
1264 
1265     return Changed;
1266   }
1267 
1268   return Changed;
1269 }
1270 
1271 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1272   MachineBasicBlock::iterator &MI) {
1273   assert(MI->mayLoad() && MI->mayStore());
1274 
1275   bool Changed = false;
1276 
1277   if (MOI.isAtomic()) {
1278     if (MOI.getOrdering() == AtomicOrdering::Release ||
1279         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1280         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1281         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1282       Changed |= CC->insertRelease(MI, MOI.getScope(),
1283                                    MOI.getOrderingAddrSpace(),
1284                                    MOI.getIsCrossAddressSpaceOrdering(),
1285                                    Position::BEFORE);
1286 
1287     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1288         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1289         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1290         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1291         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1292       Changed |= CC->insertWait(MI, MOI.getScope(),
1293                                 MOI.getOrderingAddrSpace(),
1294                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1295                                                    SIMemOp::STORE,
1296                                 MOI.getIsCrossAddressSpaceOrdering(),
1297                                 Position::AFTER);
1298       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1299                                    MOI.getOrderingAddrSpace(),
1300                                    Position::AFTER);
1301     }
1302 
1303     return Changed;
1304   }
1305 
1306   return Changed;
1307 }
1308 
1309 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1310   bool Changed = false;
1311 
1312   SIMemOpAccess MOA(MF);
1313   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1314 
1315   for (auto &MBB : MF) {
1316     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1317 
1318       // Unbundle instructions after the post-RA scheduler.
1319       if (MI->isBundle()) {
1320         MachineBasicBlock::instr_iterator II(MI->getIterator());
1321         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1322              I != E && I->isBundledWithPred(); ++I) {
1323           I->unbundleFromPred();
1324           for (MachineOperand &MO : I->operands())
1325             if (MO.isReg())
1326               MO.setIsInternalRead(false);
1327         }
1328 
1329         MI->eraseFromParent();
1330         MI = II->getIterator();
1331       }
1332 
1333       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1334         continue;
1335 
1336       if (const auto &MOI = MOA.getLoadInfo(MI))
1337         Changed |= expandLoad(MOI.getValue(), MI);
1338       else if (const auto &MOI = MOA.getStoreInfo(MI))
1339         Changed |= expandStore(MOI.getValue(), MI);
1340       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1341         Changed |= expandAtomicFence(MOI.getValue(), MI);
1342       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1343         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1344     }
1345   }
1346 
1347   Changed |= removeAtomicPseudoMIs();
1348   return Changed;
1349 }
1350 
1351 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1352 
1353 char SIMemoryLegalizer::ID = 0;
1354 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1355 
1356 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1357   return new SIMemoryLegalizer();
1358 }
1359