1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/Support/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SiMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SiMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   Optional<SIMemOpInfo> constructFromMIWithMMO(
233       const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "None" otherwise.
241   Optional<SIMemOpInfo> getLoadInfo(
242       const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "None" otherwise.
245   Optional<SIMemOpInfo> getStoreInfo(
246       const MachineBasicBlock::iterator &MI) const;
247 
248   /// \returns Atomic fence info if \p MI is an atomic fence operation,
249   /// "None" otherwise.
250   Optional<SIMemOpInfo> getAtomicFenceInfo(
251       const MachineBasicBlock::iterator &MI) const;
252 
253   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254   /// rmw operation, "None" otherwise.
255   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256       const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262   /// AMDGPU subtarget info.
263   const GCNSubtarget &ST;
264 
265   /// Instruction info.
266   const SIInstrInfo *TII = nullptr;
267 
268   IsaVersion IV;
269 
270   /// Whether to insert cache invalidating instructions.
271   bool InsertCacheInv;
272 
273   SICacheControl(const GCNSubtarget &ST);
274 
275   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276   /// \returns Returns true if \p MI is modified, false otherwise.
277   bool enableNamedBit(const MachineBasicBlock::iterator MI,
278                       AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282   /// Create a cache control for the subtarget \p ST.
283   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285   /// Update \p MI memory load instruction to bypass any caches up to
286   /// the \p Scope memory scope for address spaces \p
287   /// AddrSpace. Return true iff the instruction was modified.
288   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289                                      SIAtomicScope Scope,
290                                      SIAtomicAddrSpace AddrSpace) const = 0;
291 
292   /// Update \p MI memory store instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296                                       SIAtomicScope Scope,
297                                       SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory read-modify-write instruction to bypass any caches up
300   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301   /// iff the instruction was modified.
302   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303                                     SIAtomicScope Scope,
304                                     SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory instruction of kind \p Op associated with address
307   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308   /// true iff the instruction was modified.
309   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310                                               SIAtomicAddrSpace AddrSpace,
311                                               SIMemOp Op, bool IsVolatile,
312                                               bool IsNonTemporal) const = 0;
313 
314   /// Inserts any necessary instructions at position \p Pos relative
315   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316   /// \p Op associated with address spaces \p AddrSpace have completed. Used
317   /// between memory instructions to enforce the order they become visible as
318   /// observed by other memory instructions executing in memory scope \p Scope.
319   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320   /// address spaces. Returns true iff any instructions inserted.
321   virtual bool insertWait(MachineBasicBlock::iterator &MI,
322                           SIAtomicScope Scope,
323                           SIAtomicAddrSpace AddrSpace,
324                           SIMemOp Op,
325                           bool IsCrossAddrSpaceOrdering,
326                           Position Pos) const = 0;
327 
328   /// Inserts any necessary instructions at position \p Pos relative to
329   /// instruction \p MI to ensure any subsequent memory instructions of this
330   /// thread with address spaces \p AddrSpace will observe the previous memory
331   /// operations by any thread for memory scopes up to memory scope \p Scope .
332   /// Returns true iff any instructions inserted.
333   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334                              SIAtomicScope Scope,
335                              SIAtomicAddrSpace AddrSpace,
336                              Position Pos) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure previous memory instructions by this thread
340   /// with address spaces \p AddrSpace have completed and can be observed by
341   /// subsequent memory instructions by any thread executing in memory scope \p
342   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343   /// between address spaces. Returns true iff any instructions inserted.
344   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345                              SIAtomicScope Scope,
346                              SIAtomicAddrSpace AddrSpace,
347                              bool IsCrossAddrSpaceOrdering,
348                              Position Pos) const = 0;
349 
350   /// Virtual destructor to allow derivations to be deleted.
351   virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359   /// is modified, false otherwise.
360   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361     return enableNamedBit(MI, AMDGPU::CPol::GLC);
362   }
363 
364   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365   /// is modified, false otherwise.
366   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367     return enableNamedBit(MI, AMDGPU::CPol::SLC);
368   }
369 
370 public:
371 
372   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
373 
374   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375                              SIAtomicScope Scope,
376                              SIAtomicAddrSpace AddrSpace) const override;
377 
378   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379                               SIAtomicScope Scope,
380                               SIAtomicAddrSpace AddrSpace) const override;
381 
382   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383                             SIAtomicScope Scope,
384                             SIAtomicAddrSpace AddrSpace) const override;
385 
386   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388                                       bool IsVolatile,
389                                       bool IsNonTemporal) const override;
390 
391   bool insertWait(MachineBasicBlock::iterator &MI,
392                   SIAtomicScope Scope,
393                   SIAtomicAddrSpace AddrSpace,
394                   SIMemOp Op,
395                   bool IsCrossAddrSpaceOrdering,
396                   Position Pos) const override;
397 
398   bool insertAcquire(MachineBasicBlock::iterator &MI,
399                      SIAtomicScope Scope,
400                      SIAtomicAddrSpace AddrSpace,
401                      Position Pos) const override;
402 
403   bool insertRelease(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      bool IsCrossAddrSpaceOrdering,
407                      Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
413   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
414 
415   bool insertAcquire(MachineBasicBlock::iterator &MI,
416                      SIAtomicScope Scope,
417                      SIAtomicAddrSpace AddrSpace,
418                      Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424 
425   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
426 
427   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428                              SIAtomicScope Scope,
429                              SIAtomicAddrSpace AddrSpace) const override;
430 
431   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432                               SIAtomicScope Scope,
433                               SIAtomicAddrSpace AddrSpace) const override;
434 
435   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436                             SIAtomicScope Scope,
437                             SIAtomicAddrSpace AddrSpace) const override;
438 
439   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441                                       bool IsVolatile,
442                                       bool IsNonTemporal) const override;
443 
444   bool insertWait(MachineBasicBlock::iterator &MI,
445                   SIAtomicScope Scope,
446                   SIAtomicAddrSpace AddrSpace,
447                   SIMemOp Op,
448                   bool IsCrossAddrSpaceOrdering,
449                   Position Pos) const override;
450 
451   bool insertAcquire(MachineBasicBlock::iterator &MI,
452                      SIAtomicScope Scope,
453                      SIAtomicAddrSpace AddrSpace,
454                      Position Pos) const override;
455 
456   bool insertRelease(MachineBasicBlock::iterator &MI,
457                      SIAtomicScope Scope,
458                      SIAtomicAddrSpace AddrSpace,
459                      bool IsCrossAddrSpaceOrdering,
460                      Position Pos) const override;
461 };
462 
463 class SIGfx940CacheControl : public SIGfx90ACacheControl {
464 protected:
465 
466   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
467   /// is modified, false otherwise.
468   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
469     return enableNamedBit(MI, AMDGPU::CPol::SC0);
470   }
471 
472   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
473   /// is modified, false otherwise.
474   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
475     return enableNamedBit(MI, AMDGPU::CPol::SC1);
476   }
477 
478   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
479   /// is modified, false otherwise.
480   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
481     return enableNamedBit(MI, AMDGPU::CPol::NT);
482   }
483 
484 public:
485 
486   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
487 
488   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
489                              SIAtomicScope Scope,
490                              SIAtomicAddrSpace AddrSpace) const override;
491 
492   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
493                               SIAtomicScope Scope,
494                               SIAtomicAddrSpace AddrSpace) const override;
495 
496   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
497                             SIAtomicScope Scope,
498                             SIAtomicAddrSpace AddrSpace) const override;
499 
500   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
501                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
502                                       bool IsVolatile,
503                                       bool IsNonTemporal) const override;
504 
505   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
506                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
507 
508   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
509                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
510                      Position Pos) const override;
511 };
512 
513 class SIGfx10CacheControl : public SIGfx7CacheControl {
514 protected:
515 
516   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
517   /// is modified, false otherwise.
518   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
519     return enableNamedBit(MI, AMDGPU::CPol::DLC);
520   }
521 
522 public:
523 
524   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
525 
526   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
527                              SIAtomicScope Scope,
528                              SIAtomicAddrSpace AddrSpace) const override;
529 
530   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
531                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
532                                       bool IsVolatile,
533                                       bool IsNonTemporal) const override;
534 
535   bool insertWait(MachineBasicBlock::iterator &MI,
536                   SIAtomicScope Scope,
537                   SIAtomicAddrSpace AddrSpace,
538                   SIMemOp Op,
539                   bool IsCrossAddrSpaceOrdering,
540                   Position Pos) const override;
541 
542   bool insertAcquire(MachineBasicBlock::iterator &MI,
543                      SIAtomicScope Scope,
544                      SIAtomicAddrSpace AddrSpace,
545                      Position Pos) const override;
546 };
547 
548 class SIMemoryLegalizer final : public MachineFunctionPass {
549 private:
550 
551   /// Cache Control.
552   std::unique_ptr<SICacheControl> CC = nullptr;
553 
554   /// List of atomic pseudo instructions.
555   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
556 
557   /// Return true iff instruction \p MI is a atomic instruction that
558   /// returns a result.
559   bool isAtomicRet(const MachineInstr &MI) const {
560     return SIInstrInfo::isAtomicRet(MI);
561   }
562 
563   /// Removes all processed atomic pseudo instructions from the current
564   /// function. Returns true if current function is modified, false otherwise.
565   bool removeAtomicPseudoMIs();
566 
567   /// Expands load operation \p MI. Returns true if instructions are
568   /// added/deleted or \p MI is modified, false otherwise.
569   bool expandLoad(const SIMemOpInfo &MOI,
570                   MachineBasicBlock::iterator &MI);
571   /// Expands store operation \p MI. Returns true if instructions are
572   /// added/deleted or \p MI is modified, false otherwise.
573   bool expandStore(const SIMemOpInfo &MOI,
574                    MachineBasicBlock::iterator &MI);
575   /// Expands atomic fence operation \p MI. Returns true if
576   /// instructions are added/deleted or \p MI is modified, false otherwise.
577   bool expandAtomicFence(const SIMemOpInfo &MOI,
578                          MachineBasicBlock::iterator &MI);
579   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
580   /// instructions are added/deleted or \p MI is modified, false otherwise.
581   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
582                                 MachineBasicBlock::iterator &MI);
583 
584 public:
585   static char ID;
586 
587   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
588 
589   void getAnalysisUsage(AnalysisUsage &AU) const override {
590     AU.setPreservesCFG();
591     MachineFunctionPass::getAnalysisUsage(AU);
592   }
593 
594   StringRef getPassName() const override {
595     return PASS_NAME;
596   }
597 
598   bool runOnMachineFunction(MachineFunction &MF) override;
599 };
600 
601 } // end namespace anonymous
602 
603 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
604                                       const char *Msg) const {
605   const Function &Func = MI->getParent()->getParent()->getFunction();
606   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
607   Func.getContext().diagnose(Diag);
608 }
609 
610 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
611 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
612                                SIAtomicAddrSpace InstrAddrSpace) const {
613   if (SSID == SyncScope::System)
614     return std::make_tuple(SIAtomicScope::SYSTEM,
615                            SIAtomicAddrSpace::ATOMIC,
616                            true);
617   if (SSID == MMI->getAgentSSID())
618     return std::make_tuple(SIAtomicScope::AGENT,
619                            SIAtomicAddrSpace::ATOMIC,
620                            true);
621   if (SSID == MMI->getWorkgroupSSID())
622     return std::make_tuple(SIAtomicScope::WORKGROUP,
623                            SIAtomicAddrSpace::ATOMIC,
624                            true);
625   if (SSID == MMI->getWavefrontSSID())
626     return std::make_tuple(SIAtomicScope::WAVEFRONT,
627                            SIAtomicAddrSpace::ATOMIC,
628                            true);
629   if (SSID == SyncScope::SingleThread)
630     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
631                            SIAtomicAddrSpace::ATOMIC,
632                            true);
633   if (SSID == MMI->getSystemOneAddressSpaceSSID())
634     return std::make_tuple(SIAtomicScope::SYSTEM,
635                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
636                            false);
637   if (SSID == MMI->getAgentOneAddressSpaceSSID())
638     return std::make_tuple(SIAtomicScope::AGENT,
639                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
640                            false);
641   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
642     return std::make_tuple(SIAtomicScope::WORKGROUP,
643                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
644                            false);
645   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
646     return std::make_tuple(SIAtomicScope::WAVEFRONT,
647                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
648                            false);
649   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
650     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
651                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
652                            false);
653   return None;
654 }
655 
656 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
657   if (AS == AMDGPUAS::FLAT_ADDRESS)
658     return SIAtomicAddrSpace::FLAT;
659   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
660     return SIAtomicAddrSpace::GLOBAL;
661   if (AS == AMDGPUAS::LOCAL_ADDRESS)
662     return SIAtomicAddrSpace::LDS;
663   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
664     return SIAtomicAddrSpace::SCRATCH;
665   if (AS == AMDGPUAS::REGION_ADDRESS)
666     return SIAtomicAddrSpace::GDS;
667 
668   return SIAtomicAddrSpace::OTHER;
669 }
670 
671 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
672   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
673 }
674 
675 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
676     const MachineBasicBlock::iterator &MI) const {
677   assert(MI->getNumMemOperands() > 0);
678 
679   SyncScope::ID SSID = SyncScope::SingleThread;
680   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
681   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
682   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
683   bool IsNonTemporal = true;
684   bool IsVolatile = false;
685 
686   // Validator should check whether or not MMOs cover the entire set of
687   // locations accessed by the memory instruction.
688   for (const auto &MMO : MI->memoperands()) {
689     IsNonTemporal &= MMO->isNonTemporal();
690     IsVolatile |= MMO->isVolatile();
691     InstrAddrSpace |=
692       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
693     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
694     if (OpOrdering != AtomicOrdering::NotAtomic) {
695       const auto &IsSyncScopeInclusion =
696           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
697       if (!IsSyncScopeInclusion) {
698         reportUnsupported(MI,
699           "Unsupported non-inclusive atomic synchronization scope");
700         return None;
701       }
702 
703       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
704       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
705       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
706              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
707       FailureOrdering =
708           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
709     }
710   }
711 
712   SIAtomicScope Scope = SIAtomicScope::NONE;
713   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
714   bool IsCrossAddressSpaceOrdering = false;
715   if (Ordering != AtomicOrdering::NotAtomic) {
716     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
717     if (!ScopeOrNone) {
718       reportUnsupported(MI, "Unsupported atomic synchronization scope");
719       return None;
720     }
721     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
722       ScopeOrNone.getValue();
723     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
724         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
725         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
726       reportUnsupported(MI, "Unsupported atomic address space");
727       return None;
728     }
729   }
730   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
731                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
732                      IsNonTemporal);
733 }
734 
735 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
736     const MachineBasicBlock::iterator &MI) const {
737   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
738 
739   if (!(MI->mayLoad() && !MI->mayStore()))
740     return None;
741 
742   // Be conservative if there are no memory operands.
743   if (MI->getNumMemOperands() == 0)
744     return SIMemOpInfo();
745 
746   return constructFromMIWithMMO(MI);
747 }
748 
749 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
750     const MachineBasicBlock::iterator &MI) const {
751   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
752 
753   if (!(!MI->mayLoad() && MI->mayStore()))
754     return None;
755 
756   // Be conservative if there are no memory operands.
757   if (MI->getNumMemOperands() == 0)
758     return SIMemOpInfo();
759 
760   return constructFromMIWithMMO(MI);
761 }
762 
763 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
764     const MachineBasicBlock::iterator &MI) const {
765   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
766 
767   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
768     return None;
769 
770   AtomicOrdering Ordering =
771     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
772 
773   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
774   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
775   if (!ScopeOrNone) {
776     reportUnsupported(MI, "Unsupported atomic synchronization scope");
777     return None;
778   }
779 
780   SIAtomicScope Scope = SIAtomicScope::NONE;
781   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
782   bool IsCrossAddressSpaceOrdering = false;
783   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
784     ScopeOrNone.getValue();
785 
786   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
787       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
788     reportUnsupported(MI, "Unsupported atomic address space");
789     return None;
790   }
791 
792   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
793                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
794 }
795 
796 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
797     const MachineBasicBlock::iterator &MI) const {
798   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
799 
800   if (!(MI->mayLoad() && MI->mayStore()))
801     return None;
802 
803   // Be conservative if there are no memory operands.
804   if (MI->getNumMemOperands() == 0)
805     return SIMemOpInfo();
806 
807   return constructFromMIWithMMO(MI);
808 }
809 
810 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
811   TII = ST.getInstrInfo();
812   IV = getIsaVersion(ST.getCPU());
813   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
814 }
815 
816 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
817                                     AMDGPU::CPol::CPol Bit) const {
818   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
819   if (!CPol)
820     return false;
821 
822   CPol->setImm(CPol->getImm() | Bit);
823   return true;
824 }
825 
826 /* static */
827 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
828   GCNSubtarget::Generation Generation = ST.getGeneration();
829   if (ST.hasGFX940Insts())
830     return std::make_unique<SIGfx940CacheControl>(ST);
831   if (ST.hasGFX90AInsts())
832     return std::make_unique<SIGfx90ACacheControl>(ST);
833   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
834     return std::make_unique<SIGfx6CacheControl>(ST);
835   if (Generation < AMDGPUSubtarget::GFX10)
836     return std::make_unique<SIGfx7CacheControl>(ST);
837   return std::make_unique<SIGfx10CacheControl>(ST);
838 }
839 
840 bool SIGfx6CacheControl::enableLoadCacheBypass(
841     const MachineBasicBlock::iterator &MI,
842     SIAtomicScope Scope,
843     SIAtomicAddrSpace AddrSpace) const {
844   assert(MI->mayLoad() && !MI->mayStore());
845   bool Changed = false;
846 
847   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
848     switch (Scope) {
849     case SIAtomicScope::SYSTEM:
850     case SIAtomicScope::AGENT:
851       // Set L1 cache policy to MISS_EVICT.
852       // Note: there is no L2 cache bypass policy at the ISA level.
853       Changed |= enableGLCBit(MI);
854       break;
855     case SIAtomicScope::WORKGROUP:
856     case SIAtomicScope::WAVEFRONT:
857     case SIAtomicScope::SINGLETHREAD:
858       // No cache to bypass.
859       break;
860     default:
861       llvm_unreachable("Unsupported synchronization scope");
862     }
863   }
864 
865   /// The scratch address space does not need the global memory caches
866   /// to be bypassed as all memory operations by the same thread are
867   /// sequentially consistent, and no other thread can access scratch
868   /// memory.
869 
870   /// Other address spaces do not have a cache.
871 
872   return Changed;
873 }
874 
875 bool SIGfx6CacheControl::enableStoreCacheBypass(
876     const MachineBasicBlock::iterator &MI,
877     SIAtomicScope Scope,
878     SIAtomicAddrSpace AddrSpace) const {
879   assert(!MI->mayLoad() && MI->mayStore());
880   bool Changed = false;
881 
882   /// The L1 cache is write through so does not need to be bypassed. There is no
883   /// bypass control for the L2 cache at the isa level.
884 
885   return Changed;
886 }
887 
888 bool SIGfx6CacheControl::enableRMWCacheBypass(
889     const MachineBasicBlock::iterator &MI,
890     SIAtomicScope Scope,
891     SIAtomicAddrSpace AddrSpace) const {
892   assert(MI->mayLoad() && MI->mayStore());
893   bool Changed = false;
894 
895   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
896   /// bypassed, and the GLC bit is instead used to indicate if they are
897   /// return or no-return.
898   /// Note: there is no L2 cache coherent bypass control at the ISA level.
899 
900   return Changed;
901 }
902 
903 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
904     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
905     bool IsVolatile, bool IsNonTemporal) const {
906   // Only handle load and store, not atomic read-modify-write insructions. The
907   // latter use glc to indicate if the atomic returns a result and so must not
908   // be used for cache control.
909   assert(MI->mayLoad() ^ MI->mayStore());
910 
911   // Only update load and store, not LLVM IR atomic read-modify-write
912   // instructions. The latter are always marked as volatile so cannot sensibly
913   // handle it as do not want to pessimize all atomics. Also they do not support
914   // the nontemporal attribute.
915   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
916 
917   bool Changed = false;
918 
919   if (IsVolatile) {
920     // Set L1 cache policy to be MISS_EVICT for load instructions
921     // and MISS_LRU for store instructions.
922     // Note: there is no L2 cache bypass policy at the ISA level.
923     if (Op == SIMemOp::LOAD)
924       Changed |= enableGLCBit(MI);
925 
926     // Ensure operation has completed at system scope to cause all volatile
927     // operations to be visible outside the program in a global order. Do not
928     // request cross address space as only the global address space can be
929     // observable outside the program, so no need to cause a waitcnt for LDS
930     // address space operations.
931     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
932                           Position::AFTER);
933 
934     return Changed;
935   }
936 
937   if (IsNonTemporal) {
938     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
939     // for both loads and stores, and the L2 cache policy to STREAM.
940     Changed |= enableGLCBit(MI);
941     Changed |= enableSLCBit(MI);
942     return Changed;
943   }
944 
945   return Changed;
946 }
947 
948 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
949                                     SIAtomicScope Scope,
950                                     SIAtomicAddrSpace AddrSpace,
951                                     SIMemOp Op,
952                                     bool IsCrossAddrSpaceOrdering,
953                                     Position Pos) const {
954   bool Changed = false;
955 
956   MachineBasicBlock &MBB = *MI->getParent();
957   DebugLoc DL = MI->getDebugLoc();
958 
959   if (Pos == Position::AFTER)
960     ++MI;
961 
962   bool VMCnt = false;
963   bool LGKMCnt = false;
964 
965   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
966       SIAtomicAddrSpace::NONE) {
967     switch (Scope) {
968     case SIAtomicScope::SYSTEM:
969     case SIAtomicScope::AGENT:
970       VMCnt |= true;
971       break;
972     case SIAtomicScope::WORKGROUP:
973     case SIAtomicScope::WAVEFRONT:
974     case SIAtomicScope::SINGLETHREAD:
975       // The L1 cache keeps all memory operations in order for
976       // wavefronts in the same work-group.
977       break;
978     default:
979       llvm_unreachable("Unsupported synchronization scope");
980     }
981   }
982 
983   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
984     switch (Scope) {
985     case SIAtomicScope::SYSTEM:
986     case SIAtomicScope::AGENT:
987     case SIAtomicScope::WORKGROUP:
988       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
989       // not needed as LDS operations for all waves are executed in a total
990       // global ordering as observed by all waves. Required if also
991       // synchronizing with global/GDS memory as LDS operations could be
992       // reordered with respect to later global/GDS memory operations of the
993       // same wave.
994       LGKMCnt |= IsCrossAddrSpaceOrdering;
995       break;
996     case SIAtomicScope::WAVEFRONT:
997     case SIAtomicScope::SINGLETHREAD:
998       // The LDS keeps all memory operations in order for
999       // the same wavefront.
1000       break;
1001     default:
1002       llvm_unreachable("Unsupported synchronization scope");
1003     }
1004   }
1005 
1006   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1007     switch (Scope) {
1008     case SIAtomicScope::SYSTEM:
1009     case SIAtomicScope::AGENT:
1010       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1011       // is not needed as GDS operations for all waves are executed in a total
1012       // global ordering as observed by all waves. Required if also
1013       // synchronizing with global/LDS memory as GDS operations could be
1014       // reordered with respect to later global/LDS memory operations of the
1015       // same wave.
1016       LGKMCnt |= IsCrossAddrSpaceOrdering;
1017       break;
1018     case SIAtomicScope::WORKGROUP:
1019     case SIAtomicScope::WAVEFRONT:
1020     case SIAtomicScope::SINGLETHREAD:
1021       // The GDS keeps all memory operations in order for
1022       // the same work-group.
1023       break;
1024     default:
1025       llvm_unreachable("Unsupported synchronization scope");
1026     }
1027   }
1028 
1029   if (VMCnt || LGKMCnt) {
1030     unsigned WaitCntImmediate =
1031       AMDGPU::encodeWaitcnt(IV,
1032                             VMCnt ? 0 : getVmcntBitMask(IV),
1033                             getExpcntBitMask(IV),
1034                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1035     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1036     Changed = true;
1037   }
1038 
1039   if (Pos == Position::AFTER)
1040     --MI;
1041 
1042   return Changed;
1043 }
1044 
1045 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1046                                        SIAtomicScope Scope,
1047                                        SIAtomicAddrSpace AddrSpace,
1048                                        Position Pos) const {
1049   if (!InsertCacheInv)
1050     return false;
1051 
1052   bool Changed = false;
1053 
1054   MachineBasicBlock &MBB = *MI->getParent();
1055   DebugLoc DL = MI->getDebugLoc();
1056 
1057   if (Pos == Position::AFTER)
1058     ++MI;
1059 
1060   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1061     switch (Scope) {
1062     case SIAtomicScope::SYSTEM:
1063     case SIAtomicScope::AGENT:
1064       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1065       Changed = true;
1066       break;
1067     case SIAtomicScope::WORKGROUP:
1068     case SIAtomicScope::WAVEFRONT:
1069     case SIAtomicScope::SINGLETHREAD:
1070       // No cache to invalidate.
1071       break;
1072     default:
1073       llvm_unreachable("Unsupported synchronization scope");
1074     }
1075   }
1076 
1077   /// The scratch address space does not need the global memory cache
1078   /// to be flushed as all memory operations by the same thread are
1079   /// sequentially consistent, and no other thread can access scratch
1080   /// memory.
1081 
1082   /// Other address spaces do not have a cache.
1083 
1084   if (Pos == Position::AFTER)
1085     --MI;
1086 
1087   return Changed;
1088 }
1089 
1090 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1091                                        SIAtomicScope Scope,
1092                                        SIAtomicAddrSpace AddrSpace,
1093                                        bool IsCrossAddrSpaceOrdering,
1094                                        Position Pos) const {
1095   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1096                     IsCrossAddrSpaceOrdering, Pos);
1097 }
1098 
1099 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1100                                        SIAtomicScope Scope,
1101                                        SIAtomicAddrSpace AddrSpace,
1102                                        Position Pos) const {
1103   if (!InsertCacheInv)
1104     return false;
1105 
1106   bool Changed = false;
1107 
1108   MachineBasicBlock &MBB = *MI->getParent();
1109   DebugLoc DL = MI->getDebugLoc();
1110 
1111   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1112 
1113   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1114                                     ? AMDGPU::BUFFER_WBINVL1
1115                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1116 
1117   if (Pos == Position::AFTER)
1118     ++MI;
1119 
1120   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1121     switch (Scope) {
1122     case SIAtomicScope::SYSTEM:
1123     case SIAtomicScope::AGENT:
1124       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1125       Changed = true;
1126       break;
1127     case SIAtomicScope::WORKGROUP:
1128     case SIAtomicScope::WAVEFRONT:
1129     case SIAtomicScope::SINGLETHREAD:
1130       // No cache to invalidate.
1131       break;
1132     default:
1133       llvm_unreachable("Unsupported synchronization scope");
1134     }
1135   }
1136 
1137   /// The scratch address space does not need the global memory cache
1138   /// to be flushed as all memory operations by the same thread are
1139   /// sequentially consistent, and no other thread can access scratch
1140   /// memory.
1141 
1142   /// Other address spaces do not have a cache.
1143 
1144   if (Pos == Position::AFTER)
1145     --MI;
1146 
1147   return Changed;
1148 }
1149 
1150 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1151     const MachineBasicBlock::iterator &MI,
1152     SIAtomicScope Scope,
1153     SIAtomicAddrSpace AddrSpace) const {
1154   assert(MI->mayLoad() && !MI->mayStore());
1155   bool Changed = false;
1156 
1157   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1158     switch (Scope) {
1159     case SIAtomicScope::SYSTEM:
1160     case SIAtomicScope::AGENT:
1161       // Set the L1 cache policy to MISS_LRU.
1162       // Note: there is no L2 cache bypass policy at the ISA level.
1163       Changed |= enableGLCBit(MI);
1164       break;
1165     case SIAtomicScope::WORKGROUP:
1166       // In threadgroup split mode the waves of a work-group can be executing on
1167       // different CUs. Therefore need to bypass the L1 which is per CU.
1168       // Otherwise in non-threadgroup split mode all waves of a work-group are
1169       // on the same CU, and so the L1 does not need to be bypassed.
1170       if (ST.isTgSplitEnabled())
1171         Changed |= enableGLCBit(MI);
1172       break;
1173     case SIAtomicScope::WAVEFRONT:
1174     case SIAtomicScope::SINGLETHREAD:
1175       // No cache to bypass.
1176       break;
1177     default:
1178       llvm_unreachable("Unsupported synchronization scope");
1179     }
1180   }
1181 
1182   /// The scratch address space does not need the global memory caches
1183   /// to be bypassed as all memory operations by the same thread are
1184   /// sequentially consistent, and no other thread can access scratch
1185   /// memory.
1186 
1187   /// Other address spaces do not have a cache.
1188 
1189   return Changed;
1190 }
1191 
1192 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1193     const MachineBasicBlock::iterator &MI,
1194     SIAtomicScope Scope,
1195     SIAtomicAddrSpace AddrSpace) const {
1196   assert(!MI->mayLoad() && MI->mayStore());
1197   bool Changed = false;
1198 
1199   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1200     switch (Scope) {
1201     case SIAtomicScope::SYSTEM:
1202     case SIAtomicScope::AGENT:
1203       /// Do not set glc for store atomic operations as they implicitly write
1204       /// through the L1 cache.
1205       break;
1206     case SIAtomicScope::WORKGROUP:
1207     case SIAtomicScope::WAVEFRONT:
1208     case SIAtomicScope::SINGLETHREAD:
1209       // No cache to bypass. Store atomics implicitly write through the L1
1210       // cache.
1211       break;
1212     default:
1213       llvm_unreachable("Unsupported synchronization scope");
1214     }
1215   }
1216 
1217   /// The scratch address space does not need the global memory caches
1218   /// to be bypassed as all memory operations by the same thread are
1219   /// sequentially consistent, and no other thread can access scratch
1220   /// memory.
1221 
1222   /// Other address spaces do not have a cache.
1223 
1224   return Changed;
1225 }
1226 
1227 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1228     const MachineBasicBlock::iterator &MI,
1229     SIAtomicScope Scope,
1230     SIAtomicAddrSpace AddrSpace) const {
1231   assert(MI->mayLoad() && MI->mayStore());
1232   bool Changed = false;
1233 
1234   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1235     switch (Scope) {
1236     case SIAtomicScope::SYSTEM:
1237     case SIAtomicScope::AGENT:
1238       /// Do not set glc for RMW atomic operations as they implicitly bypass
1239       /// the L1 cache, and the glc bit is instead used to indicate if they are
1240       /// return or no-return.
1241       break;
1242     case SIAtomicScope::WORKGROUP:
1243     case SIAtomicScope::WAVEFRONT:
1244     case SIAtomicScope::SINGLETHREAD:
1245       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1246       break;
1247     default:
1248       llvm_unreachable("Unsupported synchronization scope");
1249     }
1250   }
1251 
1252   return Changed;
1253 }
1254 
1255 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1256     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1257     bool IsVolatile, bool IsNonTemporal) const {
1258   // Only handle load and store, not atomic read-modify-write insructions. The
1259   // latter use glc to indicate if the atomic returns a result and so must not
1260   // be used for cache control.
1261   assert(MI->mayLoad() ^ MI->mayStore());
1262 
1263   // Only update load and store, not LLVM IR atomic read-modify-write
1264   // instructions. The latter are always marked as volatile so cannot sensibly
1265   // handle it as do not want to pessimize all atomics. Also they do not support
1266   // the nontemporal attribute.
1267   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1268 
1269   bool Changed = false;
1270 
1271   if (IsVolatile) {
1272     // Set L1 cache policy to be MISS_EVICT for load instructions
1273     // and MISS_LRU for store instructions.
1274     // Note: there is no L2 cache bypass policy at the ISA level.
1275     if (Op == SIMemOp::LOAD)
1276       Changed |= enableGLCBit(MI);
1277 
1278     // Ensure operation has completed at system scope to cause all volatile
1279     // operations to be visible outside the program in a global order. Do not
1280     // request cross address space as only the global address space can be
1281     // observable outside the program, so no need to cause a waitcnt for LDS
1282     // address space operations.
1283     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1284                           Position::AFTER);
1285 
1286     return Changed;
1287   }
1288 
1289   if (IsNonTemporal) {
1290     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1291     // for both loads and stores, and the L2 cache policy to STREAM.
1292     Changed |= enableGLCBit(MI);
1293     Changed |= enableSLCBit(MI);
1294     return Changed;
1295   }
1296 
1297   return Changed;
1298 }
1299 
1300 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1301                                       SIAtomicScope Scope,
1302                                       SIAtomicAddrSpace AddrSpace,
1303                                       SIMemOp Op,
1304                                       bool IsCrossAddrSpaceOrdering,
1305                                       Position Pos) const {
1306   if (ST.isTgSplitEnabled()) {
1307     // In threadgroup split mode the waves of a work-group can be executing on
1308     // different CUs. Therefore need to wait for global or GDS memory operations
1309     // to complete to ensure they are visible to waves in the other CUs.
1310     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1311     // the same CU, so no need to wait for global memory as all waves in the
1312     // work-group access the same the L1, nor wait for GDS as access are ordered
1313     // on a CU.
1314     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1315                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1316         (Scope == SIAtomicScope::WORKGROUP)) {
1317       // Same as GFX7 using agent scope.
1318       Scope = SIAtomicScope::AGENT;
1319     }
1320     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1321     // LDS memory operations.
1322     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1323   }
1324   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1325                                         IsCrossAddrSpaceOrdering, Pos);
1326 }
1327 
1328 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1329                                          SIAtomicScope Scope,
1330                                          SIAtomicAddrSpace AddrSpace,
1331                                          Position Pos) const {
1332   if (!InsertCacheInv)
1333     return false;
1334 
1335   bool Changed = false;
1336 
1337   MachineBasicBlock &MBB = *MI->getParent();
1338   DebugLoc DL = MI->getDebugLoc();
1339 
1340   if (Pos == Position::AFTER)
1341     ++MI;
1342 
1343   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1344     switch (Scope) {
1345     case SIAtomicScope::SYSTEM:
1346       // Ensures that following loads will not see stale remote VMEM data or
1347       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1348       // CC will never be stale due to the local memory probes.
1349       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1350       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1351       // hardware does not reorder memory operations by the same wave with
1352       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1353       // remove any cache lines of earlier writes by the same wave and ensures
1354       // later reads by the same wave will refetch the cache lines.
1355       Changed = true;
1356       break;
1357     case SIAtomicScope::AGENT:
1358       // Same as GFX7.
1359       break;
1360     case SIAtomicScope::WORKGROUP:
1361       // In threadgroup split mode the waves of a work-group can be executing on
1362       // different CUs. Therefore need to invalidate the L1 which is per CU.
1363       // Otherwise in non-threadgroup split mode all waves of a work-group are
1364       // on the same CU, and so the L1 does not need to be invalidated.
1365       if (ST.isTgSplitEnabled()) {
1366         // Same as GFX7 using agent scope.
1367         Scope = SIAtomicScope::AGENT;
1368       }
1369       break;
1370     case SIAtomicScope::WAVEFRONT:
1371     case SIAtomicScope::SINGLETHREAD:
1372       // Same as GFX7.
1373       break;
1374     default:
1375       llvm_unreachable("Unsupported synchronization scope");
1376     }
1377   }
1378 
1379   /// The scratch address space does not need the global memory cache
1380   /// to be flushed as all memory operations by the same thread are
1381   /// sequentially consistent, and no other thread can access scratch
1382   /// memory.
1383 
1384   /// Other address spaces do not have a cache.
1385 
1386   if (Pos == Position::AFTER)
1387     --MI;
1388 
1389   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1390 
1391   return Changed;
1392 }
1393 
1394 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1395                                          SIAtomicScope Scope,
1396                                          SIAtomicAddrSpace AddrSpace,
1397                                          bool IsCrossAddrSpaceOrdering,
1398                                          Position Pos) const {
1399   bool Changed = false;
1400 
1401   MachineBasicBlock &MBB = *MI->getParent();
1402   DebugLoc DL = MI->getDebugLoc();
1403 
1404   if (Pos == Position::AFTER)
1405     ++MI;
1406 
1407   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1408     switch (Scope) {
1409     case SIAtomicScope::SYSTEM:
1410       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1411       // hardware does not reorder memory operations by the same wave with
1412       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1413       // to initiate writeback of any dirty cache lines of earlier writes by the
1414       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1415       // writeback has completed.
1416       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1417         // Set SC bits to indicate system scope.
1418         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1419       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1420       // vmcnt(0)" needed by the "BUFFER_WBL2".
1421       Changed = true;
1422       break;
1423     case SIAtomicScope::AGENT:
1424     case SIAtomicScope::WORKGROUP:
1425     case SIAtomicScope::WAVEFRONT:
1426     case SIAtomicScope::SINGLETHREAD:
1427       // Same as GFX7.
1428       break;
1429     default:
1430       llvm_unreachable("Unsupported synchronization scope");
1431     }
1432   }
1433 
1434   if (Pos == Position::AFTER)
1435     --MI;
1436 
1437   Changed |=
1438       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1439                                         IsCrossAddrSpaceOrdering, Pos);
1440 
1441   return Changed;
1442 }
1443 
1444 bool SIGfx940CacheControl::enableLoadCacheBypass(
1445     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1446     SIAtomicAddrSpace AddrSpace) const {
1447   assert(MI->mayLoad() && !MI->mayStore());
1448   bool Changed = false;
1449 
1450   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1451     switch (Scope) {
1452     case SIAtomicScope::SYSTEM:
1453       // Set SC bits to indicate system scope.
1454       Changed |= enableSC0Bit(MI);
1455       Changed |= enableSC1Bit(MI);
1456       break;
1457     case SIAtomicScope::AGENT:
1458       // Set SC bits to indicate agent scope.
1459       Changed |= enableSC1Bit(MI);
1460       break;
1461     case SIAtomicScope::WORKGROUP:
1462       // In threadgroup split mode the waves of a work-group can be executing on
1463       // different CUs. Therefore need to bypass the L1 which is per CU.
1464       // Otherwise in non-threadgroup split mode all waves of a work-group are
1465       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1466       // bits to indicate work-group scope will do this automatically.
1467       Changed |= enableSC0Bit(MI);
1468       break;
1469     case SIAtomicScope::WAVEFRONT:
1470     case SIAtomicScope::SINGLETHREAD:
1471       // Leave SC bits unset to indicate wavefront scope.
1472       break;
1473     default:
1474       llvm_unreachable("Unsupported synchronization scope");
1475     }
1476   }
1477 
1478   /// The scratch address space does not need the global memory caches
1479   /// to be bypassed as all memory operations by the same thread are
1480   /// sequentially consistent, and no other thread can access scratch
1481   /// memory.
1482 
1483   /// Other address spaces do not have a cache.
1484 
1485   return Changed;
1486 }
1487 
1488 bool SIGfx940CacheControl::enableStoreCacheBypass(
1489     const MachineBasicBlock::iterator &MI,
1490     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1491   assert(!MI->mayLoad() && MI->mayStore());
1492   bool Changed = false;
1493 
1494   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1495     switch (Scope) {
1496     case SIAtomicScope::SYSTEM:
1497       // Set SC bits to indicate system scope.
1498       Changed |= enableSC0Bit(MI);
1499       Changed |= enableSC1Bit(MI);
1500       break;
1501     case SIAtomicScope::AGENT:
1502       // Set SC bits to indicate agent scope.
1503       Changed |= enableSC1Bit(MI);
1504       break;
1505     case SIAtomicScope::WORKGROUP:
1506       // Set SC bits to indicate workgroup scope.
1507       Changed |= enableSC0Bit(MI);
1508       break;
1509     case SIAtomicScope::WAVEFRONT:
1510     case SIAtomicScope::SINGLETHREAD:
1511       // Leave SC bits unset to indicate wavefront scope.
1512       break;
1513     default:
1514       llvm_unreachable("Unsupported synchronization scope");
1515     }
1516   }
1517 
1518   /// The scratch address space does not need the global memory caches
1519   /// to be bypassed as all memory operations by the same thread are
1520   /// sequentially consistent, and no other thread can access scratch
1521   /// memory.
1522 
1523   /// Other address spaces do not have a cache.
1524 
1525   return Changed;
1526 }
1527 
1528 bool SIGfx940CacheControl::enableRMWCacheBypass(
1529     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1530     SIAtomicAddrSpace AddrSpace) const {
1531   assert(MI->mayLoad() && MI->mayStore());
1532   bool Changed = false;
1533 
1534   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1535     switch (Scope) {
1536     case SIAtomicScope::SYSTEM:
1537       // Set SC1 bit to indicate system scope.
1538       Changed |= enableSC1Bit(MI);
1539       break;
1540     case SIAtomicScope::AGENT:
1541     case SIAtomicScope::WORKGROUP:
1542     case SIAtomicScope::WAVEFRONT:
1543     case SIAtomicScope::SINGLETHREAD:
1544       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1545       // to indicate system or agent scope. The SC0 bit is used to indicate if
1546       // they are return or no-return. Leave SC1 bit unset to indicate agent
1547       // scope.
1548       break;
1549     default:
1550       llvm_unreachable("Unsupported synchronization scope");
1551     }
1552   }
1553 
1554   return Changed;
1555 }
1556 
1557 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1558     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1559     bool IsVolatile, bool IsNonTemporal) const {
1560   // Only handle load and store, not atomic read-modify-write insructions. The
1561   // latter use glc to indicate if the atomic returns a result and so must not
1562   // be used for cache control.
1563   assert(MI->mayLoad() ^ MI->mayStore());
1564 
1565   // Only update load and store, not LLVM IR atomic read-modify-write
1566   // instructions. The latter are always marked as volatile so cannot sensibly
1567   // handle it as do not want to pessimize all atomics. Also they do not support
1568   // the nontemporal attribute.
1569   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1570 
1571   bool Changed = false;
1572 
1573   if (IsVolatile) {
1574     // Set SC bits to indicate system scope.
1575     Changed |= enableSC0Bit(MI);
1576     Changed |= enableSC1Bit(MI);
1577 
1578     // Ensure operation has completed at system scope to cause all volatile
1579     // operations to be visible outside the program in a global order. Do not
1580     // request cross address space as only the global address space can be
1581     // observable outside the program, so no need to cause a waitcnt for LDS
1582     // address space operations.
1583     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1584                           Position::AFTER);
1585 
1586     return Changed;
1587   }
1588 
1589   if (IsNonTemporal) {
1590     Changed |= enableNTBit(MI);
1591     return Changed;
1592   }
1593 
1594   return Changed;
1595 }
1596 
1597 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1598                                          SIAtomicScope Scope,
1599                                          SIAtomicAddrSpace AddrSpace,
1600                                          Position Pos) const {
1601   if (!InsertCacheInv)
1602     return false;
1603 
1604   bool Changed = false;
1605 
1606   MachineBasicBlock &MBB = *MI->getParent();
1607   DebugLoc DL = MI->getDebugLoc();
1608 
1609   if (Pos == Position::AFTER)
1610     ++MI;
1611 
1612   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1613     switch (Scope) {
1614     case SIAtomicScope::SYSTEM:
1615       // Ensures that following loads will not see stale remote VMEM data or
1616       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1617       // CC will never be stale due to the local memory probes.
1618       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1619           // Set SC bits to indicate system scope.
1620           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1621       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1622       // hardware does not reorder memory operations by the same wave with
1623       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1624       // remove any cache lines of earlier writes by the same wave and ensures
1625       // later reads by the same wave will refetch the cache lines.
1626       Changed = true;
1627       break;
1628     case SIAtomicScope::AGENT:
1629       // Ensures that following loads will not see stale remote date or local
1630       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1631       // due to the memory probes.
1632       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1633           // Set SC bits to indicate agent scope.
1634           .addImm(AMDGPU::CPol::SC1);
1635       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1636       // does not reorder memory operations with respect to preceeding buffer
1637       // invalidate. The invalidate is guaranteed to remove any cache lines of
1638       // earlier writes and ensures later writes will refetch the cache lines.
1639       Changed = true;
1640       break;
1641     case SIAtomicScope::WORKGROUP:
1642       // In threadgroup split mode the waves of a work-group can be executing on
1643       // different CUs. Therefore need to invalidate the L1 which is per CU.
1644       // Otherwise in non-threadgroup split mode all waves of a work-group are
1645       // on the same CU, and so the L1 does not need to be invalidated.
1646       if (ST.isTgSplitEnabled()) {
1647         // Ensures L1 is invalidated if in threadgroup split mode. In
1648         // non-threadgroup split mode it is a NOP, but no point generating it in
1649         // that case if know not in that mode.
1650         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1651             // Set SC bits to indicate work-group scope.
1652             .addImm(AMDGPU::CPol::SC0);
1653         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1654         // does not reorder memory operations with respect to preceeding buffer
1655         // invalidate. The invalidate is guaranteed to remove any cache lines of
1656         // earlier writes and ensures later writes will refetch the cache lines.
1657         Changed = true;
1658       }
1659       break;
1660     case SIAtomicScope::WAVEFRONT:
1661     case SIAtomicScope::SINGLETHREAD:
1662       // Could generate "BUFFER_INV" but it would do nothing as there are no
1663       // caches to invalidate.
1664       break;
1665     default:
1666       llvm_unreachable("Unsupported synchronization scope");
1667     }
1668   }
1669 
1670   /// The scratch address space does not need the global memory cache
1671   /// to be flushed as all memory operations by the same thread are
1672   /// sequentially consistent, and no other thread can access scratch
1673   /// memory.
1674 
1675   /// Other address spaces do not have a cache.
1676 
1677   if (Pos == Position::AFTER)
1678     --MI;
1679 
1680   return Changed;
1681 }
1682 
1683 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1684                                          SIAtomicScope Scope,
1685                                          SIAtomicAddrSpace AddrSpace,
1686                                          bool IsCrossAddrSpaceOrdering,
1687                                          Position Pos) const {
1688   bool Changed = false;
1689 
1690   MachineBasicBlock &MBB = *MI->getParent();
1691   DebugLoc DL = MI->getDebugLoc();
1692 
1693   if (Pos == Position::AFTER)
1694     ++MI;
1695 
1696   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1697     switch (Scope) {
1698     case SIAtomicScope::SYSTEM:
1699       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1700       // hardware does not reorder memory operations by the same wave with
1701       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1702       // to initiate writeback of any dirty cache lines of earlier writes by the
1703       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1704       // writeback has completed.
1705       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1706           // Set SC bits to indicate system scope.
1707           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1708       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1709       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1710       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1711       Changed = true;
1712       break;
1713     case SIAtomicScope::AGENT:
1714       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1715           // Set SC bits to indicate agent scope.
1716           .addImm(AMDGPU::CPol::SC1);
1717 
1718       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1719       // SIAtomicScope::AGENT, the following insertWait will generate the
1720       // required "S_WAITCNT vmcnt(0)".
1721       Changed = true;
1722       break;
1723     case SIAtomicScope::WORKGROUP:
1724     case SIAtomicScope::WAVEFRONT:
1725     case SIAtomicScope::SINGLETHREAD:
1726       // Do not generate "BUFFER_WBL2" as there are no caches it would
1727       // writeback, and would require an otherwise unnecessary
1728       // "S_WAITCNT vmcnt(0)".
1729       break;
1730     default:
1731       llvm_unreachable("Unsupported synchronization scope");
1732     }
1733   }
1734 
1735   if (Pos == Position::AFTER)
1736     --MI;
1737 
1738   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1739   // S_WAITCNT needed.
1740   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1741                         IsCrossAddrSpaceOrdering, Pos);
1742 
1743   return Changed;
1744 }
1745 
1746 bool SIGfx10CacheControl::enableLoadCacheBypass(
1747     const MachineBasicBlock::iterator &MI,
1748     SIAtomicScope Scope,
1749     SIAtomicAddrSpace AddrSpace) const {
1750   assert(MI->mayLoad() && !MI->mayStore());
1751   bool Changed = false;
1752 
1753   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1754     switch (Scope) {
1755     case SIAtomicScope::SYSTEM:
1756     case SIAtomicScope::AGENT:
1757       // Set the L0 and L1 cache policies to MISS_EVICT.
1758       // Note: there is no L2 cache coherent bypass control at the ISA level.
1759       Changed |= enableGLCBit(MI);
1760       Changed |= enableDLCBit(MI);
1761       break;
1762     case SIAtomicScope::WORKGROUP:
1763       // In WGP mode the waves of a work-group can be executing on either CU of
1764       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1765       // CU mode all waves of a work-group are on the same CU, and so the L0
1766       // does not need to be bypassed.
1767       if (!ST.isCuModeEnabled())
1768         Changed |= enableGLCBit(MI);
1769       break;
1770     case SIAtomicScope::WAVEFRONT:
1771     case SIAtomicScope::SINGLETHREAD:
1772       // No cache to bypass.
1773       break;
1774     default:
1775       llvm_unreachable("Unsupported synchronization scope");
1776     }
1777   }
1778 
1779   /// The scratch address space does not need the global memory caches
1780   /// to be bypassed as all memory operations by the same thread are
1781   /// sequentially consistent, and no other thread can access scratch
1782   /// memory.
1783 
1784   /// Other address spaces do not have a cache.
1785 
1786   return Changed;
1787 }
1788 
1789 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1790     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1791     bool IsVolatile, bool IsNonTemporal) const {
1792 
1793   // Only handle load and store, not atomic read-modify-write insructions. The
1794   // latter use glc to indicate if the atomic returns a result and so must not
1795   // be used for cache control.
1796   assert(MI->mayLoad() ^ MI->mayStore());
1797 
1798   // Only update load and store, not LLVM IR atomic read-modify-write
1799   // instructions. The latter are always marked as volatile so cannot sensibly
1800   // handle it as do not want to pessimize all atomics. Also they do not support
1801   // the nontemporal attribute.
1802   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1803 
1804   bool Changed = false;
1805 
1806   if (IsVolatile) {
1807     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1808     // and MISS_LRU for store instructions.
1809     // Note: there is no L2 cache coherent bypass control at the ISA level.
1810     if (Op == SIMemOp::LOAD) {
1811       Changed |= enableGLCBit(MI);
1812       Changed |= enableDLCBit(MI);
1813     }
1814 
1815     // Ensure operation has completed at system scope to cause all volatile
1816     // operations to be visible outside the program in a global order. Do not
1817     // request cross address space as only the global address space can be
1818     // observable outside the program, so no need to cause a waitcnt for LDS
1819     // address space operations.
1820     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1821                           Position::AFTER);
1822     return Changed;
1823   }
1824 
1825   if (IsNonTemporal) {
1826     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1827     // and L2 cache policy to STREAM.
1828     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1829     // to MISS_EVICT and the L2 cache policy to STREAM.
1830     if (Op == SIMemOp::STORE)
1831       Changed |= enableGLCBit(MI);
1832     Changed |= enableSLCBit(MI);
1833 
1834     return Changed;
1835   }
1836 
1837   return Changed;
1838 }
1839 
1840 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1841                                      SIAtomicScope Scope,
1842                                      SIAtomicAddrSpace AddrSpace,
1843                                      SIMemOp Op,
1844                                      bool IsCrossAddrSpaceOrdering,
1845                                      Position Pos) const {
1846   bool Changed = false;
1847 
1848   MachineBasicBlock &MBB = *MI->getParent();
1849   DebugLoc DL = MI->getDebugLoc();
1850 
1851   if (Pos == Position::AFTER)
1852     ++MI;
1853 
1854   bool VMCnt = false;
1855   bool VSCnt = false;
1856   bool LGKMCnt = false;
1857 
1858   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1859       SIAtomicAddrSpace::NONE) {
1860     switch (Scope) {
1861     case SIAtomicScope::SYSTEM:
1862     case SIAtomicScope::AGENT:
1863       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1864         VMCnt |= true;
1865       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1866         VSCnt |= true;
1867       break;
1868     case SIAtomicScope::WORKGROUP:
1869       // In WGP mode the waves of a work-group can be executing on either CU of
1870       // the WGP. Therefore need to wait for operations to complete to ensure
1871       // they are visible to waves in the other CU as the L0 is per CU.
1872       // Otherwise in CU mode and all waves of a work-group are on the same CU
1873       // which shares the same L0.
1874       if (!ST.isCuModeEnabled()) {
1875         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1876           VMCnt |= true;
1877         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1878           VSCnt |= true;
1879       }
1880       break;
1881     case SIAtomicScope::WAVEFRONT:
1882     case SIAtomicScope::SINGLETHREAD:
1883       // The L0 cache keeps all memory operations in order for
1884       // work-items in the same wavefront.
1885       break;
1886     default:
1887       llvm_unreachable("Unsupported synchronization scope");
1888     }
1889   }
1890 
1891   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1892     switch (Scope) {
1893     case SIAtomicScope::SYSTEM:
1894     case SIAtomicScope::AGENT:
1895     case SIAtomicScope::WORKGROUP:
1896       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1897       // not needed as LDS operations for all waves are executed in a total
1898       // global ordering as observed by all waves. Required if also
1899       // synchronizing with global/GDS memory as LDS operations could be
1900       // reordered with respect to later global/GDS memory operations of the
1901       // same wave.
1902       LGKMCnt |= IsCrossAddrSpaceOrdering;
1903       break;
1904     case SIAtomicScope::WAVEFRONT:
1905     case SIAtomicScope::SINGLETHREAD:
1906       // The LDS keeps all memory operations in order for
1907       // the same wavefront.
1908       break;
1909     default:
1910       llvm_unreachable("Unsupported synchronization scope");
1911     }
1912   }
1913 
1914   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1915     switch (Scope) {
1916     case SIAtomicScope::SYSTEM:
1917     case SIAtomicScope::AGENT:
1918       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1919       // is not needed as GDS operations for all waves are executed in a total
1920       // global ordering as observed by all waves. Required if also
1921       // synchronizing with global/LDS memory as GDS operations could be
1922       // reordered with respect to later global/LDS memory operations of the
1923       // same wave.
1924       LGKMCnt |= IsCrossAddrSpaceOrdering;
1925       break;
1926     case SIAtomicScope::WORKGROUP:
1927     case SIAtomicScope::WAVEFRONT:
1928     case SIAtomicScope::SINGLETHREAD:
1929       // The GDS keeps all memory operations in order for
1930       // the same work-group.
1931       break;
1932     default:
1933       llvm_unreachable("Unsupported synchronization scope");
1934     }
1935   }
1936 
1937   if (VMCnt || LGKMCnt) {
1938     unsigned WaitCntImmediate =
1939       AMDGPU::encodeWaitcnt(IV,
1940                             VMCnt ? 0 : getVmcntBitMask(IV),
1941                             getExpcntBitMask(IV),
1942                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1943     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1944     Changed = true;
1945   }
1946 
1947   if (VSCnt) {
1948     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1949       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1950       .addImm(0);
1951     Changed = true;
1952   }
1953 
1954   if (Pos == Position::AFTER)
1955     --MI;
1956 
1957   return Changed;
1958 }
1959 
1960 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1961                                         SIAtomicScope Scope,
1962                                         SIAtomicAddrSpace AddrSpace,
1963                                         Position Pos) const {
1964   if (!InsertCacheInv)
1965     return false;
1966 
1967   bool Changed = false;
1968 
1969   MachineBasicBlock &MBB = *MI->getParent();
1970   DebugLoc DL = MI->getDebugLoc();
1971 
1972   if (Pos == Position::AFTER)
1973     ++MI;
1974 
1975   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1976     switch (Scope) {
1977     case SIAtomicScope::SYSTEM:
1978     case SIAtomicScope::AGENT:
1979       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1980       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1981       Changed = true;
1982       break;
1983     case SIAtomicScope::WORKGROUP:
1984       // In WGP mode the waves of a work-group can be executing on either CU of
1985       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1986       // in CU mode and all waves of a work-group are on the same CU, and so the
1987       // L0 does not need to be invalidated.
1988       if (!ST.isCuModeEnabled()) {
1989         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1990         Changed = true;
1991       }
1992       break;
1993     case SIAtomicScope::WAVEFRONT:
1994     case SIAtomicScope::SINGLETHREAD:
1995       // No cache to invalidate.
1996       break;
1997     default:
1998       llvm_unreachable("Unsupported synchronization scope");
1999     }
2000   }
2001 
2002   /// The scratch address space does not need the global memory cache
2003   /// to be flushed as all memory operations by the same thread are
2004   /// sequentially consistent, and no other thread can access scratch
2005   /// memory.
2006 
2007   /// Other address spaces do not have a cache.
2008 
2009   if (Pos == Position::AFTER)
2010     --MI;
2011 
2012   return Changed;
2013 }
2014 
2015 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2016   if (AtomicPseudoMIs.empty())
2017     return false;
2018 
2019   for (auto &MI : AtomicPseudoMIs)
2020     MI->eraseFromParent();
2021 
2022   AtomicPseudoMIs.clear();
2023   return true;
2024 }
2025 
2026 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2027                                    MachineBasicBlock::iterator &MI) {
2028   assert(MI->mayLoad() && !MI->mayStore());
2029 
2030   bool Changed = false;
2031 
2032   if (MOI.isAtomic()) {
2033     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2034         MOI.getOrdering() == AtomicOrdering::Acquire ||
2035         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2036       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2037                                            MOI.getOrderingAddrSpace());
2038     }
2039 
2040     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2041       Changed |= CC->insertWait(MI, MOI.getScope(),
2042                                 MOI.getOrderingAddrSpace(),
2043                                 SIMemOp::LOAD | SIMemOp::STORE,
2044                                 MOI.getIsCrossAddressSpaceOrdering(),
2045                                 Position::BEFORE);
2046 
2047     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2048         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2049       Changed |= CC->insertWait(MI, MOI.getScope(),
2050                                 MOI.getInstrAddrSpace(),
2051                                 SIMemOp::LOAD,
2052                                 MOI.getIsCrossAddressSpaceOrdering(),
2053                                 Position::AFTER);
2054       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2055                                    MOI.getOrderingAddrSpace(),
2056                                    Position::AFTER);
2057     }
2058 
2059     return Changed;
2060   }
2061 
2062   // Atomic instructions already bypass caches to the scope specified by the
2063   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2064   // need additional treatment.
2065   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2066                                                 SIMemOp::LOAD, MOI.isVolatile(),
2067                                                 MOI.isNonTemporal());
2068   return Changed;
2069 }
2070 
2071 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2072                                     MachineBasicBlock::iterator &MI) {
2073   assert(!MI->mayLoad() && MI->mayStore());
2074 
2075   bool Changed = false;
2076 
2077   if (MOI.isAtomic()) {
2078     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2079         MOI.getOrdering() == AtomicOrdering::Release ||
2080         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2081       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2082                                             MOI.getOrderingAddrSpace());
2083     }
2084 
2085     if (MOI.getOrdering() == AtomicOrdering::Release ||
2086         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2087       Changed |= CC->insertRelease(MI, MOI.getScope(),
2088                                    MOI.getOrderingAddrSpace(),
2089                                    MOI.getIsCrossAddressSpaceOrdering(),
2090                                    Position::BEFORE);
2091 
2092     return Changed;
2093   }
2094 
2095   // Atomic instructions already bypass caches to the scope specified by the
2096   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2097   // need additional treatment.
2098   Changed |= CC->enableVolatileAndOrNonTemporal(
2099       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2100       MOI.isNonTemporal());
2101   return Changed;
2102 }
2103 
2104 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2105                                           MachineBasicBlock::iterator &MI) {
2106   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2107 
2108   AtomicPseudoMIs.push_back(MI);
2109   bool Changed = false;
2110 
2111   if (MOI.isAtomic()) {
2112     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2113         MOI.getOrdering() == AtomicOrdering::Release ||
2114         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2115         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2116       /// TODO: This relies on a barrier always generating a waitcnt
2117       /// for LDS to ensure it is not reordered with the completion of
2118       /// the proceeding LDS operations. If barrier had a memory
2119       /// ordering and memory scope, then library does not need to
2120       /// generate a fence. Could add support in this file for
2121       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2122       /// adding S_WAITCNT before a S_BARRIER.
2123       Changed |= CC->insertRelease(MI, MOI.getScope(),
2124                                    MOI.getOrderingAddrSpace(),
2125                                    MOI.getIsCrossAddressSpaceOrdering(),
2126                                    Position::BEFORE);
2127 
2128     // TODO: If both release and invalidate are happening they could be combined
2129     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2130     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2131     // track cache invalidate and write back instructions.
2132 
2133     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2134         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2135         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2136       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2137                                    MOI.getOrderingAddrSpace(),
2138                                    Position::BEFORE);
2139 
2140     return Changed;
2141   }
2142 
2143   return Changed;
2144 }
2145 
2146 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2147   MachineBasicBlock::iterator &MI) {
2148   assert(MI->mayLoad() && MI->mayStore());
2149 
2150   bool Changed = false;
2151 
2152   if (MOI.isAtomic()) {
2153     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2154         MOI.getOrdering() == AtomicOrdering::Acquire ||
2155         MOI.getOrdering() == AtomicOrdering::Release ||
2156         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2157         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2158       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2159                                           MOI.getInstrAddrSpace());
2160     }
2161 
2162     if (MOI.getOrdering() == AtomicOrdering::Release ||
2163         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2164         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2165         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2166       Changed |= CC->insertRelease(MI, MOI.getScope(),
2167                                    MOI.getOrderingAddrSpace(),
2168                                    MOI.getIsCrossAddressSpaceOrdering(),
2169                                    Position::BEFORE);
2170 
2171     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2172         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2173         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2174         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2175         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2176       Changed |= CC->insertWait(MI, MOI.getScope(),
2177                                 MOI.getInstrAddrSpace(),
2178                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2179                                                    SIMemOp::STORE,
2180                                 MOI.getIsCrossAddressSpaceOrdering(),
2181                                 Position::AFTER);
2182       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2183                                    MOI.getOrderingAddrSpace(),
2184                                    Position::AFTER);
2185     }
2186 
2187     return Changed;
2188   }
2189 
2190   return Changed;
2191 }
2192 
2193 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2194   bool Changed = false;
2195 
2196   SIMemOpAccess MOA(MF);
2197   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2198 
2199   for (auto &MBB : MF) {
2200     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2201 
2202       // Unbundle instructions after the post-RA scheduler.
2203       if (MI->isBundle() && MI->mayLoadOrStore()) {
2204         MachineBasicBlock::instr_iterator II(MI->getIterator());
2205         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2206              I != E && I->isBundledWithPred(); ++I) {
2207           I->unbundleFromPred();
2208           for (MachineOperand &MO : I->operands())
2209             if (MO.isReg())
2210               MO.setIsInternalRead(false);
2211         }
2212 
2213         MI->eraseFromParent();
2214         MI = II->getIterator();
2215       }
2216 
2217       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2218         continue;
2219 
2220       if (const auto &MOI = MOA.getLoadInfo(MI))
2221         Changed |= expandLoad(MOI.getValue(), MI);
2222       else if (const auto &MOI = MOA.getStoreInfo(MI))
2223         Changed |= expandStore(MOI.getValue(), MI);
2224       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2225         Changed |= expandAtomicFence(MOI.getValue(), MI);
2226       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2227         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
2228     }
2229   }
2230 
2231   Changed |= removeAtomicPseudoMIs();
2232   return Changed;
2233 }
2234 
2235 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2236 
2237 char SIMemoryLegalizer::ID = 0;
2238 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2239 
2240 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2241   return new SIMemoryLegalizer();
2242 }
2243