1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed together.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE);
130 
131     // There is also no cross address space ordering if the ordering
132     // address space is the same as the instruction address space and
133     // only contains a single address space.
134     if ((OrderingAddrSpace == InstrAddrSpace) &&
135         isPowerOf2_32(uint32_t(InstrAddrSpace)))
136       this->IsCrossAddressSpaceOrdering = false;
137 
138     // Limit the scope to the maximum supported by the instruction's address
139     // spaces.
140     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
141         SIAtomicAddrSpace::NONE) {
142       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
143     } else if ((InstrAddrSpace &
144                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
145                SIAtomicAddrSpace::NONE) {
146       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
147     } else if ((InstrAddrSpace &
148                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
149                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
150       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
151     }
152   }
153 
154 public:
155   /// \returns Atomic synchronization scope of the machine instruction used to
156   /// create this SIMemOpInfo.
157   SIAtomicScope getScope() const {
158     return Scope;
159   }
160 
161   /// \returns Ordering constraint of the machine instruction used to
162   /// create this SIMemOpInfo.
163   AtomicOrdering getOrdering() const {
164     return Ordering;
165   }
166 
167   /// \returns Failure ordering constraint of the machine instruction used to
168   /// create this SIMemOpInfo.
169   AtomicOrdering getFailureOrdering() const {
170     return FailureOrdering;
171   }
172 
173   /// \returns The address spaces be accessed by the machine
174   /// instruction used to create this SiMemOpInfo.
175   SIAtomicAddrSpace getInstrAddrSpace() const {
176     return InstrAddrSpace;
177   }
178 
179   /// \returns The address spaces that must be ordered by the machine
180   /// instruction used to create this SiMemOpInfo.
181   SIAtomicAddrSpace getOrderingAddrSpace() const {
182     return OrderingAddrSpace;
183   }
184 
185   /// \returns Return true iff memory ordering of operations on
186   /// different address spaces is required.
187   bool getIsCrossAddressSpaceOrdering() const {
188     return IsCrossAddressSpaceOrdering;
189   }
190 
191   /// \returns True if memory access of the machine instruction used to
192   /// create this SIMemOpInfo is volatile, false otherwise.
193   bool isVolatile() const {
194     return IsVolatile;
195   }
196 
197   /// \returns True if memory access of the machine instruction used to
198   /// create this SIMemOpInfo is nontemporal, false otherwise.
199   bool isNonTemporal() const {
200     return IsNonTemporal;
201   }
202 
203   /// \returns True if ordering constraint of the machine instruction used to
204   /// create this SIMemOpInfo is unordered or higher, false otherwise.
205   bool isAtomic() const {
206     return Ordering != AtomicOrdering::NotAtomic;
207   }
208 
209 };
210 
211 class SIMemOpAccess final {
212 private:
213   AMDGPUMachineModuleInfo *MMI = nullptr;
214 
215   /// Reports unsupported message \p Msg for \p MI to LLVM context.
216   void reportUnsupported(const MachineBasicBlock::iterator &MI,
217                          const char *Msg) const;
218 
219   /// Inspects the target synchronization scope \p SSID and determines
220   /// the SI atomic scope it corresponds to, the address spaces it
221   /// covers, and whether the memory ordering applies between address
222   /// spaces.
223   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
224   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
225 
226   /// \return Return a bit set of the address spaces accessed by \p AS.
227   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
228 
229   /// \returns Info constructed from \p MI, which has at least machine memory
230   /// operand.
231   Optional<SIMemOpInfo> constructFromMIWithMMO(
232       const MachineBasicBlock::iterator &MI) const;
233 
234 public:
235   /// Construct class to support accessing the machine memory operands
236   /// of instructions in the machine function \p MF.
237   SIMemOpAccess(MachineFunction &MF);
238 
239   /// \returns Load info if \p MI is a load operation, "None" otherwise.
240   Optional<SIMemOpInfo> getLoadInfo(
241       const MachineBasicBlock::iterator &MI) const;
242 
243   /// \returns Store info if \p MI is a store operation, "None" otherwise.
244   Optional<SIMemOpInfo> getStoreInfo(
245       const MachineBasicBlock::iterator &MI) const;
246 
247   /// \returns Atomic fence info if \p MI is an atomic fence operation,
248   /// "None" otherwise.
249   Optional<SIMemOpInfo> getAtomicFenceInfo(
250       const MachineBasicBlock::iterator &MI) const;
251 
252   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
253   /// rmw operation, "None" otherwise.
254   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
255       const MachineBasicBlock::iterator &MI) const;
256 };
257 
258 class SICacheControl {
259 protected:
260 
261   /// AMDGPU subtarget info.
262   const GCNSubtarget &ST;
263 
264   /// Instruction info.
265   const SIInstrInfo *TII = nullptr;
266 
267   IsaVersion IV;
268 
269   /// Whether to insert cache invalidating instructions.
270   bool InsertCacheInv;
271 
272   SICacheControl(const GCNSubtarget &ST);
273 
274   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
275   /// \returns Returns true if \p MI is modified, false otherwise.
276   bool enableNamedBit(const MachineBasicBlock::iterator MI,
277                       AMDGPU::CPol::CPol Bit) const;
278 
279 public:
280 
281   /// Create a cache control for the subtarget \p ST.
282   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
283 
284   /// Update \p MI memory load instruction to bypass any caches up to
285   /// the \p Scope memory scope for address spaces \p
286   /// AddrSpace. Return true iff the instruction was modified.
287   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
288                                      SIAtomicScope Scope,
289                                      SIAtomicAddrSpace AddrSpace) const = 0;
290 
291   /// Update \p MI memory store instruction to bypass any caches up to
292   /// the \p Scope memory scope for address spaces \p
293   /// AddrSpace. Return true iff the instruction was modified.
294   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
295                                       SIAtomicScope Scope,
296                                       SIAtomicAddrSpace AddrSpace) const = 0;
297 
298   /// Update \p MI memory read-modify-write instruction to bypass any caches up
299   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
300   /// iff the instruction was modified.
301   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
302                                     SIAtomicScope Scope,
303                                     SIAtomicAddrSpace AddrSpace) const = 0;
304 
305   /// Update \p MI memory instruction of kind \p Op associated with address
306   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
307   /// true iff the instruction was modified.
308   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
309                                               SIAtomicAddrSpace AddrSpace,
310                                               SIMemOp Op, bool IsVolatile,
311                                               bool IsNonTemporal) const = 0;
312 
313   /// Inserts any necessary instructions at position \p Pos relative
314   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
315   /// \p Op associated with address spaces \p AddrSpace have completed. Used
316   /// between memory instructions to enforce the order they become visible as
317   /// observed by other memory instructions executing in memory scope \p Scope.
318   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
319   /// address spaces. Returns true iff any instructions inserted.
320   virtual bool insertWait(MachineBasicBlock::iterator &MI,
321                           SIAtomicScope Scope,
322                           SIAtomicAddrSpace AddrSpace,
323                           SIMemOp Op,
324                           bool IsCrossAddrSpaceOrdering,
325                           Position Pos) const = 0;
326 
327   /// Inserts any necessary instructions at position \p Pos relative to
328   /// instruction \p MI to ensure any subsequent memory instructions of this
329   /// thread with address spaces \p AddrSpace will observe the previous memory
330   /// operations by any thread for memory scopes up to memory scope \p Scope .
331   /// Returns true iff any instructions inserted.
332   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
333                              SIAtomicScope Scope,
334                              SIAtomicAddrSpace AddrSpace,
335                              Position Pos) const = 0;
336 
337   /// Inserts any necessary instructions at position \p Pos relative to
338   /// instruction \p MI to ensure previous memory instructions by this thread
339   /// with address spaces \p AddrSpace have completed and can be observed by
340   /// subsequent memory instructions by any thread executing in memory scope \p
341   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
342   /// between address spaces. Returns true iff any instructions inserted.
343   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
344                              SIAtomicScope Scope,
345                              SIAtomicAddrSpace AddrSpace,
346                              bool IsCrossAddrSpaceOrdering,
347                              Position Pos) const = 0;
348 
349   /// Virtual destructor to allow derivations to be deleted.
350   virtual ~SICacheControl() = default;
351 
352 };
353 
354 class SIGfx6CacheControl : public SICacheControl {
355 protected:
356 
357   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
358   /// is modified, false otherwise.
359   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
360     return enableNamedBit(MI, AMDGPU::CPol::GLC);
361   }
362 
363   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::SLC);
367   }
368 
369 public:
370 
371   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
372 
373   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
374                              SIAtomicScope Scope,
375                              SIAtomicAddrSpace AddrSpace) const override;
376 
377   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
378                               SIAtomicScope Scope,
379                               SIAtomicAddrSpace AddrSpace) const override;
380 
381   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
382                             SIAtomicScope Scope,
383                             SIAtomicAddrSpace AddrSpace) const override;
384 
385   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
386                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
387                                       bool IsVolatile,
388                                       bool IsNonTemporal) const override;
389 
390   bool insertWait(MachineBasicBlock::iterator &MI,
391                   SIAtomicScope Scope,
392                   SIAtomicAddrSpace AddrSpace,
393                   SIMemOp Op,
394                   bool IsCrossAddrSpaceOrdering,
395                   Position Pos) const override;
396 
397   bool insertAcquire(MachineBasicBlock::iterator &MI,
398                      SIAtomicScope Scope,
399                      SIAtomicAddrSpace AddrSpace,
400                      Position Pos) const override;
401 
402   bool insertRelease(MachineBasicBlock::iterator &MI,
403                      SIAtomicScope Scope,
404                      SIAtomicAddrSpace AddrSpace,
405                      bool IsCrossAddrSpaceOrdering,
406                      Position Pos) const override;
407 };
408 
409 class SIGfx7CacheControl : public SIGfx6CacheControl {
410 public:
411 
412   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419 };
420 
421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
422 public:
423 
424   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
425 
426   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
427                              SIAtomicScope Scope,
428                              SIAtomicAddrSpace AddrSpace) const override;
429 
430   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
431                               SIAtomicScope Scope,
432                               SIAtomicAddrSpace AddrSpace) const override;
433 
434   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
435                             SIAtomicScope Scope,
436                             SIAtomicAddrSpace AddrSpace) const override;
437 
438   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
439                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
440                                       bool IsVolatile,
441                                       bool IsNonTemporal) const override;
442 
443   bool insertWait(MachineBasicBlock::iterator &MI,
444                   SIAtomicScope Scope,
445                   SIAtomicAddrSpace AddrSpace,
446                   SIMemOp Op,
447                   bool IsCrossAddrSpaceOrdering,
448                   Position Pos) const override;
449 
450   bool insertAcquire(MachineBasicBlock::iterator &MI,
451                      SIAtomicScope Scope,
452                      SIAtomicAddrSpace AddrSpace,
453                      Position Pos) const override;
454 
455   bool insertRelease(MachineBasicBlock::iterator &MI,
456                      SIAtomicScope Scope,
457                      SIAtomicAddrSpace AddrSpace,
458                      bool IsCrossAddrSpaceOrdering,
459                      Position Pos) const override;
460 };
461 
462 class SIGfx940CacheControl : public SIGfx90ACacheControl {
463 protected:
464 
465   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
466   /// is modified, false otherwise.
467   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
468     return enableNamedBit(MI, AMDGPU::CPol::SC0);
469   }
470 
471   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
472   /// is modified, false otherwise.
473   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
474     return enableNamedBit(MI, AMDGPU::CPol::SC1);
475   }
476 
477   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
478   /// is modified, false otherwise.
479   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
480     return enableNamedBit(MI, AMDGPU::CPol::NT);
481   }
482 
483 public:
484 
485   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
486 
487   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
488                              SIAtomicScope Scope,
489                              SIAtomicAddrSpace AddrSpace) const override;
490 
491   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
492                               SIAtomicScope Scope,
493                               SIAtomicAddrSpace AddrSpace) const override;
494 
495   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
496                             SIAtomicScope Scope,
497                             SIAtomicAddrSpace AddrSpace) const override;
498 
499   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
500                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
501                                       bool IsVolatile,
502                                       bool IsNonTemporal) const override;
503 
504   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
505                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
506 
507   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
508                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
509                      Position Pos) const override;
510 };
511 
512 class SIGfx10CacheControl : public SIGfx7CacheControl {
513 protected:
514 
515   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
516   /// is modified, false otherwise.
517   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
518     return enableNamedBit(MI, AMDGPU::CPol::DLC);
519   }
520 
521 public:
522 
523   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
524 
525   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
526                              SIAtomicScope Scope,
527                              SIAtomicAddrSpace AddrSpace) const override;
528 
529   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
530                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
531                                       bool IsVolatile,
532                                       bool IsNonTemporal) const override;
533 
534   bool insertWait(MachineBasicBlock::iterator &MI,
535                   SIAtomicScope Scope,
536                   SIAtomicAddrSpace AddrSpace,
537                   SIMemOp Op,
538                   bool IsCrossAddrSpaceOrdering,
539                   Position Pos) const override;
540 
541   bool insertAcquire(MachineBasicBlock::iterator &MI,
542                      SIAtomicScope Scope,
543                      SIAtomicAddrSpace AddrSpace,
544                      Position Pos) const override;
545 };
546 
547 class SIMemoryLegalizer final : public MachineFunctionPass {
548 private:
549 
550   /// Cache Control.
551   std::unique_ptr<SICacheControl> CC = nullptr;
552 
553   /// List of atomic pseudo instructions.
554   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
555 
556   /// Return true iff instruction \p MI is a atomic instruction that
557   /// returns a result.
558   bool isAtomicRet(const MachineInstr &MI) const {
559     return SIInstrInfo::isAtomicRet(MI);
560   }
561 
562   /// Removes all processed atomic pseudo instructions from the current
563   /// function. Returns true if current function is modified, false otherwise.
564   bool removeAtomicPseudoMIs();
565 
566   /// Expands load operation \p MI. Returns true if instructions are
567   /// added/deleted or \p MI is modified, false otherwise.
568   bool expandLoad(const SIMemOpInfo &MOI,
569                   MachineBasicBlock::iterator &MI);
570   /// Expands store operation \p MI. Returns true if instructions are
571   /// added/deleted or \p MI is modified, false otherwise.
572   bool expandStore(const SIMemOpInfo &MOI,
573                    MachineBasicBlock::iterator &MI);
574   /// Expands atomic fence operation \p MI. Returns true if
575   /// instructions are added/deleted or \p MI is modified, false otherwise.
576   bool expandAtomicFence(const SIMemOpInfo &MOI,
577                          MachineBasicBlock::iterator &MI);
578   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
579   /// instructions are added/deleted or \p MI is modified, false otherwise.
580   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
581                                 MachineBasicBlock::iterator &MI);
582 
583 public:
584   static char ID;
585 
586   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
587 
588   void getAnalysisUsage(AnalysisUsage &AU) const override {
589     AU.setPreservesCFG();
590     MachineFunctionPass::getAnalysisUsage(AU);
591   }
592 
593   StringRef getPassName() const override {
594     return PASS_NAME;
595   }
596 
597   bool runOnMachineFunction(MachineFunction &MF) override;
598 };
599 
600 } // end namespace anonymous
601 
602 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
603                                       const char *Msg) const {
604   const Function &Func = MI->getParent()->getParent()->getFunction();
605   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
606   Func.getContext().diagnose(Diag);
607 }
608 
609 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
610 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
611                                SIAtomicAddrSpace InstrAddrSpace) const {
612   if (SSID == SyncScope::System)
613     return std::make_tuple(SIAtomicScope::SYSTEM,
614                            SIAtomicAddrSpace::ATOMIC,
615                            true);
616   if (SSID == MMI->getAgentSSID())
617     return std::make_tuple(SIAtomicScope::AGENT,
618                            SIAtomicAddrSpace::ATOMIC,
619                            true);
620   if (SSID == MMI->getWorkgroupSSID())
621     return std::make_tuple(SIAtomicScope::WORKGROUP,
622                            SIAtomicAddrSpace::ATOMIC,
623                            true);
624   if (SSID == MMI->getWavefrontSSID())
625     return std::make_tuple(SIAtomicScope::WAVEFRONT,
626                            SIAtomicAddrSpace::ATOMIC,
627                            true);
628   if (SSID == SyncScope::SingleThread)
629     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
630                            SIAtomicAddrSpace::ATOMIC,
631                            true);
632   if (SSID == MMI->getSystemOneAddressSpaceSSID())
633     return std::make_tuple(SIAtomicScope::SYSTEM,
634                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
635                            false);
636   if (SSID == MMI->getAgentOneAddressSpaceSSID())
637     return std::make_tuple(SIAtomicScope::AGENT,
638                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
639                            false);
640   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
641     return std::make_tuple(SIAtomicScope::WORKGROUP,
642                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
643                            false);
644   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
645     return std::make_tuple(SIAtomicScope::WAVEFRONT,
646                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
647                            false);
648   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
649     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
650                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
651                            false);
652   return None;
653 }
654 
655 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
656   if (AS == AMDGPUAS::FLAT_ADDRESS)
657     return SIAtomicAddrSpace::FLAT;
658   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
659     return SIAtomicAddrSpace::GLOBAL;
660   if (AS == AMDGPUAS::LOCAL_ADDRESS)
661     return SIAtomicAddrSpace::LDS;
662   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
663     return SIAtomicAddrSpace::SCRATCH;
664   if (AS == AMDGPUAS::REGION_ADDRESS)
665     return SIAtomicAddrSpace::GDS;
666 
667   return SIAtomicAddrSpace::OTHER;
668 }
669 
670 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
671   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
672 }
673 
674 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
675     const MachineBasicBlock::iterator &MI) const {
676   assert(MI->getNumMemOperands() > 0);
677 
678   SyncScope::ID SSID = SyncScope::SingleThread;
679   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
680   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
681   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
682   bool IsNonTemporal = true;
683   bool IsVolatile = false;
684 
685   // Validator should check whether or not MMOs cover the entire set of
686   // locations accessed by the memory instruction.
687   for (const auto &MMO : MI->memoperands()) {
688     IsNonTemporal &= MMO->isNonTemporal();
689     IsVolatile |= MMO->isVolatile();
690     InstrAddrSpace |=
691       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
692     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
693     if (OpOrdering != AtomicOrdering::NotAtomic) {
694       const auto &IsSyncScopeInclusion =
695           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
696       if (!IsSyncScopeInclusion) {
697         reportUnsupported(MI,
698           "Unsupported non-inclusive atomic synchronization scope");
699         return None;
700       }
701 
702       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
703       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
704       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
705              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
706       FailureOrdering =
707           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
708     }
709   }
710 
711   SIAtomicScope Scope = SIAtomicScope::NONE;
712   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
713   bool IsCrossAddressSpaceOrdering = false;
714   if (Ordering != AtomicOrdering::NotAtomic) {
715     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
716     if (!ScopeOrNone) {
717       reportUnsupported(MI, "Unsupported atomic synchronization scope");
718       return None;
719     }
720     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
721       ScopeOrNone.getValue();
722     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
723         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
724         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
725       reportUnsupported(MI, "Unsupported atomic address space");
726       return None;
727     }
728   }
729   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
730                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
731                      IsNonTemporal);
732 }
733 
734 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
735     const MachineBasicBlock::iterator &MI) const {
736   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
737 
738   if (!(MI->mayLoad() && !MI->mayStore()))
739     return None;
740 
741   // Be conservative if there are no memory operands.
742   if (MI->getNumMemOperands() == 0)
743     return SIMemOpInfo();
744 
745   return constructFromMIWithMMO(MI);
746 }
747 
748 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
749     const MachineBasicBlock::iterator &MI) const {
750   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
751 
752   if (!(!MI->mayLoad() && MI->mayStore()))
753     return None;
754 
755   // Be conservative if there are no memory operands.
756   if (MI->getNumMemOperands() == 0)
757     return SIMemOpInfo();
758 
759   return constructFromMIWithMMO(MI);
760 }
761 
762 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
763     const MachineBasicBlock::iterator &MI) const {
764   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
765 
766   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
767     return None;
768 
769   AtomicOrdering Ordering =
770     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
771 
772   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
773   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
774   if (!ScopeOrNone) {
775     reportUnsupported(MI, "Unsupported atomic synchronization scope");
776     return None;
777   }
778 
779   SIAtomicScope Scope = SIAtomicScope::NONE;
780   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
781   bool IsCrossAddressSpaceOrdering = false;
782   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
783     ScopeOrNone.getValue();
784 
785   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
786       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
787     reportUnsupported(MI, "Unsupported atomic address space");
788     return None;
789   }
790 
791   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
792                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
793 }
794 
795 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
796     const MachineBasicBlock::iterator &MI) const {
797   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
798 
799   if (!(MI->mayLoad() && MI->mayStore()))
800     return None;
801 
802   // Be conservative if there are no memory operands.
803   if (MI->getNumMemOperands() == 0)
804     return SIMemOpInfo();
805 
806   return constructFromMIWithMMO(MI);
807 }
808 
809 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
810   TII = ST.getInstrInfo();
811   IV = getIsaVersion(ST.getCPU());
812   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
813 }
814 
815 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
816                                     AMDGPU::CPol::CPol Bit) const {
817   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
818   if (!CPol)
819     return false;
820 
821   CPol->setImm(CPol->getImm() | Bit);
822   return true;
823 }
824 
825 /* static */
826 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
827   GCNSubtarget::Generation Generation = ST.getGeneration();
828   if (ST.hasGFX940Insts())
829     return std::make_unique<SIGfx940CacheControl>(ST);
830   if (ST.hasGFX90AInsts())
831     return std::make_unique<SIGfx90ACacheControl>(ST);
832   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
833     return std::make_unique<SIGfx6CacheControl>(ST);
834   if (Generation < AMDGPUSubtarget::GFX10)
835     return std::make_unique<SIGfx7CacheControl>(ST);
836   return std::make_unique<SIGfx10CacheControl>(ST);
837 }
838 
839 bool SIGfx6CacheControl::enableLoadCacheBypass(
840     const MachineBasicBlock::iterator &MI,
841     SIAtomicScope Scope,
842     SIAtomicAddrSpace AddrSpace) const {
843   assert(MI->mayLoad() && !MI->mayStore());
844   bool Changed = false;
845 
846   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
847     switch (Scope) {
848     case SIAtomicScope::SYSTEM:
849     case SIAtomicScope::AGENT:
850       // Set L1 cache policy to MISS_EVICT.
851       // Note: there is no L2 cache bypass policy at the ISA level.
852       Changed |= enableGLCBit(MI);
853       break;
854     case SIAtomicScope::WORKGROUP:
855     case SIAtomicScope::WAVEFRONT:
856     case SIAtomicScope::SINGLETHREAD:
857       // No cache to bypass.
858       break;
859     default:
860       llvm_unreachable("Unsupported synchronization scope");
861     }
862   }
863 
864   /// The scratch address space does not need the global memory caches
865   /// to be bypassed as all memory operations by the same thread are
866   /// sequentially consistent, and no other thread can access scratch
867   /// memory.
868 
869   /// Other address spaces do not have a cache.
870 
871   return Changed;
872 }
873 
874 bool SIGfx6CacheControl::enableStoreCacheBypass(
875     const MachineBasicBlock::iterator &MI,
876     SIAtomicScope Scope,
877     SIAtomicAddrSpace AddrSpace) const {
878   assert(!MI->mayLoad() && MI->mayStore());
879   bool Changed = false;
880 
881   /// The L1 cache is write through so does not need to be bypassed. There is no
882   /// bypass control for the L2 cache at the isa level.
883 
884   return Changed;
885 }
886 
887 bool SIGfx6CacheControl::enableRMWCacheBypass(
888     const MachineBasicBlock::iterator &MI,
889     SIAtomicScope Scope,
890     SIAtomicAddrSpace AddrSpace) const {
891   assert(MI->mayLoad() && MI->mayStore());
892   bool Changed = false;
893 
894   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
895   /// bypassed, and the GLC bit is instead used to indicate if they are
896   /// return or no-return.
897   /// Note: there is no L2 cache coherent bypass control at the ISA level.
898 
899   return Changed;
900 }
901 
902 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
903     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
904     bool IsVolatile, bool IsNonTemporal) const {
905   // Only handle load and store, not atomic read-modify-write insructions. The
906   // latter use glc to indicate if the atomic returns a result and so must not
907   // be used for cache control.
908   assert(MI->mayLoad() ^ MI->mayStore());
909 
910   // Only update load and store, not LLVM IR atomic read-modify-write
911   // instructions. The latter are always marked as volatile so cannot sensibly
912   // handle it as do not want to pessimize all atomics. Also they do not support
913   // the nontemporal attribute.
914   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
915 
916   bool Changed = false;
917 
918   if (IsVolatile) {
919     // Set L1 cache policy to be MISS_EVICT for load instructions
920     // and MISS_LRU for store instructions.
921     // Note: there is no L2 cache bypass policy at the ISA level.
922     if (Op == SIMemOp::LOAD)
923       Changed |= enableGLCBit(MI);
924 
925     // Ensure operation has completed at system scope to cause all volatile
926     // operations to be visible outside the program in a global order. Do not
927     // request cross address space as only the global address space can be
928     // observable outside the program, so no need to cause a waitcnt for LDS
929     // address space operations.
930     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
931                           Position::AFTER);
932 
933     return Changed;
934   }
935 
936   if (IsNonTemporal) {
937     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
938     // for both loads and stores, and the L2 cache policy to STREAM.
939     Changed |= enableGLCBit(MI);
940     Changed |= enableSLCBit(MI);
941     return Changed;
942   }
943 
944   return Changed;
945 }
946 
947 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
948                                     SIAtomicScope Scope,
949                                     SIAtomicAddrSpace AddrSpace,
950                                     SIMemOp Op,
951                                     bool IsCrossAddrSpaceOrdering,
952                                     Position Pos) const {
953   bool Changed = false;
954 
955   MachineBasicBlock &MBB = *MI->getParent();
956   DebugLoc DL = MI->getDebugLoc();
957 
958   if (Pos == Position::AFTER)
959     ++MI;
960 
961   bool VMCnt = false;
962   bool LGKMCnt = false;
963 
964   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
965       SIAtomicAddrSpace::NONE) {
966     switch (Scope) {
967     case SIAtomicScope::SYSTEM:
968     case SIAtomicScope::AGENT:
969       VMCnt |= true;
970       break;
971     case SIAtomicScope::WORKGROUP:
972     case SIAtomicScope::WAVEFRONT:
973     case SIAtomicScope::SINGLETHREAD:
974       // The L1 cache keeps all memory operations in order for
975       // wavefronts in the same work-group.
976       break;
977     default:
978       llvm_unreachable("Unsupported synchronization scope");
979     }
980   }
981 
982   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
983     switch (Scope) {
984     case SIAtomicScope::SYSTEM:
985     case SIAtomicScope::AGENT:
986     case SIAtomicScope::WORKGROUP:
987       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
988       // not needed as LDS operations for all waves are executed in a total
989       // global ordering as observed by all waves. Required if also
990       // synchronizing with global/GDS memory as LDS operations could be
991       // reordered with respect to later global/GDS memory operations of the
992       // same wave.
993       LGKMCnt |= IsCrossAddrSpaceOrdering;
994       break;
995     case SIAtomicScope::WAVEFRONT:
996     case SIAtomicScope::SINGLETHREAD:
997       // The LDS keeps all memory operations in order for
998       // the same wavefront.
999       break;
1000     default:
1001       llvm_unreachable("Unsupported synchronization scope");
1002     }
1003   }
1004 
1005   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1006     switch (Scope) {
1007     case SIAtomicScope::SYSTEM:
1008     case SIAtomicScope::AGENT:
1009       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1010       // is not needed as GDS operations for all waves are executed in a total
1011       // global ordering as observed by all waves. Required if also
1012       // synchronizing with global/LDS memory as GDS operations could be
1013       // reordered with respect to later global/LDS memory operations of the
1014       // same wave.
1015       LGKMCnt |= IsCrossAddrSpaceOrdering;
1016       break;
1017     case SIAtomicScope::WORKGROUP:
1018     case SIAtomicScope::WAVEFRONT:
1019     case SIAtomicScope::SINGLETHREAD:
1020       // The GDS keeps all memory operations in order for
1021       // the same work-group.
1022       break;
1023     default:
1024       llvm_unreachable("Unsupported synchronization scope");
1025     }
1026   }
1027 
1028   if (VMCnt || LGKMCnt) {
1029     unsigned WaitCntImmediate =
1030       AMDGPU::encodeWaitcnt(IV,
1031                             VMCnt ? 0 : getVmcntBitMask(IV),
1032                             getExpcntBitMask(IV),
1033                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1034     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1035     Changed = true;
1036   }
1037 
1038   if (Pos == Position::AFTER)
1039     --MI;
1040 
1041   return Changed;
1042 }
1043 
1044 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1045                                        SIAtomicScope Scope,
1046                                        SIAtomicAddrSpace AddrSpace,
1047                                        Position Pos) const {
1048   if (!InsertCacheInv)
1049     return false;
1050 
1051   bool Changed = false;
1052 
1053   MachineBasicBlock &MBB = *MI->getParent();
1054   DebugLoc DL = MI->getDebugLoc();
1055 
1056   if (Pos == Position::AFTER)
1057     ++MI;
1058 
1059   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1060     switch (Scope) {
1061     case SIAtomicScope::SYSTEM:
1062     case SIAtomicScope::AGENT:
1063       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1064       Changed = true;
1065       break;
1066     case SIAtomicScope::WORKGROUP:
1067     case SIAtomicScope::WAVEFRONT:
1068     case SIAtomicScope::SINGLETHREAD:
1069       // No cache to invalidate.
1070       break;
1071     default:
1072       llvm_unreachable("Unsupported synchronization scope");
1073     }
1074   }
1075 
1076   /// The scratch address space does not need the global memory cache
1077   /// to be flushed as all memory operations by the same thread are
1078   /// sequentially consistent, and no other thread can access scratch
1079   /// memory.
1080 
1081   /// Other address spaces do not have a cache.
1082 
1083   if (Pos == Position::AFTER)
1084     --MI;
1085 
1086   return Changed;
1087 }
1088 
1089 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1090                                        SIAtomicScope Scope,
1091                                        SIAtomicAddrSpace AddrSpace,
1092                                        bool IsCrossAddrSpaceOrdering,
1093                                        Position Pos) const {
1094   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1095                     IsCrossAddrSpaceOrdering, Pos);
1096 }
1097 
1098 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1099                                        SIAtomicScope Scope,
1100                                        SIAtomicAddrSpace AddrSpace,
1101                                        Position Pos) const {
1102   if (!InsertCacheInv)
1103     return false;
1104 
1105   bool Changed = false;
1106 
1107   MachineBasicBlock &MBB = *MI->getParent();
1108   DebugLoc DL = MI->getDebugLoc();
1109 
1110   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1111 
1112   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1113                                     ? AMDGPU::BUFFER_WBINVL1
1114                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1115 
1116   if (Pos == Position::AFTER)
1117     ++MI;
1118 
1119   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1120     switch (Scope) {
1121     case SIAtomicScope::SYSTEM:
1122     case SIAtomicScope::AGENT:
1123       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1124       Changed = true;
1125       break;
1126     case SIAtomicScope::WORKGROUP:
1127     case SIAtomicScope::WAVEFRONT:
1128     case SIAtomicScope::SINGLETHREAD:
1129       // No cache to invalidate.
1130       break;
1131     default:
1132       llvm_unreachable("Unsupported synchronization scope");
1133     }
1134   }
1135 
1136   /// The scratch address space does not need the global memory cache
1137   /// to be flushed as all memory operations by the same thread are
1138   /// sequentially consistent, and no other thread can access scratch
1139   /// memory.
1140 
1141   /// Other address spaces do not have a cache.
1142 
1143   if (Pos == Position::AFTER)
1144     --MI;
1145 
1146   return Changed;
1147 }
1148 
1149 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1150     const MachineBasicBlock::iterator &MI,
1151     SIAtomicScope Scope,
1152     SIAtomicAddrSpace AddrSpace) const {
1153   assert(MI->mayLoad() && !MI->mayStore());
1154   bool Changed = false;
1155 
1156   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1157     switch (Scope) {
1158     case SIAtomicScope::SYSTEM:
1159     case SIAtomicScope::AGENT:
1160       // Set the L1 cache policy to MISS_LRU.
1161       // Note: there is no L2 cache bypass policy at the ISA level.
1162       Changed |= enableGLCBit(MI);
1163       break;
1164     case SIAtomicScope::WORKGROUP:
1165       // In threadgroup split mode the waves of a work-group can be executing on
1166       // different CUs. Therefore need to bypass the L1 which is per CU.
1167       // Otherwise in non-threadgroup split mode all waves of a work-group are
1168       // on the same CU, and so the L1 does not need to be bypassed.
1169       if (ST.isTgSplitEnabled())
1170         Changed |= enableGLCBit(MI);
1171       break;
1172     case SIAtomicScope::WAVEFRONT:
1173     case SIAtomicScope::SINGLETHREAD:
1174       // No cache to bypass.
1175       break;
1176     default:
1177       llvm_unreachable("Unsupported synchronization scope");
1178     }
1179   }
1180 
1181   /// The scratch address space does not need the global memory caches
1182   /// to be bypassed as all memory operations by the same thread are
1183   /// sequentially consistent, and no other thread can access scratch
1184   /// memory.
1185 
1186   /// Other address spaces do not have a cache.
1187 
1188   return Changed;
1189 }
1190 
1191 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1192     const MachineBasicBlock::iterator &MI,
1193     SIAtomicScope Scope,
1194     SIAtomicAddrSpace AddrSpace) const {
1195   assert(!MI->mayLoad() && MI->mayStore());
1196   bool Changed = false;
1197 
1198   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1199     switch (Scope) {
1200     case SIAtomicScope::SYSTEM:
1201     case SIAtomicScope::AGENT:
1202       /// Do not set glc for store atomic operations as they implicitly write
1203       /// through the L1 cache.
1204       break;
1205     case SIAtomicScope::WORKGROUP:
1206     case SIAtomicScope::WAVEFRONT:
1207     case SIAtomicScope::SINGLETHREAD:
1208       // No cache to bypass. Store atomics implicitly write through the L1
1209       // cache.
1210       break;
1211     default:
1212       llvm_unreachable("Unsupported synchronization scope");
1213     }
1214   }
1215 
1216   /// The scratch address space does not need the global memory caches
1217   /// to be bypassed as all memory operations by the same thread are
1218   /// sequentially consistent, and no other thread can access scratch
1219   /// memory.
1220 
1221   /// Other address spaces do not have a cache.
1222 
1223   return Changed;
1224 }
1225 
1226 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1227     const MachineBasicBlock::iterator &MI,
1228     SIAtomicScope Scope,
1229     SIAtomicAddrSpace AddrSpace) const {
1230   assert(MI->mayLoad() && MI->mayStore());
1231   bool Changed = false;
1232 
1233   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1234     switch (Scope) {
1235     case SIAtomicScope::SYSTEM:
1236     case SIAtomicScope::AGENT:
1237       /// Do not set glc for RMW atomic operations as they implicitly bypass
1238       /// the L1 cache, and the glc bit is instead used to indicate if they are
1239       /// return or no-return.
1240       break;
1241     case SIAtomicScope::WORKGROUP:
1242     case SIAtomicScope::WAVEFRONT:
1243     case SIAtomicScope::SINGLETHREAD:
1244       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1245       break;
1246     default:
1247       llvm_unreachable("Unsupported synchronization scope");
1248     }
1249   }
1250 
1251   return Changed;
1252 }
1253 
1254 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1255     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1256     bool IsVolatile, bool IsNonTemporal) const {
1257   // Only handle load and store, not atomic read-modify-write insructions. The
1258   // latter use glc to indicate if the atomic returns a result and so must not
1259   // be used for cache control.
1260   assert(MI->mayLoad() ^ MI->mayStore());
1261 
1262   // Only update load and store, not LLVM IR atomic read-modify-write
1263   // instructions. The latter are always marked as volatile so cannot sensibly
1264   // handle it as do not want to pessimize all atomics. Also they do not support
1265   // the nontemporal attribute.
1266   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1267 
1268   bool Changed = false;
1269 
1270   if (IsVolatile) {
1271     // Set L1 cache policy to be MISS_EVICT for load instructions
1272     // and MISS_LRU for store instructions.
1273     // Note: there is no L2 cache bypass policy at the ISA level.
1274     if (Op == SIMemOp::LOAD)
1275       Changed |= enableGLCBit(MI);
1276 
1277     // Ensure operation has completed at system scope to cause all volatile
1278     // operations to be visible outside the program in a global order. Do not
1279     // request cross address space as only the global address space can be
1280     // observable outside the program, so no need to cause a waitcnt for LDS
1281     // address space operations.
1282     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1283                           Position::AFTER);
1284 
1285     return Changed;
1286   }
1287 
1288   if (IsNonTemporal) {
1289     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1290     // for both loads and stores, and the L2 cache policy to STREAM.
1291     Changed |= enableGLCBit(MI);
1292     Changed |= enableSLCBit(MI);
1293     return Changed;
1294   }
1295 
1296   return Changed;
1297 }
1298 
1299 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1300                                       SIAtomicScope Scope,
1301                                       SIAtomicAddrSpace AddrSpace,
1302                                       SIMemOp Op,
1303                                       bool IsCrossAddrSpaceOrdering,
1304                                       Position Pos) const {
1305   if (ST.isTgSplitEnabled()) {
1306     // In threadgroup split mode the waves of a work-group can be executing on
1307     // different CUs. Therefore need to wait for global or GDS memory operations
1308     // to complete to ensure they are visible to waves in the other CUs.
1309     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1310     // the same CU, so no need to wait for global memory as all waves in the
1311     // work-group access the same the L1, nor wait for GDS as access are ordered
1312     // on a CU.
1313     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1314                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1315         (Scope == SIAtomicScope::WORKGROUP)) {
1316       // Same as GFX7 using agent scope.
1317       Scope = SIAtomicScope::AGENT;
1318     }
1319     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1320     // LDS memory operations.
1321     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1322   }
1323   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1324                                         IsCrossAddrSpaceOrdering, Pos);
1325 }
1326 
1327 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1328                                          SIAtomicScope Scope,
1329                                          SIAtomicAddrSpace AddrSpace,
1330                                          Position Pos) const {
1331   if (!InsertCacheInv)
1332     return false;
1333 
1334   bool Changed = false;
1335 
1336   MachineBasicBlock &MBB = *MI->getParent();
1337   DebugLoc DL = MI->getDebugLoc();
1338 
1339   if (Pos == Position::AFTER)
1340     ++MI;
1341 
1342   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1343     switch (Scope) {
1344     case SIAtomicScope::SYSTEM:
1345       // Ensures that following loads will not see stale remote VMEM data or
1346       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1347       // CC will never be stale due to the local memory probes.
1348       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1349       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1350       // hardware does not reorder memory operations by the same wave with
1351       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1352       // remove any cache lines of earlier writes by the same wave and ensures
1353       // later reads by the same wave will refetch the cache lines.
1354       Changed = true;
1355       break;
1356     case SIAtomicScope::AGENT:
1357       // Same as GFX7.
1358       break;
1359     case SIAtomicScope::WORKGROUP:
1360       // In threadgroup split mode the waves of a work-group can be executing on
1361       // different CUs. Therefore need to invalidate the L1 which is per CU.
1362       // Otherwise in non-threadgroup split mode all waves of a work-group are
1363       // on the same CU, and so the L1 does not need to be invalidated.
1364       if (ST.isTgSplitEnabled()) {
1365         // Same as GFX7 using agent scope.
1366         Scope = SIAtomicScope::AGENT;
1367       }
1368       break;
1369     case SIAtomicScope::WAVEFRONT:
1370     case SIAtomicScope::SINGLETHREAD:
1371       // Same as GFX7.
1372       break;
1373     default:
1374       llvm_unreachable("Unsupported synchronization scope");
1375     }
1376   }
1377 
1378   /// The scratch address space does not need the global memory cache
1379   /// to be flushed as all memory operations by the same thread are
1380   /// sequentially consistent, and no other thread can access scratch
1381   /// memory.
1382 
1383   /// Other address spaces do not have a cache.
1384 
1385   if (Pos == Position::AFTER)
1386     --MI;
1387 
1388   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1389 
1390   return Changed;
1391 }
1392 
1393 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1394                                          SIAtomicScope Scope,
1395                                          SIAtomicAddrSpace AddrSpace,
1396                                          bool IsCrossAddrSpaceOrdering,
1397                                          Position Pos) const {
1398   bool Changed = false;
1399 
1400   MachineBasicBlock &MBB = *MI->getParent();
1401   DebugLoc DL = MI->getDebugLoc();
1402 
1403   if (Pos == Position::AFTER)
1404     ++MI;
1405 
1406   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1407     switch (Scope) {
1408     case SIAtomicScope::SYSTEM:
1409       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1410       // hardware does not reorder memory operations by the same wave with
1411       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1412       // to initiate writeback of any dirty cache lines of earlier writes by the
1413       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1414       // writeback has completed.
1415       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1416         // Set SC bits to indicate system scope.
1417         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1418       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1419       // vmcnt(0)" needed by the "BUFFER_WBL2".
1420       Changed = true;
1421       break;
1422     case SIAtomicScope::AGENT:
1423     case SIAtomicScope::WORKGROUP:
1424     case SIAtomicScope::WAVEFRONT:
1425     case SIAtomicScope::SINGLETHREAD:
1426       // Same as GFX7.
1427       break;
1428     default:
1429       llvm_unreachable("Unsupported synchronization scope");
1430     }
1431   }
1432 
1433   if (Pos == Position::AFTER)
1434     --MI;
1435 
1436   Changed |=
1437       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1438                                         IsCrossAddrSpaceOrdering, Pos);
1439 
1440   return Changed;
1441 }
1442 
1443 bool SIGfx940CacheControl::enableLoadCacheBypass(
1444     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1445     SIAtomicAddrSpace AddrSpace) const {
1446   assert(MI->mayLoad() && !MI->mayStore());
1447   bool Changed = false;
1448 
1449   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1450     switch (Scope) {
1451     case SIAtomicScope::SYSTEM:
1452       // Set SC bits to indicate system scope.
1453       Changed |= enableSC0Bit(MI);
1454       Changed |= enableSC1Bit(MI);
1455       break;
1456     case SIAtomicScope::AGENT:
1457       // Set SC bits to indicate agent scope.
1458       Changed |= enableSC1Bit(MI);
1459       break;
1460     case SIAtomicScope::WORKGROUP:
1461       // In threadgroup split mode the waves of a work-group can be executing on
1462       // different CUs. Therefore need to bypass the L1 which is per CU.
1463       // Otherwise in non-threadgroup split mode all waves of a work-group are
1464       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1465       // bits to indicate work-group scope will do this automatically.
1466       Changed |= enableSC0Bit(MI);
1467       break;
1468     case SIAtomicScope::WAVEFRONT:
1469     case SIAtomicScope::SINGLETHREAD:
1470       // Leave SC bits unset to indicate wavefront scope.
1471       break;
1472     default:
1473       llvm_unreachable("Unsupported synchronization scope");
1474     }
1475   }
1476 
1477   /// The scratch address space does not need the global memory caches
1478   /// to be bypassed as all memory operations by the same thread are
1479   /// sequentially consistent, and no other thread can access scratch
1480   /// memory.
1481 
1482   /// Other address spaces do not have a cache.
1483 
1484   return Changed;
1485 }
1486 
1487 bool SIGfx940CacheControl::enableStoreCacheBypass(
1488     const MachineBasicBlock::iterator &MI,
1489     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1490   assert(!MI->mayLoad() && MI->mayStore());
1491   bool Changed = false;
1492 
1493   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1494     switch (Scope) {
1495     case SIAtomicScope::SYSTEM:
1496       // Set SC bits to indicate system scope.
1497       Changed |= enableSC0Bit(MI);
1498       Changed |= enableSC1Bit(MI);
1499       break;
1500     case SIAtomicScope::AGENT:
1501       // Set SC bits to indicate agent scope.
1502       Changed |= enableSC1Bit(MI);
1503       break;
1504     case SIAtomicScope::WORKGROUP:
1505       // Set SC bits to indicate workgroup scope.
1506       Changed |= enableSC0Bit(MI);
1507       break;
1508     case SIAtomicScope::WAVEFRONT:
1509     case SIAtomicScope::SINGLETHREAD:
1510       // Leave SC bits unset to indicate wavefront scope.
1511       break;
1512     default:
1513       llvm_unreachable("Unsupported synchronization scope");
1514     }
1515   }
1516 
1517   /// The scratch address space does not need the global memory caches
1518   /// to be bypassed as all memory operations by the same thread are
1519   /// sequentially consistent, and no other thread can access scratch
1520   /// memory.
1521 
1522   /// Other address spaces do not have a cache.
1523 
1524   return Changed;
1525 }
1526 
1527 bool SIGfx940CacheControl::enableRMWCacheBypass(
1528     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1529     SIAtomicAddrSpace AddrSpace) const {
1530   assert(MI->mayLoad() && MI->mayStore());
1531   bool Changed = false;
1532 
1533   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1534     switch (Scope) {
1535     case SIAtomicScope::SYSTEM:
1536       // Set SC1 bit to indicate system scope.
1537       Changed |= enableSC1Bit(MI);
1538       break;
1539     case SIAtomicScope::AGENT:
1540     case SIAtomicScope::WORKGROUP:
1541     case SIAtomicScope::WAVEFRONT:
1542     case SIAtomicScope::SINGLETHREAD:
1543       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1544       // to indicate system or agent scope. The SC0 bit is used to indicate if
1545       // they are return or no-return. Leave SC1 bit unset to indicate agent
1546       // scope.
1547       break;
1548     default:
1549       llvm_unreachable("Unsupported synchronization scope");
1550     }
1551   }
1552 
1553   return Changed;
1554 }
1555 
1556 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1557     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1558     bool IsVolatile, bool IsNonTemporal) const {
1559   // Only handle load and store, not atomic read-modify-write insructions. The
1560   // latter use glc to indicate if the atomic returns a result and so must not
1561   // be used for cache control.
1562   assert(MI->mayLoad() ^ MI->mayStore());
1563 
1564   // Only update load and store, not LLVM IR atomic read-modify-write
1565   // instructions. The latter are always marked as volatile so cannot sensibly
1566   // handle it as do not want to pessimize all atomics. Also they do not support
1567   // the nontemporal attribute.
1568   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1569 
1570   bool Changed = false;
1571 
1572   if (IsVolatile) {
1573     // Set SC bits to indicate system scope.
1574     Changed |= enableSC0Bit(MI);
1575     Changed |= enableSC1Bit(MI);
1576 
1577     // Ensure operation has completed at system scope to cause all volatile
1578     // operations to be visible outside the program in a global order. Do not
1579     // request cross address space as only the global address space can be
1580     // observable outside the program, so no need to cause a waitcnt for LDS
1581     // address space operations.
1582     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1583                           Position::AFTER);
1584 
1585     return Changed;
1586   }
1587 
1588   if (IsNonTemporal) {
1589     Changed |= enableNTBit(MI);
1590     return Changed;
1591   }
1592 
1593   return Changed;
1594 }
1595 
1596 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1597                                          SIAtomicScope Scope,
1598                                          SIAtomicAddrSpace AddrSpace,
1599                                          Position Pos) const {
1600   if (!InsertCacheInv)
1601     return false;
1602 
1603   bool Changed = false;
1604 
1605   MachineBasicBlock &MBB = *MI->getParent();
1606   DebugLoc DL = MI->getDebugLoc();
1607 
1608   if (Pos == Position::AFTER)
1609     ++MI;
1610 
1611   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1612     switch (Scope) {
1613     case SIAtomicScope::SYSTEM:
1614       // Ensures that following loads will not see stale remote VMEM data or
1615       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1616       // CC will never be stale due to the local memory probes.
1617       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1618           // Set SC bits to indicate system scope.
1619           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1620       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1621       // hardware does not reorder memory operations by the same wave with
1622       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1623       // remove any cache lines of earlier writes by the same wave and ensures
1624       // later reads by the same wave will refetch the cache lines.
1625       Changed = true;
1626       break;
1627     case SIAtomicScope::AGENT:
1628       // Ensures that following loads will not see stale remote date or local
1629       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1630       // due to the memory probes.
1631       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1632           // Set SC bits to indicate agent scope.
1633           .addImm(AMDGPU::CPol::SC1);
1634       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1635       // does not reorder memory operations with respect to preceeding buffer
1636       // invalidate. The invalidate is guaranteed to remove any cache lines of
1637       // earlier writes and ensures later writes will refetch the cache lines.
1638       Changed = true;
1639       break;
1640     case SIAtomicScope::WORKGROUP:
1641       // In threadgroup split mode the waves of a work-group can be executing on
1642       // different CUs. Therefore need to invalidate the L1 which is per CU.
1643       // Otherwise in non-threadgroup split mode all waves of a work-group are
1644       // on the same CU, and so the L1 does not need to be invalidated.
1645       if (ST.isTgSplitEnabled()) {
1646         // Ensures L1 is invalidated if in threadgroup split mode. In
1647         // non-threadgroup split mode it is a NOP, but no point generating it in
1648         // that case if know not in that mode.
1649         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1650             // Set SC bits to indicate work-group scope.
1651             .addImm(AMDGPU::CPol::SC0);
1652         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1653         // does not reorder memory operations with respect to preceeding buffer
1654         // invalidate. The invalidate is guaranteed to remove any cache lines of
1655         // earlier writes and ensures later writes will refetch the cache lines.
1656         Changed = true;
1657       }
1658       break;
1659     case SIAtomicScope::WAVEFRONT:
1660     case SIAtomicScope::SINGLETHREAD:
1661       // Could generate "BUFFER_INV" but it would do nothing as there are no
1662       // caches to invalidate.
1663       break;
1664     default:
1665       llvm_unreachable("Unsupported synchronization scope");
1666     }
1667   }
1668 
1669   /// The scratch address space does not need the global memory cache
1670   /// to be flushed as all memory operations by the same thread are
1671   /// sequentially consistent, and no other thread can access scratch
1672   /// memory.
1673 
1674   /// Other address spaces do not have a cache.
1675 
1676   if (Pos == Position::AFTER)
1677     --MI;
1678 
1679   return Changed;
1680 }
1681 
1682 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1683                                          SIAtomicScope Scope,
1684                                          SIAtomicAddrSpace AddrSpace,
1685                                          bool IsCrossAddrSpaceOrdering,
1686                                          Position Pos) const {
1687   bool Changed = false;
1688 
1689   MachineBasicBlock &MBB = *MI->getParent();
1690   DebugLoc DL = MI->getDebugLoc();
1691 
1692   if (Pos == Position::AFTER)
1693     ++MI;
1694 
1695   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1696     switch (Scope) {
1697     case SIAtomicScope::SYSTEM:
1698       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1699       // hardware does not reorder memory operations by the same wave with
1700       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1701       // to initiate writeback of any dirty cache lines of earlier writes by the
1702       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1703       // writeback has completed.
1704       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1705           // Set SC bits to indicate system scope.
1706           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1707       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1708       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1709       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1710       Changed = true;
1711       break;
1712     case SIAtomicScope::AGENT:
1713       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1714           // Set SC bits to indicate agent scope.
1715           .addImm(AMDGPU::CPol::SC1);
1716 
1717       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1718       // SIAtomicScope::AGENT, the following insertWait will generate the
1719       // required "S_WAITCNT vmcnt(0)".
1720       Changed = true;
1721       break;
1722     case SIAtomicScope::WORKGROUP:
1723     case SIAtomicScope::WAVEFRONT:
1724     case SIAtomicScope::SINGLETHREAD:
1725       // Do not generate "BUFFER_WBL2" as there are no caches it would
1726       // writeback, and would require an otherwise unnecessary
1727       // "S_WAITCNT vmcnt(0)".
1728       break;
1729     default:
1730       llvm_unreachable("Unsupported synchronization scope");
1731     }
1732   }
1733 
1734   if (Pos == Position::AFTER)
1735     --MI;
1736 
1737   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1738   // S_WAITCNT needed.
1739   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1740                         IsCrossAddrSpaceOrdering, Pos);
1741 
1742   return Changed;
1743 }
1744 
1745 bool SIGfx10CacheControl::enableLoadCacheBypass(
1746     const MachineBasicBlock::iterator &MI,
1747     SIAtomicScope Scope,
1748     SIAtomicAddrSpace AddrSpace) const {
1749   assert(MI->mayLoad() && !MI->mayStore());
1750   bool Changed = false;
1751 
1752   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1753     switch (Scope) {
1754     case SIAtomicScope::SYSTEM:
1755     case SIAtomicScope::AGENT:
1756       // Set the L0 and L1 cache policies to MISS_EVICT.
1757       // Note: there is no L2 cache coherent bypass control at the ISA level.
1758       Changed |= enableGLCBit(MI);
1759       Changed |= enableDLCBit(MI);
1760       break;
1761     case SIAtomicScope::WORKGROUP:
1762       // In WGP mode the waves of a work-group can be executing on either CU of
1763       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1764       // CU mode all waves of a work-group are on the same CU, and so the L0
1765       // does not need to be bypassed.
1766       if (!ST.isCuModeEnabled())
1767         Changed |= enableGLCBit(MI);
1768       break;
1769     case SIAtomicScope::WAVEFRONT:
1770     case SIAtomicScope::SINGLETHREAD:
1771       // No cache to bypass.
1772       break;
1773     default:
1774       llvm_unreachable("Unsupported synchronization scope");
1775     }
1776   }
1777 
1778   /// The scratch address space does not need the global memory caches
1779   /// to be bypassed as all memory operations by the same thread are
1780   /// sequentially consistent, and no other thread can access scratch
1781   /// memory.
1782 
1783   /// Other address spaces do not have a cache.
1784 
1785   return Changed;
1786 }
1787 
1788 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1789     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1790     bool IsVolatile, bool IsNonTemporal) const {
1791 
1792   // Only handle load and store, not atomic read-modify-write insructions. The
1793   // latter use glc to indicate if the atomic returns a result and so must not
1794   // be used for cache control.
1795   assert(MI->mayLoad() ^ MI->mayStore());
1796 
1797   // Only update load and store, not LLVM IR atomic read-modify-write
1798   // instructions. The latter are always marked as volatile so cannot sensibly
1799   // handle it as do not want to pessimize all atomics. Also they do not support
1800   // the nontemporal attribute.
1801   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1802 
1803   bool Changed = false;
1804 
1805   if (IsVolatile) {
1806     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1807     // and MISS_LRU for store instructions.
1808     // Note: there is no L2 cache coherent bypass control at the ISA level.
1809     if (Op == SIMemOp::LOAD) {
1810       Changed |= enableGLCBit(MI);
1811       Changed |= enableDLCBit(MI);
1812     }
1813 
1814     // Ensure operation has completed at system scope to cause all volatile
1815     // operations to be visible outside the program in a global order. Do not
1816     // request cross address space as only the global address space can be
1817     // observable outside the program, so no need to cause a waitcnt for LDS
1818     // address space operations.
1819     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1820                           Position::AFTER);
1821     return Changed;
1822   }
1823 
1824   if (IsNonTemporal) {
1825     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1826     // and L2 cache policy to STREAM.
1827     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1828     // to MISS_EVICT and the L2 cache policy to STREAM.
1829     if (Op == SIMemOp::STORE)
1830       Changed |= enableGLCBit(MI);
1831     Changed |= enableSLCBit(MI);
1832 
1833     return Changed;
1834   }
1835 
1836   return Changed;
1837 }
1838 
1839 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1840                                      SIAtomicScope Scope,
1841                                      SIAtomicAddrSpace AddrSpace,
1842                                      SIMemOp Op,
1843                                      bool IsCrossAddrSpaceOrdering,
1844                                      Position Pos) const {
1845   bool Changed = false;
1846 
1847   MachineBasicBlock &MBB = *MI->getParent();
1848   DebugLoc DL = MI->getDebugLoc();
1849 
1850   if (Pos == Position::AFTER)
1851     ++MI;
1852 
1853   bool VMCnt = false;
1854   bool VSCnt = false;
1855   bool LGKMCnt = false;
1856 
1857   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1858       SIAtomicAddrSpace::NONE) {
1859     switch (Scope) {
1860     case SIAtomicScope::SYSTEM:
1861     case SIAtomicScope::AGENT:
1862       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1863         VMCnt |= true;
1864       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1865         VSCnt |= true;
1866       break;
1867     case SIAtomicScope::WORKGROUP:
1868       // In WGP mode the waves of a work-group can be executing on either CU of
1869       // the WGP. Therefore need to wait for operations to complete to ensure
1870       // they are visible to waves in the other CU as the L0 is per CU.
1871       // Otherwise in CU mode and all waves of a work-group are on the same CU
1872       // which shares the same L0.
1873       if (!ST.isCuModeEnabled()) {
1874         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1875           VMCnt |= true;
1876         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1877           VSCnt |= true;
1878       }
1879       break;
1880     case SIAtomicScope::WAVEFRONT:
1881     case SIAtomicScope::SINGLETHREAD:
1882       // The L0 cache keeps all memory operations in order for
1883       // work-items in the same wavefront.
1884       break;
1885     default:
1886       llvm_unreachable("Unsupported synchronization scope");
1887     }
1888   }
1889 
1890   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1891     switch (Scope) {
1892     case SIAtomicScope::SYSTEM:
1893     case SIAtomicScope::AGENT:
1894     case SIAtomicScope::WORKGROUP:
1895       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1896       // not needed as LDS operations for all waves are executed in a total
1897       // global ordering as observed by all waves. Required if also
1898       // synchronizing with global/GDS memory as LDS operations could be
1899       // reordered with respect to later global/GDS memory operations of the
1900       // same wave.
1901       LGKMCnt |= IsCrossAddrSpaceOrdering;
1902       break;
1903     case SIAtomicScope::WAVEFRONT:
1904     case SIAtomicScope::SINGLETHREAD:
1905       // The LDS keeps all memory operations in order for
1906       // the same wavefront.
1907       break;
1908     default:
1909       llvm_unreachable("Unsupported synchronization scope");
1910     }
1911   }
1912 
1913   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1914     switch (Scope) {
1915     case SIAtomicScope::SYSTEM:
1916     case SIAtomicScope::AGENT:
1917       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1918       // is not needed as GDS operations for all waves are executed in a total
1919       // global ordering as observed by all waves. Required if also
1920       // synchronizing with global/LDS memory as GDS operations could be
1921       // reordered with respect to later global/LDS memory operations of the
1922       // same wave.
1923       LGKMCnt |= IsCrossAddrSpaceOrdering;
1924       break;
1925     case SIAtomicScope::WORKGROUP:
1926     case SIAtomicScope::WAVEFRONT:
1927     case SIAtomicScope::SINGLETHREAD:
1928       // The GDS keeps all memory operations in order for
1929       // the same work-group.
1930       break;
1931     default:
1932       llvm_unreachable("Unsupported synchronization scope");
1933     }
1934   }
1935 
1936   if (VMCnt || LGKMCnt) {
1937     unsigned WaitCntImmediate =
1938       AMDGPU::encodeWaitcnt(IV,
1939                             VMCnt ? 0 : getVmcntBitMask(IV),
1940                             getExpcntBitMask(IV),
1941                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1942     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1943     Changed = true;
1944   }
1945 
1946   if (VSCnt) {
1947     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1948       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1949       .addImm(0);
1950     Changed = true;
1951   }
1952 
1953   if (Pos == Position::AFTER)
1954     --MI;
1955 
1956   return Changed;
1957 }
1958 
1959 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1960                                         SIAtomicScope Scope,
1961                                         SIAtomicAddrSpace AddrSpace,
1962                                         Position Pos) const {
1963   if (!InsertCacheInv)
1964     return false;
1965 
1966   bool Changed = false;
1967 
1968   MachineBasicBlock &MBB = *MI->getParent();
1969   DebugLoc DL = MI->getDebugLoc();
1970 
1971   if (Pos == Position::AFTER)
1972     ++MI;
1973 
1974   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1975     switch (Scope) {
1976     case SIAtomicScope::SYSTEM:
1977     case SIAtomicScope::AGENT:
1978       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1979       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1980       Changed = true;
1981       break;
1982     case SIAtomicScope::WORKGROUP:
1983       // In WGP mode the waves of a work-group can be executing on either CU of
1984       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1985       // in CU mode and all waves of a work-group are on the same CU, and so the
1986       // L0 does not need to be invalidated.
1987       if (!ST.isCuModeEnabled()) {
1988         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1989         Changed = true;
1990       }
1991       break;
1992     case SIAtomicScope::WAVEFRONT:
1993     case SIAtomicScope::SINGLETHREAD:
1994       // No cache to invalidate.
1995       break;
1996     default:
1997       llvm_unreachable("Unsupported synchronization scope");
1998     }
1999   }
2000 
2001   /// The scratch address space does not need the global memory cache
2002   /// to be flushed as all memory operations by the same thread are
2003   /// sequentially consistent, and no other thread can access scratch
2004   /// memory.
2005 
2006   /// Other address spaces do not have a cache.
2007 
2008   if (Pos == Position::AFTER)
2009     --MI;
2010 
2011   return Changed;
2012 }
2013 
2014 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2015   if (AtomicPseudoMIs.empty())
2016     return false;
2017 
2018   for (auto &MI : AtomicPseudoMIs)
2019     MI->eraseFromParent();
2020 
2021   AtomicPseudoMIs.clear();
2022   return true;
2023 }
2024 
2025 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2026                                    MachineBasicBlock::iterator &MI) {
2027   assert(MI->mayLoad() && !MI->mayStore());
2028 
2029   bool Changed = false;
2030 
2031   if (MOI.isAtomic()) {
2032     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2033         MOI.getOrdering() == AtomicOrdering::Acquire ||
2034         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2035       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2036                                            MOI.getOrderingAddrSpace());
2037     }
2038 
2039     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2040       Changed |= CC->insertWait(MI, MOI.getScope(),
2041                                 MOI.getOrderingAddrSpace(),
2042                                 SIMemOp::LOAD | SIMemOp::STORE,
2043                                 MOI.getIsCrossAddressSpaceOrdering(),
2044                                 Position::BEFORE);
2045 
2046     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2047         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2048       Changed |= CC->insertWait(MI, MOI.getScope(),
2049                                 MOI.getInstrAddrSpace(),
2050                                 SIMemOp::LOAD,
2051                                 MOI.getIsCrossAddressSpaceOrdering(),
2052                                 Position::AFTER);
2053       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2054                                    MOI.getOrderingAddrSpace(),
2055                                    Position::AFTER);
2056     }
2057 
2058     return Changed;
2059   }
2060 
2061   // Atomic instructions already bypass caches to the scope specified by the
2062   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2063   // need additional treatment.
2064   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2065                                                 SIMemOp::LOAD, MOI.isVolatile(),
2066                                                 MOI.isNonTemporal());
2067   return Changed;
2068 }
2069 
2070 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2071                                     MachineBasicBlock::iterator &MI) {
2072   assert(!MI->mayLoad() && MI->mayStore());
2073 
2074   bool Changed = false;
2075 
2076   if (MOI.isAtomic()) {
2077     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2078         MOI.getOrdering() == AtomicOrdering::Release ||
2079         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2080       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2081                                             MOI.getOrderingAddrSpace());
2082     }
2083 
2084     if (MOI.getOrdering() == AtomicOrdering::Release ||
2085         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2086       Changed |= CC->insertRelease(MI, MOI.getScope(),
2087                                    MOI.getOrderingAddrSpace(),
2088                                    MOI.getIsCrossAddressSpaceOrdering(),
2089                                    Position::BEFORE);
2090 
2091     return Changed;
2092   }
2093 
2094   // Atomic instructions already bypass caches to the scope specified by the
2095   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2096   // need additional treatment.
2097   Changed |= CC->enableVolatileAndOrNonTemporal(
2098       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2099       MOI.isNonTemporal());
2100   return Changed;
2101 }
2102 
2103 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2104                                           MachineBasicBlock::iterator &MI) {
2105   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2106 
2107   AtomicPseudoMIs.push_back(MI);
2108   bool Changed = false;
2109 
2110   if (MOI.isAtomic()) {
2111     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2112         MOI.getOrdering() == AtomicOrdering::Release ||
2113         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2114         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2115       /// TODO: This relies on a barrier always generating a waitcnt
2116       /// for LDS to ensure it is not reordered with the completion of
2117       /// the proceeding LDS operations. If barrier had a memory
2118       /// ordering and memory scope, then library does not need to
2119       /// generate a fence. Could add support in this file for
2120       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2121       /// adding S_WAITCNT before a S_BARRIER.
2122       Changed |= CC->insertRelease(MI, MOI.getScope(),
2123                                    MOI.getOrderingAddrSpace(),
2124                                    MOI.getIsCrossAddressSpaceOrdering(),
2125                                    Position::BEFORE);
2126 
2127     // TODO: If both release and invalidate are happening they could be combined
2128     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2129     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2130     // track cache invalidate and write back instructions.
2131 
2132     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2133         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2134         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2135       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2136                                    MOI.getOrderingAddrSpace(),
2137                                    Position::BEFORE);
2138 
2139     return Changed;
2140   }
2141 
2142   return Changed;
2143 }
2144 
2145 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2146   MachineBasicBlock::iterator &MI) {
2147   assert(MI->mayLoad() && MI->mayStore());
2148 
2149   bool Changed = false;
2150 
2151   if (MOI.isAtomic()) {
2152     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2153         MOI.getOrdering() == AtomicOrdering::Acquire ||
2154         MOI.getOrdering() == AtomicOrdering::Release ||
2155         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2156         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2157       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2158                                           MOI.getInstrAddrSpace());
2159     }
2160 
2161     if (MOI.getOrdering() == AtomicOrdering::Release ||
2162         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2163         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2164         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2165       Changed |= CC->insertRelease(MI, MOI.getScope(),
2166                                    MOI.getOrderingAddrSpace(),
2167                                    MOI.getIsCrossAddressSpaceOrdering(),
2168                                    Position::BEFORE);
2169 
2170     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2171         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2172         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2173         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2174         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2175       Changed |= CC->insertWait(MI, MOI.getScope(),
2176                                 MOI.getInstrAddrSpace(),
2177                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2178                                                    SIMemOp::STORE,
2179                                 MOI.getIsCrossAddressSpaceOrdering(),
2180                                 Position::AFTER);
2181       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2182                                    MOI.getOrderingAddrSpace(),
2183                                    Position::AFTER);
2184     }
2185 
2186     return Changed;
2187   }
2188 
2189   return Changed;
2190 }
2191 
2192 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2193   bool Changed = false;
2194 
2195   SIMemOpAccess MOA(MF);
2196   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2197 
2198   for (auto &MBB : MF) {
2199     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2200 
2201       // Unbundle instructions after the post-RA scheduler.
2202       if (MI->isBundle() && MI->mayLoadOrStore()) {
2203         MachineBasicBlock::instr_iterator II(MI->getIterator());
2204         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2205              I != E && I->isBundledWithPred(); ++I) {
2206           I->unbundleFromPred();
2207           for (MachineOperand &MO : I->operands())
2208             if (MO.isReg())
2209               MO.setIsInternalRead(false);
2210         }
2211 
2212         MI->eraseFromParent();
2213         MI = II->getIterator();
2214       }
2215 
2216       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2217         continue;
2218 
2219       if (const auto &MOI = MOA.getLoadInfo(MI))
2220         Changed |= expandLoad(MOI.getValue(), MI);
2221       else if (const auto &MOI = MOA.getStoreInfo(MI))
2222         Changed |= expandStore(MOI.getValue(), MI);
2223       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2224         Changed |= expandAtomicFence(MOI.getValue(), MI);
2225       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2226         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
2227     }
2228   }
2229 
2230   Changed |= removeAtomicPseudoMIs();
2231   return Changed;
2232 }
2233 
2234 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2235 
2236 char SIMemoryLegalizer::ID = 0;
2237 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2238 
2239 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2240   return new SIMemoryLegalizer();
2241 }
2242