1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed together.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE);
130 
131     // There is also no cross address space ordering if the ordering
132     // address space is the same as the instruction address space and
133     // only contains a single address space.
134     if ((OrderingAddrSpace == InstrAddrSpace) &&
135         isPowerOf2_32(uint32_t(InstrAddrSpace)))
136       this->IsCrossAddressSpaceOrdering = false;
137 
138     // Limit the scope to the maximum supported by the instruction's address
139     // spaces.
140     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
141         SIAtomicAddrSpace::NONE) {
142       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
143     } else if ((InstrAddrSpace &
144                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
145                SIAtomicAddrSpace::NONE) {
146       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
147     } else if ((InstrAddrSpace &
148                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
149                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
150       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
151     }
152   }
153 
154 public:
155   /// \returns Atomic synchronization scope of the machine instruction used to
156   /// create this SIMemOpInfo.
157   SIAtomicScope getScope() const {
158     return Scope;
159   }
160 
161   /// \returns Ordering constraint of the machine instruction used to
162   /// create this SIMemOpInfo.
163   AtomicOrdering getOrdering() const {
164     return Ordering;
165   }
166 
167   /// \returns Failure ordering constraint of the machine instruction used to
168   /// create this SIMemOpInfo.
169   AtomicOrdering getFailureOrdering() const {
170     return FailureOrdering;
171   }
172 
173   /// \returns The address spaces be accessed by the machine
174   /// instruction used to create this SiMemOpInfo.
175   SIAtomicAddrSpace getInstrAddrSpace() const {
176     return InstrAddrSpace;
177   }
178 
179   /// \returns The address spaces that must be ordered by the machine
180   /// instruction used to create this SiMemOpInfo.
181   SIAtomicAddrSpace getOrderingAddrSpace() const {
182     return OrderingAddrSpace;
183   }
184 
185   /// \returns Return true iff memory ordering of operations on
186   /// different address spaces is required.
187   bool getIsCrossAddressSpaceOrdering() const {
188     return IsCrossAddressSpaceOrdering;
189   }
190 
191   /// \returns True if memory access of the machine instruction used to
192   /// create this SIMemOpInfo is volatile, false otherwise.
193   bool isVolatile() const {
194     return IsVolatile;
195   }
196 
197   /// \returns True if memory access of the machine instruction used to
198   /// create this SIMemOpInfo is nontemporal, false otherwise.
199   bool isNonTemporal() const {
200     return IsNonTemporal;
201   }
202 
203   /// \returns True if ordering constraint of the machine instruction used to
204   /// create this SIMemOpInfo is unordered or higher, false otherwise.
205   bool isAtomic() const {
206     return Ordering != AtomicOrdering::NotAtomic;
207   }
208 
209 };
210 
211 class SIMemOpAccess final {
212 private:
213   AMDGPUMachineModuleInfo *MMI = nullptr;
214 
215   /// Reports unsupported message \p Msg for \p MI to LLVM context.
216   void reportUnsupported(const MachineBasicBlock::iterator &MI,
217                          const char *Msg) const;
218 
219   /// Inspects the target synchronization scope \p SSID and determines
220   /// the SI atomic scope it corresponds to, the address spaces it
221   /// covers, and whether the memory ordering applies between address
222   /// spaces.
223   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
224   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
225 
226   /// \return Return a bit set of the address spaces accessed by \p AS.
227   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
228 
229   /// \returns Info constructed from \p MI, which has at least machine memory
230   /// operand.
231   Optional<SIMemOpInfo> constructFromMIWithMMO(
232       const MachineBasicBlock::iterator &MI) const;
233 
234 public:
235   /// Construct class to support accessing the machine memory operands
236   /// of instructions in the machine function \p MF.
237   SIMemOpAccess(MachineFunction &MF);
238 
239   /// \returns Load info if \p MI is a load operation, "None" otherwise.
240   Optional<SIMemOpInfo> getLoadInfo(
241       const MachineBasicBlock::iterator &MI) const;
242 
243   /// \returns Store info if \p MI is a store operation, "None" otherwise.
244   Optional<SIMemOpInfo> getStoreInfo(
245       const MachineBasicBlock::iterator &MI) const;
246 
247   /// \returns Atomic fence info if \p MI is an atomic fence operation,
248   /// "None" otherwise.
249   Optional<SIMemOpInfo> getAtomicFenceInfo(
250       const MachineBasicBlock::iterator &MI) const;
251 
252   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
253   /// rmw operation, "None" otherwise.
254   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
255       const MachineBasicBlock::iterator &MI) const;
256 };
257 
258 class SICacheControl {
259 protected:
260 
261   /// AMDGPU subtarget info.
262   const GCNSubtarget &ST;
263 
264   /// Instruction info.
265   const SIInstrInfo *TII = nullptr;
266 
267   IsaVersion IV;
268 
269   /// Whether to insert cache invalidating instructions.
270   bool InsertCacheInv;
271 
272   SICacheControl(const GCNSubtarget &ST);
273 
274   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
275   /// \returns Returns true if \p MI is modified, false otherwise.
276   bool enableNamedBit(const MachineBasicBlock::iterator MI,
277                       AMDGPU::CPol::CPol Bit) const;
278 
279 public:
280 
281   /// Create a cache control for the subtarget \p ST.
282   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
283 
284   /// Update \p MI memory load instruction to bypass any caches up to
285   /// the \p Scope memory scope for address spaces \p
286   /// AddrSpace. Return true iff the instruction was modified.
287   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
288                                      SIAtomicScope Scope,
289                                      SIAtomicAddrSpace AddrSpace) const = 0;
290 
291   /// Update \p MI memory store instruction to bypass any caches up to
292   /// the \p Scope memory scope for address spaces \p
293   /// AddrSpace. Return true iff the instruction was modified.
294   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
295                                       SIAtomicScope Scope,
296                                       SIAtomicAddrSpace AddrSpace) const = 0;
297 
298   /// Update \p MI memory read-modify-write instruction to bypass any caches up
299   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
300   /// iff the instruction was modified.
301   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
302                                     SIAtomicScope Scope,
303                                     SIAtomicAddrSpace AddrSpace) const = 0;
304 
305   /// Update \p MI memory instruction of kind \p Op associated with address
306   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
307   /// true iff the instruction was modified.
308   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
309                                               SIAtomicAddrSpace AddrSpace,
310                                               SIMemOp Op, bool IsVolatile,
311                                               bool IsNonTemporal) const = 0;
312 
313   /// Inserts any necessary instructions at position \p Pos relative
314   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
315   /// \p Op associated with address spaces \p AddrSpace have completed. Used
316   /// between memory instructions to enforce the order they become visible as
317   /// observed by other memory instructions executing in memory scope \p Scope.
318   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
319   /// address spaces. Returns true iff any instructions inserted.
320   virtual bool insertWait(MachineBasicBlock::iterator &MI,
321                           SIAtomicScope Scope,
322                           SIAtomicAddrSpace AddrSpace,
323                           SIMemOp Op,
324                           bool IsCrossAddrSpaceOrdering,
325                           Position Pos) const = 0;
326 
327   /// Inserts any necessary instructions at position \p Pos relative to
328   /// instruction \p MI to ensure any subsequent memory instructions of this
329   /// thread with address spaces \p AddrSpace will observe the previous memory
330   /// operations by any thread for memory scopes up to memory scope \p Scope .
331   /// Returns true iff any instructions inserted.
332   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
333                              SIAtomicScope Scope,
334                              SIAtomicAddrSpace AddrSpace,
335                              Position Pos) const = 0;
336 
337   /// Inserts any necessary instructions at position \p Pos relative to
338   /// instruction \p MI to ensure previous memory instructions by this thread
339   /// with address spaces \p AddrSpace have completed and can be observed by
340   /// subsequent memory instructions by any thread executing in memory scope \p
341   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
342   /// between address spaces. Returns true iff any instructions inserted.
343   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
344                              SIAtomicScope Scope,
345                              SIAtomicAddrSpace AddrSpace,
346                              bool IsCrossAddrSpaceOrdering,
347                              Position Pos) const = 0;
348 
349   /// Virtual destructor to allow derivations to be deleted.
350   virtual ~SICacheControl() = default;
351 
352 };
353 
354 class SIGfx6CacheControl : public SICacheControl {
355 protected:
356 
357   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
358   /// is modified, false otherwise.
359   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
360     return enableNamedBit(MI, AMDGPU::CPol::GLC);
361   }
362 
363   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::SLC);
367   }
368 
369 public:
370 
371   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
372 
373   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
374                              SIAtomicScope Scope,
375                              SIAtomicAddrSpace AddrSpace) const override;
376 
377   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
378                               SIAtomicScope Scope,
379                               SIAtomicAddrSpace AddrSpace) const override;
380 
381   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
382                             SIAtomicScope Scope,
383                             SIAtomicAddrSpace AddrSpace) const override;
384 
385   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
386                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
387                                       bool IsVolatile,
388                                       bool IsNonTemporal) const override;
389 
390   bool insertWait(MachineBasicBlock::iterator &MI,
391                   SIAtomicScope Scope,
392                   SIAtomicAddrSpace AddrSpace,
393                   SIMemOp Op,
394                   bool IsCrossAddrSpaceOrdering,
395                   Position Pos) const override;
396 
397   bool insertAcquire(MachineBasicBlock::iterator &MI,
398                      SIAtomicScope Scope,
399                      SIAtomicAddrSpace AddrSpace,
400                      Position Pos) const override;
401 
402   bool insertRelease(MachineBasicBlock::iterator &MI,
403                      SIAtomicScope Scope,
404                      SIAtomicAddrSpace AddrSpace,
405                      bool IsCrossAddrSpaceOrdering,
406                      Position Pos) const override;
407 };
408 
409 class SIGfx7CacheControl : public SIGfx6CacheControl {
410 public:
411 
412   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419 };
420 
421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
422 public:
423 
424   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
425 
426   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
427                              SIAtomicScope Scope,
428                              SIAtomicAddrSpace AddrSpace) const override;
429 
430   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
431                               SIAtomicScope Scope,
432                               SIAtomicAddrSpace AddrSpace) const override;
433 
434   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
435                             SIAtomicScope Scope,
436                             SIAtomicAddrSpace AddrSpace) const override;
437 
438   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
439                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
440                                       bool IsVolatile,
441                                       bool IsNonTemporal) const override;
442 
443   bool insertWait(MachineBasicBlock::iterator &MI,
444                   SIAtomicScope Scope,
445                   SIAtomicAddrSpace AddrSpace,
446                   SIMemOp Op,
447                   bool IsCrossAddrSpaceOrdering,
448                   Position Pos) const override;
449 
450   bool insertAcquire(MachineBasicBlock::iterator &MI,
451                      SIAtomicScope Scope,
452                      SIAtomicAddrSpace AddrSpace,
453                      Position Pos) const override;
454 
455   bool insertRelease(MachineBasicBlock::iterator &MI,
456                      SIAtomicScope Scope,
457                      SIAtomicAddrSpace AddrSpace,
458                      bool IsCrossAddrSpaceOrdering,
459                      Position Pos) const override;
460 };
461 
462 class SIGfx10CacheControl : public SIGfx7CacheControl {
463 protected:
464 
465   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
466   /// is modified, false otherwise.
467   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
468     return enableNamedBit(MI, AMDGPU::CPol::DLC);
469   }
470 
471 public:
472 
473   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
474 
475   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
476                              SIAtomicScope Scope,
477                              SIAtomicAddrSpace AddrSpace) const override;
478 
479   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
480                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
481                                       bool IsVolatile,
482                                       bool IsNonTemporal) const override;
483 
484   bool insertWait(MachineBasicBlock::iterator &MI,
485                   SIAtomicScope Scope,
486                   SIAtomicAddrSpace AddrSpace,
487                   SIMemOp Op,
488                   bool IsCrossAddrSpaceOrdering,
489                   Position Pos) const override;
490 
491   bool insertAcquire(MachineBasicBlock::iterator &MI,
492                      SIAtomicScope Scope,
493                      SIAtomicAddrSpace AddrSpace,
494                      Position Pos) const override;
495 };
496 
497 class SIMemoryLegalizer final : public MachineFunctionPass {
498 private:
499 
500   /// Cache Control.
501   std::unique_ptr<SICacheControl> CC = nullptr;
502 
503   /// List of atomic pseudo instructions.
504   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
505 
506   /// Return true iff instruction \p MI is a atomic instruction that
507   /// returns a result.
508   bool isAtomicRet(const MachineInstr &MI) const {
509     return SIInstrInfo::isAtomicRet(MI);
510   }
511 
512   /// Removes all processed atomic pseudo instructions from the current
513   /// function. Returns true if current function is modified, false otherwise.
514   bool removeAtomicPseudoMIs();
515 
516   /// Expands load operation \p MI. Returns true if instructions are
517   /// added/deleted or \p MI is modified, false otherwise.
518   bool expandLoad(const SIMemOpInfo &MOI,
519                   MachineBasicBlock::iterator &MI);
520   /// Expands store operation \p MI. Returns true if instructions are
521   /// added/deleted or \p MI is modified, false otherwise.
522   bool expandStore(const SIMemOpInfo &MOI,
523                    MachineBasicBlock::iterator &MI);
524   /// Expands atomic fence operation \p MI. Returns true if
525   /// instructions are added/deleted or \p MI is modified, false otherwise.
526   bool expandAtomicFence(const SIMemOpInfo &MOI,
527                          MachineBasicBlock::iterator &MI);
528   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
529   /// instructions are added/deleted or \p MI is modified, false otherwise.
530   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
531                                 MachineBasicBlock::iterator &MI);
532 
533 public:
534   static char ID;
535 
536   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
537 
538   void getAnalysisUsage(AnalysisUsage &AU) const override {
539     AU.setPreservesCFG();
540     MachineFunctionPass::getAnalysisUsage(AU);
541   }
542 
543   StringRef getPassName() const override {
544     return PASS_NAME;
545   }
546 
547   bool runOnMachineFunction(MachineFunction &MF) override;
548 };
549 
550 } // end namespace anonymous
551 
552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
553                                       const char *Msg) const {
554   const Function &Func = MI->getParent()->getParent()->getFunction();
555   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
556   Func.getContext().diagnose(Diag);
557 }
558 
559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
561                                SIAtomicAddrSpace InstrAddrSpace) const {
562   if (SSID == SyncScope::System)
563     return std::make_tuple(SIAtomicScope::SYSTEM,
564                            SIAtomicAddrSpace::ATOMIC,
565                            true);
566   if (SSID == MMI->getAgentSSID())
567     return std::make_tuple(SIAtomicScope::AGENT,
568                            SIAtomicAddrSpace::ATOMIC,
569                            true);
570   if (SSID == MMI->getWorkgroupSSID())
571     return std::make_tuple(SIAtomicScope::WORKGROUP,
572                            SIAtomicAddrSpace::ATOMIC,
573                            true);
574   if (SSID == MMI->getWavefrontSSID())
575     return std::make_tuple(SIAtomicScope::WAVEFRONT,
576                            SIAtomicAddrSpace::ATOMIC,
577                            true);
578   if (SSID == SyncScope::SingleThread)
579     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
580                            SIAtomicAddrSpace::ATOMIC,
581                            true);
582   if (SSID == MMI->getSystemOneAddressSpaceSSID())
583     return std::make_tuple(SIAtomicScope::SYSTEM,
584                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
585                            false);
586   if (SSID == MMI->getAgentOneAddressSpaceSSID())
587     return std::make_tuple(SIAtomicScope::AGENT,
588                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
589                            false);
590   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
591     return std::make_tuple(SIAtomicScope::WORKGROUP,
592                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
593                            false);
594   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
595     return std::make_tuple(SIAtomicScope::WAVEFRONT,
596                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
597                            false);
598   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
599     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
600                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
601                            false);
602   return None;
603 }
604 
605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
606   if (AS == AMDGPUAS::FLAT_ADDRESS)
607     return SIAtomicAddrSpace::FLAT;
608   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
609     return SIAtomicAddrSpace::GLOBAL;
610   if (AS == AMDGPUAS::LOCAL_ADDRESS)
611     return SIAtomicAddrSpace::LDS;
612   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
613     return SIAtomicAddrSpace::SCRATCH;
614   if (AS == AMDGPUAS::REGION_ADDRESS)
615     return SIAtomicAddrSpace::GDS;
616 
617   return SIAtomicAddrSpace::OTHER;
618 }
619 
620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
621   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
622 }
623 
624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
625     const MachineBasicBlock::iterator &MI) const {
626   assert(MI->getNumMemOperands() > 0);
627 
628   SyncScope::ID SSID = SyncScope::SingleThread;
629   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
630   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
631   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
632   bool IsNonTemporal = true;
633   bool IsVolatile = false;
634 
635   // Validator should check whether or not MMOs cover the entire set of
636   // locations accessed by the memory instruction.
637   for (const auto &MMO : MI->memoperands()) {
638     IsNonTemporal &= MMO->isNonTemporal();
639     IsVolatile |= MMO->isVolatile();
640     InstrAddrSpace |=
641       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
642     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
643     if (OpOrdering != AtomicOrdering::NotAtomic) {
644       const auto &IsSyncScopeInclusion =
645           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
646       if (!IsSyncScopeInclusion) {
647         reportUnsupported(MI,
648           "Unsupported non-inclusive atomic synchronization scope");
649         return None;
650       }
651 
652       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
653       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
654       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
655              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
656       FailureOrdering =
657           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
658     }
659   }
660 
661   SIAtomicScope Scope = SIAtomicScope::NONE;
662   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
663   bool IsCrossAddressSpaceOrdering = false;
664   if (Ordering != AtomicOrdering::NotAtomic) {
665     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
666     if (!ScopeOrNone) {
667       reportUnsupported(MI, "Unsupported atomic synchronization scope");
668       return None;
669     }
670     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
671       ScopeOrNone.getValue();
672     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
673         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
674         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
675       reportUnsupported(MI, "Unsupported atomic address space");
676       return None;
677     }
678   }
679   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
680                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
681                      IsNonTemporal);
682 }
683 
684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
685     const MachineBasicBlock::iterator &MI) const {
686   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
687 
688   if (!(MI->mayLoad() && !MI->mayStore()))
689     return None;
690 
691   // Be conservative if there are no memory operands.
692   if (MI->getNumMemOperands() == 0)
693     return SIMemOpInfo();
694 
695   return constructFromMIWithMMO(MI);
696 }
697 
698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
699     const MachineBasicBlock::iterator &MI) const {
700   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
701 
702   if (!(!MI->mayLoad() && MI->mayStore()))
703     return None;
704 
705   // Be conservative if there are no memory operands.
706   if (MI->getNumMemOperands() == 0)
707     return SIMemOpInfo();
708 
709   return constructFromMIWithMMO(MI);
710 }
711 
712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
713     const MachineBasicBlock::iterator &MI) const {
714   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
715 
716   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
717     return None;
718 
719   AtomicOrdering Ordering =
720     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
721 
722   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
723   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
724   if (!ScopeOrNone) {
725     reportUnsupported(MI, "Unsupported atomic synchronization scope");
726     return None;
727   }
728 
729   SIAtomicScope Scope = SIAtomicScope::NONE;
730   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
731   bool IsCrossAddressSpaceOrdering = false;
732   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
733     ScopeOrNone.getValue();
734 
735   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
736       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
737     reportUnsupported(MI, "Unsupported atomic address space");
738     return None;
739   }
740 
741   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
742                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
743 }
744 
745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
746     const MachineBasicBlock::iterator &MI) const {
747   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
748 
749   if (!(MI->mayLoad() && MI->mayStore()))
750     return None;
751 
752   // Be conservative if there are no memory operands.
753   if (MI->getNumMemOperands() == 0)
754     return SIMemOpInfo();
755 
756   return constructFromMIWithMMO(MI);
757 }
758 
759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
760   TII = ST.getInstrInfo();
761   IV = getIsaVersion(ST.getCPU());
762   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
763 }
764 
765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
766                                     AMDGPU::CPol::CPol Bit) const {
767   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
768   if (!CPol)
769     return false;
770 
771   CPol->setImm(CPol->getImm() | Bit);
772   return true;
773 }
774 
775 /* static */
776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
777   GCNSubtarget::Generation Generation = ST.getGeneration();
778   if (ST.hasGFX90AInsts())
779     return std::make_unique<SIGfx90ACacheControl>(ST);
780   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
781     return std::make_unique<SIGfx6CacheControl>(ST);
782   if (Generation < AMDGPUSubtarget::GFX10)
783     return std::make_unique<SIGfx7CacheControl>(ST);
784   return std::make_unique<SIGfx10CacheControl>(ST);
785 }
786 
787 bool SIGfx6CacheControl::enableLoadCacheBypass(
788     const MachineBasicBlock::iterator &MI,
789     SIAtomicScope Scope,
790     SIAtomicAddrSpace AddrSpace) const {
791   assert(MI->mayLoad() && !MI->mayStore());
792   bool Changed = false;
793 
794   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
795     switch (Scope) {
796     case SIAtomicScope::SYSTEM:
797     case SIAtomicScope::AGENT:
798       // Set L1 cache policy to MISS_EVICT.
799       // Note: there is no L2 cache bypass policy at the ISA level.
800       Changed |= enableGLCBit(MI);
801       break;
802     case SIAtomicScope::WORKGROUP:
803     case SIAtomicScope::WAVEFRONT:
804     case SIAtomicScope::SINGLETHREAD:
805       // No cache to bypass.
806       break;
807     default:
808       llvm_unreachable("Unsupported synchronization scope");
809     }
810   }
811 
812   /// The scratch address space does not need the global memory caches
813   /// to be bypassed as all memory operations by the same thread are
814   /// sequentially consistent, and no other thread can access scratch
815   /// memory.
816 
817   /// Other address spaces do not have a cache.
818 
819   return Changed;
820 }
821 
822 bool SIGfx6CacheControl::enableStoreCacheBypass(
823     const MachineBasicBlock::iterator &MI,
824     SIAtomicScope Scope,
825     SIAtomicAddrSpace AddrSpace) const {
826   assert(!MI->mayLoad() && MI->mayStore());
827   bool Changed = false;
828 
829   /// The L1 cache is write through so does not need to be bypassed. There is no
830   /// bypass control for the L2 cache at the isa level.
831 
832   return Changed;
833 }
834 
835 bool SIGfx6CacheControl::enableRMWCacheBypass(
836     const MachineBasicBlock::iterator &MI,
837     SIAtomicScope Scope,
838     SIAtomicAddrSpace AddrSpace) const {
839   assert(MI->mayLoad() && MI->mayStore());
840   bool Changed = false;
841 
842   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
843   /// bypassed, and the GLC bit is instead used to indicate if they are
844   /// return or no-return.
845   /// Note: there is no L2 cache coherent bypass control at the ISA level.
846 
847   return Changed;
848 }
849 
850 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
851     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
852     bool IsVolatile, bool IsNonTemporal) const {
853   // Only handle load and store, not atomic read-modify-write insructions. The
854   // latter use glc to indicate if the atomic returns a result and so must not
855   // be used for cache control.
856   assert(MI->mayLoad() ^ MI->mayStore());
857 
858   // Only update load and store, not LLVM IR atomic read-modify-write
859   // instructions. The latter are always marked as volatile so cannot sensibly
860   // handle it as do not want to pessimize all atomics. Also they do not support
861   // the nontemporal attribute.
862   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
863 
864   bool Changed = false;
865 
866   if (IsVolatile) {
867     // Set L1 cache policy to be MISS_EVICT for load instructions
868     // and MISS_LRU for store instructions.
869     // Note: there is no L2 cache bypass policy at the ISA level.
870     if (Op == SIMemOp::LOAD)
871       Changed |= enableGLCBit(MI);
872 
873     // Ensure operation has completed at system scope to cause all volatile
874     // operations to be visible outside the program in a global order. Do not
875     // request cross address space as only the global address space can be
876     // observable outside the program, so no need to cause a waitcnt for LDS
877     // address space operations.
878     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
879                           Position::AFTER);
880 
881     return Changed;
882   }
883 
884   if (IsNonTemporal) {
885     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
886     // for both loads and stores, and the L2 cache policy to STREAM.
887     Changed |= enableGLCBit(MI);
888     Changed |= enableSLCBit(MI);
889     return Changed;
890   }
891 
892   return Changed;
893 }
894 
895 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
896                                     SIAtomicScope Scope,
897                                     SIAtomicAddrSpace AddrSpace,
898                                     SIMemOp Op,
899                                     bool IsCrossAddrSpaceOrdering,
900                                     Position Pos) const {
901   bool Changed = false;
902 
903   MachineBasicBlock &MBB = *MI->getParent();
904   DebugLoc DL = MI->getDebugLoc();
905 
906   if (Pos == Position::AFTER)
907     ++MI;
908 
909   bool VMCnt = false;
910   bool LGKMCnt = false;
911 
912   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
913       SIAtomicAddrSpace::NONE) {
914     switch (Scope) {
915     case SIAtomicScope::SYSTEM:
916     case SIAtomicScope::AGENT:
917       VMCnt |= true;
918       break;
919     case SIAtomicScope::WORKGROUP:
920     case SIAtomicScope::WAVEFRONT:
921     case SIAtomicScope::SINGLETHREAD:
922       // The L1 cache keeps all memory operations in order for
923       // wavefronts in the same work-group.
924       break;
925     default:
926       llvm_unreachable("Unsupported synchronization scope");
927     }
928   }
929 
930   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
931     switch (Scope) {
932     case SIAtomicScope::SYSTEM:
933     case SIAtomicScope::AGENT:
934     case SIAtomicScope::WORKGROUP:
935       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
936       // not needed as LDS operations for all waves are executed in a total
937       // global ordering as observed by all waves. Required if also
938       // synchronizing with global/GDS memory as LDS operations could be
939       // reordered with respect to later global/GDS memory operations of the
940       // same wave.
941       LGKMCnt |= IsCrossAddrSpaceOrdering;
942       break;
943     case SIAtomicScope::WAVEFRONT:
944     case SIAtomicScope::SINGLETHREAD:
945       // The LDS keeps all memory operations in order for
946       // the same wavefront.
947       break;
948     default:
949       llvm_unreachable("Unsupported synchronization scope");
950     }
951   }
952 
953   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
954     switch (Scope) {
955     case SIAtomicScope::SYSTEM:
956     case SIAtomicScope::AGENT:
957       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
958       // is not needed as GDS operations for all waves are executed in a total
959       // global ordering as observed by all waves. Required if also
960       // synchronizing with global/LDS memory as GDS operations could be
961       // reordered with respect to later global/LDS memory operations of the
962       // same wave.
963       LGKMCnt |= IsCrossAddrSpaceOrdering;
964       break;
965     case SIAtomicScope::WORKGROUP:
966     case SIAtomicScope::WAVEFRONT:
967     case SIAtomicScope::SINGLETHREAD:
968       // The GDS keeps all memory operations in order for
969       // the same work-group.
970       break;
971     default:
972       llvm_unreachable("Unsupported synchronization scope");
973     }
974   }
975 
976   if (VMCnt || LGKMCnt) {
977     unsigned WaitCntImmediate =
978       AMDGPU::encodeWaitcnt(IV,
979                             VMCnt ? 0 : getVmcntBitMask(IV),
980                             getExpcntBitMask(IV),
981                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
982     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
983     Changed = true;
984   }
985 
986   if (Pos == Position::AFTER)
987     --MI;
988 
989   return Changed;
990 }
991 
992 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
993                                        SIAtomicScope Scope,
994                                        SIAtomicAddrSpace AddrSpace,
995                                        Position Pos) const {
996   if (!InsertCacheInv)
997     return false;
998 
999   bool Changed = false;
1000 
1001   MachineBasicBlock &MBB = *MI->getParent();
1002   DebugLoc DL = MI->getDebugLoc();
1003 
1004   if (Pos == Position::AFTER)
1005     ++MI;
1006 
1007   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1008     switch (Scope) {
1009     case SIAtomicScope::SYSTEM:
1010     case SIAtomicScope::AGENT:
1011       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1012       Changed = true;
1013       break;
1014     case SIAtomicScope::WORKGROUP:
1015     case SIAtomicScope::WAVEFRONT:
1016     case SIAtomicScope::SINGLETHREAD:
1017       // No cache to invalidate.
1018       break;
1019     default:
1020       llvm_unreachable("Unsupported synchronization scope");
1021     }
1022   }
1023 
1024   /// The scratch address space does not need the global memory cache
1025   /// to be flushed as all memory operations by the same thread are
1026   /// sequentially consistent, and no other thread can access scratch
1027   /// memory.
1028 
1029   /// Other address spaces do not have a cache.
1030 
1031   if (Pos == Position::AFTER)
1032     --MI;
1033 
1034   return Changed;
1035 }
1036 
1037 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1038                                        SIAtomicScope Scope,
1039                                        SIAtomicAddrSpace AddrSpace,
1040                                        bool IsCrossAddrSpaceOrdering,
1041                                        Position Pos) const {
1042   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1043                     IsCrossAddrSpaceOrdering, Pos);
1044 }
1045 
1046 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1047                                        SIAtomicScope Scope,
1048                                        SIAtomicAddrSpace AddrSpace,
1049                                        Position Pos) const {
1050   if (!InsertCacheInv)
1051     return false;
1052 
1053   bool Changed = false;
1054 
1055   MachineBasicBlock &MBB = *MI->getParent();
1056   DebugLoc DL = MI->getDebugLoc();
1057 
1058   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1059 
1060   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1061                                     ? AMDGPU::BUFFER_WBINVL1
1062                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1063 
1064   if (Pos == Position::AFTER)
1065     ++MI;
1066 
1067   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1068     switch (Scope) {
1069     case SIAtomicScope::SYSTEM:
1070     case SIAtomicScope::AGENT:
1071       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1072       Changed = true;
1073       break;
1074     case SIAtomicScope::WORKGROUP:
1075     case SIAtomicScope::WAVEFRONT:
1076     case SIAtomicScope::SINGLETHREAD:
1077       // No cache to invalidate.
1078       break;
1079     default:
1080       llvm_unreachable("Unsupported synchronization scope");
1081     }
1082   }
1083 
1084   /// The scratch address space does not need the global memory cache
1085   /// to be flushed as all memory operations by the same thread are
1086   /// sequentially consistent, and no other thread can access scratch
1087   /// memory.
1088 
1089   /// Other address spaces do not have a cache.
1090 
1091   if (Pos == Position::AFTER)
1092     --MI;
1093 
1094   return Changed;
1095 }
1096 
1097 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1098     const MachineBasicBlock::iterator &MI,
1099     SIAtomicScope Scope,
1100     SIAtomicAddrSpace AddrSpace) const {
1101   assert(MI->mayLoad() && !MI->mayStore());
1102   bool Changed = false;
1103 
1104   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1105     switch (Scope) {
1106     case SIAtomicScope::SYSTEM:
1107     case SIAtomicScope::AGENT:
1108       // Set the L1 cache policy to MISS_LRU.
1109       // Note: there is no L2 cache bypass policy at the ISA level.
1110       Changed |= enableGLCBit(MI);
1111       break;
1112     case SIAtomicScope::WORKGROUP:
1113       // In threadgroup split mode the waves of a work-group can be executing on
1114       // different CUs. Therefore need to bypass the L1 which is per CU.
1115       // Otherwise in non-threadgroup split mode all waves of a work-group are
1116       // on the same CU, and so the L1 does not need to be bypassed.
1117       if (ST.isTgSplitEnabled())
1118         Changed |= enableGLCBit(MI);
1119       break;
1120     case SIAtomicScope::WAVEFRONT:
1121     case SIAtomicScope::SINGLETHREAD:
1122       // No cache to bypass.
1123       break;
1124     default:
1125       llvm_unreachable("Unsupported synchronization scope");
1126     }
1127   }
1128 
1129   /// The scratch address space does not need the global memory caches
1130   /// to be bypassed as all memory operations by the same thread are
1131   /// sequentially consistent, and no other thread can access scratch
1132   /// memory.
1133 
1134   /// Other address spaces do not have a cache.
1135 
1136   return Changed;
1137 }
1138 
1139 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1140     const MachineBasicBlock::iterator &MI,
1141     SIAtomicScope Scope,
1142     SIAtomicAddrSpace AddrSpace) const {
1143   assert(!MI->mayLoad() && MI->mayStore());
1144   bool Changed = false;
1145 
1146   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1147     switch (Scope) {
1148     case SIAtomicScope::SYSTEM:
1149     case SIAtomicScope::AGENT:
1150       /// Do not set glc for store atomic operations as they implicitly write
1151       /// through the L1 cache.
1152       break;
1153     case SIAtomicScope::WORKGROUP:
1154     case SIAtomicScope::WAVEFRONT:
1155     case SIAtomicScope::SINGLETHREAD:
1156       // No cache to bypass. Store atomics implicitly write through the L1
1157       // cache.
1158       break;
1159     default:
1160       llvm_unreachable("Unsupported synchronization scope");
1161     }
1162   }
1163 
1164   /// The scratch address space does not need the global memory caches
1165   /// to be bypassed as all memory operations by the same thread are
1166   /// sequentially consistent, and no other thread can access scratch
1167   /// memory.
1168 
1169   /// Other address spaces do not have a cache.
1170 
1171   return Changed;
1172 }
1173 
1174 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1175     const MachineBasicBlock::iterator &MI,
1176     SIAtomicScope Scope,
1177     SIAtomicAddrSpace AddrSpace) const {
1178   assert(MI->mayLoad() && MI->mayStore());
1179   bool Changed = false;
1180 
1181   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1182     switch (Scope) {
1183     case SIAtomicScope::SYSTEM:
1184     case SIAtomicScope::AGENT:
1185       /// Do not set glc for RMW atomic operations as they implicitly bypass
1186       /// the L1 cache, and the glc bit is instead used to indicate if they are
1187       /// return or no-return.
1188       break;
1189     case SIAtomicScope::WORKGROUP:
1190     case SIAtomicScope::WAVEFRONT:
1191     case SIAtomicScope::SINGLETHREAD:
1192       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1193       break;
1194     default:
1195       llvm_unreachable("Unsupported synchronization scope");
1196     }
1197   }
1198 
1199   return Changed;
1200 }
1201 
1202 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1203     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1204     bool IsVolatile, bool IsNonTemporal) const {
1205   // Only handle load and store, not atomic read-modify-write insructions. The
1206   // latter use glc to indicate if the atomic returns a result and so must not
1207   // be used for cache control.
1208   assert(MI->mayLoad() ^ MI->mayStore());
1209 
1210   // Only update load and store, not LLVM IR atomic read-modify-write
1211   // instructions. The latter are always marked as volatile so cannot sensibly
1212   // handle it as do not want to pessimize all atomics. Also they do not support
1213   // the nontemporal attribute.
1214   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1215 
1216   bool Changed = false;
1217 
1218   if (IsVolatile) {
1219     // Set L1 cache policy to be MISS_EVICT for load instructions
1220     // and MISS_LRU for store instructions.
1221     // Note: there is no L2 cache bypass policy at the ISA level.
1222     if (Op == SIMemOp::LOAD)
1223       Changed |= enableGLCBit(MI);
1224 
1225     // Ensure operation has completed at system scope to cause all volatile
1226     // operations to be visible outside the program in a global order. Do not
1227     // request cross address space as only the global address space can be
1228     // observable outside the program, so no need to cause a waitcnt for LDS
1229     // address space operations.
1230     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1231                           Position::AFTER);
1232 
1233     return Changed;
1234   }
1235 
1236   if (IsNonTemporal) {
1237     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1238     // for both loads and stores, and the L2 cache policy to STREAM.
1239     Changed |= enableGLCBit(MI);
1240     Changed |= enableSLCBit(MI);
1241     return Changed;
1242   }
1243 
1244   return Changed;
1245 }
1246 
1247 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1248                                       SIAtomicScope Scope,
1249                                       SIAtomicAddrSpace AddrSpace,
1250                                       SIMemOp Op,
1251                                       bool IsCrossAddrSpaceOrdering,
1252                                       Position Pos) const {
1253   if (ST.isTgSplitEnabled()) {
1254     // In threadgroup split mode the waves of a work-group can be executing on
1255     // different CUs. Therefore need to wait for global or GDS memory operations
1256     // to complete to ensure they are visible to waves in the other CUs.
1257     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1258     // the same CU, so no need to wait for global memory as all waves in the
1259     // work-group access the same the L1, nor wait for GDS as access are ordered
1260     // on a CU.
1261     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1262                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1263         (Scope == SIAtomicScope::WORKGROUP)) {
1264       // Same as GFX7 using agent scope.
1265       Scope = SIAtomicScope::AGENT;
1266     }
1267     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1268     // LDS memory operations.
1269     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1270   }
1271   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1272                                         IsCrossAddrSpaceOrdering, Pos);
1273 }
1274 
1275 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1276                                          SIAtomicScope Scope,
1277                                          SIAtomicAddrSpace AddrSpace,
1278                                          Position Pos) const {
1279   if (!InsertCacheInv)
1280     return false;
1281 
1282   bool Changed = false;
1283 
1284   MachineBasicBlock &MBB = *MI->getParent();
1285   DebugLoc DL = MI->getDebugLoc();
1286 
1287   if (Pos == Position::AFTER)
1288     ++MI;
1289 
1290   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1291     switch (Scope) {
1292     case SIAtomicScope::SYSTEM:
1293       // Ensures that following loads will not see stale remote VMEM data or
1294       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1295       // CC will never be stale due to the local memory probes.
1296       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1297       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1298       // hardware does not reorder memory operations by the same wave with
1299       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1300       // remove any cache lines of earlier writes by the same wave and ensures
1301       // later reads by the same wave will refetch the cache lines.
1302       Changed = true;
1303       break;
1304     case SIAtomicScope::AGENT:
1305       // Same as GFX7.
1306       break;
1307     case SIAtomicScope::WORKGROUP:
1308       // In threadgroup split mode the waves of a work-group can be executing on
1309       // different CUs. Therefore need to invalidate the L1 which is per CU.
1310       // Otherwise in non-threadgroup split mode all waves of a work-group are
1311       // on the same CU, and so the L1 does not need to be invalidated.
1312       if (ST.isTgSplitEnabled()) {
1313         // Same as GFX7 using agent scope.
1314         Scope = SIAtomicScope::AGENT;
1315       }
1316       break;
1317     case SIAtomicScope::WAVEFRONT:
1318     case SIAtomicScope::SINGLETHREAD:
1319       // Same as GFX7.
1320       break;
1321     default:
1322       llvm_unreachable("Unsupported synchronization scope");
1323     }
1324   }
1325 
1326   /// The scratch address space does not need the global memory cache
1327   /// to be flushed as all memory operations by the same thread are
1328   /// sequentially consistent, and no other thread can access scratch
1329   /// memory.
1330 
1331   /// Other address spaces do not have a cache.
1332 
1333   if (Pos == Position::AFTER)
1334     --MI;
1335 
1336   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1337 
1338   return Changed;
1339 }
1340 
1341 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1342                                          SIAtomicScope Scope,
1343                                          SIAtomicAddrSpace AddrSpace,
1344                                          bool IsCrossAddrSpaceOrdering,
1345                                          Position Pos) const {
1346   bool Changed = false;
1347 
1348   MachineBasicBlock &MBB = *MI->getParent();
1349   DebugLoc DL = MI->getDebugLoc();
1350 
1351   if (Pos == Position::AFTER)
1352     ++MI;
1353 
1354   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1355     switch (Scope) {
1356     case SIAtomicScope::SYSTEM:
1357       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1358       // hardware does not reorder memory operations by the same wave with
1359       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1360       // to initiate writeback of any dirty cache lines of earlier writes by the
1361       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1362       // writeback has completed.
1363       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1364         // Set SC bits to indicate system scope.
1365         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1366       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1367       // vmcnt(0)" needed by the "BUFFER_WBL2".
1368       Changed = true;
1369       break;
1370     case SIAtomicScope::AGENT:
1371     case SIAtomicScope::WORKGROUP:
1372     case SIAtomicScope::WAVEFRONT:
1373     case SIAtomicScope::SINGLETHREAD:
1374       // Same as GFX7.
1375       break;
1376     default:
1377       llvm_unreachable("Unsupported synchronization scope");
1378     }
1379   }
1380 
1381   if (Pos == Position::AFTER)
1382     --MI;
1383 
1384   Changed |=
1385       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1386                                         IsCrossAddrSpaceOrdering, Pos);
1387 
1388   return Changed;
1389 }
1390 
1391 bool SIGfx10CacheControl::enableLoadCacheBypass(
1392     const MachineBasicBlock::iterator &MI,
1393     SIAtomicScope Scope,
1394     SIAtomicAddrSpace AddrSpace) const {
1395   assert(MI->mayLoad() && !MI->mayStore());
1396   bool Changed = false;
1397 
1398   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1399     switch (Scope) {
1400     case SIAtomicScope::SYSTEM:
1401     case SIAtomicScope::AGENT:
1402       // Set the L0 and L1 cache policies to MISS_EVICT.
1403       // Note: there is no L2 cache coherent bypass control at the ISA level.
1404       Changed |= enableGLCBit(MI);
1405       Changed |= enableDLCBit(MI);
1406       break;
1407     case SIAtomicScope::WORKGROUP:
1408       // In WGP mode the waves of a work-group can be executing on either CU of
1409       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1410       // CU mode all waves of a work-group are on the same CU, and so the L0
1411       // does not need to be bypassed.
1412       if (!ST.isCuModeEnabled())
1413         Changed |= enableGLCBit(MI);
1414       break;
1415     case SIAtomicScope::WAVEFRONT:
1416     case SIAtomicScope::SINGLETHREAD:
1417       // No cache to bypass.
1418       break;
1419     default:
1420       llvm_unreachable("Unsupported synchronization scope");
1421     }
1422   }
1423 
1424   /// The scratch address space does not need the global memory caches
1425   /// to be bypassed as all memory operations by the same thread are
1426   /// sequentially consistent, and no other thread can access scratch
1427   /// memory.
1428 
1429   /// Other address spaces do not have a cache.
1430 
1431   return Changed;
1432 }
1433 
1434 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1435     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1436     bool IsVolatile, bool IsNonTemporal) const {
1437 
1438   // Only handle load and store, not atomic read-modify-write insructions. The
1439   // latter use glc to indicate if the atomic returns a result and so must not
1440   // be used for cache control.
1441   assert(MI->mayLoad() ^ MI->mayStore());
1442 
1443   // Only update load and store, not LLVM IR atomic read-modify-write
1444   // instructions. The latter are always marked as volatile so cannot sensibly
1445   // handle it as do not want to pessimize all atomics. Also they do not support
1446   // the nontemporal attribute.
1447   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1448 
1449   bool Changed = false;
1450 
1451   if (IsVolatile) {
1452     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1453     // and MISS_LRU for store instructions.
1454     // Note: there is no L2 cache coherent bypass control at the ISA level.
1455     if (Op == SIMemOp::LOAD) {
1456       Changed |= enableGLCBit(MI);
1457       Changed |= enableDLCBit(MI);
1458     }
1459 
1460     // Ensure operation has completed at system scope to cause all volatile
1461     // operations to be visible outside the program in a global order. Do not
1462     // request cross address space as only the global address space can be
1463     // observable outside the program, so no need to cause a waitcnt for LDS
1464     // address space operations.
1465     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1466                           Position::AFTER);
1467     return Changed;
1468   }
1469 
1470   if (IsNonTemporal) {
1471     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1472     // and L2 cache policy to STREAM.
1473     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1474     // to MISS_EVICT and the L2 cache policy to STREAM.
1475     if (Op == SIMemOp::STORE)
1476       Changed |= enableGLCBit(MI);
1477     Changed |= enableSLCBit(MI);
1478 
1479     return Changed;
1480   }
1481 
1482   return Changed;
1483 }
1484 
1485 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1486                                      SIAtomicScope Scope,
1487                                      SIAtomicAddrSpace AddrSpace,
1488                                      SIMemOp Op,
1489                                      bool IsCrossAddrSpaceOrdering,
1490                                      Position Pos) const {
1491   bool Changed = false;
1492 
1493   MachineBasicBlock &MBB = *MI->getParent();
1494   DebugLoc DL = MI->getDebugLoc();
1495 
1496   if (Pos == Position::AFTER)
1497     ++MI;
1498 
1499   bool VMCnt = false;
1500   bool VSCnt = false;
1501   bool LGKMCnt = false;
1502 
1503   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1504       SIAtomicAddrSpace::NONE) {
1505     switch (Scope) {
1506     case SIAtomicScope::SYSTEM:
1507     case SIAtomicScope::AGENT:
1508       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1509         VMCnt |= true;
1510       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1511         VSCnt |= true;
1512       break;
1513     case SIAtomicScope::WORKGROUP:
1514       // In WGP mode the waves of a work-group can be executing on either CU of
1515       // the WGP. Therefore need to wait for operations to complete to ensure
1516       // they are visible to waves in the other CU as the L0 is per CU.
1517       // Otherwise in CU mode and all waves of a work-group are on the same CU
1518       // which shares the same L0.
1519       if (!ST.isCuModeEnabled()) {
1520         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1521           VMCnt |= true;
1522         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1523           VSCnt |= true;
1524       }
1525       break;
1526     case SIAtomicScope::WAVEFRONT:
1527     case SIAtomicScope::SINGLETHREAD:
1528       // The L0 cache keeps all memory operations in order for
1529       // work-items in the same wavefront.
1530       break;
1531     default:
1532       llvm_unreachable("Unsupported synchronization scope");
1533     }
1534   }
1535 
1536   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1537     switch (Scope) {
1538     case SIAtomicScope::SYSTEM:
1539     case SIAtomicScope::AGENT:
1540     case SIAtomicScope::WORKGROUP:
1541       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1542       // not needed as LDS operations for all waves are executed in a total
1543       // global ordering as observed by all waves. Required if also
1544       // synchronizing with global/GDS memory as LDS operations could be
1545       // reordered with respect to later global/GDS memory operations of the
1546       // same wave.
1547       LGKMCnt |= IsCrossAddrSpaceOrdering;
1548       break;
1549     case SIAtomicScope::WAVEFRONT:
1550     case SIAtomicScope::SINGLETHREAD:
1551       // The LDS keeps all memory operations in order for
1552       // the same wavefront.
1553       break;
1554     default:
1555       llvm_unreachable("Unsupported synchronization scope");
1556     }
1557   }
1558 
1559   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1560     switch (Scope) {
1561     case SIAtomicScope::SYSTEM:
1562     case SIAtomicScope::AGENT:
1563       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1564       // is not needed as GDS operations for all waves are executed in a total
1565       // global ordering as observed by all waves. Required if also
1566       // synchronizing with global/LDS memory as GDS operations could be
1567       // reordered with respect to later global/LDS memory operations of the
1568       // same wave.
1569       LGKMCnt |= IsCrossAddrSpaceOrdering;
1570       break;
1571     case SIAtomicScope::WORKGROUP:
1572     case SIAtomicScope::WAVEFRONT:
1573     case SIAtomicScope::SINGLETHREAD:
1574       // The GDS keeps all memory operations in order for
1575       // the same work-group.
1576       break;
1577     default:
1578       llvm_unreachable("Unsupported synchronization scope");
1579     }
1580   }
1581 
1582   if (VMCnt || LGKMCnt) {
1583     unsigned WaitCntImmediate =
1584       AMDGPU::encodeWaitcnt(IV,
1585                             VMCnt ? 0 : getVmcntBitMask(IV),
1586                             getExpcntBitMask(IV),
1587                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1588     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1589     Changed = true;
1590   }
1591 
1592   if (VSCnt) {
1593     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1594       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1595       .addImm(0);
1596     Changed = true;
1597   }
1598 
1599   if (Pos == Position::AFTER)
1600     --MI;
1601 
1602   return Changed;
1603 }
1604 
1605 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1606                                         SIAtomicScope Scope,
1607                                         SIAtomicAddrSpace AddrSpace,
1608                                         Position Pos) const {
1609   if (!InsertCacheInv)
1610     return false;
1611 
1612   bool Changed = false;
1613 
1614   MachineBasicBlock &MBB = *MI->getParent();
1615   DebugLoc DL = MI->getDebugLoc();
1616 
1617   if (Pos == Position::AFTER)
1618     ++MI;
1619 
1620   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1621     switch (Scope) {
1622     case SIAtomicScope::SYSTEM:
1623     case SIAtomicScope::AGENT:
1624       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1625       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1626       Changed = true;
1627       break;
1628     case SIAtomicScope::WORKGROUP:
1629       // In WGP mode the waves of a work-group can be executing on either CU of
1630       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1631       // in CU mode and all waves of a work-group are on the same CU, and so the
1632       // L0 does not need to be invalidated.
1633       if (!ST.isCuModeEnabled()) {
1634         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1635         Changed = true;
1636       }
1637       break;
1638     case SIAtomicScope::WAVEFRONT:
1639     case SIAtomicScope::SINGLETHREAD:
1640       // No cache to invalidate.
1641       break;
1642     default:
1643       llvm_unreachable("Unsupported synchronization scope");
1644     }
1645   }
1646 
1647   /// The scratch address space does not need the global memory cache
1648   /// to be flushed as all memory operations by the same thread are
1649   /// sequentially consistent, and no other thread can access scratch
1650   /// memory.
1651 
1652   /// Other address spaces do not have a cache.
1653 
1654   if (Pos == Position::AFTER)
1655     --MI;
1656 
1657   return Changed;
1658 }
1659 
1660 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1661   if (AtomicPseudoMIs.empty())
1662     return false;
1663 
1664   for (auto &MI : AtomicPseudoMIs)
1665     MI->eraseFromParent();
1666 
1667   AtomicPseudoMIs.clear();
1668   return true;
1669 }
1670 
1671 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1672                                    MachineBasicBlock::iterator &MI) {
1673   assert(MI->mayLoad() && !MI->mayStore());
1674 
1675   bool Changed = false;
1676 
1677   if (MOI.isAtomic()) {
1678     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1679         MOI.getOrdering() == AtomicOrdering::Acquire ||
1680         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1681       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1682                                            MOI.getOrderingAddrSpace());
1683     }
1684 
1685     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1686       Changed |= CC->insertWait(MI, MOI.getScope(),
1687                                 MOI.getOrderingAddrSpace(),
1688                                 SIMemOp::LOAD | SIMemOp::STORE,
1689                                 MOI.getIsCrossAddressSpaceOrdering(),
1690                                 Position::BEFORE);
1691 
1692     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1693         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1694       Changed |= CC->insertWait(MI, MOI.getScope(),
1695                                 MOI.getInstrAddrSpace(),
1696                                 SIMemOp::LOAD,
1697                                 MOI.getIsCrossAddressSpaceOrdering(),
1698                                 Position::AFTER);
1699       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1700                                    MOI.getOrderingAddrSpace(),
1701                                    Position::AFTER);
1702     }
1703 
1704     return Changed;
1705   }
1706 
1707   // Atomic instructions already bypass caches to the scope specified by the
1708   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1709   // need additional treatment.
1710   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1711                                                 SIMemOp::LOAD, MOI.isVolatile(),
1712                                                 MOI.isNonTemporal());
1713   return Changed;
1714 }
1715 
1716 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1717                                     MachineBasicBlock::iterator &MI) {
1718   assert(!MI->mayLoad() && MI->mayStore());
1719 
1720   bool Changed = false;
1721 
1722   if (MOI.isAtomic()) {
1723     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1724         MOI.getOrdering() == AtomicOrdering::Release ||
1725         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1726       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1727                                             MOI.getOrderingAddrSpace());
1728     }
1729 
1730     if (MOI.getOrdering() == AtomicOrdering::Release ||
1731         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1732       Changed |= CC->insertRelease(MI, MOI.getScope(),
1733                                    MOI.getOrderingAddrSpace(),
1734                                    MOI.getIsCrossAddressSpaceOrdering(),
1735                                    Position::BEFORE);
1736 
1737     return Changed;
1738   }
1739 
1740   // Atomic instructions already bypass caches to the scope specified by the
1741   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1742   // need additional treatment.
1743   Changed |= CC->enableVolatileAndOrNonTemporal(
1744       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1745       MOI.isNonTemporal());
1746   return Changed;
1747 }
1748 
1749 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1750                                           MachineBasicBlock::iterator &MI) {
1751   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1752 
1753   AtomicPseudoMIs.push_back(MI);
1754   bool Changed = false;
1755 
1756   if (MOI.isAtomic()) {
1757     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1758         MOI.getOrdering() == AtomicOrdering::Release ||
1759         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1760         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1761       /// TODO: This relies on a barrier always generating a waitcnt
1762       /// for LDS to ensure it is not reordered with the completion of
1763       /// the proceeding LDS operations. If barrier had a memory
1764       /// ordering and memory scope, then library does not need to
1765       /// generate a fence. Could add support in this file for
1766       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1767       /// adding S_WAITCNT before a S_BARRIER.
1768       Changed |= CC->insertRelease(MI, MOI.getScope(),
1769                                    MOI.getOrderingAddrSpace(),
1770                                    MOI.getIsCrossAddressSpaceOrdering(),
1771                                    Position::BEFORE);
1772 
1773     // TODO: If both release and invalidate are happening they could be combined
1774     // to use the single "BUFFER_WBINV*" instruction. This could be done by
1775     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1776     // track cache invalidate and write back instructions.
1777 
1778     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1779         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1780         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1781       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1782                                    MOI.getOrderingAddrSpace(),
1783                                    Position::BEFORE);
1784 
1785     return Changed;
1786   }
1787 
1788   return Changed;
1789 }
1790 
1791 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1792   MachineBasicBlock::iterator &MI) {
1793   assert(MI->mayLoad() && MI->mayStore());
1794 
1795   bool Changed = false;
1796 
1797   if (MOI.isAtomic()) {
1798     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1799         MOI.getOrdering() == AtomicOrdering::Acquire ||
1800         MOI.getOrdering() == AtomicOrdering::Release ||
1801         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1802         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1803       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1804                                           MOI.getInstrAddrSpace());
1805     }
1806 
1807     if (MOI.getOrdering() == AtomicOrdering::Release ||
1808         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1809         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1810         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1811       Changed |= CC->insertRelease(MI, MOI.getScope(),
1812                                    MOI.getOrderingAddrSpace(),
1813                                    MOI.getIsCrossAddressSpaceOrdering(),
1814                                    Position::BEFORE);
1815 
1816     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1817         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1818         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1819         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1820         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1821       Changed |= CC->insertWait(MI, MOI.getScope(),
1822                                 MOI.getInstrAddrSpace(),
1823                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1824                                                    SIMemOp::STORE,
1825                                 MOI.getIsCrossAddressSpaceOrdering(),
1826                                 Position::AFTER);
1827       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1828                                    MOI.getOrderingAddrSpace(),
1829                                    Position::AFTER);
1830     }
1831 
1832     return Changed;
1833   }
1834 
1835   return Changed;
1836 }
1837 
1838 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1839   bool Changed = false;
1840 
1841   SIMemOpAccess MOA(MF);
1842   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1843 
1844   for (auto &MBB : MF) {
1845     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1846 
1847       // Unbundle instructions after the post-RA scheduler.
1848       if (MI->isBundle() && MI->mayLoadOrStore()) {
1849         MachineBasicBlock::instr_iterator II(MI->getIterator());
1850         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1851              I != E && I->isBundledWithPred(); ++I) {
1852           I->unbundleFromPred();
1853           for (MachineOperand &MO : I->operands())
1854             if (MO.isReg())
1855               MO.setIsInternalRead(false);
1856         }
1857 
1858         MI->eraseFromParent();
1859         MI = II->getIterator();
1860       }
1861 
1862       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1863         continue;
1864 
1865       if (const auto &MOI = MOA.getLoadInfo(MI))
1866         Changed |= expandLoad(MOI.getValue(), MI);
1867       else if (const auto &MOI = MOA.getStoreInfo(MI))
1868         Changed |= expandStore(MOI.getValue(), MI);
1869       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1870         Changed |= expandAtomicFence(MOI.getValue(), MI);
1871       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1872         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1873     }
1874   }
1875 
1876   Changed |= removeAtomicPseudoMIs();
1877   return Changed;
1878 }
1879 
1880 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1881 
1882 char SIMemoryLegalizer::ID = 0;
1883 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1884 
1885 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1886   return new SIMemoryLegalizer();
1887 }
1888