1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
88 /// \returns Returns true if \p MI is modified, false otherwise.
89 template <uint16_t BitName>
90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
91   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
92   if (BitIdx == -1)
93     return false;
94 
95   MachineOperand &Bit = MI->getOperand(BitIdx);
96   if (Bit.getImm() != 0)
97     return false;
98 
99   Bit.setImm(1);
100   return true;
101 }
102 
103 class SIMemOpInfo final {
104 private:
105 
106   friend class SIMemOpAccess;
107 
108   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
109   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
110   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
111   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
112   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
113   bool IsCrossAddressSpaceOrdering = false;
114   bool IsVolatile = false;
115   bool IsNonTemporal = false;
116 
117   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
118               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
119               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
120               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
121               bool IsCrossAddressSpaceOrdering = true,
122               AtomicOrdering FailureOrdering =
123                 AtomicOrdering::SequentiallyConsistent,
124               bool IsVolatile = false,
125               bool IsNonTemporal = false)
126     : Ordering(Ordering), FailureOrdering(FailureOrdering),
127       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
128       InstrAddrSpace(InstrAddrSpace),
129       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
130       IsVolatile(IsVolatile),
131       IsNonTemporal(IsNonTemporal) {
132 
133     if (Ordering == AtomicOrdering::NotAtomic) {
134       assert(Scope == SIAtomicScope::NONE &&
135              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
136              !IsCrossAddressSpaceOrdering &&
137              FailureOrdering == AtomicOrdering::NotAtomic);
138       return;
139     }
140 
141     assert(Scope != SIAtomicScope::NONE &&
142            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
143                SIAtomicAddrSpace::NONE &&
144            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
145                SIAtomicAddrSpace::NONE &&
146            !isStrongerThan(FailureOrdering, Ordering));
147 
148     // There is also no cross address space ordering if the ordering
149     // address space is the same as the instruction address space and
150     // only contains a single address space.
151     if ((OrderingAddrSpace == InstrAddrSpace) &&
152         isPowerOf2_32(uint32_t(InstrAddrSpace)))
153       this->IsCrossAddressSpaceOrdering = false;
154 
155     // Limit the scope to the maximum supported by the instruction's address
156     // spaces.
157     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
158         SIAtomicAddrSpace::NONE) {
159       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
160     } else if ((InstrAddrSpace &
161                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
162                SIAtomicAddrSpace::NONE) {
163       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
164     } else if ((InstrAddrSpace &
165                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
166                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
167       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
168     }
169   }
170 
171 public:
172   /// \returns Atomic synchronization scope of the machine instruction used to
173   /// create this SIMemOpInfo.
174   SIAtomicScope getScope() const {
175     return Scope;
176   }
177 
178   /// \returns Ordering constraint of the machine instruction used to
179   /// create this SIMemOpInfo.
180   AtomicOrdering getOrdering() const {
181     return Ordering;
182   }
183 
184   /// \returns Failure ordering constraint of the machine instruction used to
185   /// create this SIMemOpInfo.
186   AtomicOrdering getFailureOrdering() const {
187     return FailureOrdering;
188   }
189 
190   /// \returns The address spaces be accessed by the machine
191   /// instruction used to create this SiMemOpInfo.
192   SIAtomicAddrSpace getInstrAddrSpace() const {
193     return InstrAddrSpace;
194   }
195 
196   /// \returns The address spaces that must be ordered by the machine
197   /// instruction used to create this SiMemOpInfo.
198   SIAtomicAddrSpace getOrderingAddrSpace() const {
199     return OrderingAddrSpace;
200   }
201 
202   /// \returns Return true iff memory ordering of operations on
203   /// different address spaces is required.
204   bool getIsCrossAddressSpaceOrdering() const {
205     return IsCrossAddressSpaceOrdering;
206   }
207 
208   /// \returns True if memory access of the machine instruction used to
209   /// create this SIMemOpInfo is volatile, false otherwise.
210   bool isVolatile() const {
211     return IsVolatile;
212   }
213 
214   /// \returns True if memory access of the machine instruction used to
215   /// create this SIMemOpInfo is nontemporal, false otherwise.
216   bool isNonTemporal() const {
217     return IsNonTemporal;
218   }
219 
220   /// \returns True if ordering constraint of the machine instruction used to
221   /// create this SIMemOpInfo is unordered or higher, false otherwise.
222   bool isAtomic() const {
223     return Ordering != AtomicOrdering::NotAtomic;
224   }
225 
226 };
227 
228 class SIMemOpAccess final {
229 private:
230   AMDGPUMachineModuleInfo *MMI = nullptr;
231 
232   /// Reports unsupported message \p Msg for \p MI to LLVM context.
233   void reportUnsupported(const MachineBasicBlock::iterator &MI,
234                          const char *Msg) const;
235 
236   /// Inspects the target synchronization scope \p SSID and determines
237   /// the SI atomic scope it corresponds to, the address spaces it
238   /// covers, and whether the memory ordering applies between address
239   /// spaces.
240   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
241   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
242 
243   /// \return Return a bit set of the address spaces accessed by \p AS.
244   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
245 
246   /// \returns Info constructed from \p MI, which has at least machine memory
247   /// operand.
248   Optional<SIMemOpInfo> constructFromMIWithMMO(
249       const MachineBasicBlock::iterator &MI) const;
250 
251 public:
252   /// Construct class to support accessing the machine memory operands
253   /// of instructions in the machine function \p MF.
254   SIMemOpAccess(MachineFunction &MF);
255 
256   /// \returns Load info if \p MI is a load operation, "None" otherwise.
257   Optional<SIMemOpInfo> getLoadInfo(
258       const MachineBasicBlock::iterator &MI) const;
259 
260   /// \returns Store info if \p MI is a store operation, "None" otherwise.
261   Optional<SIMemOpInfo> getStoreInfo(
262       const MachineBasicBlock::iterator &MI) const;
263 
264   /// \returns Atomic fence info if \p MI is an atomic fence operation,
265   /// "None" otherwise.
266   Optional<SIMemOpInfo> getAtomicFenceInfo(
267       const MachineBasicBlock::iterator &MI) const;
268 
269   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
270   /// rmw operation, "None" otherwise.
271   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
272       const MachineBasicBlock::iterator &MI) const;
273 };
274 
275 class SICacheControl {
276 protected:
277 
278   /// AMDGPU subtarget info.
279   const GCNSubtarget &ST;
280 
281   /// Instruction info.
282   const SIInstrInfo *TII = nullptr;
283 
284   IsaVersion IV;
285 
286   /// Whether to insert cache invalidating instructions.
287   bool InsertCacheInv;
288 
289   SICacheControl(const GCNSubtarget &ST);
290 
291 public:
292 
293   /// Create a cache control for the subtarget \p ST.
294   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
295 
296   /// Update \p MI memory load instruction to bypass any caches up to
297   /// the \p Scope memory scope for address spaces \p
298   /// AddrSpace. Return true iff the instruction was modified.
299   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
300                                      SIAtomicScope Scope,
301                                      SIAtomicAddrSpace AddrSpace) const = 0;
302 
303   /// Update \p MI memory store instruction to bypass any caches up to
304   /// the \p Scope memory scope for address spaces \p
305   /// AddrSpace. Return true iff the instruction was modified.
306   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
307                                       SIAtomicScope Scope,
308                                       SIAtomicAddrSpace AddrSpace) const = 0;
309 
310   /// Update \p MI memory read-modify-write instruction to bypass any caches up
311   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
312   /// iff the instruction was modified.
313   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
314                                     SIAtomicScope Scope,
315                                     SIAtomicAddrSpace AddrSpace) const = 0;
316 
317   /// Update \p MI memory instruction of kind \p Op associated with address
318   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
319   /// true iff the instruction was modified.
320   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
321                                               SIAtomicAddrSpace AddrSpace,
322                                               SIMemOp Op, bool IsVolatile,
323                                               bool IsNonTemporal) const = 0;
324 
325   /// Inserts any necessary instructions at position \p Pos relative
326   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
327   /// \p Op associated with address spaces \p AddrSpace have completed. Used
328   /// between memory instructions to enforce the order they become visible as
329   /// observed by other memory instructions executing in memory scope \p Scope.
330   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
331   /// address spaces. Returns true iff any instructions inserted.
332   virtual bool insertWait(MachineBasicBlock::iterator &MI,
333                           SIAtomicScope Scope,
334                           SIAtomicAddrSpace AddrSpace,
335                           SIMemOp Op,
336                           bool IsCrossAddrSpaceOrdering,
337                           Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure any subsequent memory instructions of this
341   /// thread with address spaces \p AddrSpace will observe the previous memory
342   /// operations by any thread for memory scopes up to memory scope \p Scope .
343   /// Returns true iff any instructions inserted.
344   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
345                              SIAtomicScope Scope,
346                              SIAtomicAddrSpace AddrSpace,
347                              Position Pos) const = 0;
348 
349   /// Inserts any necessary instructions at position \p Pos relative to
350   /// instruction \p MI to ensure previous memory instructions by this thread
351   /// with address spaces \p AddrSpace have completed and can be observed by
352   /// subsequent memory instructions by any thread executing in memory scope \p
353   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
354   /// between address spaces. Returns true iff any instructions inserted.
355   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
356                              SIAtomicScope Scope,
357                              SIAtomicAddrSpace AddrSpace,
358                              bool IsCrossAddrSpaceOrdering,
359                              Position Pos) const = 0;
360 
361   /// Virtual destructor to allow derivations to be deleted.
362   virtual ~SICacheControl() = default;
363 
364 };
365 
366 class SIGfx6CacheControl : public SICacheControl {
367 protected:
368 
369   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
370   /// is modified, false otherwise.
371   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
372     return enableNamedBit<AMDGPU::OpName::glc>(MI);
373   }
374 
375   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
376   /// is modified, false otherwise.
377   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
378     return enableNamedBit<AMDGPU::OpName::slc>(MI);
379   }
380 
381 public:
382 
383   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
384 
385   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
386                              SIAtomicScope Scope,
387                              SIAtomicAddrSpace AddrSpace) const override;
388 
389   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
390                               SIAtomicScope Scope,
391                               SIAtomicAddrSpace AddrSpace) const override;
392 
393   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
394                             SIAtomicScope Scope,
395                             SIAtomicAddrSpace AddrSpace) const override;
396 
397   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
398                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
399                                       bool IsVolatile,
400                                       bool IsNonTemporal) const override;
401 
402   bool insertWait(MachineBasicBlock::iterator &MI,
403                   SIAtomicScope Scope,
404                   SIAtomicAddrSpace AddrSpace,
405                   SIMemOp Op,
406                   bool IsCrossAddrSpaceOrdering,
407                   Position Pos) const override;
408 
409   bool insertAcquire(MachineBasicBlock::iterator &MI,
410                      SIAtomicScope Scope,
411                      SIAtomicAddrSpace AddrSpace,
412                      Position Pos) const override;
413 
414   bool insertRelease(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      bool IsCrossAddrSpaceOrdering,
418                      Position Pos) const override;
419 };
420 
421 class SIGfx7CacheControl : public SIGfx6CacheControl {
422 public:
423 
424   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
425 
426   bool insertAcquire(MachineBasicBlock::iterator &MI,
427                      SIAtomicScope Scope,
428                      SIAtomicAddrSpace AddrSpace,
429                      Position Pos) const override;
430 
431 };
432 
433 class SIGfx90ACacheControl : public SIGfx7CacheControl {
434 protected:
435 
436   /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI
437   /// is modified, false otherwise.
438   bool enableSCCBit(const MachineBasicBlock::iterator &MI) const {
439     return enableNamedBit<AMDGPU::OpName::sccb>(MI);
440   }
441 
442 public:
443 
444   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
445 
446   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
447                              SIAtomicScope Scope,
448                              SIAtomicAddrSpace AddrSpace) const override;
449 
450   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
451                               SIAtomicScope Scope,
452                               SIAtomicAddrSpace AddrSpace) const override;
453 
454   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
455                             SIAtomicScope Scope,
456                             SIAtomicAddrSpace AddrSpace) const override;
457 
458   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
459                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
460                                       bool IsVolatile,
461                                       bool IsNonTemporal) const override;
462 
463   bool insertWait(MachineBasicBlock::iterator &MI,
464                   SIAtomicScope Scope,
465                   SIAtomicAddrSpace AddrSpace,
466                   SIMemOp Op,
467                   bool IsCrossAddrSpaceOrdering,
468                   Position Pos) const override;
469 
470   bool insertAcquire(MachineBasicBlock::iterator &MI,
471                      SIAtomicScope Scope,
472                      SIAtomicAddrSpace AddrSpace,
473                      Position Pos) const override;
474 
475   bool insertRelease(MachineBasicBlock::iterator &MI,
476                      SIAtomicScope Scope,
477                      SIAtomicAddrSpace AddrSpace,
478                      bool IsCrossAddrSpaceOrdering,
479                      Position Pos) const override;
480 };
481 
482 class SIGfx10CacheControl : public SIGfx7CacheControl {
483 protected:
484 
485   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
486   /// is modified, false otherwise.
487   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
488     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
489   }
490 
491 public:
492 
493   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
494 
495   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
496                              SIAtomicScope Scope,
497                              SIAtomicAddrSpace AddrSpace) const override;
498 
499   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
500                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
501                                       bool IsVolatile,
502                                       bool IsNonTemporal) const override;
503 
504   bool insertWait(MachineBasicBlock::iterator &MI,
505                   SIAtomicScope Scope,
506                   SIAtomicAddrSpace AddrSpace,
507                   SIMemOp Op,
508                   bool IsCrossAddrSpaceOrdering,
509                   Position Pos) const override;
510 
511   bool insertAcquire(MachineBasicBlock::iterator &MI,
512                      SIAtomicScope Scope,
513                      SIAtomicAddrSpace AddrSpace,
514                      Position Pos) const override;
515 };
516 
517 class SIMemoryLegalizer final : public MachineFunctionPass {
518 private:
519 
520   /// Cache Control.
521   std::unique_ptr<SICacheControl> CC = nullptr;
522 
523   /// List of atomic pseudo instructions.
524   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
525 
526   /// Return true iff instruction \p MI is a atomic instruction that
527   /// returns a result.
528   bool isAtomicRet(const MachineInstr &MI) const {
529     return SIInstrInfo::isAtomicRet(MI);
530   }
531 
532   /// Removes all processed atomic pseudo instructions from the current
533   /// function. Returns true if current function is modified, false otherwise.
534   bool removeAtomicPseudoMIs();
535 
536   /// Expands load operation \p MI. Returns true if instructions are
537   /// added/deleted or \p MI is modified, false otherwise.
538   bool expandLoad(const SIMemOpInfo &MOI,
539                   MachineBasicBlock::iterator &MI);
540   /// Expands store operation \p MI. Returns true if instructions are
541   /// added/deleted or \p MI is modified, false otherwise.
542   bool expandStore(const SIMemOpInfo &MOI,
543                    MachineBasicBlock::iterator &MI);
544   /// Expands atomic fence operation \p MI. Returns true if
545   /// instructions are added/deleted or \p MI is modified, false otherwise.
546   bool expandAtomicFence(const SIMemOpInfo &MOI,
547                          MachineBasicBlock::iterator &MI);
548   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
549   /// instructions are added/deleted or \p MI is modified, false otherwise.
550   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
551                                 MachineBasicBlock::iterator &MI);
552 
553 public:
554   static char ID;
555 
556   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
557 
558   void getAnalysisUsage(AnalysisUsage &AU) const override {
559     AU.setPreservesCFG();
560     MachineFunctionPass::getAnalysisUsage(AU);
561   }
562 
563   StringRef getPassName() const override {
564     return PASS_NAME;
565   }
566 
567   bool runOnMachineFunction(MachineFunction &MF) override;
568 };
569 
570 } // end namespace anonymous
571 
572 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
573                                       const char *Msg) const {
574   const Function &Func = MI->getParent()->getParent()->getFunction();
575   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
576   Func.getContext().diagnose(Diag);
577 }
578 
579 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
580 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
581                                SIAtomicAddrSpace InstrAddrSpace) const {
582   if (SSID == SyncScope::System)
583     return std::make_tuple(SIAtomicScope::SYSTEM,
584                            SIAtomicAddrSpace::ATOMIC,
585                            true);
586   if (SSID == MMI->getAgentSSID())
587     return std::make_tuple(SIAtomicScope::AGENT,
588                            SIAtomicAddrSpace::ATOMIC,
589                            true);
590   if (SSID == MMI->getWorkgroupSSID())
591     return std::make_tuple(SIAtomicScope::WORKGROUP,
592                            SIAtomicAddrSpace::ATOMIC,
593                            true);
594   if (SSID == MMI->getWavefrontSSID())
595     return std::make_tuple(SIAtomicScope::WAVEFRONT,
596                            SIAtomicAddrSpace::ATOMIC,
597                            true);
598   if (SSID == SyncScope::SingleThread)
599     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
600                            SIAtomicAddrSpace::ATOMIC,
601                            true);
602   if (SSID == MMI->getSystemOneAddressSpaceSSID())
603     return std::make_tuple(SIAtomicScope::SYSTEM,
604                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
605                            false);
606   if (SSID == MMI->getAgentOneAddressSpaceSSID())
607     return std::make_tuple(SIAtomicScope::AGENT,
608                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
609                            false);
610   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
611     return std::make_tuple(SIAtomicScope::WORKGROUP,
612                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
613                            false);
614   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
615     return std::make_tuple(SIAtomicScope::WAVEFRONT,
616                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
617                            false);
618   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
619     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
620                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
621                            false);
622   return None;
623 }
624 
625 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
626   if (AS == AMDGPUAS::FLAT_ADDRESS)
627     return SIAtomicAddrSpace::FLAT;
628   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
629     return SIAtomicAddrSpace::GLOBAL;
630   if (AS == AMDGPUAS::LOCAL_ADDRESS)
631     return SIAtomicAddrSpace::LDS;
632   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
633     return SIAtomicAddrSpace::SCRATCH;
634   if (AS == AMDGPUAS::REGION_ADDRESS)
635     return SIAtomicAddrSpace::GDS;
636 
637   return SIAtomicAddrSpace::OTHER;
638 }
639 
640 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
641   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
642 }
643 
644 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
645     const MachineBasicBlock::iterator &MI) const {
646   assert(MI->getNumMemOperands() > 0);
647 
648   SyncScope::ID SSID = SyncScope::SingleThread;
649   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
650   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
651   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
652   bool IsNonTemporal = true;
653   bool IsVolatile = false;
654 
655   // Validator should check whether or not MMOs cover the entire set of
656   // locations accessed by the memory instruction.
657   for (const auto &MMO : MI->memoperands()) {
658     IsNonTemporal &= MMO->isNonTemporal();
659     IsVolatile |= MMO->isVolatile();
660     InstrAddrSpace |=
661       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
662     AtomicOrdering OpOrdering = MMO->getOrdering();
663     if (OpOrdering != AtomicOrdering::NotAtomic) {
664       const auto &IsSyncScopeInclusion =
665           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
666       if (!IsSyncScopeInclusion) {
667         reportUnsupported(MI,
668           "Unsupported non-inclusive atomic synchronization scope");
669         return None;
670       }
671 
672       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
673       Ordering =
674           isStrongerThan(Ordering, OpOrdering) ?
675               Ordering : MMO->getOrdering();
676       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
677              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
678       FailureOrdering =
679           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
680               FailureOrdering : MMO->getFailureOrdering();
681     }
682   }
683 
684   SIAtomicScope Scope = SIAtomicScope::NONE;
685   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
686   bool IsCrossAddressSpaceOrdering = false;
687   if (Ordering != AtomicOrdering::NotAtomic) {
688     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
689     if (!ScopeOrNone) {
690       reportUnsupported(MI, "Unsupported atomic synchronization scope");
691       return None;
692     }
693     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
694       ScopeOrNone.getValue();
695     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
696         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
697         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
698       reportUnsupported(MI, "Unsupported atomic address space");
699       return None;
700     }
701   }
702   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
703                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
704                      IsNonTemporal);
705 }
706 
707 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
708     const MachineBasicBlock::iterator &MI) const {
709   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
710 
711   if (!(MI->mayLoad() && !MI->mayStore()))
712     return None;
713 
714   // Be conservative if there are no memory operands.
715   if (MI->getNumMemOperands() == 0)
716     return SIMemOpInfo();
717 
718   return constructFromMIWithMMO(MI);
719 }
720 
721 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
722     const MachineBasicBlock::iterator &MI) const {
723   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
724 
725   if (!(!MI->mayLoad() && MI->mayStore()))
726     return None;
727 
728   // Be conservative if there are no memory operands.
729   if (MI->getNumMemOperands() == 0)
730     return SIMemOpInfo();
731 
732   return constructFromMIWithMMO(MI);
733 }
734 
735 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
736     const MachineBasicBlock::iterator &MI) const {
737   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
738 
739   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
740     return None;
741 
742   AtomicOrdering Ordering =
743     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
744 
745   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
746   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
747   if (!ScopeOrNone) {
748     reportUnsupported(MI, "Unsupported atomic synchronization scope");
749     return None;
750   }
751 
752   SIAtomicScope Scope = SIAtomicScope::NONE;
753   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
754   bool IsCrossAddressSpaceOrdering = false;
755   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
756     ScopeOrNone.getValue();
757 
758   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
759       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
760     reportUnsupported(MI, "Unsupported atomic address space");
761     return None;
762   }
763 
764   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
765                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
766 }
767 
768 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
769     const MachineBasicBlock::iterator &MI) const {
770   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
771 
772   if (!(MI->mayLoad() && MI->mayStore()))
773     return None;
774 
775   // Be conservative if there are no memory operands.
776   if (MI->getNumMemOperands() == 0)
777     return SIMemOpInfo();
778 
779   return constructFromMIWithMMO(MI);
780 }
781 
782 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
783   TII = ST.getInstrInfo();
784   IV = getIsaVersion(ST.getCPU());
785   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
786 }
787 
788 /* static */
789 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
790   GCNSubtarget::Generation Generation = ST.getGeneration();
791   if (ST.hasGFX90AInsts())
792     return std::make_unique<SIGfx90ACacheControl>(ST);
793   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
794     return std::make_unique<SIGfx6CacheControl>(ST);
795   if (Generation < AMDGPUSubtarget::GFX10)
796     return std::make_unique<SIGfx7CacheControl>(ST);
797   return std::make_unique<SIGfx10CacheControl>(ST);
798 }
799 
800 bool SIGfx6CacheControl::enableLoadCacheBypass(
801     const MachineBasicBlock::iterator &MI,
802     SIAtomicScope Scope,
803     SIAtomicAddrSpace AddrSpace) const {
804   assert(MI->mayLoad() && !MI->mayStore());
805   bool Changed = false;
806 
807   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
808     switch (Scope) {
809     case SIAtomicScope::SYSTEM:
810     case SIAtomicScope::AGENT:
811       Changed |= enableGLCBit(MI);
812       break;
813     case SIAtomicScope::WORKGROUP:
814     case SIAtomicScope::WAVEFRONT:
815     case SIAtomicScope::SINGLETHREAD:
816       // No cache to bypass.
817       break;
818     default:
819       llvm_unreachable("Unsupported synchronization scope");
820     }
821   }
822 
823   /// The scratch address space does not need the global memory caches
824   /// to be bypassed as all memory operations by the same thread are
825   /// sequentially consistent, and no other thread can access scratch
826   /// memory.
827 
828   /// Other address spaces do not have a cache.
829 
830   return Changed;
831 }
832 
833 bool SIGfx6CacheControl::enableStoreCacheBypass(
834     const MachineBasicBlock::iterator &MI,
835     SIAtomicScope Scope,
836     SIAtomicAddrSpace AddrSpace) const {
837   assert(!MI->mayLoad() && MI->mayStore());
838   bool Changed = false;
839 
840   /// The L1 cache is write through so does not need to be bypassed. There is no
841   /// bypass control for the L2 cache at the isa level.
842 
843   return Changed;
844 }
845 
846 bool SIGfx6CacheControl::enableRMWCacheBypass(
847     const MachineBasicBlock::iterator &MI,
848     SIAtomicScope Scope,
849     SIAtomicAddrSpace AddrSpace) const {
850   assert(MI->mayLoad() && MI->mayStore());
851   bool Changed = false;
852 
853   /// The L1 cache is write through so does not need to be bypassed. There is no
854   /// bypass control for the L2 cache at the isa level.
855 
856   return Changed;
857 }
858 
859 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
860     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
861     bool IsVolatile, bool IsNonTemporal) const {
862   // Only handle load and store, not atomic read-modify-write insructions. The
863   // latter use glc to indicate if the atomic returns a result and so must not
864   // be used for cache control.
865   assert(MI->mayLoad() ^ MI->mayStore());
866 
867   // Only update load and store, not LLVM IR atomic read-modify-write
868   // instructions. The latter are always marked as volatile so cannot sensibly
869   // handle it as do not want to pessimize all atomics. Also they do not support
870   // the nontemporal attribute.
871   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
872 
873   bool Changed = false;
874 
875   if (IsVolatile) {
876     if (Op == SIMemOp::LOAD)
877       Changed |= enableGLCBit(MI);
878 
879     // Ensure operation has completed at system scope to cause all volatile
880     // operations to be visible outside the program in a global order. Do not
881     // request cross address space as only the global address space can be
882     // observable outside the program, so no need to cause a waitcnt for LDS
883     // address space operations.
884     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
885                           Position::AFTER);
886 
887     return Changed;
888   }
889 
890   if (IsNonTemporal) {
891     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
892     Changed |= enableGLCBit(MI);
893     Changed |= enableSLCBit(MI);
894     return Changed;
895   }
896 
897   return Changed;
898 }
899 
900 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
901                                     SIAtomicScope Scope,
902                                     SIAtomicAddrSpace AddrSpace,
903                                     SIMemOp Op,
904                                     bool IsCrossAddrSpaceOrdering,
905                                     Position Pos) const {
906   bool Changed = false;
907 
908   MachineBasicBlock &MBB = *MI->getParent();
909   DebugLoc DL = MI->getDebugLoc();
910 
911   if (Pos == Position::AFTER)
912     ++MI;
913 
914   bool VMCnt = false;
915   bool LGKMCnt = false;
916 
917   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
918       SIAtomicAddrSpace::NONE) {
919     switch (Scope) {
920     case SIAtomicScope::SYSTEM:
921     case SIAtomicScope::AGENT:
922       VMCnt |= true;
923       break;
924     case SIAtomicScope::WORKGROUP:
925     case SIAtomicScope::WAVEFRONT:
926     case SIAtomicScope::SINGLETHREAD:
927       // The L1 cache keeps all memory operations in order for
928       // wavefronts in the same work-group.
929       break;
930     default:
931       llvm_unreachable("Unsupported synchronization scope");
932     }
933   }
934 
935   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
936     switch (Scope) {
937     case SIAtomicScope::SYSTEM:
938     case SIAtomicScope::AGENT:
939     case SIAtomicScope::WORKGROUP:
940       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
941       // not needed as LDS operations for all waves are executed in a total
942       // global ordering as observed by all waves. Required if also
943       // synchronizing with global/GDS memory as LDS operations could be
944       // reordered with respect to later global/GDS memory operations of the
945       // same wave.
946       LGKMCnt |= IsCrossAddrSpaceOrdering;
947       break;
948     case SIAtomicScope::WAVEFRONT:
949     case SIAtomicScope::SINGLETHREAD:
950       // The LDS keeps all memory operations in order for
951       // the same wavesfront.
952       break;
953     default:
954       llvm_unreachable("Unsupported synchronization scope");
955     }
956   }
957 
958   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
959     switch (Scope) {
960     case SIAtomicScope::SYSTEM:
961     case SIAtomicScope::AGENT:
962       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
963       // is not needed as GDS operations for all waves are executed in a total
964       // global ordering as observed by all waves. Required if also
965       // synchronizing with global/LDS memory as GDS operations could be
966       // reordered with respect to later global/LDS memory operations of the
967       // same wave.
968       LGKMCnt |= IsCrossAddrSpaceOrdering;
969       break;
970     case SIAtomicScope::WORKGROUP:
971     case SIAtomicScope::WAVEFRONT:
972     case SIAtomicScope::SINGLETHREAD:
973       // The GDS keeps all memory operations in order for
974       // the same work-group.
975       break;
976     default:
977       llvm_unreachable("Unsupported synchronization scope");
978     }
979   }
980 
981   if (VMCnt || LGKMCnt) {
982     unsigned WaitCntImmediate =
983       AMDGPU::encodeWaitcnt(IV,
984                             VMCnt ? 0 : getVmcntBitMask(IV),
985                             getExpcntBitMask(IV),
986                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
987     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
988     Changed = true;
989   }
990 
991   if (Pos == Position::AFTER)
992     --MI;
993 
994   return Changed;
995 }
996 
997 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
998                                        SIAtomicScope Scope,
999                                        SIAtomicAddrSpace AddrSpace,
1000                                        Position Pos) const {
1001   if (!InsertCacheInv)
1002     return false;
1003 
1004   bool Changed = false;
1005 
1006   MachineBasicBlock &MBB = *MI->getParent();
1007   DebugLoc DL = MI->getDebugLoc();
1008 
1009   if (Pos == Position::AFTER)
1010     ++MI;
1011 
1012   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1013     switch (Scope) {
1014     case SIAtomicScope::SYSTEM:
1015     case SIAtomicScope::AGENT:
1016       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1017       Changed = true;
1018       break;
1019     case SIAtomicScope::WORKGROUP:
1020     case SIAtomicScope::WAVEFRONT:
1021     case SIAtomicScope::SINGLETHREAD:
1022       // No cache to invalidate.
1023       break;
1024     default:
1025       llvm_unreachable("Unsupported synchronization scope");
1026     }
1027   }
1028 
1029   /// The scratch address space does not need the global memory cache
1030   /// to be flushed as all memory operations by the same thread are
1031   /// sequentially consistent, and no other thread can access scratch
1032   /// memory.
1033 
1034   /// Other address spaces do not have a cache.
1035 
1036   if (Pos == Position::AFTER)
1037     --MI;
1038 
1039   return Changed;
1040 }
1041 
1042 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1043                                        SIAtomicScope Scope,
1044                                        SIAtomicAddrSpace AddrSpace,
1045                                        bool IsCrossAddrSpaceOrdering,
1046                                        Position Pos) const {
1047     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1048                       IsCrossAddrSpaceOrdering, Pos);
1049 }
1050 
1051 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1052                                        SIAtomicScope Scope,
1053                                        SIAtomicAddrSpace AddrSpace,
1054                                        Position Pos) const {
1055   if (!InsertCacheInv)
1056     return false;
1057 
1058   bool Changed = false;
1059 
1060   MachineBasicBlock &MBB = *MI->getParent();
1061   DebugLoc DL = MI->getDebugLoc();
1062 
1063   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1064 
1065   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1066                                     ? AMDGPU::BUFFER_WBINVL1
1067                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1068 
1069   if (Pos == Position::AFTER)
1070     ++MI;
1071 
1072   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1073     switch (Scope) {
1074     case SIAtomicScope::SYSTEM:
1075     case SIAtomicScope::AGENT:
1076       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1077       Changed = true;
1078       break;
1079     case SIAtomicScope::WORKGROUP:
1080     case SIAtomicScope::WAVEFRONT:
1081     case SIAtomicScope::SINGLETHREAD:
1082       // No cache to invalidate.
1083       break;
1084     default:
1085       llvm_unreachable("Unsupported synchronization scope");
1086     }
1087   }
1088 
1089   /// The scratch address space does not need the global memory cache
1090   /// to be flushed as all memory operations by the same thread are
1091   /// sequentially consistent, and no other thread can access scratch
1092   /// memory.
1093 
1094   /// Other address spaces do not have a cache.
1095 
1096   if (Pos == Position::AFTER)
1097     --MI;
1098 
1099   return Changed;
1100 }
1101 
1102 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1103     const MachineBasicBlock::iterator &MI,
1104     SIAtomicScope Scope,
1105     SIAtomicAddrSpace AddrSpace) const {
1106   assert(MI->mayLoad() && !MI->mayStore());
1107   bool Changed = false;
1108 
1109   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1110     switch (Scope) {
1111     case SIAtomicScope::SYSTEM:
1112       Changed |= enableSCCBit(MI);
1113       Changed |= enableGLCBit(MI);
1114       break;
1115     case SIAtomicScope::AGENT:
1116       Changed |= enableGLCBit(MI);
1117       break;
1118     case SIAtomicScope::WORKGROUP:
1119       // In threadgroup split mode the waves of a work-group can be executing on
1120       // different CUs. Therefore need to bypass the L1 which is per CU.
1121       // Otherwise in non-threadgroup split mode all waves of a work-group are
1122       // on the same CU, and so the L1 does not need to be bypassed.
1123       if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1124       break;
1125     case SIAtomicScope::WAVEFRONT:
1126     case SIAtomicScope::SINGLETHREAD:
1127       // No cache to bypass.
1128       break;
1129     default:
1130       llvm_unreachable("Unsupported synchronization scope");
1131     }
1132   }
1133 
1134   /// The scratch address space does not need the global memory caches
1135   /// to be bypassed as all memory operations by the same thread are
1136   /// sequentially consistent, and no other thread can access scratch
1137   /// memory.
1138 
1139   /// Other address spaces do not have a cache.
1140 
1141   return Changed;
1142 }
1143 
1144 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1145     const MachineBasicBlock::iterator &MI,
1146     SIAtomicScope Scope,
1147     SIAtomicAddrSpace AddrSpace) const {
1148   assert(!MI->mayLoad() && MI->mayStore());
1149   bool Changed = false;
1150 
1151   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1152     switch (Scope) {
1153     case SIAtomicScope::SYSTEM:
1154       Changed |= enableSCCBit(MI);
1155       LLVM_FALLTHROUGH;
1156     case SIAtomicScope::AGENT:
1157       /// Do not set glc for store atomic operations as they implicitly write
1158       /// through the L1 cache.
1159       break;
1160     case SIAtomicScope::WORKGROUP:
1161     case SIAtomicScope::WAVEFRONT:
1162     case SIAtomicScope::SINGLETHREAD:
1163       // No cache to bypass. Store atomics implicitly write through the L1
1164       // cache.
1165       break;
1166     default:
1167       llvm_unreachable("Unsupported synchronization scope");
1168     }
1169   }
1170 
1171   /// The scratch address space does not need the global memory caches
1172   /// to be bypassed as all memory operations by the same thread are
1173   /// sequentially consistent, and no other thread can access scratch
1174   /// memory.
1175 
1176   /// Other address spaces do not have a cache.
1177 
1178   return Changed;
1179 }
1180 
1181 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1182     const MachineBasicBlock::iterator &MI,
1183     SIAtomicScope Scope,
1184     SIAtomicAddrSpace AddrSpace) const {
1185   assert(MI->mayLoad() && MI->mayStore());
1186   bool Changed = false;
1187 
1188   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1189     switch (Scope) {
1190     case SIAtomicScope::SYSTEM:
1191       Changed |= enableSCCBit(MI);
1192       LLVM_FALLTHROUGH;
1193     case SIAtomicScope::AGENT:
1194       /// Do not set glc for RMW atomic operations as they implicitly bypass
1195       /// the L1 cache, and the glc bit is instead used to indicate if they are
1196       /// return or no-return.
1197       break;
1198     case SIAtomicScope::WORKGROUP:
1199     case SIAtomicScope::WAVEFRONT:
1200     case SIAtomicScope::SINGLETHREAD:
1201       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1202       break;
1203     default:
1204       llvm_unreachable("Unsupported synchronization scope");
1205     }
1206   }
1207 
1208   return Changed;
1209 }
1210 
1211 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1212     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1213     bool IsVolatile, bool IsNonTemporal) const {
1214   // Only handle load and store, not atomic read-modify-write insructions. The
1215   // latter use glc to indicate if the atomic returns a result and so must not
1216   // be used for cache control.
1217   assert(MI->mayLoad() ^ MI->mayStore());
1218 
1219   // Only update load and store, not LLVM IR atomic read-modify-write
1220   // instructions. The latter are always marked as volatile so cannot sensibly
1221   // handle it as do not want to pessimize all atomics. Also they do not support
1222   // the nontemporal attribute.
1223   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1224 
1225   bool Changed = false;
1226 
1227   if (IsVolatile) {
1228     if (Op == SIMemOp::LOAD) {
1229       Changed |= enableGLCBit(MI);
1230     }
1231     Changed |= enableSCCBit(MI);
1232 
1233     // Ensure operation has completed at system scope to cause all volatile
1234     // operations to be visible outside the program in a global order. Do not
1235     // request cross address space as only the global address space can be
1236     // observable outside the program, so no need to cause a waitcnt for LDS
1237     // address space operations.
1238     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1239                           Position::AFTER);
1240 
1241     return Changed;
1242   }
1243 
1244   if (IsNonTemporal) {
1245     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1246     Changed |= enableGLCBit(MI);
1247     Changed |= enableSLCBit(MI);
1248     return Changed;
1249   }
1250 
1251   return Changed;
1252 }
1253 
1254 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1255                                       SIAtomicScope Scope,
1256                                       SIAtomicAddrSpace AddrSpace,
1257                                       SIMemOp Op,
1258                                       bool IsCrossAddrSpaceOrdering,
1259                                       Position Pos) const {
1260   if (ST.isTgSplitEnabled()) {
1261     // In threadgroup split mode the waves of a work-group can be executing on
1262     // different CUs. Therefore need to wait for global or GDS memory operations
1263     // to complete to ensure they are visible to waves in the other CUs.
1264     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1265     // the same CU, so no need to wait for global memory as all waves in the
1266     // work-group access the same the L1, nor wait for GDS as access are ordered
1267     // on a CU.
1268     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1269                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1270         (Scope == SIAtomicScope::WORKGROUP)) {
1271       // Same as GFX7 using agent scope.
1272       Scope = SIAtomicScope::AGENT;
1273     }
1274     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1275     // LDS memory operations.
1276     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1277   }
1278   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1279                                         IsCrossAddrSpaceOrdering, Pos);
1280 }
1281 
1282 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1283                                          SIAtomicScope Scope,
1284                                          SIAtomicAddrSpace AddrSpace,
1285                                          Position Pos) const {
1286   if (!InsertCacheInv)
1287     return false;
1288 
1289   bool Changed = false;
1290 
1291   MachineBasicBlock &MBB = *MI->getParent();
1292   DebugLoc DL = MI->getDebugLoc();
1293 
1294   if (Pos == Position::AFTER)
1295     ++MI;
1296 
1297   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1298     switch (Scope) {
1299     case SIAtomicScope::SYSTEM:
1300       // Ensures that following loads will not see stale remote VMEM data or
1301       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1302       // CC will never be stale due to the local memory probes.
1303       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1304       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1305       // hardware does not reorder memory operations by the same wave with
1306       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1307       // remove any cache lines of earlier writes by the same wave and ensures
1308       // later reads by the same wave will refetch the cache lines.
1309       Changed = true;
1310       break;
1311     case SIAtomicScope::AGENT:
1312       // Same as GFX7.
1313       break;
1314     case SIAtomicScope::WORKGROUP:
1315       // In threadgroup split mode the waves of a work-group can be executing on
1316       // different CUs. Therefore need to invalidate the L1 which is per CU.
1317       // Otherwise in non-threadgroup split mode all waves of a work-group are
1318       // on the same CU, and so the L1 does not need to be invalidated.
1319       if (ST.isTgSplitEnabled()) {
1320         // Same as GFX7 using agent scope.
1321         Scope = SIAtomicScope::AGENT;
1322       }
1323       break;
1324     case SIAtomicScope::WAVEFRONT:
1325     case SIAtomicScope::SINGLETHREAD:
1326       // Same as GFX7.
1327       break;
1328     default:
1329       llvm_unreachable("Unsupported synchronization scope");
1330     }
1331   }
1332 
1333   /// The scratch address space does not need the global memory cache
1334   /// to be flushed as all memory operations by the same thread are
1335   /// sequentially consistent, and no other thread can access scratch
1336   /// memory.
1337 
1338   /// Other address spaces do not have a cache.
1339 
1340   if (Pos == Position::AFTER)
1341     --MI;
1342 
1343   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1344 
1345   return Changed;
1346 }
1347 
1348 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1349                                          SIAtomicScope Scope,
1350                                          SIAtomicAddrSpace AddrSpace,
1351                                          bool IsCrossAddrSpaceOrdering,
1352                                          Position Pos) const {
1353   bool Changed = false;
1354 
1355   MachineBasicBlock &MBB = *MI->getParent();
1356   DebugLoc DL = MI->getDebugLoc();
1357 
1358   if (Pos == Position::AFTER)
1359     ++MI;
1360 
1361   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1362     switch (Scope) {
1363     case SIAtomicScope::SYSTEM:
1364       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1365       // hardware does not reorder memory operations by the same wave with
1366       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1367       // to initiate writeback of any dirty cache lines of earlier writes by the
1368       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1369       // writeback has completed.
1370       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1371       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1372       // vmcnt(0)" needed by the "BUFFER_WBL2".
1373       Changed = true;
1374       break;
1375     case SIAtomicScope::AGENT:
1376     case SIAtomicScope::WORKGROUP:
1377     case SIAtomicScope::WAVEFRONT:
1378     case SIAtomicScope::SINGLETHREAD:
1379       // Same as GFX7.
1380       break;
1381     default:
1382       llvm_unreachable("Unsupported synchronization scope");
1383     }
1384   }
1385 
1386   if (Pos == Position::AFTER)
1387     --MI;
1388 
1389   Changed |=
1390       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1391                                         IsCrossAddrSpaceOrdering, Pos);
1392 
1393   return Changed;
1394 }
1395 
1396 bool SIGfx10CacheControl::enableLoadCacheBypass(
1397     const MachineBasicBlock::iterator &MI,
1398     SIAtomicScope Scope,
1399     SIAtomicAddrSpace AddrSpace) const {
1400   assert(MI->mayLoad() && !MI->mayStore());
1401   bool Changed = false;
1402 
1403   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1404     /// TODO Do not set glc for rmw atomic operations as they
1405     /// implicitly bypass the L0/L1 caches.
1406 
1407     switch (Scope) {
1408     case SIAtomicScope::SYSTEM:
1409     case SIAtomicScope::AGENT:
1410       Changed |= enableGLCBit(MI);
1411       Changed |= enableDLCBit(MI);
1412       break;
1413     case SIAtomicScope::WORKGROUP:
1414       // In WGP mode the waves of a work-group can be executing on either CU of
1415       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1416       // CU mode all waves of a work-group are on the same CU, and so the L0
1417       // does not need to be bypassed.
1418       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1419       break;
1420     case SIAtomicScope::WAVEFRONT:
1421     case SIAtomicScope::SINGLETHREAD:
1422       // No cache to bypass.
1423       break;
1424     default:
1425       llvm_unreachable("Unsupported synchronization scope");
1426     }
1427   }
1428 
1429   /// The scratch address space does not need the global memory caches
1430   /// to be bypassed as all memory operations by the same thread are
1431   /// sequentially consistent, and no other thread can access scratch
1432   /// memory.
1433 
1434   /// Other address spaces do not have a cache.
1435 
1436   return Changed;
1437 }
1438 
1439 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1440     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1441     bool IsVolatile, bool IsNonTemporal) const {
1442 
1443   // Only handle load and store, not atomic read-modify-write insructions. The
1444   // latter use glc to indicate if the atomic returns a result and so must not
1445   // be used for cache control.
1446   assert(MI->mayLoad() ^ MI->mayStore());
1447 
1448   // Only update load and store, not LLVM IR atomic read-modify-write
1449   // instructions. The latter are always marked as volatile so cannot sensibly
1450   // handle it as do not want to pessimize all atomics. Also they do not support
1451   // the nontemporal attribute.
1452   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1453 
1454   bool Changed = false;
1455 
1456   if (IsVolatile) {
1457 
1458     if (Op == SIMemOp::LOAD) {
1459       Changed |= enableGLCBit(MI);
1460       Changed |= enableDLCBit(MI);
1461     }
1462 
1463     // Ensure operation has completed at system scope to cause all volatile
1464     // operations to be visible outside the program in a global order. Do not
1465     // request cross address space as only the global address space can be
1466     // observable outside the program, so no need to cause a waitcnt for LDS
1467     // address space operations.
1468     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1469                           Position::AFTER);
1470     return Changed;
1471   }
1472 
1473   if (IsNonTemporal) {
1474     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1475     Changed |= enableSLCBit(MI);
1476     return Changed;
1477   }
1478 
1479   return Changed;
1480 }
1481 
1482 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1483                                      SIAtomicScope Scope,
1484                                      SIAtomicAddrSpace AddrSpace,
1485                                      SIMemOp Op,
1486                                      bool IsCrossAddrSpaceOrdering,
1487                                      Position Pos) const {
1488   bool Changed = false;
1489 
1490   MachineBasicBlock &MBB = *MI->getParent();
1491   DebugLoc DL = MI->getDebugLoc();
1492 
1493   if (Pos == Position::AFTER)
1494     ++MI;
1495 
1496   bool VMCnt = false;
1497   bool VSCnt = false;
1498   bool LGKMCnt = false;
1499 
1500   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1501       SIAtomicAddrSpace::NONE) {
1502     switch (Scope) {
1503     case SIAtomicScope::SYSTEM:
1504     case SIAtomicScope::AGENT:
1505       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1506         VMCnt |= true;
1507       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1508         VSCnt |= true;
1509       break;
1510     case SIAtomicScope::WORKGROUP:
1511       // In WGP mode the waves of a work-group can be executing on either CU of
1512       // the WGP. Therefore need to wait for operations to complete to ensure
1513       // they are visible to waves in the other CU as the L0 is per CU.
1514       // Otherwise in CU mode and all waves of a work-group are on the same CU
1515       // which shares the same L0.
1516       if (!ST.isCuModeEnabled()) {
1517         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1518           VMCnt |= true;
1519         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1520           VSCnt |= true;
1521       }
1522       break;
1523     case SIAtomicScope::WAVEFRONT:
1524     case SIAtomicScope::SINGLETHREAD:
1525       // The L0 cache keeps all memory operations in order for
1526       // work-items in the same wavefront.
1527       break;
1528     default:
1529       llvm_unreachable("Unsupported synchronization scope");
1530     }
1531   }
1532 
1533   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1534     switch (Scope) {
1535     case SIAtomicScope::SYSTEM:
1536     case SIAtomicScope::AGENT:
1537     case SIAtomicScope::WORKGROUP:
1538       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1539       // not needed as LDS operations for all waves are executed in a total
1540       // global ordering as observed by all waves. Required if also
1541       // synchronizing with global/GDS memory as LDS operations could be
1542       // reordered with respect to later global/GDS memory operations of the
1543       // same wave.
1544       LGKMCnt |= IsCrossAddrSpaceOrdering;
1545       break;
1546     case SIAtomicScope::WAVEFRONT:
1547     case SIAtomicScope::SINGLETHREAD:
1548       // The LDS keeps all memory operations in order for
1549       // the same wavesfront.
1550       break;
1551     default:
1552       llvm_unreachable("Unsupported synchronization scope");
1553     }
1554   }
1555 
1556   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1557     switch (Scope) {
1558     case SIAtomicScope::SYSTEM:
1559     case SIAtomicScope::AGENT:
1560       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1561       // is not needed as GDS operations for all waves are executed in a total
1562       // global ordering as observed by all waves. Required if also
1563       // synchronizing with global/LDS memory as GDS operations could be
1564       // reordered with respect to later global/LDS memory operations of the
1565       // same wave.
1566       LGKMCnt |= IsCrossAddrSpaceOrdering;
1567       break;
1568     case SIAtomicScope::WORKGROUP:
1569     case SIAtomicScope::WAVEFRONT:
1570     case SIAtomicScope::SINGLETHREAD:
1571       // The GDS keeps all memory operations in order for
1572       // the same work-group.
1573       break;
1574     default:
1575       llvm_unreachable("Unsupported synchronization scope");
1576     }
1577   }
1578 
1579   if (VMCnt || LGKMCnt) {
1580     unsigned WaitCntImmediate =
1581       AMDGPU::encodeWaitcnt(IV,
1582                             VMCnt ? 0 : getVmcntBitMask(IV),
1583                             getExpcntBitMask(IV),
1584                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1585     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1586     Changed = true;
1587   }
1588 
1589   if (VSCnt) {
1590     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1591       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1592       .addImm(0);
1593     Changed = true;
1594   }
1595 
1596   if (Pos == Position::AFTER)
1597     --MI;
1598 
1599   return Changed;
1600 }
1601 
1602 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1603                                         SIAtomicScope Scope,
1604                                         SIAtomicAddrSpace AddrSpace,
1605                                         Position Pos) const {
1606   if (!InsertCacheInv)
1607     return false;
1608 
1609   bool Changed = false;
1610 
1611   MachineBasicBlock &MBB = *MI->getParent();
1612   DebugLoc DL = MI->getDebugLoc();
1613 
1614   if (Pos == Position::AFTER)
1615     ++MI;
1616 
1617   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1618     switch (Scope) {
1619     case SIAtomicScope::SYSTEM:
1620     case SIAtomicScope::AGENT:
1621       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1622       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1623       Changed = true;
1624       break;
1625     case SIAtomicScope::WORKGROUP:
1626       // In WGP mode the waves of a work-group can be executing on either CU of
1627       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1628       // in CU mode and all waves of a work-group are on the same CU, and so the
1629       // L0 does not need to be invalidated.
1630       if (!ST.isCuModeEnabled()) {
1631         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1632         Changed = true;
1633       }
1634       break;
1635     case SIAtomicScope::WAVEFRONT:
1636     case SIAtomicScope::SINGLETHREAD:
1637       // No cache to invalidate.
1638       break;
1639     default:
1640       llvm_unreachable("Unsupported synchronization scope");
1641     }
1642   }
1643 
1644   /// The scratch address space does not need the global memory cache
1645   /// to be flushed as all memory operations by the same thread are
1646   /// sequentially consistent, and no other thread can access scratch
1647   /// memory.
1648 
1649   /// Other address spaces do not have a cache.
1650 
1651   if (Pos == Position::AFTER)
1652     --MI;
1653 
1654   return Changed;
1655 }
1656 
1657 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1658   if (AtomicPseudoMIs.empty())
1659     return false;
1660 
1661   for (auto &MI : AtomicPseudoMIs)
1662     MI->eraseFromParent();
1663 
1664   AtomicPseudoMIs.clear();
1665   return true;
1666 }
1667 
1668 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1669                                    MachineBasicBlock::iterator &MI) {
1670   assert(MI->mayLoad() && !MI->mayStore());
1671 
1672   bool Changed = false;
1673 
1674   if (MOI.isAtomic()) {
1675     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1676         MOI.getOrdering() == AtomicOrdering::Acquire ||
1677         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1678       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1679                                            MOI.getOrderingAddrSpace());
1680     }
1681 
1682     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1683       Changed |= CC->insertWait(MI, MOI.getScope(),
1684                                 MOI.getOrderingAddrSpace(),
1685                                 SIMemOp::LOAD | SIMemOp::STORE,
1686                                 MOI.getIsCrossAddressSpaceOrdering(),
1687                                 Position::BEFORE);
1688 
1689     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1690         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1691       Changed |= CC->insertWait(MI, MOI.getScope(),
1692                                 MOI.getInstrAddrSpace(),
1693                                 SIMemOp::LOAD,
1694                                 MOI.getIsCrossAddressSpaceOrdering(),
1695                                 Position::AFTER);
1696       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1697                                    MOI.getOrderingAddrSpace(),
1698                                    Position::AFTER);
1699     }
1700 
1701     return Changed;
1702   }
1703 
1704   // Atomic instructions already bypass caches to the scope specified by the
1705   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1706   // need additional treatment.
1707   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1708                                                 SIMemOp::LOAD, MOI.isVolatile(),
1709                                                 MOI.isNonTemporal());
1710   return Changed;
1711 }
1712 
1713 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1714                                     MachineBasicBlock::iterator &MI) {
1715   assert(!MI->mayLoad() && MI->mayStore());
1716 
1717   bool Changed = false;
1718 
1719   if (MOI.isAtomic()) {
1720     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1721         MOI.getOrdering() == AtomicOrdering::Release ||
1722         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1723       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1724                                             MOI.getOrderingAddrSpace());
1725     }
1726 
1727     if (MOI.getOrdering() == AtomicOrdering::Release ||
1728         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1729       Changed |= CC->insertRelease(MI, MOI.getScope(),
1730                                    MOI.getOrderingAddrSpace(),
1731                                    MOI.getIsCrossAddressSpaceOrdering(),
1732                                    Position::BEFORE);
1733 
1734     return Changed;
1735   }
1736 
1737   // Atomic instructions already bypass caches to the scope specified by the
1738   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1739   // need additional treatment.
1740   Changed |= CC->enableVolatileAndOrNonTemporal(
1741       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1742       MOI.isNonTemporal());
1743   return Changed;
1744 }
1745 
1746 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1747                                           MachineBasicBlock::iterator &MI) {
1748   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1749 
1750   AtomicPseudoMIs.push_back(MI);
1751   bool Changed = false;
1752 
1753   if (MOI.isAtomic()) {
1754     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1755         MOI.getOrdering() == AtomicOrdering::Release ||
1756         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1757         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1758       /// TODO: This relies on a barrier always generating a waitcnt
1759       /// for LDS to ensure it is not reordered with the completion of
1760       /// the proceeding LDS operations. If barrier had a memory
1761       /// ordering and memory scope, then library does not need to
1762       /// generate a fence. Could add support in this file for
1763       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1764       /// adding S_WAITCNT before a S_BARRIER.
1765       Changed |= CC->insertRelease(MI, MOI.getScope(),
1766                                    MOI.getOrderingAddrSpace(),
1767                                    MOI.getIsCrossAddressSpaceOrdering(),
1768                                    Position::BEFORE);
1769 
1770     // TODO: If both release and invalidate are happening they could be combined
1771     // to use the single "BUFFER_WBL2" instruction. This could be done by
1772     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1773     // track cache invalidate and write back instructions.
1774 
1775     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1776         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1777         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1778       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1779                                    MOI.getOrderingAddrSpace(),
1780                                    Position::BEFORE);
1781 
1782     return Changed;
1783   }
1784 
1785   return Changed;
1786 }
1787 
1788 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1789   MachineBasicBlock::iterator &MI) {
1790   assert(MI->mayLoad() && MI->mayStore());
1791 
1792   bool Changed = false;
1793 
1794   if (MOI.isAtomic()) {
1795     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1796         MOI.getOrdering() == AtomicOrdering::Acquire ||
1797         MOI.getOrdering() == AtomicOrdering::Release ||
1798         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1799         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1800       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1801                                           MOI.getInstrAddrSpace());
1802     }
1803 
1804     if (MOI.getOrdering() == AtomicOrdering::Release ||
1805         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1806         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1807         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1808       Changed |= CC->insertRelease(MI, MOI.getScope(),
1809                                    MOI.getOrderingAddrSpace(),
1810                                    MOI.getIsCrossAddressSpaceOrdering(),
1811                                    Position::BEFORE);
1812 
1813     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1814         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1815         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1816         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1817         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1818       Changed |= CC->insertWait(MI, MOI.getScope(),
1819                                 MOI.getInstrAddrSpace(),
1820                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1821                                                    SIMemOp::STORE,
1822                                 MOI.getIsCrossAddressSpaceOrdering(),
1823                                 Position::AFTER);
1824       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1825                                    MOI.getOrderingAddrSpace(),
1826                                    Position::AFTER);
1827     }
1828 
1829     return Changed;
1830   }
1831 
1832   return Changed;
1833 }
1834 
1835 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1836   bool Changed = false;
1837 
1838   SIMemOpAccess MOA(MF);
1839   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1840 
1841   for (auto &MBB : MF) {
1842     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1843 
1844       // Unbundle instructions after the post-RA scheduler.
1845       if (MI->isBundle()) {
1846         MachineBasicBlock::instr_iterator II(MI->getIterator());
1847         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1848              I != E && I->isBundledWithPred(); ++I) {
1849           I->unbundleFromPred();
1850           for (MachineOperand &MO : I->operands())
1851             if (MO.isReg())
1852               MO.setIsInternalRead(false);
1853         }
1854 
1855         MI->eraseFromParent();
1856         MI = II->getIterator();
1857       }
1858 
1859       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1860         continue;
1861 
1862       if (const auto &MOI = MOA.getLoadInfo(MI))
1863         Changed |= expandLoad(MOI.getValue(), MI);
1864       else if (const auto &MOI = MOA.getStoreInfo(MI))
1865         Changed |= expandStore(MOI.getValue(), MI);
1866       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1867         Changed |= expandAtomicFence(MOI.getValue(), MI);
1868       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1869         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1870     }
1871   }
1872 
1873   Changed |= removeAtomicPseudoMIs();
1874   return Changed;
1875 }
1876 
1877 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1878 
1879 char SIMemoryLegalizer::ID = 0;
1880 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1881 
1882 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1883   return new SIMemoryLegalizer();
1884 }
1885