1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE &&
130            !isStrongerThan(FailureOrdering, Ordering));
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SiMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SiMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   Optional<SIMemOpInfo> constructFromMIWithMMO(
233       const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "None" otherwise.
241   Optional<SIMemOpInfo> getLoadInfo(
242       const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "None" otherwise.
245   Optional<SIMemOpInfo> getStoreInfo(
246       const MachineBasicBlock::iterator &MI) const;
247 
248   /// \returns Atomic fence info if \p MI is an atomic fence operation,
249   /// "None" otherwise.
250   Optional<SIMemOpInfo> getAtomicFenceInfo(
251       const MachineBasicBlock::iterator &MI) const;
252 
253   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254   /// rmw operation, "None" otherwise.
255   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256       const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262   /// AMDGPU subtarget info.
263   const GCNSubtarget &ST;
264 
265   /// Instruction info.
266   const SIInstrInfo *TII = nullptr;
267 
268   IsaVersion IV;
269 
270   /// Whether to insert cache invalidating instructions.
271   bool InsertCacheInv;
272 
273   SICacheControl(const GCNSubtarget &ST);
274 
275   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276   /// \returns Returns true if \p MI is modified, false otherwise.
277   bool enableNamedBit(const MachineBasicBlock::iterator MI,
278                       AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282   /// Create a cache control for the subtarget \p ST.
283   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285   /// Update \p MI memory load instruction to bypass any caches up to
286   /// the \p Scope memory scope for address spaces \p
287   /// AddrSpace. Return true iff the instruction was modified.
288   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289                                      SIAtomicScope Scope,
290                                      SIAtomicAddrSpace AddrSpace) const = 0;
291 
292   /// Update \p MI memory store instruction to bypass any caches up to
293   /// the \p Scope memory scope for address spaces \p
294   /// AddrSpace. Return true iff the instruction was modified.
295   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296                                       SIAtomicScope Scope,
297                                       SIAtomicAddrSpace AddrSpace) const = 0;
298 
299   /// Update \p MI memory read-modify-write instruction to bypass any caches up
300   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301   /// iff the instruction was modified.
302   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303                                     SIAtomicScope Scope,
304                                     SIAtomicAddrSpace AddrSpace) const = 0;
305 
306   /// Update \p MI memory instruction of kind \p Op associated with address
307   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308   /// true iff the instruction was modified.
309   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310                                               SIAtomicAddrSpace AddrSpace,
311                                               SIMemOp Op, bool IsVolatile,
312                                               bool IsNonTemporal) const = 0;
313 
314   /// Inserts any necessary instructions at position \p Pos relative
315   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316   /// \p Op associated with address spaces \p AddrSpace have completed. Used
317   /// between memory instructions to enforce the order they become visible as
318   /// observed by other memory instructions executing in memory scope \p Scope.
319   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320   /// address spaces. Returns true iff any instructions inserted.
321   virtual bool insertWait(MachineBasicBlock::iterator &MI,
322                           SIAtomicScope Scope,
323                           SIAtomicAddrSpace AddrSpace,
324                           SIMemOp Op,
325                           bool IsCrossAddrSpaceOrdering,
326                           Position Pos) const = 0;
327 
328   /// Inserts any necessary instructions at position \p Pos relative to
329   /// instruction \p MI to ensure any subsequent memory instructions of this
330   /// thread with address spaces \p AddrSpace will observe the previous memory
331   /// operations by any thread for memory scopes up to memory scope \p Scope .
332   /// Returns true iff any instructions inserted.
333   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334                              SIAtomicScope Scope,
335                              SIAtomicAddrSpace AddrSpace,
336                              Position Pos) const = 0;
337 
338   /// Inserts any necessary instructions at position \p Pos relative to
339   /// instruction \p MI to ensure previous memory instructions by this thread
340   /// with address spaces \p AddrSpace have completed and can be observed by
341   /// subsequent memory instructions by any thread executing in memory scope \p
342   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343   /// between address spaces. Returns true iff any instructions inserted.
344   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345                              SIAtomicScope Scope,
346                              SIAtomicAddrSpace AddrSpace,
347                              bool IsCrossAddrSpaceOrdering,
348                              Position Pos) const = 0;
349 
350   /// Virtual destructor to allow derivations to be deleted.
351   virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359   /// is modified, false otherwise.
360   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361     return enableNamedBit(MI, AMDGPU::CPol::GLC);
362   }
363 
364   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365   /// is modified, false otherwise.
366   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367     return enableNamedBit(MI, AMDGPU::CPol::SLC);
368   }
369 
370 public:
371 
372   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
373 
374   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375                              SIAtomicScope Scope,
376                              SIAtomicAddrSpace AddrSpace) const override;
377 
378   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379                               SIAtomicScope Scope,
380                               SIAtomicAddrSpace AddrSpace) const override;
381 
382   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383                             SIAtomicScope Scope,
384                             SIAtomicAddrSpace AddrSpace) const override;
385 
386   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388                                       bool IsVolatile,
389                                       bool IsNonTemporal) const override;
390 
391   bool insertWait(MachineBasicBlock::iterator &MI,
392                   SIAtomicScope Scope,
393                   SIAtomicAddrSpace AddrSpace,
394                   SIMemOp Op,
395                   bool IsCrossAddrSpaceOrdering,
396                   Position Pos) const override;
397 
398   bool insertAcquire(MachineBasicBlock::iterator &MI,
399                      SIAtomicScope Scope,
400                      SIAtomicAddrSpace AddrSpace,
401                      Position Pos) const override;
402 
403   bool insertRelease(MachineBasicBlock::iterator &MI,
404                      SIAtomicScope Scope,
405                      SIAtomicAddrSpace AddrSpace,
406                      bool IsCrossAddrSpaceOrdering,
407                      Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
413   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
414 
415   bool insertAcquire(MachineBasicBlock::iterator &MI,
416                      SIAtomicScope Scope,
417                      SIAtomicAddrSpace AddrSpace,
418                      Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 protected:
424 
425   /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI
426   /// is modified, false otherwise.
427   bool enableSCCBit(const MachineBasicBlock::iterator &MI) const {
428     return enableNamedBit(MI, AMDGPU::CPol::SCC);;
429   }
430 
431 public:
432 
433   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
434 
435   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
436                              SIAtomicScope Scope,
437                              SIAtomicAddrSpace AddrSpace) const override;
438 
439   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
440                               SIAtomicScope Scope,
441                               SIAtomicAddrSpace AddrSpace) const override;
442 
443   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
444                             SIAtomicScope Scope,
445                             SIAtomicAddrSpace AddrSpace) const override;
446 
447   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
448                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
449                                       bool IsVolatile,
450                                       bool IsNonTemporal) const override;
451 
452   bool insertWait(MachineBasicBlock::iterator &MI,
453                   SIAtomicScope Scope,
454                   SIAtomicAddrSpace AddrSpace,
455                   SIMemOp Op,
456                   bool IsCrossAddrSpaceOrdering,
457                   Position Pos) const override;
458 
459   bool insertAcquire(MachineBasicBlock::iterator &MI,
460                      SIAtomicScope Scope,
461                      SIAtomicAddrSpace AddrSpace,
462                      Position Pos) const override;
463 
464   bool insertRelease(MachineBasicBlock::iterator &MI,
465                      SIAtomicScope Scope,
466                      SIAtomicAddrSpace AddrSpace,
467                      bool IsCrossAddrSpaceOrdering,
468                      Position Pos) const override;
469 };
470 
471 class SIGfx10CacheControl : public SIGfx7CacheControl {
472 protected:
473 
474   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
475   /// is modified, false otherwise.
476   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
477     return enableNamedBit(MI, AMDGPU::CPol::DLC);
478   }
479 
480 public:
481 
482   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
483 
484   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
485                              SIAtomicScope Scope,
486                              SIAtomicAddrSpace AddrSpace) const override;
487 
488   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
489                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
490                                       bool IsVolatile,
491                                       bool IsNonTemporal) const override;
492 
493   bool insertWait(MachineBasicBlock::iterator &MI,
494                   SIAtomicScope Scope,
495                   SIAtomicAddrSpace AddrSpace,
496                   SIMemOp Op,
497                   bool IsCrossAddrSpaceOrdering,
498                   Position Pos) const override;
499 
500   bool insertAcquire(MachineBasicBlock::iterator &MI,
501                      SIAtomicScope Scope,
502                      SIAtomicAddrSpace AddrSpace,
503                      Position Pos) const override;
504 };
505 
506 class SIMemoryLegalizer final : public MachineFunctionPass {
507 private:
508 
509   /// Cache Control.
510   std::unique_ptr<SICacheControl> CC = nullptr;
511 
512   /// List of atomic pseudo instructions.
513   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
514 
515   /// Return true iff instruction \p MI is a atomic instruction that
516   /// returns a result.
517   bool isAtomicRet(const MachineInstr &MI) const {
518     return SIInstrInfo::isAtomicRet(MI);
519   }
520 
521   /// Removes all processed atomic pseudo instructions from the current
522   /// function. Returns true if current function is modified, false otherwise.
523   bool removeAtomicPseudoMIs();
524 
525   /// Expands load operation \p MI. Returns true if instructions are
526   /// added/deleted or \p MI is modified, false otherwise.
527   bool expandLoad(const SIMemOpInfo &MOI,
528                   MachineBasicBlock::iterator &MI);
529   /// Expands store operation \p MI. Returns true if instructions are
530   /// added/deleted or \p MI is modified, false otherwise.
531   bool expandStore(const SIMemOpInfo &MOI,
532                    MachineBasicBlock::iterator &MI);
533   /// Expands atomic fence operation \p MI. Returns true if
534   /// instructions are added/deleted or \p MI is modified, false otherwise.
535   bool expandAtomicFence(const SIMemOpInfo &MOI,
536                          MachineBasicBlock::iterator &MI);
537   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
538   /// instructions are added/deleted or \p MI is modified, false otherwise.
539   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
540                                 MachineBasicBlock::iterator &MI);
541 
542 public:
543   static char ID;
544 
545   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
546 
547   void getAnalysisUsage(AnalysisUsage &AU) const override {
548     AU.setPreservesCFG();
549     MachineFunctionPass::getAnalysisUsage(AU);
550   }
551 
552   StringRef getPassName() const override {
553     return PASS_NAME;
554   }
555 
556   bool runOnMachineFunction(MachineFunction &MF) override;
557 };
558 
559 } // end namespace anonymous
560 
561 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
562                                       const char *Msg) const {
563   const Function &Func = MI->getParent()->getParent()->getFunction();
564   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
565   Func.getContext().diagnose(Diag);
566 }
567 
568 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
569 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
570                                SIAtomicAddrSpace InstrAddrSpace) const {
571   if (SSID == SyncScope::System)
572     return std::make_tuple(SIAtomicScope::SYSTEM,
573                            SIAtomicAddrSpace::ATOMIC,
574                            true);
575   if (SSID == MMI->getAgentSSID())
576     return std::make_tuple(SIAtomicScope::AGENT,
577                            SIAtomicAddrSpace::ATOMIC,
578                            true);
579   if (SSID == MMI->getWorkgroupSSID())
580     return std::make_tuple(SIAtomicScope::WORKGROUP,
581                            SIAtomicAddrSpace::ATOMIC,
582                            true);
583   if (SSID == MMI->getWavefrontSSID())
584     return std::make_tuple(SIAtomicScope::WAVEFRONT,
585                            SIAtomicAddrSpace::ATOMIC,
586                            true);
587   if (SSID == SyncScope::SingleThread)
588     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
589                            SIAtomicAddrSpace::ATOMIC,
590                            true);
591   if (SSID == MMI->getSystemOneAddressSpaceSSID())
592     return std::make_tuple(SIAtomicScope::SYSTEM,
593                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
594                            false);
595   if (SSID == MMI->getAgentOneAddressSpaceSSID())
596     return std::make_tuple(SIAtomicScope::AGENT,
597                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
598                            false);
599   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
600     return std::make_tuple(SIAtomicScope::WORKGROUP,
601                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
602                            false);
603   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
604     return std::make_tuple(SIAtomicScope::WAVEFRONT,
605                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
606                            false);
607   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
608     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
609                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
610                            false);
611   return None;
612 }
613 
614 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
615   if (AS == AMDGPUAS::FLAT_ADDRESS)
616     return SIAtomicAddrSpace::FLAT;
617   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
618     return SIAtomicAddrSpace::GLOBAL;
619   if (AS == AMDGPUAS::LOCAL_ADDRESS)
620     return SIAtomicAddrSpace::LDS;
621   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
622     return SIAtomicAddrSpace::SCRATCH;
623   if (AS == AMDGPUAS::REGION_ADDRESS)
624     return SIAtomicAddrSpace::GDS;
625 
626   return SIAtomicAddrSpace::OTHER;
627 }
628 
629 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
630   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
631 }
632 
633 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
634     const MachineBasicBlock::iterator &MI) const {
635   assert(MI->getNumMemOperands() > 0);
636 
637   SyncScope::ID SSID = SyncScope::SingleThread;
638   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
639   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
640   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
641   bool IsNonTemporal = true;
642   bool IsVolatile = false;
643 
644   // Validator should check whether or not MMOs cover the entire set of
645   // locations accessed by the memory instruction.
646   for (const auto &MMO : MI->memoperands()) {
647     IsNonTemporal &= MMO->isNonTemporal();
648     IsVolatile |= MMO->isVolatile();
649     InstrAddrSpace |=
650       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
651     AtomicOrdering OpOrdering = MMO->getOrdering();
652     if (OpOrdering != AtomicOrdering::NotAtomic) {
653       const auto &IsSyncScopeInclusion =
654           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
655       if (!IsSyncScopeInclusion) {
656         reportUnsupported(MI,
657           "Unsupported non-inclusive atomic synchronization scope");
658         return None;
659       }
660 
661       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
662       Ordering =
663           isStrongerThan(Ordering, OpOrdering) ?
664               Ordering : MMO->getOrdering();
665       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
666              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
667       FailureOrdering =
668           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
669               FailureOrdering : MMO->getFailureOrdering();
670     }
671   }
672 
673   SIAtomicScope Scope = SIAtomicScope::NONE;
674   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
675   bool IsCrossAddressSpaceOrdering = false;
676   if (Ordering != AtomicOrdering::NotAtomic) {
677     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
678     if (!ScopeOrNone) {
679       reportUnsupported(MI, "Unsupported atomic synchronization scope");
680       return None;
681     }
682     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
683       ScopeOrNone.getValue();
684     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
685         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
686         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
687       reportUnsupported(MI, "Unsupported atomic address space");
688       return None;
689     }
690   }
691   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
692                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
693                      IsNonTemporal);
694 }
695 
696 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
697     const MachineBasicBlock::iterator &MI) const {
698   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
699 
700   if (!(MI->mayLoad() && !MI->mayStore()))
701     return None;
702 
703   // Be conservative if there are no memory operands.
704   if (MI->getNumMemOperands() == 0)
705     return SIMemOpInfo();
706 
707   return constructFromMIWithMMO(MI);
708 }
709 
710 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
711     const MachineBasicBlock::iterator &MI) const {
712   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
713 
714   if (!(!MI->mayLoad() && MI->mayStore()))
715     return None;
716 
717   // Be conservative if there are no memory operands.
718   if (MI->getNumMemOperands() == 0)
719     return SIMemOpInfo();
720 
721   return constructFromMIWithMMO(MI);
722 }
723 
724 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
725     const MachineBasicBlock::iterator &MI) const {
726   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
727 
728   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
729     return None;
730 
731   AtomicOrdering Ordering =
732     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
733 
734   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
735   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
736   if (!ScopeOrNone) {
737     reportUnsupported(MI, "Unsupported atomic synchronization scope");
738     return None;
739   }
740 
741   SIAtomicScope Scope = SIAtomicScope::NONE;
742   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
743   bool IsCrossAddressSpaceOrdering = false;
744   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
745     ScopeOrNone.getValue();
746 
747   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
748       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
749     reportUnsupported(MI, "Unsupported atomic address space");
750     return None;
751   }
752 
753   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
754                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
755 }
756 
757 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
758     const MachineBasicBlock::iterator &MI) const {
759   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
760 
761   if (!(MI->mayLoad() && MI->mayStore()))
762     return None;
763 
764   // Be conservative if there are no memory operands.
765   if (MI->getNumMemOperands() == 0)
766     return SIMemOpInfo();
767 
768   return constructFromMIWithMMO(MI);
769 }
770 
771 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
772   TII = ST.getInstrInfo();
773   IV = getIsaVersion(ST.getCPU());
774   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
775 }
776 
777 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
778                                     AMDGPU::CPol::CPol Bit) const {
779   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
780   if (!CPol)
781     return false;
782 
783   CPol->setImm(CPol->getImm() | Bit);
784   return true;
785 }
786 
787 /* static */
788 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
789   GCNSubtarget::Generation Generation = ST.getGeneration();
790   if (ST.hasGFX90AInsts())
791     return std::make_unique<SIGfx90ACacheControl>(ST);
792   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
793     return std::make_unique<SIGfx6CacheControl>(ST);
794   if (Generation < AMDGPUSubtarget::GFX10)
795     return std::make_unique<SIGfx7CacheControl>(ST);
796   return std::make_unique<SIGfx10CacheControl>(ST);
797 }
798 
799 bool SIGfx6CacheControl::enableLoadCacheBypass(
800     const MachineBasicBlock::iterator &MI,
801     SIAtomicScope Scope,
802     SIAtomicAddrSpace AddrSpace) const {
803   assert(MI->mayLoad() && !MI->mayStore());
804   bool Changed = false;
805 
806   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
807     switch (Scope) {
808     case SIAtomicScope::SYSTEM:
809     case SIAtomicScope::AGENT:
810       Changed |= enableGLCBit(MI);
811       break;
812     case SIAtomicScope::WORKGROUP:
813     case SIAtomicScope::WAVEFRONT:
814     case SIAtomicScope::SINGLETHREAD:
815       // No cache to bypass.
816       break;
817     default:
818       llvm_unreachable("Unsupported synchronization scope");
819     }
820   }
821 
822   /// The scratch address space does not need the global memory caches
823   /// to be bypassed as all memory operations by the same thread are
824   /// sequentially consistent, and no other thread can access scratch
825   /// memory.
826 
827   /// Other address spaces do not have a cache.
828 
829   return Changed;
830 }
831 
832 bool SIGfx6CacheControl::enableStoreCacheBypass(
833     const MachineBasicBlock::iterator &MI,
834     SIAtomicScope Scope,
835     SIAtomicAddrSpace AddrSpace) const {
836   assert(!MI->mayLoad() && MI->mayStore());
837   bool Changed = false;
838 
839   /// The L1 cache is write through so does not need to be bypassed. There is no
840   /// bypass control for the L2 cache at the isa level.
841 
842   return Changed;
843 }
844 
845 bool SIGfx6CacheControl::enableRMWCacheBypass(
846     const MachineBasicBlock::iterator &MI,
847     SIAtomicScope Scope,
848     SIAtomicAddrSpace AddrSpace) const {
849   assert(MI->mayLoad() && MI->mayStore());
850   bool Changed = false;
851 
852   /// The L1 cache is write through so does not need to be bypassed. There is no
853   /// bypass control for the L2 cache at the isa level.
854 
855   return Changed;
856 }
857 
858 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
859     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
860     bool IsVolatile, bool IsNonTemporal) const {
861   // Only handle load and store, not atomic read-modify-write insructions. The
862   // latter use glc to indicate if the atomic returns a result and so must not
863   // be used for cache control.
864   assert(MI->mayLoad() ^ MI->mayStore());
865 
866   // Only update load and store, not LLVM IR atomic read-modify-write
867   // instructions. The latter are always marked as volatile so cannot sensibly
868   // handle it as do not want to pessimize all atomics. Also they do not support
869   // the nontemporal attribute.
870   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
871 
872   bool Changed = false;
873 
874   if (IsVolatile) {
875     if (Op == SIMemOp::LOAD)
876       Changed |= enableGLCBit(MI);
877 
878     // Ensure operation has completed at system scope to cause all volatile
879     // operations to be visible outside the program in a global order. Do not
880     // request cross address space as only the global address space can be
881     // observable outside the program, so no need to cause a waitcnt for LDS
882     // address space operations.
883     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
884                           Position::AFTER);
885 
886     return Changed;
887   }
888 
889   if (IsNonTemporal) {
890     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
891     Changed |= enableGLCBit(MI);
892     Changed |= enableSLCBit(MI);
893     return Changed;
894   }
895 
896   return Changed;
897 }
898 
899 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
900                                     SIAtomicScope Scope,
901                                     SIAtomicAddrSpace AddrSpace,
902                                     SIMemOp Op,
903                                     bool IsCrossAddrSpaceOrdering,
904                                     Position Pos) const {
905   bool Changed = false;
906 
907   MachineBasicBlock &MBB = *MI->getParent();
908   DebugLoc DL = MI->getDebugLoc();
909 
910   if (Pos == Position::AFTER)
911     ++MI;
912 
913   bool VMCnt = false;
914   bool LGKMCnt = false;
915 
916   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
917       SIAtomicAddrSpace::NONE) {
918     switch (Scope) {
919     case SIAtomicScope::SYSTEM:
920     case SIAtomicScope::AGENT:
921       VMCnt |= true;
922       break;
923     case SIAtomicScope::WORKGROUP:
924     case SIAtomicScope::WAVEFRONT:
925     case SIAtomicScope::SINGLETHREAD:
926       // The L1 cache keeps all memory operations in order for
927       // wavefronts in the same work-group.
928       break;
929     default:
930       llvm_unreachable("Unsupported synchronization scope");
931     }
932   }
933 
934   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
935     switch (Scope) {
936     case SIAtomicScope::SYSTEM:
937     case SIAtomicScope::AGENT:
938     case SIAtomicScope::WORKGROUP:
939       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
940       // not needed as LDS operations for all waves are executed in a total
941       // global ordering as observed by all waves. Required if also
942       // synchronizing with global/GDS memory as LDS operations could be
943       // reordered with respect to later global/GDS memory operations of the
944       // same wave.
945       LGKMCnt |= IsCrossAddrSpaceOrdering;
946       break;
947     case SIAtomicScope::WAVEFRONT:
948     case SIAtomicScope::SINGLETHREAD:
949       // The LDS keeps all memory operations in order for
950       // the same wavesfront.
951       break;
952     default:
953       llvm_unreachable("Unsupported synchronization scope");
954     }
955   }
956 
957   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
958     switch (Scope) {
959     case SIAtomicScope::SYSTEM:
960     case SIAtomicScope::AGENT:
961       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
962       // is not needed as GDS operations for all waves are executed in a total
963       // global ordering as observed by all waves. Required if also
964       // synchronizing with global/LDS memory as GDS operations could be
965       // reordered with respect to later global/LDS memory operations of the
966       // same wave.
967       LGKMCnt |= IsCrossAddrSpaceOrdering;
968       break;
969     case SIAtomicScope::WORKGROUP:
970     case SIAtomicScope::WAVEFRONT:
971     case SIAtomicScope::SINGLETHREAD:
972       // The GDS keeps all memory operations in order for
973       // the same work-group.
974       break;
975     default:
976       llvm_unreachable("Unsupported synchronization scope");
977     }
978   }
979 
980   if (VMCnt || LGKMCnt) {
981     unsigned WaitCntImmediate =
982       AMDGPU::encodeWaitcnt(IV,
983                             VMCnt ? 0 : getVmcntBitMask(IV),
984                             getExpcntBitMask(IV),
985                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
986     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
987     Changed = true;
988   }
989 
990   if (Pos == Position::AFTER)
991     --MI;
992 
993   return Changed;
994 }
995 
996 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
997                                        SIAtomicScope Scope,
998                                        SIAtomicAddrSpace AddrSpace,
999                                        Position Pos) const {
1000   if (!InsertCacheInv)
1001     return false;
1002 
1003   bool Changed = false;
1004 
1005   MachineBasicBlock &MBB = *MI->getParent();
1006   DebugLoc DL = MI->getDebugLoc();
1007 
1008   if (Pos == Position::AFTER)
1009     ++MI;
1010 
1011   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1012     switch (Scope) {
1013     case SIAtomicScope::SYSTEM:
1014     case SIAtomicScope::AGENT:
1015       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1016       Changed = true;
1017       break;
1018     case SIAtomicScope::WORKGROUP:
1019     case SIAtomicScope::WAVEFRONT:
1020     case SIAtomicScope::SINGLETHREAD:
1021       // No cache to invalidate.
1022       break;
1023     default:
1024       llvm_unreachable("Unsupported synchronization scope");
1025     }
1026   }
1027 
1028   /// The scratch address space does not need the global memory cache
1029   /// to be flushed as all memory operations by the same thread are
1030   /// sequentially consistent, and no other thread can access scratch
1031   /// memory.
1032 
1033   /// Other address spaces do not have a cache.
1034 
1035   if (Pos == Position::AFTER)
1036     --MI;
1037 
1038   return Changed;
1039 }
1040 
1041 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1042                                        SIAtomicScope Scope,
1043                                        SIAtomicAddrSpace AddrSpace,
1044                                        bool IsCrossAddrSpaceOrdering,
1045                                        Position Pos) const {
1046     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1047                       IsCrossAddrSpaceOrdering, Pos);
1048 }
1049 
1050 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1051                                        SIAtomicScope Scope,
1052                                        SIAtomicAddrSpace AddrSpace,
1053                                        Position Pos) const {
1054   if (!InsertCacheInv)
1055     return false;
1056 
1057   bool Changed = false;
1058 
1059   MachineBasicBlock &MBB = *MI->getParent();
1060   DebugLoc DL = MI->getDebugLoc();
1061 
1062   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1063 
1064   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1065                                     ? AMDGPU::BUFFER_WBINVL1
1066                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1067 
1068   if (Pos == Position::AFTER)
1069     ++MI;
1070 
1071   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1072     switch (Scope) {
1073     case SIAtomicScope::SYSTEM:
1074     case SIAtomicScope::AGENT:
1075       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1076       Changed = true;
1077       break;
1078     case SIAtomicScope::WORKGROUP:
1079     case SIAtomicScope::WAVEFRONT:
1080     case SIAtomicScope::SINGLETHREAD:
1081       // No cache to invalidate.
1082       break;
1083     default:
1084       llvm_unreachable("Unsupported synchronization scope");
1085     }
1086   }
1087 
1088   /// The scratch address space does not need the global memory cache
1089   /// to be flushed as all memory operations by the same thread are
1090   /// sequentially consistent, and no other thread can access scratch
1091   /// memory.
1092 
1093   /// Other address spaces do not have a cache.
1094 
1095   if (Pos == Position::AFTER)
1096     --MI;
1097 
1098   return Changed;
1099 }
1100 
1101 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1102     const MachineBasicBlock::iterator &MI,
1103     SIAtomicScope Scope,
1104     SIAtomicAddrSpace AddrSpace) const {
1105   assert(MI->mayLoad() && !MI->mayStore());
1106   bool Changed = false;
1107 
1108   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1109     switch (Scope) {
1110     case SIAtomicScope::SYSTEM:
1111       Changed |= enableSCCBit(MI);
1112       Changed |= enableGLCBit(MI);
1113       break;
1114     case SIAtomicScope::AGENT:
1115       Changed |= enableGLCBit(MI);
1116       break;
1117     case SIAtomicScope::WORKGROUP:
1118       // In threadgroup split mode the waves of a work-group can be executing on
1119       // different CUs. Therefore need to bypass the L1 which is per CU.
1120       // Otherwise in non-threadgroup split mode all waves of a work-group are
1121       // on the same CU, and so the L1 does not need to be bypassed.
1122       if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1123       break;
1124     case SIAtomicScope::WAVEFRONT:
1125     case SIAtomicScope::SINGLETHREAD:
1126       // No cache to bypass.
1127       break;
1128     default:
1129       llvm_unreachable("Unsupported synchronization scope");
1130     }
1131   }
1132 
1133   /// The scratch address space does not need the global memory caches
1134   /// to be bypassed as all memory operations by the same thread are
1135   /// sequentially consistent, and no other thread can access scratch
1136   /// memory.
1137 
1138   /// Other address spaces do not have a cache.
1139 
1140   return Changed;
1141 }
1142 
1143 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1144     const MachineBasicBlock::iterator &MI,
1145     SIAtomicScope Scope,
1146     SIAtomicAddrSpace AddrSpace) const {
1147   assert(!MI->mayLoad() && MI->mayStore());
1148   bool Changed = false;
1149 
1150   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1151     switch (Scope) {
1152     case SIAtomicScope::SYSTEM:
1153       Changed |= enableSCCBit(MI);
1154       LLVM_FALLTHROUGH;
1155     case SIAtomicScope::AGENT:
1156       /// Do not set glc for store atomic operations as they implicitly write
1157       /// through the L1 cache.
1158       break;
1159     case SIAtomicScope::WORKGROUP:
1160     case SIAtomicScope::WAVEFRONT:
1161     case SIAtomicScope::SINGLETHREAD:
1162       // No cache to bypass. Store atomics implicitly write through the L1
1163       // cache.
1164       break;
1165     default:
1166       llvm_unreachable("Unsupported synchronization scope");
1167     }
1168   }
1169 
1170   /// The scratch address space does not need the global memory caches
1171   /// to be bypassed as all memory operations by the same thread are
1172   /// sequentially consistent, and no other thread can access scratch
1173   /// memory.
1174 
1175   /// Other address spaces do not have a cache.
1176 
1177   return Changed;
1178 }
1179 
1180 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1181     const MachineBasicBlock::iterator &MI,
1182     SIAtomicScope Scope,
1183     SIAtomicAddrSpace AddrSpace) const {
1184   assert(MI->mayLoad() && MI->mayStore());
1185   bool Changed = false;
1186 
1187   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1188     switch (Scope) {
1189     case SIAtomicScope::SYSTEM:
1190       Changed |= enableSCCBit(MI);
1191       LLVM_FALLTHROUGH;
1192     case SIAtomicScope::AGENT:
1193       /// Do not set glc for RMW atomic operations as they implicitly bypass
1194       /// the L1 cache, and the glc bit is instead used to indicate if they are
1195       /// return or no-return.
1196       break;
1197     case SIAtomicScope::WORKGROUP:
1198     case SIAtomicScope::WAVEFRONT:
1199     case SIAtomicScope::SINGLETHREAD:
1200       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1201       break;
1202     default:
1203       llvm_unreachable("Unsupported synchronization scope");
1204     }
1205   }
1206 
1207   return Changed;
1208 }
1209 
1210 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1211     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1212     bool IsVolatile, bool IsNonTemporal) const {
1213   // Only handle load and store, not atomic read-modify-write insructions. The
1214   // latter use glc to indicate if the atomic returns a result and so must not
1215   // be used for cache control.
1216   assert(MI->mayLoad() ^ MI->mayStore());
1217 
1218   // Only update load and store, not LLVM IR atomic read-modify-write
1219   // instructions. The latter are always marked as volatile so cannot sensibly
1220   // handle it as do not want to pessimize all atomics. Also they do not support
1221   // the nontemporal attribute.
1222   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1223 
1224   bool Changed = false;
1225 
1226   if (IsVolatile) {
1227     if (Op == SIMemOp::LOAD) {
1228       Changed |= enableGLCBit(MI);
1229     }
1230     Changed |= enableSCCBit(MI);
1231 
1232     // Ensure operation has completed at system scope to cause all volatile
1233     // operations to be visible outside the program in a global order. Do not
1234     // request cross address space as only the global address space can be
1235     // observable outside the program, so no need to cause a waitcnt for LDS
1236     // address space operations.
1237     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1238                           Position::AFTER);
1239 
1240     return Changed;
1241   }
1242 
1243   if (IsNonTemporal) {
1244     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1245     Changed |= enableGLCBit(MI);
1246     Changed |= enableSLCBit(MI);
1247     return Changed;
1248   }
1249 
1250   return Changed;
1251 }
1252 
1253 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1254                                       SIAtomicScope Scope,
1255                                       SIAtomicAddrSpace AddrSpace,
1256                                       SIMemOp Op,
1257                                       bool IsCrossAddrSpaceOrdering,
1258                                       Position Pos) const {
1259   if (ST.isTgSplitEnabled()) {
1260     // In threadgroup split mode the waves of a work-group can be executing on
1261     // different CUs. Therefore need to wait for global or GDS memory operations
1262     // to complete to ensure they are visible to waves in the other CUs.
1263     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1264     // the same CU, so no need to wait for global memory as all waves in the
1265     // work-group access the same the L1, nor wait for GDS as access are ordered
1266     // on a CU.
1267     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1268                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1269         (Scope == SIAtomicScope::WORKGROUP)) {
1270       // Same as GFX7 using agent scope.
1271       Scope = SIAtomicScope::AGENT;
1272     }
1273     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1274     // LDS memory operations.
1275     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1276   }
1277   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1278                                         IsCrossAddrSpaceOrdering, Pos);
1279 }
1280 
1281 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1282                                          SIAtomicScope Scope,
1283                                          SIAtomicAddrSpace AddrSpace,
1284                                          Position Pos) const {
1285   if (!InsertCacheInv)
1286     return false;
1287 
1288   bool Changed = false;
1289 
1290   MachineBasicBlock &MBB = *MI->getParent();
1291   DebugLoc DL = MI->getDebugLoc();
1292 
1293   if (Pos == Position::AFTER)
1294     ++MI;
1295 
1296   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1297     switch (Scope) {
1298     case SIAtomicScope::SYSTEM:
1299       // Ensures that following loads will not see stale remote VMEM data or
1300       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1301       // CC will never be stale due to the local memory probes.
1302       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1303       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1304       // hardware does not reorder memory operations by the same wave with
1305       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1306       // remove any cache lines of earlier writes by the same wave and ensures
1307       // later reads by the same wave will refetch the cache lines.
1308       Changed = true;
1309       break;
1310     case SIAtomicScope::AGENT:
1311       // Same as GFX7.
1312       break;
1313     case SIAtomicScope::WORKGROUP:
1314       // In threadgroup split mode the waves of a work-group can be executing on
1315       // different CUs. Therefore need to invalidate the L1 which is per CU.
1316       // Otherwise in non-threadgroup split mode all waves of a work-group are
1317       // on the same CU, and so the L1 does not need to be invalidated.
1318       if (ST.isTgSplitEnabled()) {
1319         // Same as GFX7 using agent scope.
1320         Scope = SIAtomicScope::AGENT;
1321       }
1322       break;
1323     case SIAtomicScope::WAVEFRONT:
1324     case SIAtomicScope::SINGLETHREAD:
1325       // Same as GFX7.
1326       break;
1327     default:
1328       llvm_unreachable("Unsupported synchronization scope");
1329     }
1330   }
1331 
1332   /// The scratch address space does not need the global memory cache
1333   /// to be flushed as all memory operations by the same thread are
1334   /// sequentially consistent, and no other thread can access scratch
1335   /// memory.
1336 
1337   /// Other address spaces do not have a cache.
1338 
1339   if (Pos == Position::AFTER)
1340     --MI;
1341 
1342   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1343 
1344   return Changed;
1345 }
1346 
1347 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1348                                          SIAtomicScope Scope,
1349                                          SIAtomicAddrSpace AddrSpace,
1350                                          bool IsCrossAddrSpaceOrdering,
1351                                          Position Pos) const {
1352   bool Changed = false;
1353 
1354   MachineBasicBlock &MBB = *MI->getParent();
1355   DebugLoc DL = MI->getDebugLoc();
1356 
1357   if (Pos == Position::AFTER)
1358     ++MI;
1359 
1360   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1361     switch (Scope) {
1362     case SIAtomicScope::SYSTEM:
1363       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1364       // hardware does not reorder memory operations by the same wave with
1365       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1366       // to initiate writeback of any dirty cache lines of earlier writes by the
1367       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1368       // writeback has completed.
1369       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1370       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1371       // vmcnt(0)" needed by the "BUFFER_WBL2".
1372       Changed = true;
1373       break;
1374     case SIAtomicScope::AGENT:
1375     case SIAtomicScope::WORKGROUP:
1376     case SIAtomicScope::WAVEFRONT:
1377     case SIAtomicScope::SINGLETHREAD:
1378       // Same as GFX7.
1379       break;
1380     default:
1381       llvm_unreachable("Unsupported synchronization scope");
1382     }
1383   }
1384 
1385   if (Pos == Position::AFTER)
1386     --MI;
1387 
1388   Changed |=
1389       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1390                                         IsCrossAddrSpaceOrdering, Pos);
1391 
1392   return Changed;
1393 }
1394 
1395 bool SIGfx10CacheControl::enableLoadCacheBypass(
1396     const MachineBasicBlock::iterator &MI,
1397     SIAtomicScope Scope,
1398     SIAtomicAddrSpace AddrSpace) const {
1399   assert(MI->mayLoad() && !MI->mayStore());
1400   bool Changed = false;
1401 
1402   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1403     /// TODO Do not set glc for rmw atomic operations as they
1404     /// implicitly bypass the L0/L1 caches.
1405 
1406     switch (Scope) {
1407     case SIAtomicScope::SYSTEM:
1408     case SIAtomicScope::AGENT:
1409       Changed |= enableGLCBit(MI);
1410       Changed |= enableDLCBit(MI);
1411       break;
1412     case SIAtomicScope::WORKGROUP:
1413       // In WGP mode the waves of a work-group can be executing on either CU of
1414       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1415       // CU mode all waves of a work-group are on the same CU, and so the L0
1416       // does not need to be bypassed.
1417       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1418       break;
1419     case SIAtomicScope::WAVEFRONT:
1420     case SIAtomicScope::SINGLETHREAD:
1421       // No cache to bypass.
1422       break;
1423     default:
1424       llvm_unreachable("Unsupported synchronization scope");
1425     }
1426   }
1427 
1428   /// The scratch address space does not need the global memory caches
1429   /// to be bypassed as all memory operations by the same thread are
1430   /// sequentially consistent, and no other thread can access scratch
1431   /// memory.
1432 
1433   /// Other address spaces do not have a cache.
1434 
1435   return Changed;
1436 }
1437 
1438 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1439     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1440     bool IsVolatile, bool IsNonTemporal) const {
1441 
1442   // Only handle load and store, not atomic read-modify-write insructions. The
1443   // latter use glc to indicate if the atomic returns a result and so must not
1444   // be used for cache control.
1445   assert(MI->mayLoad() ^ MI->mayStore());
1446 
1447   // Only update load and store, not LLVM IR atomic read-modify-write
1448   // instructions. The latter are always marked as volatile so cannot sensibly
1449   // handle it as do not want to pessimize all atomics. Also they do not support
1450   // the nontemporal attribute.
1451   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1452 
1453   bool Changed = false;
1454 
1455   if (IsVolatile) {
1456 
1457     if (Op == SIMemOp::LOAD) {
1458       Changed |= enableGLCBit(MI);
1459       Changed |= enableDLCBit(MI);
1460     }
1461 
1462     // Ensure operation has completed at system scope to cause all volatile
1463     // operations to be visible outside the program in a global order. Do not
1464     // request cross address space as only the global address space can be
1465     // observable outside the program, so no need to cause a waitcnt for LDS
1466     // address space operations.
1467     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1468                           Position::AFTER);
1469     return Changed;
1470   }
1471 
1472   if (IsNonTemporal) {
1473     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1474     Changed |= enableSLCBit(MI);
1475     return Changed;
1476   }
1477 
1478   return Changed;
1479 }
1480 
1481 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1482                                      SIAtomicScope Scope,
1483                                      SIAtomicAddrSpace AddrSpace,
1484                                      SIMemOp Op,
1485                                      bool IsCrossAddrSpaceOrdering,
1486                                      Position Pos) const {
1487   bool Changed = false;
1488 
1489   MachineBasicBlock &MBB = *MI->getParent();
1490   DebugLoc DL = MI->getDebugLoc();
1491 
1492   if (Pos == Position::AFTER)
1493     ++MI;
1494 
1495   bool VMCnt = false;
1496   bool VSCnt = false;
1497   bool LGKMCnt = false;
1498 
1499   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1500       SIAtomicAddrSpace::NONE) {
1501     switch (Scope) {
1502     case SIAtomicScope::SYSTEM:
1503     case SIAtomicScope::AGENT:
1504       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1505         VMCnt |= true;
1506       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1507         VSCnt |= true;
1508       break;
1509     case SIAtomicScope::WORKGROUP:
1510       // In WGP mode the waves of a work-group can be executing on either CU of
1511       // the WGP. Therefore need to wait for operations to complete to ensure
1512       // they are visible to waves in the other CU as the L0 is per CU.
1513       // Otherwise in CU mode and all waves of a work-group are on the same CU
1514       // which shares the same L0.
1515       if (!ST.isCuModeEnabled()) {
1516         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1517           VMCnt |= true;
1518         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1519           VSCnt |= true;
1520       }
1521       break;
1522     case SIAtomicScope::WAVEFRONT:
1523     case SIAtomicScope::SINGLETHREAD:
1524       // The L0 cache keeps all memory operations in order for
1525       // work-items in the same wavefront.
1526       break;
1527     default:
1528       llvm_unreachable("Unsupported synchronization scope");
1529     }
1530   }
1531 
1532   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1533     switch (Scope) {
1534     case SIAtomicScope::SYSTEM:
1535     case SIAtomicScope::AGENT:
1536     case SIAtomicScope::WORKGROUP:
1537       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1538       // not needed as LDS operations for all waves are executed in a total
1539       // global ordering as observed by all waves. Required if also
1540       // synchronizing with global/GDS memory as LDS operations could be
1541       // reordered with respect to later global/GDS memory operations of the
1542       // same wave.
1543       LGKMCnt |= IsCrossAddrSpaceOrdering;
1544       break;
1545     case SIAtomicScope::WAVEFRONT:
1546     case SIAtomicScope::SINGLETHREAD:
1547       // The LDS keeps all memory operations in order for
1548       // the same wavesfront.
1549       break;
1550     default:
1551       llvm_unreachable("Unsupported synchronization scope");
1552     }
1553   }
1554 
1555   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1556     switch (Scope) {
1557     case SIAtomicScope::SYSTEM:
1558     case SIAtomicScope::AGENT:
1559       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1560       // is not needed as GDS operations for all waves are executed in a total
1561       // global ordering as observed by all waves. Required if also
1562       // synchronizing with global/LDS memory as GDS operations could be
1563       // reordered with respect to later global/LDS memory operations of the
1564       // same wave.
1565       LGKMCnt |= IsCrossAddrSpaceOrdering;
1566       break;
1567     case SIAtomicScope::WORKGROUP:
1568     case SIAtomicScope::WAVEFRONT:
1569     case SIAtomicScope::SINGLETHREAD:
1570       // The GDS keeps all memory operations in order for
1571       // the same work-group.
1572       break;
1573     default:
1574       llvm_unreachable("Unsupported synchronization scope");
1575     }
1576   }
1577 
1578   if (VMCnt || LGKMCnt) {
1579     unsigned WaitCntImmediate =
1580       AMDGPU::encodeWaitcnt(IV,
1581                             VMCnt ? 0 : getVmcntBitMask(IV),
1582                             getExpcntBitMask(IV),
1583                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1584     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1585     Changed = true;
1586   }
1587 
1588   if (VSCnt) {
1589     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1590       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1591       .addImm(0);
1592     Changed = true;
1593   }
1594 
1595   if (Pos == Position::AFTER)
1596     --MI;
1597 
1598   return Changed;
1599 }
1600 
1601 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1602                                         SIAtomicScope Scope,
1603                                         SIAtomicAddrSpace AddrSpace,
1604                                         Position Pos) const {
1605   if (!InsertCacheInv)
1606     return false;
1607 
1608   bool Changed = false;
1609 
1610   MachineBasicBlock &MBB = *MI->getParent();
1611   DebugLoc DL = MI->getDebugLoc();
1612 
1613   if (Pos == Position::AFTER)
1614     ++MI;
1615 
1616   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1617     switch (Scope) {
1618     case SIAtomicScope::SYSTEM:
1619     case SIAtomicScope::AGENT:
1620       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1621       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1622       Changed = true;
1623       break;
1624     case SIAtomicScope::WORKGROUP:
1625       // In WGP mode the waves of a work-group can be executing on either CU of
1626       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1627       // in CU mode and all waves of a work-group are on the same CU, and so the
1628       // L0 does not need to be invalidated.
1629       if (!ST.isCuModeEnabled()) {
1630         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1631         Changed = true;
1632       }
1633       break;
1634     case SIAtomicScope::WAVEFRONT:
1635     case SIAtomicScope::SINGLETHREAD:
1636       // No cache to invalidate.
1637       break;
1638     default:
1639       llvm_unreachable("Unsupported synchronization scope");
1640     }
1641   }
1642 
1643   /// The scratch address space does not need the global memory cache
1644   /// to be flushed as all memory operations by the same thread are
1645   /// sequentially consistent, and no other thread can access scratch
1646   /// memory.
1647 
1648   /// Other address spaces do not have a cache.
1649 
1650   if (Pos == Position::AFTER)
1651     --MI;
1652 
1653   return Changed;
1654 }
1655 
1656 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1657   if (AtomicPseudoMIs.empty())
1658     return false;
1659 
1660   for (auto &MI : AtomicPseudoMIs)
1661     MI->eraseFromParent();
1662 
1663   AtomicPseudoMIs.clear();
1664   return true;
1665 }
1666 
1667 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1668                                    MachineBasicBlock::iterator &MI) {
1669   assert(MI->mayLoad() && !MI->mayStore());
1670 
1671   bool Changed = false;
1672 
1673   if (MOI.isAtomic()) {
1674     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1675         MOI.getOrdering() == AtomicOrdering::Acquire ||
1676         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1677       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1678                                            MOI.getOrderingAddrSpace());
1679     }
1680 
1681     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1682       Changed |= CC->insertWait(MI, MOI.getScope(),
1683                                 MOI.getOrderingAddrSpace(),
1684                                 SIMemOp::LOAD | SIMemOp::STORE,
1685                                 MOI.getIsCrossAddressSpaceOrdering(),
1686                                 Position::BEFORE);
1687 
1688     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1689         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1690       Changed |= CC->insertWait(MI, MOI.getScope(),
1691                                 MOI.getInstrAddrSpace(),
1692                                 SIMemOp::LOAD,
1693                                 MOI.getIsCrossAddressSpaceOrdering(),
1694                                 Position::AFTER);
1695       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1696                                    MOI.getOrderingAddrSpace(),
1697                                    Position::AFTER);
1698     }
1699 
1700     return Changed;
1701   }
1702 
1703   // Atomic instructions already bypass caches to the scope specified by the
1704   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1705   // need additional treatment.
1706   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1707                                                 SIMemOp::LOAD, MOI.isVolatile(),
1708                                                 MOI.isNonTemporal());
1709   return Changed;
1710 }
1711 
1712 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1713                                     MachineBasicBlock::iterator &MI) {
1714   assert(!MI->mayLoad() && MI->mayStore());
1715 
1716   bool Changed = false;
1717 
1718   if (MOI.isAtomic()) {
1719     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1720         MOI.getOrdering() == AtomicOrdering::Release ||
1721         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1722       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1723                                             MOI.getOrderingAddrSpace());
1724     }
1725 
1726     if (MOI.getOrdering() == AtomicOrdering::Release ||
1727         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1728       Changed |= CC->insertRelease(MI, MOI.getScope(),
1729                                    MOI.getOrderingAddrSpace(),
1730                                    MOI.getIsCrossAddressSpaceOrdering(),
1731                                    Position::BEFORE);
1732 
1733     return Changed;
1734   }
1735 
1736   // Atomic instructions already bypass caches to the scope specified by the
1737   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1738   // need additional treatment.
1739   Changed |= CC->enableVolatileAndOrNonTemporal(
1740       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1741       MOI.isNonTemporal());
1742   return Changed;
1743 }
1744 
1745 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1746                                           MachineBasicBlock::iterator &MI) {
1747   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1748 
1749   AtomicPseudoMIs.push_back(MI);
1750   bool Changed = false;
1751 
1752   if (MOI.isAtomic()) {
1753     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1754         MOI.getOrdering() == AtomicOrdering::Release ||
1755         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1756         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1757       /// TODO: This relies on a barrier always generating a waitcnt
1758       /// for LDS to ensure it is not reordered with the completion of
1759       /// the proceeding LDS operations. If barrier had a memory
1760       /// ordering and memory scope, then library does not need to
1761       /// generate a fence. Could add support in this file for
1762       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1763       /// adding S_WAITCNT before a S_BARRIER.
1764       Changed |= CC->insertRelease(MI, MOI.getScope(),
1765                                    MOI.getOrderingAddrSpace(),
1766                                    MOI.getIsCrossAddressSpaceOrdering(),
1767                                    Position::BEFORE);
1768 
1769     // TODO: If both release and invalidate are happening they could be combined
1770     // to use the single "BUFFER_WBL2" instruction. This could be done by
1771     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1772     // track cache invalidate and write back instructions.
1773 
1774     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1775         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1776         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1777       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1778                                    MOI.getOrderingAddrSpace(),
1779                                    Position::BEFORE);
1780 
1781     return Changed;
1782   }
1783 
1784   return Changed;
1785 }
1786 
1787 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1788   MachineBasicBlock::iterator &MI) {
1789   assert(MI->mayLoad() && MI->mayStore());
1790 
1791   bool Changed = false;
1792 
1793   if (MOI.isAtomic()) {
1794     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1795         MOI.getOrdering() == AtomicOrdering::Acquire ||
1796         MOI.getOrdering() == AtomicOrdering::Release ||
1797         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1798         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1799       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1800                                           MOI.getInstrAddrSpace());
1801     }
1802 
1803     if (MOI.getOrdering() == AtomicOrdering::Release ||
1804         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1805         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1806         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1807       Changed |= CC->insertRelease(MI, MOI.getScope(),
1808                                    MOI.getOrderingAddrSpace(),
1809                                    MOI.getIsCrossAddressSpaceOrdering(),
1810                                    Position::BEFORE);
1811 
1812     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1813         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1814         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1815         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1816         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1817       Changed |= CC->insertWait(MI, MOI.getScope(),
1818                                 MOI.getInstrAddrSpace(),
1819                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1820                                                    SIMemOp::STORE,
1821                                 MOI.getIsCrossAddressSpaceOrdering(),
1822                                 Position::AFTER);
1823       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1824                                    MOI.getOrderingAddrSpace(),
1825                                    Position::AFTER);
1826     }
1827 
1828     return Changed;
1829   }
1830 
1831   return Changed;
1832 }
1833 
1834 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1835   bool Changed = false;
1836 
1837   SIMemOpAccess MOA(MF);
1838   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1839 
1840   for (auto &MBB : MF) {
1841     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1842 
1843       // Unbundle instructions after the post-RA scheduler.
1844       if (MI->isBundle() && MI->mayLoadOrStore()) {
1845         MachineBasicBlock::instr_iterator II(MI->getIterator());
1846         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1847              I != E && I->isBundledWithPred(); ++I) {
1848           I->unbundleFromPred();
1849           for (MachineOperand &MO : I->operands())
1850             if (MO.isReg())
1851               MO.setIsInternalRead(false);
1852         }
1853 
1854         MI->eraseFromParent();
1855         MI = II->getIterator();
1856       }
1857 
1858       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1859         continue;
1860 
1861       if (const auto &MOI = MOA.getLoadInfo(MI))
1862         Changed |= expandLoad(MOI.getValue(), MI);
1863       else if (const auto &MOI = MOA.getStoreInfo(MI))
1864         Changed |= expandStore(MOI.getValue(), MI);
1865       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1866         Changed |= expandAtomicFence(MOI.getValue(), MI);
1867       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1868         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1869     }
1870   }
1871 
1872   Changed |= removeAtomicPseudoMIs();
1873   return Changed;
1874 }
1875 
1876 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1877 
1878 char SIMemoryLegalizer::ID = 0;
1879 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1880 
1881 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1882   return new SIMemoryLegalizer();
1883 }
1884