1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/Support/TargetParser.h"
26
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35 cl::desc("Use this to skip inserting cache invalidating instructions."));
36
37 namespace {
38
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43 NONE = 0u,
44 LOAD = 1u << 0,
45 STORE = 1u << 1,
46 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52 BEFORE,
53 AFTER
54 };
55
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58 NONE,
59 SINGLETHREAD,
60 WAVEFRONT,
61 WORKGROUP,
62 AGENT,
63 SYSTEM
64 };
65
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69 NONE = 0u,
70 GLOBAL = 1u << 0,
71 LDS = 1u << 1,
72 SCRATCH = 1u << 2,
73 GDS = 1u << 3,
74 OTHER = 1u << 4,
75
76 /// The address spaces that can be accessed by a FLAT instruction.
77 FLAT = GLOBAL | LDS | SCRATCH,
78
79 /// The address spaces that support atomic instructions.
80 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81
82 /// All address spaces.
83 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84
85 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87
88 class SIMemOpInfo final {
89 private:
90
91 friend class SIMemOpAccess;
92
93 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98 bool IsCrossAddressSpaceOrdering = false;
99 bool IsVolatile = false;
100 bool IsNonTemporal = false;
101
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsVolatile=false,bool IsNonTemporal=false)102 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106 bool IsCrossAddressSpaceOrdering = true,
107 AtomicOrdering FailureOrdering =
108 AtomicOrdering::SequentiallyConsistent,
109 bool IsVolatile = false,
110 bool IsNonTemporal = false)
111 : Ordering(Ordering), FailureOrdering(FailureOrdering),
112 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113 InstrAddrSpace(InstrAddrSpace),
114 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115 IsVolatile(IsVolatile),
116 IsNonTemporal(IsNonTemporal) {
117
118 if (Ordering == AtomicOrdering::NotAtomic) {
119 assert(Scope == SIAtomicScope::NONE &&
120 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121 !IsCrossAddressSpaceOrdering &&
122 FailureOrdering == AtomicOrdering::NotAtomic);
123 return;
124 }
125
126 assert(Scope != SIAtomicScope::NONE &&
127 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128 SIAtomicAddrSpace::NONE &&
129 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 SIAtomicAddrSpace::NONE);
131
132 // There is also no cross address space ordering if the ordering
133 // address space is the same as the instruction address space and
134 // only contains a single address space.
135 if ((OrderingAddrSpace == InstrAddrSpace) &&
136 isPowerOf2_32(uint32_t(InstrAddrSpace)))
137 this->IsCrossAddressSpaceOrdering = false;
138
139 // Limit the scope to the maximum supported by the instruction's address
140 // spaces.
141 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142 SIAtomicAddrSpace::NONE) {
143 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144 } else if ((InstrAddrSpace &
145 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146 SIAtomicAddrSpace::NONE) {
147 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148 } else if ((InstrAddrSpace &
149 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152 }
153 }
154
155 public:
156 /// \returns Atomic synchronization scope of the machine instruction used to
157 /// create this SIMemOpInfo.
getScope() const158 SIAtomicScope getScope() const {
159 return Scope;
160 }
161
162 /// \returns Ordering constraint of the machine instruction used to
163 /// create this SIMemOpInfo.
getOrdering() const164 AtomicOrdering getOrdering() const {
165 return Ordering;
166 }
167
168 /// \returns Failure ordering constraint of the machine instruction used to
169 /// create this SIMemOpInfo.
getFailureOrdering() const170 AtomicOrdering getFailureOrdering() const {
171 return FailureOrdering;
172 }
173
174 /// \returns The address spaces be accessed by the machine
175 /// instruction used to create this SiMemOpInfo.
getInstrAddrSpace() const176 SIAtomicAddrSpace getInstrAddrSpace() const {
177 return InstrAddrSpace;
178 }
179
180 /// \returns The address spaces that must be ordered by the machine
181 /// instruction used to create this SiMemOpInfo.
getOrderingAddrSpace() const182 SIAtomicAddrSpace getOrderingAddrSpace() const {
183 return OrderingAddrSpace;
184 }
185
186 /// \returns Return true iff memory ordering of operations on
187 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const188 bool getIsCrossAddressSpaceOrdering() const {
189 return IsCrossAddressSpaceOrdering;
190 }
191
192 /// \returns True if memory access of the machine instruction used to
193 /// create this SIMemOpInfo is volatile, false otherwise.
isVolatile() const194 bool isVolatile() const {
195 return IsVolatile;
196 }
197
198 /// \returns True if memory access of the machine instruction used to
199 /// create this SIMemOpInfo is nontemporal, false otherwise.
isNonTemporal() const200 bool isNonTemporal() const {
201 return IsNonTemporal;
202 }
203
204 /// \returns True if ordering constraint of the machine instruction used to
205 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const206 bool isAtomic() const {
207 return Ordering != AtomicOrdering::NotAtomic;
208 }
209
210 };
211
212 class SIMemOpAccess final {
213 private:
214 AMDGPUMachineModuleInfo *MMI = nullptr;
215
216 /// Reports unsupported message \p Msg for \p MI to LLVM context.
217 void reportUnsupported(const MachineBasicBlock::iterator &MI,
218 const char *Msg) const;
219
220 /// Inspects the target synchronization scope \p SSID and determines
221 /// the SI atomic scope it corresponds to, the address spaces it
222 /// covers, and whether the memory ordering applies between address
223 /// spaces.
224 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226
227 /// \return Return a bit set of the address spaces accessed by \p AS.
228 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229
230 /// \returns Info constructed from \p MI, which has at least machine memory
231 /// operand.
232 Optional<SIMemOpInfo> constructFromMIWithMMO(
233 const MachineBasicBlock::iterator &MI) const;
234
235 public:
236 /// Construct class to support accessing the machine memory operands
237 /// of instructions in the machine function \p MF.
238 SIMemOpAccess(MachineFunction &MF);
239
240 /// \returns Load info if \p MI is a load operation, "None" otherwise.
241 Optional<SIMemOpInfo> getLoadInfo(
242 const MachineBasicBlock::iterator &MI) const;
243
244 /// \returns Store info if \p MI is a store operation, "None" otherwise.
245 Optional<SIMemOpInfo> getStoreInfo(
246 const MachineBasicBlock::iterator &MI) const;
247
248 /// \returns Atomic fence info if \p MI is an atomic fence operation,
249 /// "None" otherwise.
250 Optional<SIMemOpInfo> getAtomicFenceInfo(
251 const MachineBasicBlock::iterator &MI) const;
252
253 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254 /// rmw operation, "None" otherwise.
255 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256 const MachineBasicBlock::iterator &MI) const;
257 };
258
259 class SICacheControl {
260 protected:
261
262 /// AMDGPU subtarget info.
263 const GCNSubtarget &ST;
264
265 /// Instruction info.
266 const SIInstrInfo *TII = nullptr;
267
268 IsaVersion IV;
269
270 /// Whether to insert cache invalidating instructions.
271 bool InsertCacheInv;
272
273 SICacheControl(const GCNSubtarget &ST);
274
275 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276 /// \returns Returns true if \p MI is modified, false otherwise.
277 bool enableNamedBit(const MachineBasicBlock::iterator MI,
278 AMDGPU::CPol::CPol Bit) const;
279
280 public:
281
282 /// Create a cache control for the subtarget \p ST.
283 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284
285 /// Update \p MI memory load instruction to bypass any caches up to
286 /// the \p Scope memory scope for address spaces \p
287 /// AddrSpace. Return true iff the instruction was modified.
288 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289 SIAtomicScope Scope,
290 SIAtomicAddrSpace AddrSpace) const = 0;
291
292 /// Update \p MI memory store instruction to bypass any caches up to
293 /// the \p Scope memory scope for address spaces \p
294 /// AddrSpace. Return true iff the instruction was modified.
295 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296 SIAtomicScope Scope,
297 SIAtomicAddrSpace AddrSpace) const = 0;
298
299 /// Update \p MI memory read-modify-write instruction to bypass any caches up
300 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301 /// iff the instruction was modified.
302 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303 SIAtomicScope Scope,
304 SIAtomicAddrSpace AddrSpace) const = 0;
305
306 /// Update \p MI memory instruction of kind \p Op associated with address
307 /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308 /// true iff the instruction was modified.
309 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310 SIAtomicAddrSpace AddrSpace,
311 SIMemOp Op, bool IsVolatile,
312 bool IsNonTemporal) const = 0;
313
314 /// Inserts any necessary instructions at position \p Pos relative
315 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316 /// \p Op associated with address spaces \p AddrSpace have completed. Used
317 /// between memory instructions to enforce the order they become visible as
318 /// observed by other memory instructions executing in memory scope \p Scope.
319 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320 /// address spaces. Returns true iff any instructions inserted.
321 virtual bool insertWait(MachineBasicBlock::iterator &MI,
322 SIAtomicScope Scope,
323 SIAtomicAddrSpace AddrSpace,
324 SIMemOp Op,
325 bool IsCrossAddrSpaceOrdering,
326 Position Pos) const = 0;
327
328 /// Inserts any necessary instructions at position \p Pos relative to
329 /// instruction \p MI to ensure any subsequent memory instructions of this
330 /// thread with address spaces \p AddrSpace will observe the previous memory
331 /// operations by any thread for memory scopes up to memory scope \p Scope .
332 /// Returns true iff any instructions inserted.
333 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334 SIAtomicScope Scope,
335 SIAtomicAddrSpace AddrSpace,
336 Position Pos) const = 0;
337
338 /// Inserts any necessary instructions at position \p Pos relative to
339 /// instruction \p MI to ensure previous memory instructions by this thread
340 /// with address spaces \p AddrSpace have completed and can be observed by
341 /// subsequent memory instructions by any thread executing in memory scope \p
342 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343 /// between address spaces. Returns true iff any instructions inserted.
344 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345 SIAtomicScope Scope,
346 SIAtomicAddrSpace AddrSpace,
347 bool IsCrossAddrSpaceOrdering,
348 Position Pos) const = 0;
349
350 /// Virtual destructor to allow derivations to be deleted.
351 virtual ~SICacheControl() = default;
352
353 };
354
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357
358 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const360 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361 return enableNamedBit(MI, AMDGPU::CPol::GLC);
362 }
363
364 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const366 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367 return enableNamedBit(MI, AMDGPU::CPol::SLC);
368 }
369
370 public:
371
SIGfx6CacheControl(const GCNSubtarget & ST)372 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
373
374 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375 SIAtomicScope Scope,
376 SIAtomicAddrSpace AddrSpace) const override;
377
378 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379 SIAtomicScope Scope,
380 SIAtomicAddrSpace AddrSpace) const override;
381
382 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383 SIAtomicScope Scope,
384 SIAtomicAddrSpace AddrSpace) const override;
385
386 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388 bool IsVolatile,
389 bool IsNonTemporal) const override;
390
391 bool insertWait(MachineBasicBlock::iterator &MI,
392 SIAtomicScope Scope,
393 SIAtomicAddrSpace AddrSpace,
394 SIMemOp Op,
395 bool IsCrossAddrSpaceOrdering,
396 Position Pos) const override;
397
398 bool insertAcquire(MachineBasicBlock::iterator &MI,
399 SIAtomicScope Scope,
400 SIAtomicAddrSpace AddrSpace,
401 Position Pos) const override;
402
403 bool insertRelease(MachineBasicBlock::iterator &MI,
404 SIAtomicScope Scope,
405 SIAtomicAddrSpace AddrSpace,
406 bool IsCrossAddrSpaceOrdering,
407 Position Pos) const override;
408 };
409
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412
SIGfx7CacheControl(const GCNSubtarget & ST)413 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
414
415 bool insertAcquire(MachineBasicBlock::iterator &MI,
416 SIAtomicScope Scope,
417 SIAtomicAddrSpace AddrSpace,
418 Position Pos) const override;
419
420 };
421
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424
SIGfx90ACacheControl(const GCNSubtarget & ST)425 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
426
427 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428 SIAtomicScope Scope,
429 SIAtomicAddrSpace AddrSpace) const override;
430
431 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432 SIAtomicScope Scope,
433 SIAtomicAddrSpace AddrSpace) const override;
434
435 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436 SIAtomicScope Scope,
437 SIAtomicAddrSpace AddrSpace) const override;
438
439 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441 bool IsVolatile,
442 bool IsNonTemporal) const override;
443
444 bool insertWait(MachineBasicBlock::iterator &MI,
445 SIAtomicScope Scope,
446 SIAtomicAddrSpace AddrSpace,
447 SIMemOp Op,
448 bool IsCrossAddrSpaceOrdering,
449 Position Pos) const override;
450
451 bool insertAcquire(MachineBasicBlock::iterator &MI,
452 SIAtomicScope Scope,
453 SIAtomicAddrSpace AddrSpace,
454 Position Pos) const override;
455
456 bool insertRelease(MachineBasicBlock::iterator &MI,
457 SIAtomicScope Scope,
458 SIAtomicAddrSpace AddrSpace,
459 bool IsCrossAddrSpaceOrdering,
460 Position Pos) const override;
461 };
462
463 class SIGfx940CacheControl : public SIGfx90ACacheControl {
464 protected:
465
466 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
467 /// is modified, false otherwise.
enableSC0Bit(const MachineBasicBlock::iterator & MI) const468 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
469 return enableNamedBit(MI, AMDGPU::CPol::SC0);
470 }
471
472 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
473 /// is modified, false otherwise.
enableSC1Bit(const MachineBasicBlock::iterator & MI) const474 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
475 return enableNamedBit(MI, AMDGPU::CPol::SC1);
476 }
477
478 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
479 /// is modified, false otherwise.
enableNTBit(const MachineBasicBlock::iterator & MI) const480 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
481 return enableNamedBit(MI, AMDGPU::CPol::NT);
482 }
483
484 public:
485
SIGfx940CacheControl(const GCNSubtarget & ST)486 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
487
488 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
489 SIAtomicScope Scope,
490 SIAtomicAddrSpace AddrSpace) const override;
491
492 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
493 SIAtomicScope Scope,
494 SIAtomicAddrSpace AddrSpace) const override;
495
496 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
497 SIAtomicScope Scope,
498 SIAtomicAddrSpace AddrSpace) const override;
499
500 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
501 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
502 bool IsVolatile,
503 bool IsNonTemporal) const override;
504
505 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
506 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
507
508 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
509 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
510 Position Pos) const override;
511 };
512
513 class SIGfx10CacheControl : public SIGfx7CacheControl {
514 protected:
515
516 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
517 /// is modified, false otherwise.
enableDLCBit(const MachineBasicBlock::iterator & MI) const518 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
519 return enableNamedBit(MI, AMDGPU::CPol::DLC);
520 }
521
522 public:
523
SIGfx10CacheControl(const GCNSubtarget & ST)524 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
525
526 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
527 SIAtomicScope Scope,
528 SIAtomicAddrSpace AddrSpace) const override;
529
530 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
531 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
532 bool IsVolatile,
533 bool IsNonTemporal) const override;
534
535 bool insertWait(MachineBasicBlock::iterator &MI,
536 SIAtomicScope Scope,
537 SIAtomicAddrSpace AddrSpace,
538 SIMemOp Op,
539 bool IsCrossAddrSpaceOrdering,
540 Position Pos) const override;
541
542 bool insertAcquire(MachineBasicBlock::iterator &MI,
543 SIAtomicScope Scope,
544 SIAtomicAddrSpace AddrSpace,
545 Position Pos) const override;
546 };
547
548 class SIGfx11CacheControl : public SIGfx10CacheControl {
549 public:
SIGfx11CacheControl(const GCNSubtarget & ST)550 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
551
552 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
553 SIAtomicScope Scope,
554 SIAtomicAddrSpace AddrSpace) const override;
555
556 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
557 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
558 bool IsVolatile,
559 bool IsNonTemporal) const override;
560 };
561
562 class SIMemoryLegalizer final : public MachineFunctionPass {
563 private:
564
565 /// Cache Control.
566 std::unique_ptr<SICacheControl> CC = nullptr;
567
568 /// List of atomic pseudo instructions.
569 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
570
571 /// Return true iff instruction \p MI is a atomic instruction that
572 /// returns a result.
isAtomicRet(const MachineInstr & MI) const573 bool isAtomicRet(const MachineInstr &MI) const {
574 return SIInstrInfo::isAtomicRet(MI);
575 }
576
577 /// Removes all processed atomic pseudo instructions from the current
578 /// function. Returns true if current function is modified, false otherwise.
579 bool removeAtomicPseudoMIs();
580
581 /// Expands load operation \p MI. Returns true if instructions are
582 /// added/deleted or \p MI is modified, false otherwise.
583 bool expandLoad(const SIMemOpInfo &MOI,
584 MachineBasicBlock::iterator &MI);
585 /// Expands store operation \p MI. Returns true if instructions are
586 /// added/deleted or \p MI is modified, false otherwise.
587 bool expandStore(const SIMemOpInfo &MOI,
588 MachineBasicBlock::iterator &MI);
589 /// Expands atomic fence operation \p MI. Returns true if
590 /// instructions are added/deleted or \p MI is modified, false otherwise.
591 bool expandAtomicFence(const SIMemOpInfo &MOI,
592 MachineBasicBlock::iterator &MI);
593 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
594 /// instructions are added/deleted or \p MI is modified, false otherwise.
595 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
596 MachineBasicBlock::iterator &MI);
597
598 public:
599 static char ID;
600
SIMemoryLegalizer()601 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
602
getAnalysisUsage(AnalysisUsage & AU) const603 void getAnalysisUsage(AnalysisUsage &AU) const override {
604 AU.setPreservesCFG();
605 MachineFunctionPass::getAnalysisUsage(AU);
606 }
607
getPassName() const608 StringRef getPassName() const override {
609 return PASS_NAME;
610 }
611
612 bool runOnMachineFunction(MachineFunction &MF) override;
613 };
614
615 } // end namespace anonymous
616
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const617 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
618 const char *Msg) const {
619 const Function &Func = MI->getParent()->getParent()->getFunction();
620 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
621 Func.getContext().diagnose(Diag);
622 }
623
624 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrAddrSpace) const625 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
626 SIAtomicAddrSpace InstrAddrSpace) const {
627 if (SSID == SyncScope::System)
628 return std::make_tuple(SIAtomicScope::SYSTEM,
629 SIAtomicAddrSpace::ATOMIC,
630 true);
631 if (SSID == MMI->getAgentSSID())
632 return std::make_tuple(SIAtomicScope::AGENT,
633 SIAtomicAddrSpace::ATOMIC,
634 true);
635 if (SSID == MMI->getWorkgroupSSID())
636 return std::make_tuple(SIAtomicScope::WORKGROUP,
637 SIAtomicAddrSpace::ATOMIC,
638 true);
639 if (SSID == MMI->getWavefrontSSID())
640 return std::make_tuple(SIAtomicScope::WAVEFRONT,
641 SIAtomicAddrSpace::ATOMIC,
642 true);
643 if (SSID == SyncScope::SingleThread)
644 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
645 SIAtomicAddrSpace::ATOMIC,
646 true);
647 if (SSID == MMI->getSystemOneAddressSpaceSSID())
648 return std::make_tuple(SIAtomicScope::SYSTEM,
649 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
650 false);
651 if (SSID == MMI->getAgentOneAddressSpaceSSID())
652 return std::make_tuple(SIAtomicScope::AGENT,
653 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
654 false);
655 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
656 return std::make_tuple(SIAtomicScope::WORKGROUP,
657 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
658 false);
659 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
660 return std::make_tuple(SIAtomicScope::WAVEFRONT,
661 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
662 false);
663 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
664 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
665 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
666 false);
667 return None;
668 }
669
toSIAtomicAddrSpace(unsigned AS) const670 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
671 if (AS == AMDGPUAS::FLAT_ADDRESS)
672 return SIAtomicAddrSpace::FLAT;
673 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
674 return SIAtomicAddrSpace::GLOBAL;
675 if (AS == AMDGPUAS::LOCAL_ADDRESS)
676 return SIAtomicAddrSpace::LDS;
677 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
678 return SIAtomicAddrSpace::SCRATCH;
679 if (AS == AMDGPUAS::REGION_ADDRESS)
680 return SIAtomicAddrSpace::GDS;
681
682 return SIAtomicAddrSpace::OTHER;
683 }
684
SIMemOpAccess(MachineFunction & MF)685 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
686 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
687 }
688
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const689 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
690 const MachineBasicBlock::iterator &MI) const {
691 assert(MI->getNumMemOperands() > 0);
692
693 SyncScope::ID SSID = SyncScope::SingleThread;
694 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
695 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
696 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
697 bool IsNonTemporal = true;
698 bool IsVolatile = false;
699
700 // Validator should check whether or not MMOs cover the entire set of
701 // locations accessed by the memory instruction.
702 for (const auto &MMO : MI->memoperands()) {
703 IsNonTemporal &= MMO->isNonTemporal();
704 IsVolatile |= MMO->isVolatile();
705 InstrAddrSpace |=
706 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
707 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
708 if (OpOrdering != AtomicOrdering::NotAtomic) {
709 const auto &IsSyncScopeInclusion =
710 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
711 if (!IsSyncScopeInclusion) {
712 reportUnsupported(MI,
713 "Unsupported non-inclusive atomic synchronization scope");
714 return None;
715 }
716
717 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
718 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
719 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
720 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
721 FailureOrdering =
722 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
723 }
724 }
725
726 SIAtomicScope Scope = SIAtomicScope::NONE;
727 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
728 bool IsCrossAddressSpaceOrdering = false;
729 if (Ordering != AtomicOrdering::NotAtomic) {
730 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
731 if (!ScopeOrNone) {
732 reportUnsupported(MI, "Unsupported atomic synchronization scope");
733 return None;
734 }
735 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
736 *ScopeOrNone;
737 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
738 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
739 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
740 reportUnsupported(MI, "Unsupported atomic address space");
741 return None;
742 }
743 }
744 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
745 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
746 IsNonTemporal);
747 }
748
getLoadInfo(const MachineBasicBlock::iterator & MI) const749 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
750 const MachineBasicBlock::iterator &MI) const {
751 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
752
753 if (!(MI->mayLoad() && !MI->mayStore()))
754 return None;
755
756 // Be conservative if there are no memory operands.
757 if (MI->getNumMemOperands() == 0)
758 return SIMemOpInfo();
759
760 return constructFromMIWithMMO(MI);
761 }
762
getStoreInfo(const MachineBasicBlock::iterator & MI) const763 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
764 const MachineBasicBlock::iterator &MI) const {
765 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
766
767 if (!(!MI->mayLoad() && MI->mayStore()))
768 return None;
769
770 // Be conservative if there are no memory operands.
771 if (MI->getNumMemOperands() == 0)
772 return SIMemOpInfo();
773
774 return constructFromMIWithMMO(MI);
775 }
776
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const777 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
778 const MachineBasicBlock::iterator &MI) const {
779 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
780
781 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
782 return None;
783
784 AtomicOrdering Ordering =
785 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
786
787 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
788 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
789 if (!ScopeOrNone) {
790 reportUnsupported(MI, "Unsupported atomic synchronization scope");
791 return None;
792 }
793
794 SIAtomicScope Scope = SIAtomicScope::NONE;
795 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
796 bool IsCrossAddressSpaceOrdering = false;
797 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
798 *ScopeOrNone;
799
800 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
801 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
802 reportUnsupported(MI, "Unsupported atomic address space");
803 return None;
804 }
805
806 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
807 IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
808 }
809
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const810 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
811 const MachineBasicBlock::iterator &MI) const {
812 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
813
814 if (!(MI->mayLoad() && MI->mayStore()))
815 return None;
816
817 // Be conservative if there are no memory operands.
818 if (MI->getNumMemOperands() == 0)
819 return SIMemOpInfo();
820
821 return constructFromMIWithMMO(MI);
822 }
823
SICacheControl(const GCNSubtarget & ST)824 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
825 TII = ST.getInstrInfo();
826 IV = getIsaVersion(ST.getCPU());
827 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
828 }
829
enableNamedBit(const MachineBasicBlock::iterator MI,AMDGPU::CPol::CPol Bit) const830 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
831 AMDGPU::CPol::CPol Bit) const {
832 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
833 if (!CPol)
834 return false;
835
836 CPol->setImm(CPol->getImm() | Bit);
837 return true;
838 }
839
840 /* static */
create(const GCNSubtarget & ST)841 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
842 GCNSubtarget::Generation Generation = ST.getGeneration();
843 if (ST.hasGFX940Insts())
844 return std::make_unique<SIGfx940CacheControl>(ST);
845 if (ST.hasGFX90AInsts())
846 return std::make_unique<SIGfx90ACacheControl>(ST);
847 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
848 return std::make_unique<SIGfx6CacheControl>(ST);
849 if (Generation < AMDGPUSubtarget::GFX10)
850 return std::make_unique<SIGfx7CacheControl>(ST);
851 if (Generation < AMDGPUSubtarget::GFX11)
852 return std::make_unique<SIGfx10CacheControl>(ST);
853 return std::make_unique<SIGfx11CacheControl>(ST);
854 }
855
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const856 bool SIGfx6CacheControl::enableLoadCacheBypass(
857 const MachineBasicBlock::iterator &MI,
858 SIAtomicScope Scope,
859 SIAtomicAddrSpace AddrSpace) const {
860 assert(MI->mayLoad() && !MI->mayStore());
861 bool Changed = false;
862
863 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
864 switch (Scope) {
865 case SIAtomicScope::SYSTEM:
866 case SIAtomicScope::AGENT:
867 // Set L1 cache policy to MISS_EVICT.
868 // Note: there is no L2 cache bypass policy at the ISA level.
869 Changed |= enableGLCBit(MI);
870 break;
871 case SIAtomicScope::WORKGROUP:
872 case SIAtomicScope::WAVEFRONT:
873 case SIAtomicScope::SINGLETHREAD:
874 // No cache to bypass.
875 break;
876 default:
877 llvm_unreachable("Unsupported synchronization scope");
878 }
879 }
880
881 /// The scratch address space does not need the global memory caches
882 /// to be bypassed as all memory operations by the same thread are
883 /// sequentially consistent, and no other thread can access scratch
884 /// memory.
885
886 /// Other address spaces do not have a cache.
887
888 return Changed;
889 }
890
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const891 bool SIGfx6CacheControl::enableStoreCacheBypass(
892 const MachineBasicBlock::iterator &MI,
893 SIAtomicScope Scope,
894 SIAtomicAddrSpace AddrSpace) const {
895 assert(!MI->mayLoad() && MI->mayStore());
896 bool Changed = false;
897
898 /// The L1 cache is write through so does not need to be bypassed. There is no
899 /// bypass control for the L2 cache at the isa level.
900
901 return Changed;
902 }
903
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const904 bool SIGfx6CacheControl::enableRMWCacheBypass(
905 const MachineBasicBlock::iterator &MI,
906 SIAtomicScope Scope,
907 SIAtomicAddrSpace AddrSpace) const {
908 assert(MI->mayLoad() && MI->mayStore());
909 bool Changed = false;
910
911 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
912 /// bypassed, and the GLC bit is instead used to indicate if they are
913 /// return or no-return.
914 /// Note: there is no L2 cache coherent bypass control at the ISA level.
915
916 return Changed;
917 }
918
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const919 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
920 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
921 bool IsVolatile, bool IsNonTemporal) const {
922 // Only handle load and store, not atomic read-modify-write insructions. The
923 // latter use glc to indicate if the atomic returns a result and so must not
924 // be used for cache control.
925 assert(MI->mayLoad() ^ MI->mayStore());
926
927 // Only update load and store, not LLVM IR atomic read-modify-write
928 // instructions. The latter are always marked as volatile so cannot sensibly
929 // handle it as do not want to pessimize all atomics. Also they do not support
930 // the nontemporal attribute.
931 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
932
933 bool Changed = false;
934
935 if (IsVolatile) {
936 // Set L1 cache policy to be MISS_EVICT for load instructions
937 // and MISS_LRU for store instructions.
938 // Note: there is no L2 cache bypass policy at the ISA level.
939 if (Op == SIMemOp::LOAD)
940 Changed |= enableGLCBit(MI);
941
942 // Ensure operation has completed at system scope to cause all volatile
943 // operations to be visible outside the program in a global order. Do not
944 // request cross address space as only the global address space can be
945 // observable outside the program, so no need to cause a waitcnt for LDS
946 // address space operations.
947 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
948 Position::AFTER);
949
950 return Changed;
951 }
952
953 if (IsNonTemporal) {
954 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
955 // for both loads and stores, and the L2 cache policy to STREAM.
956 Changed |= enableGLCBit(MI);
957 Changed |= enableSLCBit(MI);
958 return Changed;
959 }
960
961 return Changed;
962 }
963
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const964 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
965 SIAtomicScope Scope,
966 SIAtomicAddrSpace AddrSpace,
967 SIMemOp Op,
968 bool IsCrossAddrSpaceOrdering,
969 Position Pos) const {
970 bool Changed = false;
971
972 MachineBasicBlock &MBB = *MI->getParent();
973 DebugLoc DL = MI->getDebugLoc();
974
975 if (Pos == Position::AFTER)
976 ++MI;
977
978 bool VMCnt = false;
979 bool LGKMCnt = false;
980
981 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
982 SIAtomicAddrSpace::NONE) {
983 switch (Scope) {
984 case SIAtomicScope::SYSTEM:
985 case SIAtomicScope::AGENT:
986 VMCnt |= true;
987 break;
988 case SIAtomicScope::WORKGROUP:
989 case SIAtomicScope::WAVEFRONT:
990 case SIAtomicScope::SINGLETHREAD:
991 // The L1 cache keeps all memory operations in order for
992 // wavefronts in the same work-group.
993 break;
994 default:
995 llvm_unreachable("Unsupported synchronization scope");
996 }
997 }
998
999 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1000 switch (Scope) {
1001 case SIAtomicScope::SYSTEM:
1002 case SIAtomicScope::AGENT:
1003 case SIAtomicScope::WORKGROUP:
1004 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1005 // not needed as LDS operations for all waves are executed in a total
1006 // global ordering as observed by all waves. Required if also
1007 // synchronizing with global/GDS memory as LDS operations could be
1008 // reordered with respect to later global/GDS memory operations of the
1009 // same wave.
1010 LGKMCnt |= IsCrossAddrSpaceOrdering;
1011 break;
1012 case SIAtomicScope::WAVEFRONT:
1013 case SIAtomicScope::SINGLETHREAD:
1014 // The LDS keeps all memory operations in order for
1015 // the same wavefront.
1016 break;
1017 default:
1018 llvm_unreachable("Unsupported synchronization scope");
1019 }
1020 }
1021
1022 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1023 switch (Scope) {
1024 case SIAtomicScope::SYSTEM:
1025 case SIAtomicScope::AGENT:
1026 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1027 // is not needed as GDS operations for all waves are executed in a total
1028 // global ordering as observed by all waves. Required if also
1029 // synchronizing with global/LDS memory as GDS operations could be
1030 // reordered with respect to later global/LDS memory operations of the
1031 // same wave.
1032 LGKMCnt |= IsCrossAddrSpaceOrdering;
1033 break;
1034 case SIAtomicScope::WORKGROUP:
1035 case SIAtomicScope::WAVEFRONT:
1036 case SIAtomicScope::SINGLETHREAD:
1037 // The GDS keeps all memory operations in order for
1038 // the same work-group.
1039 break;
1040 default:
1041 llvm_unreachable("Unsupported synchronization scope");
1042 }
1043 }
1044
1045 if (VMCnt || LGKMCnt) {
1046 unsigned WaitCntImmediate =
1047 AMDGPU::encodeWaitcnt(IV,
1048 VMCnt ? 0 : getVmcntBitMask(IV),
1049 getExpcntBitMask(IV),
1050 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1051 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1052 Changed = true;
1053 }
1054
1055 if (Pos == Position::AFTER)
1056 --MI;
1057
1058 return Changed;
1059 }
1060
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1061 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1062 SIAtomicScope Scope,
1063 SIAtomicAddrSpace AddrSpace,
1064 Position Pos) const {
1065 if (!InsertCacheInv)
1066 return false;
1067
1068 bool Changed = false;
1069
1070 MachineBasicBlock &MBB = *MI->getParent();
1071 DebugLoc DL = MI->getDebugLoc();
1072
1073 if (Pos == Position::AFTER)
1074 ++MI;
1075
1076 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1077 switch (Scope) {
1078 case SIAtomicScope::SYSTEM:
1079 case SIAtomicScope::AGENT:
1080 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1081 Changed = true;
1082 break;
1083 case SIAtomicScope::WORKGROUP:
1084 case SIAtomicScope::WAVEFRONT:
1085 case SIAtomicScope::SINGLETHREAD:
1086 // No cache to invalidate.
1087 break;
1088 default:
1089 llvm_unreachable("Unsupported synchronization scope");
1090 }
1091 }
1092
1093 /// The scratch address space does not need the global memory cache
1094 /// to be flushed as all memory operations by the same thread are
1095 /// sequentially consistent, and no other thread can access scratch
1096 /// memory.
1097
1098 /// Other address spaces do not have a cache.
1099
1100 if (Pos == Position::AFTER)
1101 --MI;
1102
1103 return Changed;
1104 }
1105
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1106 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1107 SIAtomicScope Scope,
1108 SIAtomicAddrSpace AddrSpace,
1109 bool IsCrossAddrSpaceOrdering,
1110 Position Pos) const {
1111 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1112 IsCrossAddrSpaceOrdering, Pos);
1113 }
1114
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1115 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1116 SIAtomicScope Scope,
1117 SIAtomicAddrSpace AddrSpace,
1118 Position Pos) const {
1119 if (!InsertCacheInv)
1120 return false;
1121
1122 bool Changed = false;
1123
1124 MachineBasicBlock &MBB = *MI->getParent();
1125 DebugLoc DL = MI->getDebugLoc();
1126
1127 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1128
1129 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1130 ? AMDGPU::BUFFER_WBINVL1
1131 : AMDGPU::BUFFER_WBINVL1_VOL;
1132
1133 if (Pos == Position::AFTER)
1134 ++MI;
1135
1136 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1137 switch (Scope) {
1138 case SIAtomicScope::SYSTEM:
1139 case SIAtomicScope::AGENT:
1140 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1141 Changed = true;
1142 break;
1143 case SIAtomicScope::WORKGROUP:
1144 case SIAtomicScope::WAVEFRONT:
1145 case SIAtomicScope::SINGLETHREAD:
1146 // No cache to invalidate.
1147 break;
1148 default:
1149 llvm_unreachable("Unsupported synchronization scope");
1150 }
1151 }
1152
1153 /// The scratch address space does not need the global memory cache
1154 /// to be flushed as all memory operations by the same thread are
1155 /// sequentially consistent, and no other thread can access scratch
1156 /// memory.
1157
1158 /// Other address spaces do not have a cache.
1159
1160 if (Pos == Position::AFTER)
1161 --MI;
1162
1163 return Changed;
1164 }
1165
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1166 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1167 const MachineBasicBlock::iterator &MI,
1168 SIAtomicScope Scope,
1169 SIAtomicAddrSpace AddrSpace) const {
1170 assert(MI->mayLoad() && !MI->mayStore());
1171 bool Changed = false;
1172
1173 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1174 switch (Scope) {
1175 case SIAtomicScope::SYSTEM:
1176 case SIAtomicScope::AGENT:
1177 // Set the L1 cache policy to MISS_LRU.
1178 // Note: there is no L2 cache bypass policy at the ISA level.
1179 Changed |= enableGLCBit(MI);
1180 break;
1181 case SIAtomicScope::WORKGROUP:
1182 // In threadgroup split mode the waves of a work-group can be executing on
1183 // different CUs. Therefore need to bypass the L1 which is per CU.
1184 // Otherwise in non-threadgroup split mode all waves of a work-group are
1185 // on the same CU, and so the L1 does not need to be bypassed.
1186 if (ST.isTgSplitEnabled())
1187 Changed |= enableGLCBit(MI);
1188 break;
1189 case SIAtomicScope::WAVEFRONT:
1190 case SIAtomicScope::SINGLETHREAD:
1191 // No cache to bypass.
1192 break;
1193 default:
1194 llvm_unreachable("Unsupported synchronization scope");
1195 }
1196 }
1197
1198 /// The scratch address space does not need the global memory caches
1199 /// to be bypassed as all memory operations by the same thread are
1200 /// sequentially consistent, and no other thread can access scratch
1201 /// memory.
1202
1203 /// Other address spaces do not have a cache.
1204
1205 return Changed;
1206 }
1207
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1208 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1209 const MachineBasicBlock::iterator &MI,
1210 SIAtomicScope Scope,
1211 SIAtomicAddrSpace AddrSpace) const {
1212 assert(!MI->mayLoad() && MI->mayStore());
1213 bool Changed = false;
1214
1215 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1216 switch (Scope) {
1217 case SIAtomicScope::SYSTEM:
1218 case SIAtomicScope::AGENT:
1219 /// Do not set glc for store atomic operations as they implicitly write
1220 /// through the L1 cache.
1221 break;
1222 case SIAtomicScope::WORKGROUP:
1223 case SIAtomicScope::WAVEFRONT:
1224 case SIAtomicScope::SINGLETHREAD:
1225 // No cache to bypass. Store atomics implicitly write through the L1
1226 // cache.
1227 break;
1228 default:
1229 llvm_unreachable("Unsupported synchronization scope");
1230 }
1231 }
1232
1233 /// The scratch address space does not need the global memory caches
1234 /// to be bypassed as all memory operations by the same thread are
1235 /// sequentially consistent, and no other thread can access scratch
1236 /// memory.
1237
1238 /// Other address spaces do not have a cache.
1239
1240 return Changed;
1241 }
1242
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1243 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1244 const MachineBasicBlock::iterator &MI,
1245 SIAtomicScope Scope,
1246 SIAtomicAddrSpace AddrSpace) const {
1247 assert(MI->mayLoad() && MI->mayStore());
1248 bool Changed = false;
1249
1250 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1251 switch (Scope) {
1252 case SIAtomicScope::SYSTEM:
1253 case SIAtomicScope::AGENT:
1254 /// Do not set glc for RMW atomic operations as they implicitly bypass
1255 /// the L1 cache, and the glc bit is instead used to indicate if they are
1256 /// return or no-return.
1257 break;
1258 case SIAtomicScope::WORKGROUP:
1259 case SIAtomicScope::WAVEFRONT:
1260 case SIAtomicScope::SINGLETHREAD:
1261 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1262 break;
1263 default:
1264 llvm_unreachable("Unsupported synchronization scope");
1265 }
1266 }
1267
1268 return Changed;
1269 }
1270
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1271 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1272 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1273 bool IsVolatile, bool IsNonTemporal) const {
1274 // Only handle load and store, not atomic read-modify-write insructions. The
1275 // latter use glc to indicate if the atomic returns a result and so must not
1276 // be used for cache control.
1277 assert(MI->mayLoad() ^ MI->mayStore());
1278
1279 // Only update load and store, not LLVM IR atomic read-modify-write
1280 // instructions. The latter are always marked as volatile so cannot sensibly
1281 // handle it as do not want to pessimize all atomics. Also they do not support
1282 // the nontemporal attribute.
1283 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1284
1285 bool Changed = false;
1286
1287 if (IsVolatile) {
1288 // Set L1 cache policy to be MISS_EVICT for load instructions
1289 // and MISS_LRU for store instructions.
1290 // Note: there is no L2 cache bypass policy at the ISA level.
1291 if (Op == SIMemOp::LOAD)
1292 Changed |= enableGLCBit(MI);
1293
1294 // Ensure operation has completed at system scope to cause all volatile
1295 // operations to be visible outside the program in a global order. Do not
1296 // request cross address space as only the global address space can be
1297 // observable outside the program, so no need to cause a waitcnt for LDS
1298 // address space operations.
1299 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1300 Position::AFTER);
1301
1302 return Changed;
1303 }
1304
1305 if (IsNonTemporal) {
1306 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1307 // for both loads and stores, and the L2 cache policy to STREAM.
1308 Changed |= enableGLCBit(MI);
1309 Changed |= enableSLCBit(MI);
1310 return Changed;
1311 }
1312
1313 return Changed;
1314 }
1315
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1316 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1317 SIAtomicScope Scope,
1318 SIAtomicAddrSpace AddrSpace,
1319 SIMemOp Op,
1320 bool IsCrossAddrSpaceOrdering,
1321 Position Pos) const {
1322 if (ST.isTgSplitEnabled()) {
1323 // In threadgroup split mode the waves of a work-group can be executing on
1324 // different CUs. Therefore need to wait for global or GDS memory operations
1325 // to complete to ensure they are visible to waves in the other CUs.
1326 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1327 // the same CU, so no need to wait for global memory as all waves in the
1328 // work-group access the same the L1, nor wait for GDS as access are ordered
1329 // on a CU.
1330 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1331 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1332 (Scope == SIAtomicScope::WORKGROUP)) {
1333 // Same as GFX7 using agent scope.
1334 Scope = SIAtomicScope::AGENT;
1335 }
1336 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1337 // LDS memory operations.
1338 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1339 }
1340 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1341 IsCrossAddrSpaceOrdering, Pos);
1342 }
1343
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1344 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1345 SIAtomicScope Scope,
1346 SIAtomicAddrSpace AddrSpace,
1347 Position Pos) const {
1348 if (!InsertCacheInv)
1349 return false;
1350
1351 bool Changed = false;
1352
1353 MachineBasicBlock &MBB = *MI->getParent();
1354 DebugLoc DL = MI->getDebugLoc();
1355
1356 if (Pos == Position::AFTER)
1357 ++MI;
1358
1359 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1360 switch (Scope) {
1361 case SIAtomicScope::SYSTEM:
1362 // Ensures that following loads will not see stale remote VMEM data or
1363 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1364 // CC will never be stale due to the local memory probes.
1365 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1366 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1367 // hardware does not reorder memory operations by the same wave with
1368 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1369 // remove any cache lines of earlier writes by the same wave and ensures
1370 // later reads by the same wave will refetch the cache lines.
1371 Changed = true;
1372 break;
1373 case SIAtomicScope::AGENT:
1374 // Same as GFX7.
1375 break;
1376 case SIAtomicScope::WORKGROUP:
1377 // In threadgroup split mode the waves of a work-group can be executing on
1378 // different CUs. Therefore need to invalidate the L1 which is per CU.
1379 // Otherwise in non-threadgroup split mode all waves of a work-group are
1380 // on the same CU, and so the L1 does not need to be invalidated.
1381 if (ST.isTgSplitEnabled()) {
1382 // Same as GFX7 using agent scope.
1383 Scope = SIAtomicScope::AGENT;
1384 }
1385 break;
1386 case SIAtomicScope::WAVEFRONT:
1387 case SIAtomicScope::SINGLETHREAD:
1388 // Same as GFX7.
1389 break;
1390 default:
1391 llvm_unreachable("Unsupported synchronization scope");
1392 }
1393 }
1394
1395 /// The scratch address space does not need the global memory cache
1396 /// to be flushed as all memory operations by the same thread are
1397 /// sequentially consistent, and no other thread can access scratch
1398 /// memory.
1399
1400 /// Other address spaces do not have a cache.
1401
1402 if (Pos == Position::AFTER)
1403 --MI;
1404
1405 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1406
1407 return Changed;
1408 }
1409
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1410 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1411 SIAtomicScope Scope,
1412 SIAtomicAddrSpace AddrSpace,
1413 bool IsCrossAddrSpaceOrdering,
1414 Position Pos) const {
1415 bool Changed = false;
1416
1417 MachineBasicBlock &MBB = *MI->getParent();
1418 DebugLoc DL = MI->getDebugLoc();
1419
1420 if (Pos == Position::AFTER)
1421 ++MI;
1422
1423 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1424 switch (Scope) {
1425 case SIAtomicScope::SYSTEM:
1426 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1427 // hardware does not reorder memory operations by the same wave with
1428 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1429 // to initiate writeback of any dirty cache lines of earlier writes by the
1430 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1431 // writeback has completed.
1432 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1433 // Set SC bits to indicate system scope.
1434 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1435 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1436 // vmcnt(0)" needed by the "BUFFER_WBL2".
1437 Changed = true;
1438 break;
1439 case SIAtomicScope::AGENT:
1440 case SIAtomicScope::WORKGROUP:
1441 case SIAtomicScope::WAVEFRONT:
1442 case SIAtomicScope::SINGLETHREAD:
1443 // Same as GFX7.
1444 break;
1445 default:
1446 llvm_unreachable("Unsupported synchronization scope");
1447 }
1448 }
1449
1450 if (Pos == Position::AFTER)
1451 --MI;
1452
1453 Changed |=
1454 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1455 IsCrossAddrSpaceOrdering, Pos);
1456
1457 return Changed;
1458 }
1459
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1460 bool SIGfx940CacheControl::enableLoadCacheBypass(
1461 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1462 SIAtomicAddrSpace AddrSpace) const {
1463 assert(MI->mayLoad() && !MI->mayStore());
1464 bool Changed = false;
1465
1466 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1467 switch (Scope) {
1468 case SIAtomicScope::SYSTEM:
1469 // Set SC bits to indicate system scope.
1470 Changed |= enableSC0Bit(MI);
1471 Changed |= enableSC1Bit(MI);
1472 break;
1473 case SIAtomicScope::AGENT:
1474 // Set SC bits to indicate agent scope.
1475 Changed |= enableSC1Bit(MI);
1476 break;
1477 case SIAtomicScope::WORKGROUP:
1478 // In threadgroup split mode the waves of a work-group can be executing on
1479 // different CUs. Therefore need to bypass the L1 which is per CU.
1480 // Otherwise in non-threadgroup split mode all waves of a work-group are
1481 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1482 // bits to indicate work-group scope will do this automatically.
1483 Changed |= enableSC0Bit(MI);
1484 break;
1485 case SIAtomicScope::WAVEFRONT:
1486 case SIAtomicScope::SINGLETHREAD:
1487 // Leave SC bits unset to indicate wavefront scope.
1488 break;
1489 default:
1490 llvm_unreachable("Unsupported synchronization scope");
1491 }
1492 }
1493
1494 /// The scratch address space does not need the global memory caches
1495 /// to be bypassed as all memory operations by the same thread are
1496 /// sequentially consistent, and no other thread can access scratch
1497 /// memory.
1498
1499 /// Other address spaces do not have a cache.
1500
1501 return Changed;
1502 }
1503
enableStoreCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1504 bool SIGfx940CacheControl::enableStoreCacheBypass(
1505 const MachineBasicBlock::iterator &MI,
1506 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1507 assert(!MI->mayLoad() && MI->mayStore());
1508 bool Changed = false;
1509
1510 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1511 switch (Scope) {
1512 case SIAtomicScope::SYSTEM:
1513 // Set SC bits to indicate system scope.
1514 Changed |= enableSC0Bit(MI);
1515 Changed |= enableSC1Bit(MI);
1516 break;
1517 case SIAtomicScope::AGENT:
1518 // Set SC bits to indicate agent scope.
1519 Changed |= enableSC1Bit(MI);
1520 break;
1521 case SIAtomicScope::WORKGROUP:
1522 // Set SC bits to indicate workgroup scope.
1523 Changed |= enableSC0Bit(MI);
1524 break;
1525 case SIAtomicScope::WAVEFRONT:
1526 case SIAtomicScope::SINGLETHREAD:
1527 // Leave SC bits unset to indicate wavefront scope.
1528 break;
1529 default:
1530 llvm_unreachable("Unsupported synchronization scope");
1531 }
1532 }
1533
1534 /// The scratch address space does not need the global memory caches
1535 /// to be bypassed as all memory operations by the same thread are
1536 /// sequentially consistent, and no other thread can access scratch
1537 /// memory.
1538
1539 /// Other address spaces do not have a cache.
1540
1541 return Changed;
1542 }
1543
enableRMWCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1544 bool SIGfx940CacheControl::enableRMWCacheBypass(
1545 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1546 SIAtomicAddrSpace AddrSpace) const {
1547 assert(MI->mayLoad() && MI->mayStore());
1548 bool Changed = false;
1549
1550 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1551 switch (Scope) {
1552 case SIAtomicScope::SYSTEM:
1553 // Set SC1 bit to indicate system scope.
1554 Changed |= enableSC1Bit(MI);
1555 break;
1556 case SIAtomicScope::AGENT:
1557 case SIAtomicScope::WORKGROUP:
1558 case SIAtomicScope::WAVEFRONT:
1559 case SIAtomicScope::SINGLETHREAD:
1560 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1561 // to indicate system or agent scope. The SC0 bit is used to indicate if
1562 // they are return or no-return. Leave SC1 bit unset to indicate agent
1563 // scope.
1564 break;
1565 default:
1566 llvm_unreachable("Unsupported synchronization scope");
1567 }
1568 }
1569
1570 return Changed;
1571 }
1572
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1573 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1574 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1575 bool IsVolatile, bool IsNonTemporal) const {
1576 // Only handle load and store, not atomic read-modify-write insructions. The
1577 // latter use glc to indicate if the atomic returns a result and so must not
1578 // be used for cache control.
1579 assert(MI->mayLoad() ^ MI->mayStore());
1580
1581 // Only update load and store, not LLVM IR atomic read-modify-write
1582 // instructions. The latter are always marked as volatile so cannot sensibly
1583 // handle it as do not want to pessimize all atomics. Also they do not support
1584 // the nontemporal attribute.
1585 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1586
1587 bool Changed = false;
1588
1589 if (IsVolatile) {
1590 // Set SC bits to indicate system scope.
1591 Changed |= enableSC0Bit(MI);
1592 Changed |= enableSC1Bit(MI);
1593
1594 // Ensure operation has completed at system scope to cause all volatile
1595 // operations to be visible outside the program in a global order. Do not
1596 // request cross address space as only the global address space can be
1597 // observable outside the program, so no need to cause a waitcnt for LDS
1598 // address space operations.
1599 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1600 Position::AFTER);
1601
1602 return Changed;
1603 }
1604
1605 if (IsNonTemporal) {
1606 Changed |= enableNTBit(MI);
1607 return Changed;
1608 }
1609
1610 return Changed;
1611 }
1612
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1613 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1614 SIAtomicScope Scope,
1615 SIAtomicAddrSpace AddrSpace,
1616 Position Pos) const {
1617 if (!InsertCacheInv)
1618 return false;
1619
1620 bool Changed = false;
1621
1622 MachineBasicBlock &MBB = *MI->getParent();
1623 DebugLoc DL = MI->getDebugLoc();
1624
1625 if (Pos == Position::AFTER)
1626 ++MI;
1627
1628 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1629 switch (Scope) {
1630 case SIAtomicScope::SYSTEM:
1631 // Ensures that following loads will not see stale remote VMEM data or
1632 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1633 // CC will never be stale due to the local memory probes.
1634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1635 // Set SC bits to indicate system scope.
1636 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1637 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1638 // hardware does not reorder memory operations by the same wave with
1639 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1640 // remove any cache lines of earlier writes by the same wave and ensures
1641 // later reads by the same wave will refetch the cache lines.
1642 Changed = true;
1643 break;
1644 case SIAtomicScope::AGENT:
1645 // Ensures that following loads will not see stale remote date or local
1646 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1647 // due to the memory probes.
1648 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1649 // Set SC bits to indicate agent scope.
1650 .addImm(AMDGPU::CPol::SC1);
1651 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1652 // does not reorder memory operations with respect to preceeding buffer
1653 // invalidate. The invalidate is guaranteed to remove any cache lines of
1654 // earlier writes and ensures later writes will refetch the cache lines.
1655 Changed = true;
1656 break;
1657 case SIAtomicScope::WORKGROUP:
1658 // In threadgroup split mode the waves of a work-group can be executing on
1659 // different CUs. Therefore need to invalidate the L1 which is per CU.
1660 // Otherwise in non-threadgroup split mode all waves of a work-group are
1661 // on the same CU, and so the L1 does not need to be invalidated.
1662 if (ST.isTgSplitEnabled()) {
1663 // Ensures L1 is invalidated if in threadgroup split mode. In
1664 // non-threadgroup split mode it is a NOP, but no point generating it in
1665 // that case if know not in that mode.
1666 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1667 // Set SC bits to indicate work-group scope.
1668 .addImm(AMDGPU::CPol::SC0);
1669 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1670 // does not reorder memory operations with respect to preceeding buffer
1671 // invalidate. The invalidate is guaranteed to remove any cache lines of
1672 // earlier writes and ensures later writes will refetch the cache lines.
1673 Changed = true;
1674 }
1675 break;
1676 case SIAtomicScope::WAVEFRONT:
1677 case SIAtomicScope::SINGLETHREAD:
1678 // Could generate "BUFFER_INV" but it would do nothing as there are no
1679 // caches to invalidate.
1680 break;
1681 default:
1682 llvm_unreachable("Unsupported synchronization scope");
1683 }
1684 }
1685
1686 /// The scratch address space does not need the global memory cache
1687 /// to be flushed as all memory operations by the same thread are
1688 /// sequentially consistent, and no other thread can access scratch
1689 /// memory.
1690
1691 /// Other address spaces do not have a cache.
1692
1693 if (Pos == Position::AFTER)
1694 --MI;
1695
1696 return Changed;
1697 }
1698
insertRelease(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,bool IsCrossAddrSpaceOrdering,Position Pos) const1699 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1700 SIAtomicScope Scope,
1701 SIAtomicAddrSpace AddrSpace,
1702 bool IsCrossAddrSpaceOrdering,
1703 Position Pos) const {
1704 bool Changed = false;
1705
1706 MachineBasicBlock &MBB = *MI->getParent();
1707 DebugLoc DL = MI->getDebugLoc();
1708
1709 if (Pos == Position::AFTER)
1710 ++MI;
1711
1712 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1713 switch (Scope) {
1714 case SIAtomicScope::SYSTEM:
1715 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1716 // hardware does not reorder memory operations by the same wave with
1717 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1718 // to initiate writeback of any dirty cache lines of earlier writes by the
1719 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1720 // writeback has completed.
1721 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1722 // Set SC bits to indicate system scope.
1723 .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1724 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1725 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1726 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1727 Changed = true;
1728 break;
1729 case SIAtomicScope::AGENT:
1730 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1731 // Set SC bits to indicate agent scope.
1732 .addImm(AMDGPU::CPol::SC1);
1733
1734 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1735 // SIAtomicScope::AGENT, the following insertWait will generate the
1736 // required "S_WAITCNT vmcnt(0)".
1737 Changed = true;
1738 break;
1739 case SIAtomicScope::WORKGROUP:
1740 case SIAtomicScope::WAVEFRONT:
1741 case SIAtomicScope::SINGLETHREAD:
1742 // Do not generate "BUFFER_WBL2" as there are no caches it would
1743 // writeback, and would require an otherwise unnecessary
1744 // "S_WAITCNT vmcnt(0)".
1745 break;
1746 default:
1747 llvm_unreachable("Unsupported synchronization scope");
1748 }
1749 }
1750
1751 if (Pos == Position::AFTER)
1752 --MI;
1753
1754 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1755 // S_WAITCNT needed.
1756 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1757 IsCrossAddrSpaceOrdering, Pos);
1758
1759 return Changed;
1760 }
1761
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const1762 bool SIGfx10CacheControl::enableLoadCacheBypass(
1763 const MachineBasicBlock::iterator &MI,
1764 SIAtomicScope Scope,
1765 SIAtomicAddrSpace AddrSpace) const {
1766 assert(MI->mayLoad() && !MI->mayStore());
1767 bool Changed = false;
1768
1769 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1770 switch (Scope) {
1771 case SIAtomicScope::SYSTEM:
1772 case SIAtomicScope::AGENT:
1773 // Set the L0 and L1 cache policies to MISS_EVICT.
1774 // Note: there is no L2 cache coherent bypass control at the ISA level.
1775 Changed |= enableGLCBit(MI);
1776 Changed |= enableDLCBit(MI);
1777 break;
1778 case SIAtomicScope::WORKGROUP:
1779 // In WGP mode the waves of a work-group can be executing on either CU of
1780 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1781 // CU mode all waves of a work-group are on the same CU, and so the L0
1782 // does not need to be bypassed.
1783 if (!ST.isCuModeEnabled())
1784 Changed |= enableGLCBit(MI);
1785 break;
1786 case SIAtomicScope::WAVEFRONT:
1787 case SIAtomicScope::SINGLETHREAD:
1788 // No cache to bypass.
1789 break;
1790 default:
1791 llvm_unreachable("Unsupported synchronization scope");
1792 }
1793 }
1794
1795 /// The scratch address space does not need the global memory caches
1796 /// to be bypassed as all memory operations by the same thread are
1797 /// sequentially consistent, and no other thread can access scratch
1798 /// memory.
1799
1800 /// Other address spaces do not have a cache.
1801
1802 return Changed;
1803 }
1804
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const1805 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1806 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1807 bool IsVolatile, bool IsNonTemporal) const {
1808
1809 // Only handle load and store, not atomic read-modify-write insructions. The
1810 // latter use glc to indicate if the atomic returns a result and so must not
1811 // be used for cache control.
1812 assert(MI->mayLoad() ^ MI->mayStore());
1813
1814 // Only update load and store, not LLVM IR atomic read-modify-write
1815 // instructions. The latter are always marked as volatile so cannot sensibly
1816 // handle it as do not want to pessimize all atomics. Also they do not support
1817 // the nontemporal attribute.
1818 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1819
1820 bool Changed = false;
1821
1822 if (IsVolatile) {
1823 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1824 // and MISS_LRU for store instructions.
1825 // Note: there is no L2 cache coherent bypass control at the ISA level.
1826 if (Op == SIMemOp::LOAD) {
1827 Changed |= enableGLCBit(MI);
1828 Changed |= enableDLCBit(MI);
1829 }
1830
1831 // Ensure operation has completed at system scope to cause all volatile
1832 // operations to be visible outside the program in a global order. Do not
1833 // request cross address space as only the global address space can be
1834 // observable outside the program, so no need to cause a waitcnt for LDS
1835 // address space operations.
1836 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1837 Position::AFTER);
1838 return Changed;
1839 }
1840
1841 if (IsNonTemporal) {
1842 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1843 // and L2 cache policy to STREAM.
1844 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1845 // to MISS_EVICT and the L2 cache policy to STREAM.
1846 if (Op == SIMemOp::STORE)
1847 Changed |= enableGLCBit(MI);
1848 Changed |= enableSLCBit(MI);
1849
1850 return Changed;
1851 }
1852
1853 return Changed;
1854 }
1855
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const1856 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1857 SIAtomicScope Scope,
1858 SIAtomicAddrSpace AddrSpace,
1859 SIMemOp Op,
1860 bool IsCrossAddrSpaceOrdering,
1861 Position Pos) const {
1862 bool Changed = false;
1863
1864 MachineBasicBlock &MBB = *MI->getParent();
1865 DebugLoc DL = MI->getDebugLoc();
1866
1867 if (Pos == Position::AFTER)
1868 ++MI;
1869
1870 bool VMCnt = false;
1871 bool VSCnt = false;
1872 bool LGKMCnt = false;
1873
1874 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1875 SIAtomicAddrSpace::NONE) {
1876 switch (Scope) {
1877 case SIAtomicScope::SYSTEM:
1878 case SIAtomicScope::AGENT:
1879 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1880 VMCnt |= true;
1881 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1882 VSCnt |= true;
1883 break;
1884 case SIAtomicScope::WORKGROUP:
1885 // In WGP mode the waves of a work-group can be executing on either CU of
1886 // the WGP. Therefore need to wait for operations to complete to ensure
1887 // they are visible to waves in the other CU as the L0 is per CU.
1888 // Otherwise in CU mode and all waves of a work-group are on the same CU
1889 // which shares the same L0.
1890 if (!ST.isCuModeEnabled()) {
1891 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1892 VMCnt |= true;
1893 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1894 VSCnt |= true;
1895 }
1896 break;
1897 case SIAtomicScope::WAVEFRONT:
1898 case SIAtomicScope::SINGLETHREAD:
1899 // The L0 cache keeps all memory operations in order for
1900 // work-items in the same wavefront.
1901 break;
1902 default:
1903 llvm_unreachable("Unsupported synchronization scope");
1904 }
1905 }
1906
1907 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1908 switch (Scope) {
1909 case SIAtomicScope::SYSTEM:
1910 case SIAtomicScope::AGENT:
1911 case SIAtomicScope::WORKGROUP:
1912 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1913 // not needed as LDS operations for all waves are executed in a total
1914 // global ordering as observed by all waves. Required if also
1915 // synchronizing with global/GDS memory as LDS operations could be
1916 // reordered with respect to later global/GDS memory operations of the
1917 // same wave.
1918 LGKMCnt |= IsCrossAddrSpaceOrdering;
1919 break;
1920 case SIAtomicScope::WAVEFRONT:
1921 case SIAtomicScope::SINGLETHREAD:
1922 // The LDS keeps all memory operations in order for
1923 // the same wavefront.
1924 break;
1925 default:
1926 llvm_unreachable("Unsupported synchronization scope");
1927 }
1928 }
1929
1930 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1931 switch (Scope) {
1932 case SIAtomicScope::SYSTEM:
1933 case SIAtomicScope::AGENT:
1934 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1935 // is not needed as GDS operations for all waves are executed in a total
1936 // global ordering as observed by all waves. Required if also
1937 // synchronizing with global/LDS memory as GDS operations could be
1938 // reordered with respect to later global/LDS memory operations of the
1939 // same wave.
1940 LGKMCnt |= IsCrossAddrSpaceOrdering;
1941 break;
1942 case SIAtomicScope::WORKGROUP:
1943 case SIAtomicScope::WAVEFRONT:
1944 case SIAtomicScope::SINGLETHREAD:
1945 // The GDS keeps all memory operations in order for
1946 // the same work-group.
1947 break;
1948 default:
1949 llvm_unreachable("Unsupported synchronization scope");
1950 }
1951 }
1952
1953 if (VMCnt || LGKMCnt) {
1954 unsigned WaitCntImmediate =
1955 AMDGPU::encodeWaitcnt(IV,
1956 VMCnt ? 0 : getVmcntBitMask(IV),
1957 getExpcntBitMask(IV),
1958 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1959 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1960 Changed = true;
1961 }
1962
1963 if (VSCnt) {
1964 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1965 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1966 .addImm(0);
1967 Changed = true;
1968 }
1969
1970 if (Pos == Position::AFTER)
1971 --MI;
1972
1973 return Changed;
1974 }
1975
insertAcquire(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const1976 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1977 SIAtomicScope Scope,
1978 SIAtomicAddrSpace AddrSpace,
1979 Position Pos) const {
1980 if (!InsertCacheInv)
1981 return false;
1982
1983 bool Changed = false;
1984
1985 MachineBasicBlock &MBB = *MI->getParent();
1986 DebugLoc DL = MI->getDebugLoc();
1987
1988 if (Pos == Position::AFTER)
1989 ++MI;
1990
1991 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1992 switch (Scope) {
1993 case SIAtomicScope::SYSTEM:
1994 case SIAtomicScope::AGENT:
1995 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1996 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1997 Changed = true;
1998 break;
1999 case SIAtomicScope::WORKGROUP:
2000 // In WGP mode the waves of a work-group can be executing on either CU of
2001 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2002 // in CU mode and all waves of a work-group are on the same CU, and so the
2003 // L0 does not need to be invalidated.
2004 if (!ST.isCuModeEnabled()) {
2005 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2006 Changed = true;
2007 }
2008 break;
2009 case SIAtomicScope::WAVEFRONT:
2010 case SIAtomicScope::SINGLETHREAD:
2011 // No cache to invalidate.
2012 break;
2013 default:
2014 llvm_unreachable("Unsupported synchronization scope");
2015 }
2016 }
2017
2018 /// The scratch address space does not need the global memory cache
2019 /// to be flushed as all memory operations by the same thread are
2020 /// sequentially consistent, and no other thread can access scratch
2021 /// memory.
2022
2023 /// Other address spaces do not have a cache.
2024
2025 if (Pos == Position::AFTER)
2026 --MI;
2027
2028 return Changed;
2029 }
2030
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const2031 bool SIGfx11CacheControl::enableLoadCacheBypass(
2032 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2033 SIAtomicAddrSpace AddrSpace) const {
2034 assert(MI->mayLoad() && !MI->mayStore());
2035 bool Changed = false;
2036
2037 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2038 switch (Scope) {
2039 case SIAtomicScope::SYSTEM:
2040 case SIAtomicScope::AGENT:
2041 // Set the L0 and L1 cache policies to MISS_EVICT.
2042 // Note: there is no L2 cache coherent bypass control at the ISA level.
2043 Changed |= enableGLCBit(MI);
2044 break;
2045 case SIAtomicScope::WORKGROUP:
2046 // In WGP mode the waves of a work-group can be executing on either CU of
2047 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2048 // CU mode all waves of a work-group are on the same CU, and so the L0
2049 // does not need to be bypassed.
2050 if (!ST.isCuModeEnabled())
2051 Changed |= enableGLCBit(MI);
2052 break;
2053 case SIAtomicScope::WAVEFRONT:
2054 case SIAtomicScope::SINGLETHREAD:
2055 // No cache to bypass.
2056 break;
2057 default:
2058 llvm_unreachable("Unsupported synchronization scope");
2059 }
2060 }
2061
2062 /// The scratch address space does not need the global memory caches
2063 /// to be bypassed as all memory operations by the same thread are
2064 /// sequentially consistent, and no other thread can access scratch
2065 /// memory.
2066
2067 /// Other address spaces do not have a cache.
2068
2069 return Changed;
2070 }
2071
enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator & MI,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsVolatile,bool IsNonTemporal) const2072 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2073 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2074 bool IsVolatile, bool IsNonTemporal) const {
2075
2076 // Only handle load and store, not atomic read-modify-write insructions. The
2077 // latter use glc to indicate if the atomic returns a result and so must not
2078 // be used for cache control.
2079 assert(MI->mayLoad() ^ MI->mayStore());
2080
2081 // Only update load and store, not LLVM IR atomic read-modify-write
2082 // instructions. The latter are always marked as volatile so cannot sensibly
2083 // handle it as do not want to pessimize all atomics. Also they do not support
2084 // the nontemporal attribute.
2085 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2086
2087 bool Changed = false;
2088
2089 if (IsVolatile) {
2090 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2091 // and MISS_LRU for store instructions.
2092 // Note: there is no L2 cache coherent bypass control at the ISA level.
2093 if (Op == SIMemOp::LOAD)
2094 Changed |= enableGLCBit(MI);
2095
2096 // Set MALL NOALLOC for load and store instructions.
2097 Changed |= enableDLCBit(MI);
2098
2099 // Ensure operation has completed at system scope to cause all volatile
2100 // operations to be visible outside the program in a global order. Do not
2101 // request cross address space as only the global address space can be
2102 // observable outside the program, so no need to cause a waitcnt for LDS
2103 // address space operations.
2104 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2105 Position::AFTER);
2106 return Changed;
2107 }
2108
2109 if (IsNonTemporal) {
2110 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2111 // and L2 cache policy to STREAM.
2112 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2113 // to MISS_EVICT and the L2 cache policy to STREAM.
2114 if (Op == SIMemOp::STORE)
2115 Changed |= enableGLCBit(MI);
2116 Changed |= enableSLCBit(MI);
2117
2118 // Set MALL NOALLOC for load and store instructions.
2119 Changed |= enableDLCBit(MI);
2120 return Changed;
2121 }
2122
2123 return Changed;
2124 }
2125
removeAtomicPseudoMIs()2126 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2127 if (AtomicPseudoMIs.empty())
2128 return false;
2129
2130 for (auto &MI : AtomicPseudoMIs)
2131 MI->eraseFromParent();
2132
2133 AtomicPseudoMIs.clear();
2134 return true;
2135 }
2136
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2137 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2138 MachineBasicBlock::iterator &MI) {
2139 assert(MI->mayLoad() && !MI->mayStore());
2140
2141 bool Changed = false;
2142
2143 if (MOI.isAtomic()) {
2144 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2145 MOI.getOrdering() == AtomicOrdering::Acquire ||
2146 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2147 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2148 MOI.getOrderingAddrSpace());
2149 }
2150
2151 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2152 Changed |= CC->insertWait(MI, MOI.getScope(),
2153 MOI.getOrderingAddrSpace(),
2154 SIMemOp::LOAD | SIMemOp::STORE,
2155 MOI.getIsCrossAddressSpaceOrdering(),
2156 Position::BEFORE);
2157
2158 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2159 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2160 Changed |= CC->insertWait(MI, MOI.getScope(),
2161 MOI.getInstrAddrSpace(),
2162 SIMemOp::LOAD,
2163 MOI.getIsCrossAddressSpaceOrdering(),
2164 Position::AFTER);
2165 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2166 MOI.getOrderingAddrSpace(),
2167 Position::AFTER);
2168 }
2169
2170 return Changed;
2171 }
2172
2173 // Atomic instructions already bypass caches to the scope specified by the
2174 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2175 // need additional treatment.
2176 Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2177 SIMemOp::LOAD, MOI.isVolatile(),
2178 MOI.isNonTemporal());
2179 return Changed;
2180 }
2181
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2182 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2183 MachineBasicBlock::iterator &MI) {
2184 assert(!MI->mayLoad() && MI->mayStore());
2185
2186 bool Changed = false;
2187
2188 if (MOI.isAtomic()) {
2189 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2190 MOI.getOrdering() == AtomicOrdering::Release ||
2191 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2192 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2193 MOI.getOrderingAddrSpace());
2194 }
2195
2196 if (MOI.getOrdering() == AtomicOrdering::Release ||
2197 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2198 Changed |= CC->insertRelease(MI, MOI.getScope(),
2199 MOI.getOrderingAddrSpace(),
2200 MOI.getIsCrossAddressSpaceOrdering(),
2201 Position::BEFORE);
2202
2203 return Changed;
2204 }
2205
2206 // Atomic instructions already bypass caches to the scope specified by the
2207 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2208 // need additional treatment.
2209 Changed |= CC->enableVolatileAndOrNonTemporal(
2210 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2211 MOI.isNonTemporal());
2212 return Changed;
2213 }
2214
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2215 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2216 MachineBasicBlock::iterator &MI) {
2217 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2218
2219 AtomicPseudoMIs.push_back(MI);
2220 bool Changed = false;
2221
2222 if (MOI.isAtomic()) {
2223 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2224 MOI.getOrdering() == AtomicOrdering::Release ||
2225 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2226 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2227 /// TODO: This relies on a barrier always generating a waitcnt
2228 /// for LDS to ensure it is not reordered with the completion of
2229 /// the proceeding LDS operations. If barrier had a memory
2230 /// ordering and memory scope, then library does not need to
2231 /// generate a fence. Could add support in this file for
2232 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2233 /// adding S_WAITCNT before a S_BARRIER.
2234 Changed |= CC->insertRelease(MI, MOI.getScope(),
2235 MOI.getOrderingAddrSpace(),
2236 MOI.getIsCrossAddressSpaceOrdering(),
2237 Position::BEFORE);
2238
2239 // TODO: If both release and invalidate are happening they could be combined
2240 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2241 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2242 // track cache invalidate and write back instructions.
2243
2244 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2245 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2246 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2247 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2248 MOI.getOrderingAddrSpace(),
2249 Position::BEFORE);
2250
2251 return Changed;
2252 }
2253
2254 return Changed;
2255 }
2256
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)2257 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2258 MachineBasicBlock::iterator &MI) {
2259 assert(MI->mayLoad() && MI->mayStore());
2260
2261 bool Changed = false;
2262
2263 if (MOI.isAtomic()) {
2264 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2265 MOI.getOrdering() == AtomicOrdering::Acquire ||
2266 MOI.getOrdering() == AtomicOrdering::Release ||
2267 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2268 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2269 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2270 MOI.getInstrAddrSpace());
2271 }
2272
2273 if (MOI.getOrdering() == AtomicOrdering::Release ||
2274 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2275 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2276 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2277 Changed |= CC->insertRelease(MI, MOI.getScope(),
2278 MOI.getOrderingAddrSpace(),
2279 MOI.getIsCrossAddressSpaceOrdering(),
2280 Position::BEFORE);
2281
2282 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2283 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2284 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2285 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2286 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2287 Changed |= CC->insertWait(MI, MOI.getScope(),
2288 MOI.getInstrAddrSpace(),
2289 isAtomicRet(*MI) ? SIMemOp::LOAD :
2290 SIMemOp::STORE,
2291 MOI.getIsCrossAddressSpaceOrdering(),
2292 Position::AFTER);
2293 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2294 MOI.getOrderingAddrSpace(),
2295 Position::AFTER);
2296 }
2297
2298 return Changed;
2299 }
2300
2301 return Changed;
2302 }
2303
runOnMachineFunction(MachineFunction & MF)2304 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2305 bool Changed = false;
2306
2307 SIMemOpAccess MOA(MF);
2308 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2309
2310 for (auto &MBB : MF) {
2311 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2312
2313 // Unbundle instructions after the post-RA scheduler.
2314 if (MI->isBundle() && MI->mayLoadOrStore()) {
2315 MachineBasicBlock::instr_iterator II(MI->getIterator());
2316 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2317 I != E && I->isBundledWithPred(); ++I) {
2318 I->unbundleFromPred();
2319 for (MachineOperand &MO : I->operands())
2320 if (MO.isReg())
2321 MO.setIsInternalRead(false);
2322 }
2323
2324 MI->eraseFromParent();
2325 MI = II->getIterator();
2326 }
2327
2328 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2329 continue;
2330
2331 if (const auto &MOI = MOA.getLoadInfo(MI))
2332 Changed |= expandLoad(MOI.value(), MI);
2333 else if (const auto &MOI = MOA.getStoreInfo(MI))
2334 Changed |= expandStore(MOI.value(), MI);
2335 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2336 Changed |= expandAtomicFence(MOI.value(), MI);
2337 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2338 Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI);
2339 }
2340 }
2341
2342 Changed |= removeAtomicPseudoMIs();
2343 return Changed;
2344 }
2345
2346 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2347
2348 char SIMemoryLegalizer::ID = 0;
2349 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2350
createSIMemoryLegalizerPass()2351 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2352 return new SIMemoryLegalizer();
2353 }
2354