1 //===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "AMDGPUMemoryUtils.h"
10 #include "AMDGPU.h"
11 #include "AMDGPUBaseInfo.h"
12 #include "llvm/ADT/SetVector.h"
13 #include "llvm/ADT/SmallSet.h"
14 #include "llvm/Analysis/AliasAnalysis.h"
15 #include "llvm/Analysis/MemorySSA.h"
16 #include "llvm/IR/DataLayout.h"
17 #include "llvm/IR/Instructions.h"
18 #include "llvm/IR/IntrinsicInst.h"
19 #include "llvm/IR/IntrinsicsAMDGPU.h"
20 #include "llvm/IR/ReplaceConstant.h"
21
22 #define DEBUG_TYPE "amdgpu-memory-utils"
23
24 using namespace llvm;
25
26 namespace llvm {
27
28 namespace AMDGPU {
29
getAlign(DataLayout const & DL,const GlobalVariable * GV)30 Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
31 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
32 GV->getValueType());
33 }
34
collectFunctionUses(User * U,const Function * F,SetVector<Instruction * > & InstUsers)35 static void collectFunctionUses(User *U, const Function *F,
36 SetVector<Instruction *> &InstUsers) {
37 SmallVector<User *> Stack{U};
38
39 while (!Stack.empty()) {
40 U = Stack.pop_back_val();
41
42 if (auto *I = dyn_cast<Instruction>(U)) {
43 if (I->getFunction() == F)
44 InstUsers.insert(I);
45 continue;
46 }
47
48 if (!isa<ConstantExpr>(U))
49 continue;
50
51 append_range(Stack, U->users());
52 }
53 }
54
replaceConstantUsesInFunction(ConstantExpr * C,const Function * F)55 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
56 SetVector<Instruction *> InstUsers;
57
58 collectFunctionUses(C, F, InstUsers);
59 for (Instruction *I : InstUsers) {
60 convertConstantExprsToInstructions(I, C);
61 }
62 }
63
shouldLowerLDSToStruct(const GlobalVariable & GV,const Function * F)64 static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
65 const Function *F) {
66 // We are not interested in kernel LDS lowering for module LDS itself.
67 if (F && GV.getName() == "llvm.amdgcn.module.lds")
68 return false;
69
70 bool Ret = false;
71 SmallPtrSet<const User *, 8> Visited;
72 SmallVector<const User *, 16> Stack(GV.users());
73
74 assert(!F || isKernelCC(F));
75
76 while (!Stack.empty()) {
77 const User *V = Stack.pop_back_val();
78 Visited.insert(V);
79
80 if (isa<GlobalValue>(V)) {
81 // This use of the LDS variable is the initializer of a global variable.
82 // This is ill formed. The address of an LDS variable is kernel dependent
83 // and unknown until runtime. It can't be written to a global variable.
84 continue;
85 }
86
87 if (auto *I = dyn_cast<Instruction>(V)) {
88 const Function *UF = I->getFunction();
89 if (UF == F) {
90 // Used from this kernel, we want to put it into the structure.
91 Ret = true;
92 } else if (!F) {
93 // For module LDS lowering, lowering is required if the user instruction
94 // is from non-kernel function.
95 Ret |= !isKernelCC(UF);
96 }
97 continue;
98 }
99
100 // User V should be a constant, recursively visit users of V.
101 assert(isa<Constant>(V) && "Expected a constant.");
102 append_range(Stack, V->users());
103 }
104
105 return Ret;
106 }
107
findVariablesToLower(Module & M,const Function * F)108 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
109 const Function *F) {
110 std::vector<llvm::GlobalVariable *> LocalVars;
111 for (auto &GV : M.globals()) {
112 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
113 continue;
114 }
115 if (!GV.hasInitializer()) {
116 // addrspace(3) without initializer implies cuda/hip extern __shared__
117 // the semantics for such a variable appears to be that all extern
118 // __shared__ variables alias one another, in which case this transform
119 // is not required
120 continue;
121 }
122 if (!isa<UndefValue>(GV.getInitializer())) {
123 // Initializers are unimplemented for LDS address space.
124 // Leave such variables in place for consistent error reporting.
125 continue;
126 }
127 if (GV.isConstant()) {
128 // A constant undef variable can't be written to, and any load is
129 // undef, so it should be eliminated by the optimizer. It could be
130 // dropped by the back end if not. This pass skips over it.
131 continue;
132 }
133 if (!shouldLowerLDSToStruct(GV, F)) {
134 continue;
135 }
136 LocalVars.push_back(&GV);
137 }
138 return LocalVars;
139 }
140
isReallyAClobber(const Value * Ptr,MemoryDef * Def,AAResults * AA)141 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
142 Instruction *DefInst = Def->getMemoryInst();
143
144 if (isa<FenceInst>(DefInst))
145 return false;
146
147 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
148 switch (II->getIntrinsicID()) {
149 case Intrinsic::amdgcn_s_barrier:
150 case Intrinsic::amdgcn_wave_barrier:
151 case Intrinsic::amdgcn_sched_barrier:
152 return false;
153 default:
154 break;
155 }
156 }
157
158 // Ignore atomics not aliasing with the original load, any atomic is a
159 // universal MemoryDef from MSSA's point of view too, just like a fence.
160 const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
161 return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
162 };
163
164 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
165 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
166 return false;
167
168 return true;
169 }
170
isClobberedInFunction(const LoadInst * Load,MemorySSA * MSSA,AAResults * AA)171 bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
172 AAResults *AA) {
173 MemorySSAWalker *Walker = MSSA->getWalker();
174 SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
175 SmallSet<MemoryAccess *, 8> Visited;
176 MemoryLocation Loc(MemoryLocation::get(Load));
177
178 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
179
180 // Start with a nearest dominating clobbering access, it will be either
181 // live on entry (nothing to do, load is not clobbered), MemoryDef, or
182 // MemoryPhi if several MemoryDefs can define this memory state. In that
183 // case add all Defs to WorkList and continue going up and checking all
184 // the definitions of this memory location until the root. When all the
185 // defs are exhausted and came to the entry state we have no clobber.
186 // Along the scan ignore barriers and fences which are considered clobbers
187 // by the MemorySSA, but not really writing anything into the memory.
188 while (!WorkList.empty()) {
189 MemoryAccess *MA = WorkList.pop_back_val();
190 if (!Visited.insert(MA).second)
191 continue;
192
193 if (MSSA->isLiveOnEntryDef(MA))
194 continue;
195
196 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
197 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
198
199 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
200 LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
201 return true;
202 }
203
204 WorkList.push_back(
205 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
206 continue;
207 }
208
209 const MemoryPhi *Phi = cast<MemoryPhi>(MA);
210 for (auto &Use : Phi->incoming_values())
211 WorkList.push_back(cast<MemoryAccess>(&Use));
212 }
213
214 LLVM_DEBUG(dbgs() << " -> no clobber\n");
215 return false;
216 }
217
218 } // end namespace AMDGPU
219
220 } // end namespace llvm
221