1af732203SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2af732203SDimitry Andric //
3af732203SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4af732203SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5af732203SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6af732203SDimitry Andric //
7af732203SDimitry Andric //===----------------------------------------------------------------------===//
8af732203SDimitry Andric //
9af732203SDimitry Andric /// \file
10af732203SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11af732203SDimitry Andric /// selection.
12af732203SDimitry Andric //
13af732203SDimitry Andric //===----------------------------------------------------------------------===//
14af732203SDimitry Andric 
15af732203SDimitry Andric #include "AMDGPU.h"
16af732203SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
17af732203SDimitry Andric #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18af732203SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
19af732203SDimitry Andric #include "llvm/IR/IRBuilder.h"
20af732203SDimitry Andric #include "llvm/IR/InstVisitor.h"
21af732203SDimitry Andric #include "llvm/InitializePasses.h"
22af732203SDimitry Andric #include "llvm/Support/CommandLine.h"
23af732203SDimitry Andric #include "llvm/Support/KnownBits.h"
24af732203SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
25af732203SDimitry Andric 
26af732203SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27af732203SDimitry Andric 
28af732203SDimitry Andric using namespace llvm;
29af732203SDimitry Andric 
30af732203SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass
31af732203SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the
32af732203SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned
33af732203SDimitry Andric // only but not dword aligned.
34af732203SDimitry Andric static cl::opt<bool>
35af732203SDimitry Andric     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36af732203SDimitry Andric                cl::desc("Widen sub-dword constant address space loads in "
37af732203SDimitry Andric                         "AMDGPULateCodeGenPrepare"),
38af732203SDimitry Andric                cl::ReallyHidden, cl::init(true));
39af732203SDimitry Andric 
40af732203SDimitry Andric namespace {
41af732203SDimitry Andric 
42af732203SDimitry Andric class AMDGPULateCodeGenPrepare
43af732203SDimitry Andric     : public FunctionPass,
44af732203SDimitry Andric       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45af732203SDimitry Andric   Module *Mod = nullptr;
46af732203SDimitry Andric   const DataLayout *DL = nullptr;
47af732203SDimitry Andric 
48af732203SDimitry Andric   AssumptionCache *AC = nullptr;
49af732203SDimitry Andric   LegacyDivergenceAnalysis *DA = nullptr;
50af732203SDimitry Andric 
51af732203SDimitry Andric public:
52af732203SDimitry Andric   static char ID;
53af732203SDimitry Andric 
AMDGPULateCodeGenPrepare()54af732203SDimitry Andric   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55af732203SDimitry Andric 
getPassName() const56af732203SDimitry Andric   StringRef getPassName() const override {
57af732203SDimitry Andric     return "AMDGPU IR late optimizations";
58af732203SDimitry Andric   }
59af732203SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const60af732203SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
61af732203SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
62af732203SDimitry Andric     AU.addRequired<LegacyDivergenceAnalysis>();
63af732203SDimitry Andric     AU.setPreservesAll();
64af732203SDimitry Andric   }
65af732203SDimitry Andric 
66af732203SDimitry Andric   bool doInitialization(Module &M) override;
67af732203SDimitry Andric   bool runOnFunction(Function &F) override;
68af732203SDimitry Andric 
visitInstruction(Instruction &)69af732203SDimitry Andric   bool visitInstruction(Instruction &) { return false; }
70af732203SDimitry Andric 
71af732203SDimitry Andric   // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const72af732203SDimitry Andric   bool isDWORDAligned(const Value *V) const {
73af732203SDimitry Andric     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74af732203SDimitry Andric     return Known.countMinTrailingZeros() >= 2;
75af732203SDimitry Andric   }
76af732203SDimitry Andric 
77af732203SDimitry Andric   bool canWidenScalarExtLoad(LoadInst &LI) const;
78af732203SDimitry Andric   bool visitLoadInst(LoadInst &LI);
79af732203SDimitry Andric };
80af732203SDimitry Andric 
81af732203SDimitry Andric } // end anonymous namespace
82af732203SDimitry Andric 
doInitialization(Module & M)83af732203SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84af732203SDimitry Andric   Mod = &M;
85af732203SDimitry Andric   DL = &Mod->getDataLayout();
86af732203SDimitry Andric   return false;
87af732203SDimitry Andric }
88af732203SDimitry Andric 
runOnFunction(Function & F)89af732203SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
90af732203SDimitry Andric   if (skipFunction(F))
91af732203SDimitry Andric     return false;
92af732203SDimitry Andric 
93af732203SDimitry Andric   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94af732203SDimitry Andric   DA = &getAnalysis<LegacyDivergenceAnalysis>();
95af732203SDimitry Andric 
96af732203SDimitry Andric   bool Changed = false;
97af732203SDimitry Andric   for (auto &BB : F)
98af732203SDimitry Andric     for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
99af732203SDimitry Andric       Instruction *I = &*BI++;
100af732203SDimitry Andric       Changed |= visit(*I);
101af732203SDimitry Andric     }
102af732203SDimitry Andric 
103af732203SDimitry Andric   return Changed;
104af732203SDimitry Andric }
105af732203SDimitry Andric 
canWidenScalarExtLoad(LoadInst & LI) const106af732203SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
107af732203SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
108af732203SDimitry Andric   // Skip non-constant address space.
109af732203SDimitry Andric   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
110af732203SDimitry Andric       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
111af732203SDimitry Andric     return false;
112af732203SDimitry Andric   // Skip non-simple loads.
113af732203SDimitry Andric   if (!LI.isSimple())
114af732203SDimitry Andric     return false;
115af732203SDimitry Andric   auto *Ty = LI.getType();
116af732203SDimitry Andric   // Skip aggregate types.
117af732203SDimitry Andric   if (Ty->isAggregateType())
118af732203SDimitry Andric     return false;
119af732203SDimitry Andric   unsigned TySize = DL->getTypeStoreSize(Ty);
120af732203SDimitry Andric   // Only handle sub-DWORD loads.
121af732203SDimitry Andric   if (TySize >= 4)
122af732203SDimitry Andric     return false;
123af732203SDimitry Andric   // That load must be at least naturally aligned.
124af732203SDimitry Andric   if (LI.getAlign() < DL->getABITypeAlign(Ty))
125af732203SDimitry Andric     return false;
126af732203SDimitry Andric   // It should be uniform, i.e. a scalar load.
127af732203SDimitry Andric   return DA->isUniform(&LI);
128af732203SDimitry Andric }
129af732203SDimitry Andric 
visitLoadInst(LoadInst & LI)130af732203SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
131af732203SDimitry Andric   if (!WidenLoads)
132af732203SDimitry Andric     return false;
133af732203SDimitry Andric 
134af732203SDimitry Andric   // Skip if that load is already aligned on DWORD at least as it's handled in
135af732203SDimitry Andric   // SDAG.
136af732203SDimitry Andric   if (LI.getAlign() >= 4)
137af732203SDimitry Andric     return false;
138af732203SDimitry Andric 
139af732203SDimitry Andric   if (!canWidenScalarExtLoad(LI))
140af732203SDimitry Andric     return false;
141af732203SDimitry Andric 
142af732203SDimitry Andric   int64_t Offset = 0;
143af732203SDimitry Andric   auto *Base =
144af732203SDimitry Andric       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
145af732203SDimitry Andric   // If that base is not DWORD aligned, it's not safe to perform the following
146af732203SDimitry Andric   // transforms.
147af732203SDimitry Andric   if (!isDWORDAligned(Base))
148af732203SDimitry Andric     return false;
149af732203SDimitry Andric 
150af732203SDimitry Andric   int64_t Adjust = Offset & 0x3;
151af732203SDimitry Andric   if (Adjust == 0) {
152af732203SDimitry Andric     // With a zero adjust, the original alignment could be promoted with a
153af732203SDimitry Andric     // better one.
154af732203SDimitry Andric     LI.setAlignment(Align(4));
155af732203SDimitry Andric     return true;
156af732203SDimitry Andric   }
157af732203SDimitry Andric 
158af732203SDimitry Andric   IRBuilder<> IRB(&LI);
159af732203SDimitry Andric   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
160af732203SDimitry Andric 
161af732203SDimitry Andric   unsigned AS = LI.getPointerAddressSpace();
162af732203SDimitry Andric   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
163af732203SDimitry Andric   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
164af732203SDimitry Andric 
165af732203SDimitry Andric   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
166af732203SDimitry Andric   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
167af732203SDimitry Andric   auto *NewPtr = IRB.CreateBitCast(
168*5f7ddb14SDimitry Andric       IRB.CreateConstGEP1_64(
169*5f7ddb14SDimitry Andric           IRB.getInt8Ty(),
170*5f7ddb14SDimitry Andric           IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
171af732203SDimitry Andric           Offset - Adjust),
172af732203SDimitry Andric       Int32PtrTy);
173*5f7ddb14SDimitry Andric   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
174af732203SDimitry Andric   NewLd->copyMetadata(LI);
175af732203SDimitry Andric   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
176af732203SDimitry Andric 
177af732203SDimitry Andric   unsigned ShAmt = Adjust * 8;
178af732203SDimitry Andric   auto *NewVal = IRB.CreateBitCast(
179af732203SDimitry Andric       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
180af732203SDimitry Andric   LI.replaceAllUsesWith(NewVal);
181af732203SDimitry Andric   RecursivelyDeleteTriviallyDeadInstructions(&LI);
182af732203SDimitry Andric 
183af732203SDimitry Andric   return true;
184af732203SDimitry Andric }
185af732203SDimitry Andric 
186af732203SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
187af732203SDimitry Andric                       "AMDGPU IR late optimizations", false, false)
188af732203SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
189af732203SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
190af732203SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
191af732203SDimitry Andric                     "AMDGPU IR late optimizations", false, false)
192af732203SDimitry Andric 
193af732203SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0;
194af732203SDimitry Andric 
createAMDGPULateCodeGenPreparePass()195af732203SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
196af732203SDimitry Andric   return new AMDGPULateCodeGenPrepare();
197af732203SDimitry Andric }
198