146c3d5cbSMichael Liao //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
246c3d5cbSMichael Liao //
346c3d5cbSMichael Liao // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
446c3d5cbSMichael Liao // See https://llvm.org/LICENSE.txt for license information.
546c3d5cbSMichael Liao // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
646c3d5cbSMichael Liao //
746c3d5cbSMichael Liao //===----------------------------------------------------------------------===//
846c3d5cbSMichael Liao //
946c3d5cbSMichael Liao /// \file
1046c3d5cbSMichael Liao /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
1146c3d5cbSMichael Liao /// selection.
1246c3d5cbSMichael Liao //
1346c3d5cbSMichael Liao //===----------------------------------------------------------------------===//
1446c3d5cbSMichael Liao 
1546c3d5cbSMichael Liao #include "AMDGPU.h"
1646c3d5cbSMichael Liao #include "llvm/Analysis/AssumptionCache.h"
1746c3d5cbSMichael Liao #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1846c3d5cbSMichael Liao #include "llvm/Analysis/ValueTracking.h"
1946c3d5cbSMichael Liao #include "llvm/IR/IRBuilder.h"
2046c3d5cbSMichael Liao #include "llvm/IR/InstVisitor.h"
2146c3d5cbSMichael Liao #include "llvm/InitializePasses.h"
2246c3d5cbSMichael Liao #include "llvm/Support/CommandLine.h"
2346c3d5cbSMichael Liao #include "llvm/Support/KnownBits.h"
2446c3d5cbSMichael Liao #include "llvm/Transforms/Utils/Local.h"
2546c3d5cbSMichael Liao 
2646c3d5cbSMichael Liao #define DEBUG_TYPE "amdgpu-late-codegenprepare"
2746c3d5cbSMichael Liao 
2846c3d5cbSMichael Liao using namespace llvm;
2946c3d5cbSMichael Liao 
3046c3d5cbSMichael Liao // Scalar load widening needs running after load-store-vectorizer as that pass
3146c3d5cbSMichael Liao // doesn't handle overlapping cases. In addition, this pass enhances the
3246c3d5cbSMichael Liao // widening to handle cases where scalar sub-dword loads are naturally aligned
3346c3d5cbSMichael Liao // only but not dword aligned.
3446c3d5cbSMichael Liao static cl::opt<bool>
3546c3d5cbSMichael Liao     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
3646c3d5cbSMichael Liao                cl::desc("Widen sub-dword constant address space loads in "
3746c3d5cbSMichael Liao                         "AMDGPULateCodeGenPrepare"),
3846c3d5cbSMichael Liao                cl::ReallyHidden, cl::init(true));
3946c3d5cbSMichael Liao 
4046c3d5cbSMichael Liao namespace {
4146c3d5cbSMichael Liao 
4246c3d5cbSMichael Liao class AMDGPULateCodeGenPrepare
4346c3d5cbSMichael Liao     : public FunctionPass,
4446c3d5cbSMichael Liao       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
4546c3d5cbSMichael Liao   Module *Mod = nullptr;
4646c3d5cbSMichael Liao   const DataLayout *DL = nullptr;
4746c3d5cbSMichael Liao 
4846c3d5cbSMichael Liao   AssumptionCache *AC = nullptr;
4946c3d5cbSMichael Liao   LegacyDivergenceAnalysis *DA = nullptr;
5046c3d5cbSMichael Liao 
5146c3d5cbSMichael Liao public:
5246c3d5cbSMichael Liao   static char ID;
5346c3d5cbSMichael Liao 
AMDGPULateCodeGenPrepare()5446c3d5cbSMichael Liao   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
5546c3d5cbSMichael Liao 
getPassName() const5646c3d5cbSMichael Liao   StringRef getPassName() const override {
5746c3d5cbSMichael Liao     return "AMDGPU IR late optimizations";
5846c3d5cbSMichael Liao   }
5946c3d5cbSMichael Liao 
getAnalysisUsage(AnalysisUsage & AU) const6046c3d5cbSMichael Liao   void getAnalysisUsage(AnalysisUsage &AU) const override {
6146c3d5cbSMichael Liao     AU.addRequired<AssumptionCacheTracker>();
6246c3d5cbSMichael Liao     AU.addRequired<LegacyDivergenceAnalysis>();
6346c3d5cbSMichael Liao     AU.setPreservesAll();
6446c3d5cbSMichael Liao   }
6546c3d5cbSMichael Liao 
6646c3d5cbSMichael Liao   bool doInitialization(Module &M) override;
6746c3d5cbSMichael Liao   bool runOnFunction(Function &F) override;
6846c3d5cbSMichael Liao 
visitInstruction(Instruction &)6946c3d5cbSMichael Liao   bool visitInstruction(Instruction &) { return false; }
7046c3d5cbSMichael Liao 
7146c3d5cbSMichael Liao   // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const7246c3d5cbSMichael Liao   bool isDWORDAligned(const Value *V) const {
7346c3d5cbSMichael Liao     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
7446c3d5cbSMichael Liao     return Known.countMinTrailingZeros() >= 2;
7546c3d5cbSMichael Liao   }
7646c3d5cbSMichael Liao 
7746c3d5cbSMichael Liao   bool canWidenScalarExtLoad(LoadInst &LI) const;
7846c3d5cbSMichael Liao   bool visitLoadInst(LoadInst &LI);
7946c3d5cbSMichael Liao };
8046c3d5cbSMichael Liao 
8146c3d5cbSMichael Liao } // end anonymous namespace
8246c3d5cbSMichael Liao 
doInitialization(Module & M)8346c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
8446c3d5cbSMichael Liao   Mod = &M;
8546c3d5cbSMichael Liao   DL = &Mod->getDataLayout();
8646c3d5cbSMichael Liao   return false;
8746c3d5cbSMichael Liao }
8846c3d5cbSMichael Liao 
runOnFunction(Function & F)8946c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
9046c3d5cbSMichael Liao   if (skipFunction(F))
9146c3d5cbSMichael Liao     return false;
9246c3d5cbSMichael Liao 
9346c3d5cbSMichael Liao   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
9446c3d5cbSMichael Liao   DA = &getAnalysis<LegacyDivergenceAnalysis>();
9546c3d5cbSMichael Liao 
9646c3d5cbSMichael Liao   bool Changed = false;
9746c3d5cbSMichael Liao   for (auto &BB : F)
98*4bef0304SKazu Hirata     for (Instruction &I : llvm::make_early_inc_range(BB))
99*4bef0304SKazu Hirata       Changed |= visit(I);
10046c3d5cbSMichael Liao 
10146c3d5cbSMichael Liao   return Changed;
10246c3d5cbSMichael Liao }
10346c3d5cbSMichael Liao 
canWidenScalarExtLoad(LoadInst & LI) const10446c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
10546c3d5cbSMichael Liao   unsigned AS = LI.getPointerAddressSpace();
10646c3d5cbSMichael Liao   // Skip non-constant address space.
10746c3d5cbSMichael Liao   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10846c3d5cbSMichael Liao       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
10946c3d5cbSMichael Liao     return false;
11046c3d5cbSMichael Liao   // Skip non-simple loads.
11146c3d5cbSMichael Liao   if (!LI.isSimple())
11246c3d5cbSMichael Liao     return false;
11346c3d5cbSMichael Liao   auto *Ty = LI.getType();
11446c3d5cbSMichael Liao   // Skip aggregate types.
11546c3d5cbSMichael Liao   if (Ty->isAggregateType())
11646c3d5cbSMichael Liao     return false;
11746c3d5cbSMichael Liao   unsigned TySize = DL->getTypeStoreSize(Ty);
11846c3d5cbSMichael Liao   // Only handle sub-DWORD loads.
11946c3d5cbSMichael Liao   if (TySize >= 4)
12046c3d5cbSMichael Liao     return false;
12146c3d5cbSMichael Liao   // That load must be at least naturally aligned.
12246c3d5cbSMichael Liao   if (LI.getAlign() < DL->getABITypeAlign(Ty))
12346c3d5cbSMichael Liao     return false;
12446c3d5cbSMichael Liao   // It should be uniform, i.e. a scalar load.
12546c3d5cbSMichael Liao   return DA->isUniform(&LI);
12646c3d5cbSMichael Liao }
12746c3d5cbSMichael Liao 
visitLoadInst(LoadInst & LI)12846c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
12946c3d5cbSMichael Liao   if (!WidenLoads)
13046c3d5cbSMichael Liao     return false;
13146c3d5cbSMichael Liao 
13246c3d5cbSMichael Liao   // Skip if that load is already aligned on DWORD at least as it's handled in
13346c3d5cbSMichael Liao   // SDAG.
13446c3d5cbSMichael Liao   if (LI.getAlign() >= 4)
13546c3d5cbSMichael Liao     return false;
13646c3d5cbSMichael Liao 
13746c3d5cbSMichael Liao   if (!canWidenScalarExtLoad(LI))
13846c3d5cbSMichael Liao     return false;
13946c3d5cbSMichael Liao 
14046c3d5cbSMichael Liao   int64_t Offset = 0;
14146c3d5cbSMichael Liao   auto *Base =
14246c3d5cbSMichael Liao       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
14346c3d5cbSMichael Liao   // If that base is not DWORD aligned, it's not safe to perform the following
14446c3d5cbSMichael Liao   // transforms.
14546c3d5cbSMichael Liao   if (!isDWORDAligned(Base))
14646c3d5cbSMichael Liao     return false;
14746c3d5cbSMichael Liao 
14846c3d5cbSMichael Liao   int64_t Adjust = Offset & 0x3;
14946c3d5cbSMichael Liao   if (Adjust == 0) {
15046c3d5cbSMichael Liao     // With a zero adjust, the original alignment could be promoted with a
15146c3d5cbSMichael Liao     // better one.
15246c3d5cbSMichael Liao     LI.setAlignment(Align(4));
15346c3d5cbSMichael Liao     return true;
15446c3d5cbSMichael Liao   }
15546c3d5cbSMichael Liao 
15646c3d5cbSMichael Liao   IRBuilder<> IRB(&LI);
15746c3d5cbSMichael Liao   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
15846c3d5cbSMichael Liao 
15946c3d5cbSMichael Liao   unsigned AS = LI.getPointerAddressSpace();
16046c3d5cbSMichael Liao   unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
16146c3d5cbSMichael Liao   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
16246c3d5cbSMichael Liao 
16346c3d5cbSMichael Liao   PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
16446c3d5cbSMichael Liao   PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
16546c3d5cbSMichael Liao   auto *NewPtr = IRB.CreateBitCast(
166357756ecSNikita Popov       IRB.CreateConstGEP1_64(
167357756ecSNikita Popov           IRB.getInt8Ty(),
168357756ecSNikita Popov           IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
16946c3d5cbSMichael Liao           Offset - Adjust),
17046c3d5cbSMichael Liao       Int32PtrTy);
17146354bacSNikita Popov   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
17246c3d5cbSMichael Liao   NewLd->copyMetadata(LI);
17346c3d5cbSMichael Liao   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
17446c3d5cbSMichael Liao 
17546c3d5cbSMichael Liao   unsigned ShAmt = Adjust * 8;
17646c3d5cbSMichael Liao   auto *NewVal = IRB.CreateBitCast(
17746c3d5cbSMichael Liao       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
17846c3d5cbSMichael Liao   LI.replaceAllUsesWith(NewVal);
17946c3d5cbSMichael Liao   RecursivelyDeleteTriviallyDeadInstructions(&LI);
18046c3d5cbSMichael Liao 
18146c3d5cbSMichael Liao   return true;
18246c3d5cbSMichael Liao }
18346c3d5cbSMichael Liao 
18446c3d5cbSMichael Liao INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18546c3d5cbSMichael Liao                       "AMDGPU IR late optimizations", false, false)
18646c3d5cbSMichael Liao INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
18746c3d5cbSMichael Liao INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
18846c3d5cbSMichael Liao INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18946c3d5cbSMichael Liao                     "AMDGPU IR late optimizations", false, false)
19046c3d5cbSMichael Liao 
19146c3d5cbSMichael Liao char AMDGPULateCodeGenPrepare::ID = 0;
19246c3d5cbSMichael Liao 
createAMDGPULateCodeGenPreparePass()19346c3d5cbSMichael Liao FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
19446c3d5cbSMichael Liao   return new AMDGPULateCodeGenPrepare();
19546c3d5cbSMichael Liao }
196