146c3d5cbSMichael Liao //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
246c3d5cbSMichael Liao //
346c3d5cbSMichael Liao // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
446c3d5cbSMichael Liao // See https://llvm.org/LICENSE.txt for license information.
546c3d5cbSMichael Liao // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
646c3d5cbSMichael Liao //
746c3d5cbSMichael Liao //===----------------------------------------------------------------------===//
846c3d5cbSMichael Liao //
946c3d5cbSMichael Liao /// \file
1046c3d5cbSMichael Liao /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
1146c3d5cbSMichael Liao /// selection.
1246c3d5cbSMichael Liao //
1346c3d5cbSMichael Liao //===----------------------------------------------------------------------===//
1446c3d5cbSMichael Liao
1546c3d5cbSMichael Liao #include "AMDGPU.h"
1646c3d5cbSMichael Liao #include "llvm/Analysis/AssumptionCache.h"
1746c3d5cbSMichael Liao #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1846c3d5cbSMichael Liao #include "llvm/Analysis/ValueTracking.h"
1946c3d5cbSMichael Liao #include "llvm/IR/IRBuilder.h"
2046c3d5cbSMichael Liao #include "llvm/IR/InstVisitor.h"
2146c3d5cbSMichael Liao #include "llvm/InitializePasses.h"
2246c3d5cbSMichael Liao #include "llvm/Support/CommandLine.h"
2346c3d5cbSMichael Liao #include "llvm/Support/KnownBits.h"
2446c3d5cbSMichael Liao #include "llvm/Transforms/Utils/Local.h"
2546c3d5cbSMichael Liao
2646c3d5cbSMichael Liao #define DEBUG_TYPE "amdgpu-late-codegenprepare"
2746c3d5cbSMichael Liao
2846c3d5cbSMichael Liao using namespace llvm;
2946c3d5cbSMichael Liao
3046c3d5cbSMichael Liao // Scalar load widening needs running after load-store-vectorizer as that pass
3146c3d5cbSMichael Liao // doesn't handle overlapping cases. In addition, this pass enhances the
3246c3d5cbSMichael Liao // widening to handle cases where scalar sub-dword loads are naturally aligned
3346c3d5cbSMichael Liao // only but not dword aligned.
3446c3d5cbSMichael Liao static cl::opt<bool>
3546c3d5cbSMichael Liao WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
3646c3d5cbSMichael Liao cl::desc("Widen sub-dword constant address space loads in "
3746c3d5cbSMichael Liao "AMDGPULateCodeGenPrepare"),
3846c3d5cbSMichael Liao cl::ReallyHidden, cl::init(true));
3946c3d5cbSMichael Liao
4046c3d5cbSMichael Liao namespace {
4146c3d5cbSMichael Liao
4246c3d5cbSMichael Liao class AMDGPULateCodeGenPrepare
4346c3d5cbSMichael Liao : public FunctionPass,
4446c3d5cbSMichael Liao public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
4546c3d5cbSMichael Liao Module *Mod = nullptr;
4646c3d5cbSMichael Liao const DataLayout *DL = nullptr;
4746c3d5cbSMichael Liao
4846c3d5cbSMichael Liao AssumptionCache *AC = nullptr;
4946c3d5cbSMichael Liao LegacyDivergenceAnalysis *DA = nullptr;
5046c3d5cbSMichael Liao
5146c3d5cbSMichael Liao public:
5246c3d5cbSMichael Liao static char ID;
5346c3d5cbSMichael Liao
AMDGPULateCodeGenPrepare()5446c3d5cbSMichael Liao AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
5546c3d5cbSMichael Liao
getPassName() const5646c3d5cbSMichael Liao StringRef getPassName() const override {
5746c3d5cbSMichael Liao return "AMDGPU IR late optimizations";
5846c3d5cbSMichael Liao }
5946c3d5cbSMichael Liao
getAnalysisUsage(AnalysisUsage & AU) const6046c3d5cbSMichael Liao void getAnalysisUsage(AnalysisUsage &AU) const override {
6146c3d5cbSMichael Liao AU.addRequired<AssumptionCacheTracker>();
6246c3d5cbSMichael Liao AU.addRequired<LegacyDivergenceAnalysis>();
6346c3d5cbSMichael Liao AU.setPreservesAll();
6446c3d5cbSMichael Liao }
6546c3d5cbSMichael Liao
6646c3d5cbSMichael Liao bool doInitialization(Module &M) override;
6746c3d5cbSMichael Liao bool runOnFunction(Function &F) override;
6846c3d5cbSMichael Liao
visitInstruction(Instruction &)6946c3d5cbSMichael Liao bool visitInstruction(Instruction &) { return false; }
7046c3d5cbSMichael Liao
7146c3d5cbSMichael Liao // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const7246c3d5cbSMichael Liao bool isDWORDAligned(const Value *V) const {
7346c3d5cbSMichael Liao KnownBits Known = computeKnownBits(V, *DL, 0, AC);
7446c3d5cbSMichael Liao return Known.countMinTrailingZeros() >= 2;
7546c3d5cbSMichael Liao }
7646c3d5cbSMichael Liao
7746c3d5cbSMichael Liao bool canWidenScalarExtLoad(LoadInst &LI) const;
7846c3d5cbSMichael Liao bool visitLoadInst(LoadInst &LI);
7946c3d5cbSMichael Liao };
8046c3d5cbSMichael Liao
8146c3d5cbSMichael Liao } // end anonymous namespace
8246c3d5cbSMichael Liao
doInitialization(Module & M)8346c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
8446c3d5cbSMichael Liao Mod = &M;
8546c3d5cbSMichael Liao DL = &Mod->getDataLayout();
8646c3d5cbSMichael Liao return false;
8746c3d5cbSMichael Liao }
8846c3d5cbSMichael Liao
runOnFunction(Function & F)8946c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
9046c3d5cbSMichael Liao if (skipFunction(F))
9146c3d5cbSMichael Liao return false;
9246c3d5cbSMichael Liao
9346c3d5cbSMichael Liao AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
9446c3d5cbSMichael Liao DA = &getAnalysis<LegacyDivergenceAnalysis>();
9546c3d5cbSMichael Liao
9646c3d5cbSMichael Liao bool Changed = false;
9746c3d5cbSMichael Liao for (auto &BB : F)
98*4bef0304SKazu Hirata for (Instruction &I : llvm::make_early_inc_range(BB))
99*4bef0304SKazu Hirata Changed |= visit(I);
10046c3d5cbSMichael Liao
10146c3d5cbSMichael Liao return Changed;
10246c3d5cbSMichael Liao }
10346c3d5cbSMichael Liao
canWidenScalarExtLoad(LoadInst & LI) const10446c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
10546c3d5cbSMichael Liao unsigned AS = LI.getPointerAddressSpace();
10646c3d5cbSMichael Liao // Skip non-constant address space.
10746c3d5cbSMichael Liao if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10846c3d5cbSMichael Liao AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
10946c3d5cbSMichael Liao return false;
11046c3d5cbSMichael Liao // Skip non-simple loads.
11146c3d5cbSMichael Liao if (!LI.isSimple())
11246c3d5cbSMichael Liao return false;
11346c3d5cbSMichael Liao auto *Ty = LI.getType();
11446c3d5cbSMichael Liao // Skip aggregate types.
11546c3d5cbSMichael Liao if (Ty->isAggregateType())
11646c3d5cbSMichael Liao return false;
11746c3d5cbSMichael Liao unsigned TySize = DL->getTypeStoreSize(Ty);
11846c3d5cbSMichael Liao // Only handle sub-DWORD loads.
11946c3d5cbSMichael Liao if (TySize >= 4)
12046c3d5cbSMichael Liao return false;
12146c3d5cbSMichael Liao // That load must be at least naturally aligned.
12246c3d5cbSMichael Liao if (LI.getAlign() < DL->getABITypeAlign(Ty))
12346c3d5cbSMichael Liao return false;
12446c3d5cbSMichael Liao // It should be uniform, i.e. a scalar load.
12546c3d5cbSMichael Liao return DA->isUniform(&LI);
12646c3d5cbSMichael Liao }
12746c3d5cbSMichael Liao
visitLoadInst(LoadInst & LI)12846c3d5cbSMichael Liao bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
12946c3d5cbSMichael Liao if (!WidenLoads)
13046c3d5cbSMichael Liao return false;
13146c3d5cbSMichael Liao
13246c3d5cbSMichael Liao // Skip if that load is already aligned on DWORD at least as it's handled in
13346c3d5cbSMichael Liao // SDAG.
13446c3d5cbSMichael Liao if (LI.getAlign() >= 4)
13546c3d5cbSMichael Liao return false;
13646c3d5cbSMichael Liao
13746c3d5cbSMichael Liao if (!canWidenScalarExtLoad(LI))
13846c3d5cbSMichael Liao return false;
13946c3d5cbSMichael Liao
14046c3d5cbSMichael Liao int64_t Offset = 0;
14146c3d5cbSMichael Liao auto *Base =
14246c3d5cbSMichael Liao GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
14346c3d5cbSMichael Liao // If that base is not DWORD aligned, it's not safe to perform the following
14446c3d5cbSMichael Liao // transforms.
14546c3d5cbSMichael Liao if (!isDWORDAligned(Base))
14646c3d5cbSMichael Liao return false;
14746c3d5cbSMichael Liao
14846c3d5cbSMichael Liao int64_t Adjust = Offset & 0x3;
14946c3d5cbSMichael Liao if (Adjust == 0) {
15046c3d5cbSMichael Liao // With a zero adjust, the original alignment could be promoted with a
15146c3d5cbSMichael Liao // better one.
15246c3d5cbSMichael Liao LI.setAlignment(Align(4));
15346c3d5cbSMichael Liao return true;
15446c3d5cbSMichael Liao }
15546c3d5cbSMichael Liao
15646c3d5cbSMichael Liao IRBuilder<> IRB(&LI);
15746c3d5cbSMichael Liao IRB.SetCurrentDebugLocation(LI.getDebugLoc());
15846c3d5cbSMichael Liao
15946c3d5cbSMichael Liao unsigned AS = LI.getPointerAddressSpace();
16046c3d5cbSMichael Liao unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
16146c3d5cbSMichael Liao auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
16246c3d5cbSMichael Liao
16346c3d5cbSMichael Liao PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
16446c3d5cbSMichael Liao PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
16546c3d5cbSMichael Liao auto *NewPtr = IRB.CreateBitCast(
166357756ecSNikita Popov IRB.CreateConstGEP1_64(
167357756ecSNikita Popov IRB.getInt8Ty(),
168357756ecSNikita Popov IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
16946c3d5cbSMichael Liao Offset - Adjust),
17046c3d5cbSMichael Liao Int32PtrTy);
17146354bacSNikita Popov LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
17246c3d5cbSMichael Liao NewLd->copyMetadata(LI);
17346c3d5cbSMichael Liao NewLd->setMetadata(LLVMContext::MD_range, nullptr);
17446c3d5cbSMichael Liao
17546c3d5cbSMichael Liao unsigned ShAmt = Adjust * 8;
17646c3d5cbSMichael Liao auto *NewVal = IRB.CreateBitCast(
17746c3d5cbSMichael Liao IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
17846c3d5cbSMichael Liao LI.replaceAllUsesWith(NewVal);
17946c3d5cbSMichael Liao RecursivelyDeleteTriviallyDeadInstructions(&LI);
18046c3d5cbSMichael Liao
18146c3d5cbSMichael Liao return true;
18246c3d5cbSMichael Liao }
18346c3d5cbSMichael Liao
18446c3d5cbSMichael Liao INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18546c3d5cbSMichael Liao "AMDGPU IR late optimizations", false, false)
18646c3d5cbSMichael Liao INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
18746c3d5cbSMichael Liao INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
18846c3d5cbSMichael Liao INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
18946c3d5cbSMichael Liao "AMDGPU IR late optimizations", false, false)
19046c3d5cbSMichael Liao
19146c3d5cbSMichael Liao char AMDGPULateCodeGenPrepare::ID = 0;
19246c3d5cbSMichael Liao
createAMDGPULateCodeGenPreparePass()19346c3d5cbSMichael Liao FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
19446c3d5cbSMichael Liao return new AMDGPULateCodeGenPrepare();
19546c3d5cbSMichael Liao }
196