1e8d8bef9SDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2e8d8bef9SDimitry Andric //
3e8d8bef9SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e8d8bef9SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e8d8bef9SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e8d8bef9SDimitry Andric //
7e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
8e8d8bef9SDimitry Andric //
9e8d8bef9SDimitry Andric /// \file
10e8d8bef9SDimitry Andric /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11e8d8bef9SDimitry Andric /// selection.
12e8d8bef9SDimitry Andric //
13e8d8bef9SDimitry Andric //===----------------------------------------------------------------------===//
14e8d8bef9SDimitry Andric
15e8d8bef9SDimitry Andric #include "AMDGPU.h"
16*a58f00eaSDimitry Andric #include "AMDGPUTargetMachine.h"
17e8d8bef9SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
18fe013be4SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h"
19e8d8bef9SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
20*a58f00eaSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
21e8d8bef9SDimitry Andric #include "llvm/IR/IRBuilder.h"
22e8d8bef9SDimitry Andric #include "llvm/IR/InstVisitor.h"
23e8d8bef9SDimitry Andric #include "llvm/InitializePasses.h"
24e8d8bef9SDimitry Andric #include "llvm/Support/CommandLine.h"
25e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
26e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
27e8d8bef9SDimitry Andric
28e8d8bef9SDimitry Andric #define DEBUG_TYPE "amdgpu-late-codegenprepare"
29e8d8bef9SDimitry Andric
30e8d8bef9SDimitry Andric using namespace llvm;
31e8d8bef9SDimitry Andric
32e8d8bef9SDimitry Andric // Scalar load widening needs running after load-store-vectorizer as that pass
33e8d8bef9SDimitry Andric // doesn't handle overlapping cases. In addition, this pass enhances the
34e8d8bef9SDimitry Andric // widening to handle cases where scalar sub-dword loads are naturally aligned
35e8d8bef9SDimitry Andric // only but not dword aligned.
36e8d8bef9SDimitry Andric static cl::opt<bool>
37e8d8bef9SDimitry Andric WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38e8d8bef9SDimitry Andric cl::desc("Widen sub-dword constant address space loads in "
39e8d8bef9SDimitry Andric "AMDGPULateCodeGenPrepare"),
40e8d8bef9SDimitry Andric cl::ReallyHidden, cl::init(true));
41e8d8bef9SDimitry Andric
42e8d8bef9SDimitry Andric namespace {
43e8d8bef9SDimitry Andric
44e8d8bef9SDimitry Andric class AMDGPULateCodeGenPrepare
45e8d8bef9SDimitry Andric : public FunctionPass,
46e8d8bef9SDimitry Andric public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
47e8d8bef9SDimitry Andric Module *Mod = nullptr;
48e8d8bef9SDimitry Andric const DataLayout *DL = nullptr;
49e8d8bef9SDimitry Andric
50e8d8bef9SDimitry Andric AssumptionCache *AC = nullptr;
51fe013be4SDimitry Andric UniformityInfo *UA = nullptr;
52e8d8bef9SDimitry Andric
53e8d8bef9SDimitry Andric public:
54e8d8bef9SDimitry Andric static char ID;
55e8d8bef9SDimitry Andric
AMDGPULateCodeGenPrepare()56e8d8bef9SDimitry Andric AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
57e8d8bef9SDimitry Andric
getPassName() const58e8d8bef9SDimitry Andric StringRef getPassName() const override {
59e8d8bef9SDimitry Andric return "AMDGPU IR late optimizations";
60e8d8bef9SDimitry Andric }
61e8d8bef9SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const62e8d8bef9SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
63*a58f00eaSDimitry Andric AU.addRequired<TargetPassConfig>();
64e8d8bef9SDimitry Andric AU.addRequired<AssumptionCacheTracker>();
65fe013be4SDimitry Andric AU.addRequired<UniformityInfoWrapperPass>();
66e8d8bef9SDimitry Andric AU.setPreservesAll();
67e8d8bef9SDimitry Andric }
68e8d8bef9SDimitry Andric
69e8d8bef9SDimitry Andric bool doInitialization(Module &M) override;
70e8d8bef9SDimitry Andric bool runOnFunction(Function &F) override;
71e8d8bef9SDimitry Andric
visitInstruction(Instruction &)72e8d8bef9SDimitry Andric bool visitInstruction(Instruction &) { return false; }
73e8d8bef9SDimitry Andric
74e8d8bef9SDimitry Andric // Check if the specified value is at least DWORD aligned.
isDWORDAligned(const Value * V) const75e8d8bef9SDimitry Andric bool isDWORDAligned(const Value *V) const {
76e8d8bef9SDimitry Andric KnownBits Known = computeKnownBits(V, *DL, 0, AC);
77e8d8bef9SDimitry Andric return Known.countMinTrailingZeros() >= 2;
78e8d8bef9SDimitry Andric }
79e8d8bef9SDimitry Andric
80e8d8bef9SDimitry Andric bool canWidenScalarExtLoad(LoadInst &LI) const;
81e8d8bef9SDimitry Andric bool visitLoadInst(LoadInst &LI);
82e8d8bef9SDimitry Andric };
83e8d8bef9SDimitry Andric
84e8d8bef9SDimitry Andric } // end anonymous namespace
85e8d8bef9SDimitry Andric
doInitialization(Module & M)86e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
87e8d8bef9SDimitry Andric Mod = &M;
88e8d8bef9SDimitry Andric DL = &Mod->getDataLayout();
89e8d8bef9SDimitry Andric return false;
90e8d8bef9SDimitry Andric }
91e8d8bef9SDimitry Andric
runOnFunction(Function & F)92e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
93e8d8bef9SDimitry Andric if (skipFunction(F))
94e8d8bef9SDimitry Andric return false;
95e8d8bef9SDimitry Andric
96*a58f00eaSDimitry Andric const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97*a58f00eaSDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>();
98*a58f00eaSDimitry Andric const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99*a58f00eaSDimitry Andric if (ST.hasScalarSubwordLoads())
100*a58f00eaSDimitry Andric return false;
101*a58f00eaSDimitry Andric
102e8d8bef9SDimitry Andric AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
103fe013be4SDimitry Andric UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
104e8d8bef9SDimitry Andric
105e8d8bef9SDimitry Andric bool Changed = false;
106e8d8bef9SDimitry Andric for (auto &BB : F)
107349cc55cSDimitry Andric for (Instruction &I : llvm::make_early_inc_range(BB))
108349cc55cSDimitry Andric Changed |= visit(I);
109e8d8bef9SDimitry Andric
110e8d8bef9SDimitry Andric return Changed;
111e8d8bef9SDimitry Andric }
112e8d8bef9SDimitry Andric
canWidenScalarExtLoad(LoadInst & LI) const113e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
114e8d8bef9SDimitry Andric unsigned AS = LI.getPointerAddressSpace();
115e8d8bef9SDimitry Andric // Skip non-constant address space.
116e8d8bef9SDimitry Andric if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
117e8d8bef9SDimitry Andric AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
118e8d8bef9SDimitry Andric return false;
119e8d8bef9SDimitry Andric // Skip non-simple loads.
120e8d8bef9SDimitry Andric if (!LI.isSimple())
121e8d8bef9SDimitry Andric return false;
122e8d8bef9SDimitry Andric auto *Ty = LI.getType();
123e8d8bef9SDimitry Andric // Skip aggregate types.
124e8d8bef9SDimitry Andric if (Ty->isAggregateType())
125e8d8bef9SDimitry Andric return false;
126e8d8bef9SDimitry Andric unsigned TySize = DL->getTypeStoreSize(Ty);
127e8d8bef9SDimitry Andric // Only handle sub-DWORD loads.
128e8d8bef9SDimitry Andric if (TySize >= 4)
129e8d8bef9SDimitry Andric return false;
130e8d8bef9SDimitry Andric // That load must be at least naturally aligned.
131e8d8bef9SDimitry Andric if (LI.getAlign() < DL->getABITypeAlign(Ty))
132e8d8bef9SDimitry Andric return false;
133e8d8bef9SDimitry Andric // It should be uniform, i.e. a scalar load.
134fe013be4SDimitry Andric return UA->isUniform(&LI);
135e8d8bef9SDimitry Andric }
136e8d8bef9SDimitry Andric
visitLoadInst(LoadInst & LI)137e8d8bef9SDimitry Andric bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
138e8d8bef9SDimitry Andric if (!WidenLoads)
139e8d8bef9SDimitry Andric return false;
140e8d8bef9SDimitry Andric
141e8d8bef9SDimitry Andric // Skip if that load is already aligned on DWORD at least as it's handled in
142e8d8bef9SDimitry Andric // SDAG.
143e8d8bef9SDimitry Andric if (LI.getAlign() >= 4)
144e8d8bef9SDimitry Andric return false;
145e8d8bef9SDimitry Andric
146e8d8bef9SDimitry Andric if (!canWidenScalarExtLoad(LI))
147e8d8bef9SDimitry Andric return false;
148e8d8bef9SDimitry Andric
149e8d8bef9SDimitry Andric int64_t Offset = 0;
150e8d8bef9SDimitry Andric auto *Base =
151e8d8bef9SDimitry Andric GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
152e8d8bef9SDimitry Andric // If that base is not DWORD aligned, it's not safe to perform the following
153e8d8bef9SDimitry Andric // transforms.
154e8d8bef9SDimitry Andric if (!isDWORDAligned(Base))
155e8d8bef9SDimitry Andric return false;
156e8d8bef9SDimitry Andric
157e8d8bef9SDimitry Andric int64_t Adjust = Offset & 0x3;
158e8d8bef9SDimitry Andric if (Adjust == 0) {
159e8d8bef9SDimitry Andric // With a zero adjust, the original alignment could be promoted with a
160e8d8bef9SDimitry Andric // better one.
161e8d8bef9SDimitry Andric LI.setAlignment(Align(4));
162e8d8bef9SDimitry Andric return true;
163e8d8bef9SDimitry Andric }
164e8d8bef9SDimitry Andric
165e8d8bef9SDimitry Andric IRBuilder<> IRB(&LI);
166e8d8bef9SDimitry Andric IRB.SetCurrentDebugLocation(LI.getDebugLoc());
167e8d8bef9SDimitry Andric
168fe013be4SDimitry Andric unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
169e8d8bef9SDimitry Andric auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
170e8d8bef9SDimitry Andric
171fe013be4SDimitry Andric auto *NewPtr = IRB.CreateConstGEP1_64(
172fe6060f1SDimitry Andric IRB.getInt8Ty(),
173fe013be4SDimitry Andric IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
174fe013be4SDimitry Andric Offset - Adjust);
175fe013be4SDimitry Andric
176fe6060f1SDimitry Andric LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
177e8d8bef9SDimitry Andric NewLd->copyMetadata(LI);
178e8d8bef9SDimitry Andric NewLd->setMetadata(LLVMContext::MD_range, nullptr);
179e8d8bef9SDimitry Andric
180e8d8bef9SDimitry Andric unsigned ShAmt = Adjust * 8;
181e8d8bef9SDimitry Andric auto *NewVal = IRB.CreateBitCast(
182e8d8bef9SDimitry Andric IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
183e8d8bef9SDimitry Andric LI.replaceAllUsesWith(NewVal);
184e8d8bef9SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&LI);
185e8d8bef9SDimitry Andric
186e8d8bef9SDimitry Andric return true;
187e8d8bef9SDimitry Andric }
188e8d8bef9SDimitry Andric
189e8d8bef9SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
190e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false)
191*a58f00eaSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
192e8d8bef9SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
193fe013be4SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
194e8d8bef9SDimitry Andric INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
195e8d8bef9SDimitry Andric "AMDGPU IR late optimizations", false, false)
196e8d8bef9SDimitry Andric
197e8d8bef9SDimitry Andric char AMDGPULateCodeGenPrepare::ID = 0;
198e8d8bef9SDimitry Andric
createAMDGPULateCodeGenPreparePass()199e8d8bef9SDimitry Andric FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
200e8d8bef9SDimitry Andric return new AMDGPULateCodeGenPrepare();
201e8d8bef9SDimitry Andric }
202