1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass munges the code in the input function to better prepare it for 10 // SelectionDAG-based code generation. This works around limitations in it's 11 // basic-block-at-a-time approach. It should eventually be removed. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/ArrayRef.h" 17 #include "llvm/ADT/DenseMap.h" 18 #include "llvm/ADT/MapVector.h" 19 #include "llvm/ADT/PointerIntPair.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallPtrSet.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/Analysis/BlockFrequencyInfo.h" 25 #include "llvm/Analysis/BranchProbabilityInfo.h" 26 #include "llvm/Analysis/ConstantFolding.h" 27 #include "llvm/Analysis/InstructionSimplify.h" 28 #include "llvm/Analysis/LoopInfo.h" 29 #include "llvm/Analysis/MemoryBuiltins.h" 30 #include "llvm/Analysis/ProfileSummaryInfo.h" 31 #include "llvm/Analysis/TargetLibraryInfo.h" 32 #include "llvm/Analysis/TargetTransformInfo.h" 33 #include "llvm/Analysis/ValueTracking.h" 34 #include "llvm/Analysis/VectorUtils.h" 35 #include "llvm/CodeGen/Analysis.h" 36 #include "llvm/CodeGen/ISDOpcodes.h" 37 #include "llvm/CodeGen/SelectionDAGNodes.h" 38 #include "llvm/CodeGen/TargetLowering.h" 39 #include "llvm/CodeGen/TargetPassConfig.h" 40 #include "llvm/CodeGen/TargetSubtargetInfo.h" 41 #include "llvm/CodeGen/ValueTypes.h" 42 #include "llvm/Config/llvm-config.h" 43 #include "llvm/IR/Argument.h" 44 #include "llvm/IR/Attributes.h" 45 #include "llvm/IR/BasicBlock.h" 46 #include "llvm/IR/Constant.h" 47 #include "llvm/IR/Constants.h" 48 #include "llvm/IR/DataLayout.h" 49 #include "llvm/IR/DerivedTypes.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/GetElementPtrTypeIterator.h" 53 #include "llvm/IR/GlobalValue.h" 54 #include "llvm/IR/GlobalVariable.h" 55 #include "llvm/IR/IRBuilder.h" 56 #include "llvm/IR/InlineAsm.h" 57 #include "llvm/IR/InstrTypes.h" 58 #include "llvm/IR/Instruction.h" 59 #include "llvm/IR/Instructions.h" 60 #include "llvm/IR/IntrinsicInst.h" 61 #include "llvm/IR/Intrinsics.h" 62 #include "llvm/IR/IntrinsicsAArch64.h" 63 #include "llvm/IR/LLVMContext.h" 64 #include "llvm/IR/MDBuilder.h" 65 #include "llvm/IR/Module.h" 66 #include "llvm/IR/Operator.h" 67 #include "llvm/IR/PatternMatch.h" 68 #include "llvm/IR/Statepoint.h" 69 #include "llvm/IR/Type.h" 70 #include "llvm/IR/Use.h" 71 #include "llvm/IR/User.h" 72 #include "llvm/IR/Value.h" 73 #include "llvm/IR/ValueHandle.h" 74 #include "llvm/IR/ValueMap.h" 75 #include "llvm/InitializePasses.h" 76 #include "llvm/Pass.h" 77 #include "llvm/Support/BlockFrequency.h" 78 #include "llvm/Support/BranchProbability.h" 79 #include "llvm/Support/Casting.h" 80 #include "llvm/Support/CommandLine.h" 81 #include "llvm/Support/Compiler.h" 82 #include "llvm/Support/Debug.h" 83 #include "llvm/Support/ErrorHandling.h" 84 #include "llvm/Support/MachineValueType.h" 85 #include "llvm/Support/MathExtras.h" 86 #include "llvm/Support/raw_ostream.h" 87 #include "llvm/Target/TargetMachine.h" 88 #include "llvm/Target/TargetOptions.h" 89 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 90 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 91 #include "llvm/Transforms/Utils/Local.h" 92 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 93 #include "llvm/Transforms/Utils/SizeOpts.h" 94 #include <algorithm> 95 #include <cassert> 96 #include <cstdint> 97 #include <iterator> 98 #include <limits> 99 #include <memory> 100 #include <utility> 101 #include <vector> 102 103 using namespace llvm; 104 using namespace llvm::PatternMatch; 105 106 #define DEBUG_TYPE "codegenprepare" 107 108 STATISTIC(NumBlocksElim, "Number of blocks eliminated"); 109 STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); 110 STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); 111 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " 112 "sunken Cmps"); 113 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " 114 "of sunken Casts"); 115 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " 116 "computations were sunk"); 117 STATISTIC(NumMemoryInstsPhiCreated, 118 "Number of phis created when address " 119 "computations were sunk to memory instructions"); 120 STATISTIC(NumMemoryInstsSelectCreated, 121 "Number of select created when address " 122 "computations were sunk to memory instructions"); 123 STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); 124 STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); 125 STATISTIC(NumAndsAdded, 126 "Number of and mask instructions added to form ext loads"); 127 STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); 128 STATISTIC(NumRetsDup, "Number of return instructions duplicated"); 129 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); 130 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); 131 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); 132 133 static cl::opt<bool> DisableBranchOpts( 134 "disable-cgp-branch-opts", cl::Hidden, cl::init(false), 135 cl::desc("Disable branch optimizations in CodeGenPrepare")); 136 137 static cl::opt<bool> 138 DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false), 139 cl::desc("Disable GC optimizations in CodeGenPrepare")); 140 141 static cl::opt<bool> DisableSelectToBranch( 142 "disable-cgp-select2branch", cl::Hidden, cl::init(false), 143 cl::desc("Disable select to branch conversion.")); 144 145 static cl::opt<bool> AddrSinkUsingGEPs( 146 "addr-sink-using-gep", cl::Hidden, cl::init(true), 147 cl::desc("Address sinking in CGP using GEPs.")); 148 149 static cl::opt<bool> EnableAndCmpSinking( 150 "enable-andcmp-sinking", cl::Hidden, cl::init(true), 151 cl::desc("Enable sinkinig and/cmp into branches.")); 152 153 static cl::opt<bool> DisableStoreExtract( 154 "disable-cgp-store-extract", cl::Hidden, cl::init(false), 155 cl::desc("Disable store(extract) optimizations in CodeGenPrepare")); 156 157 static cl::opt<bool> StressStoreExtract( 158 "stress-cgp-store-extract", cl::Hidden, cl::init(false), 159 cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); 160 161 static cl::opt<bool> DisableExtLdPromotion( 162 "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), 163 cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in " 164 "CodeGenPrepare")); 165 166 static cl::opt<bool> StressExtLdPromotion( 167 "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), 168 cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) " 169 "optimization in CodeGenPrepare")); 170 171 static cl::opt<bool> DisablePreheaderProtect( 172 "disable-preheader-prot", cl::Hidden, cl::init(false), 173 cl::desc("Disable protection against removing loop preheaders")); 174 175 static cl::opt<bool> ProfileGuidedSectionPrefix( 176 "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore, 177 cl::desc("Use profile info to add section prefix for hot/cold functions")); 178 179 static cl::opt<bool> ProfileUnknownInSpecialSection( 180 "profile-unknown-in-special-section", cl::Hidden, cl::init(false), 181 cl::ZeroOrMore, 182 cl::desc("In profiling mode like sampleFDO, if a function doesn't have " 183 "profile, we cannot tell the function is cold for sure because " 184 "it may be a function newly added without ever being sampled. " 185 "With the flag enabled, compiler can put such profile unknown " 186 "functions into a special section, so runtime system can choose " 187 "to handle it in a different way than .text section, to save " 188 "RAM for example. ")); 189 190 static cl::opt<unsigned> FreqRatioToSkipMerge( 191 "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2), 192 cl::desc("Skip merging empty blocks if (frequency of empty block) / " 193 "(frequency of destination block) is greater than this ratio")); 194 195 static cl::opt<bool> ForceSplitStore( 196 "force-split-store", cl::Hidden, cl::init(false), 197 cl::desc("Force store splitting no matter what the target query says.")); 198 199 static cl::opt<bool> 200 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, 201 cl::desc("Enable merging of redundant sexts when one is dominating" 202 " the other."), cl::init(true)); 203 204 static cl::opt<bool> DisableComplexAddrModes( 205 "disable-complex-addr-modes", cl::Hidden, cl::init(false), 206 cl::desc("Disables combining addressing modes with different parts " 207 "in optimizeMemoryInst.")); 208 209 static cl::opt<bool> 210 AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false), 211 cl::desc("Allow creation of Phis in Address sinking.")); 212 213 static cl::opt<bool> 214 AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true), 215 cl::desc("Allow creation of selects in Address sinking.")); 216 217 static cl::opt<bool> AddrSinkCombineBaseReg( 218 "addr-sink-combine-base-reg", cl::Hidden, cl::init(true), 219 cl::desc("Allow combining of BaseReg field in Address sinking.")); 220 221 static cl::opt<bool> AddrSinkCombineBaseGV( 222 "addr-sink-combine-base-gv", cl::Hidden, cl::init(true), 223 cl::desc("Allow combining of BaseGV field in Address sinking.")); 224 225 static cl::opt<bool> AddrSinkCombineBaseOffs( 226 "addr-sink-combine-base-offs", cl::Hidden, cl::init(true), 227 cl::desc("Allow combining of BaseOffs field in Address sinking.")); 228 229 static cl::opt<bool> AddrSinkCombineScaledReg( 230 "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true), 231 cl::desc("Allow combining of ScaledReg field in Address sinking.")); 232 233 static cl::opt<bool> 234 EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden, 235 cl::init(true), 236 cl::desc("Enable splitting large offset of GEP.")); 237 238 static cl::opt<bool> EnableICMP_EQToICMP_ST( 239 "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false), 240 cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion.")); 241 242 static cl::opt<bool> 243 VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false), 244 cl::desc("Enable BFI update verification for " 245 "CodeGenPrepare.")); 246 247 static cl::opt<bool> OptimizePhiTypes( 248 "cgp-optimize-phi-types", cl::Hidden, cl::init(false), 249 cl::desc("Enable converting phi types in CodeGenPrepare")); 250 251 namespace { 252 253 enum ExtType { 254 ZeroExtension, // Zero extension has been seen. 255 SignExtension, // Sign extension has been seen. 256 BothExtension // This extension type is used if we saw sext after 257 // ZeroExtension had been set, or if we saw zext after 258 // SignExtension had been set. It makes the type 259 // information of a promoted instruction invalid. 260 }; 261 262 using SetOfInstrs = SmallPtrSet<Instruction *, 16>; 263 using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>; 264 using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>; 265 using SExts = SmallVector<Instruction *, 16>; 266 using ValueToSExts = DenseMap<Value *, SExts>; 267 268 class TypePromotionTransaction; 269 270 class CodeGenPrepare : public FunctionPass { 271 const TargetMachine *TM = nullptr; 272 const TargetSubtargetInfo *SubtargetInfo; 273 const TargetLowering *TLI = nullptr; 274 const TargetRegisterInfo *TRI; 275 const TargetTransformInfo *TTI = nullptr; 276 const TargetLibraryInfo *TLInfo; 277 const LoopInfo *LI; 278 std::unique_ptr<BlockFrequencyInfo> BFI; 279 std::unique_ptr<BranchProbabilityInfo> BPI; 280 ProfileSummaryInfo *PSI; 281 282 /// As we scan instructions optimizing them, this is the next instruction 283 /// to optimize. Transforms that can invalidate this should update it. 284 BasicBlock::iterator CurInstIterator; 285 286 /// Keeps track of non-local addresses that have been sunk into a block. 287 /// This allows us to avoid inserting duplicate code for blocks with 288 /// multiple load/stores of the same address. The usage of WeakTrackingVH 289 /// enables SunkAddrs to be treated as a cache whose entries can be 290 /// invalidated if a sunken address computation has been erased. 291 ValueMap<Value*, WeakTrackingVH> SunkAddrs; 292 293 /// Keeps track of all instructions inserted for the current function. 294 SetOfInstrs InsertedInsts; 295 296 /// Keeps track of the type of the related instruction before their 297 /// promotion for the current function. 298 InstrToOrigTy PromotedInsts; 299 300 /// Keep track of instructions removed during promotion. 301 SetOfInstrs RemovedInsts; 302 303 /// Keep track of sext chains based on their initial value. 304 DenseMap<Value *, Instruction *> SeenChainsForSExt; 305 306 /// Keep track of GEPs accessing the same data structures such as structs or 307 /// arrays that are candidates to be split later because of their large 308 /// size. 309 MapVector< 310 AssertingVH<Value>, 311 SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>> 312 LargeOffsetGEPMap; 313 314 /// Keep track of new GEP base after splitting the GEPs having large offset. 315 SmallSet<AssertingVH<Value>, 2> NewGEPBases; 316 317 /// Map serial numbers to Large offset GEPs. 318 DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID; 319 320 /// Keep track of SExt promoted. 321 ValueToSExts ValToSExtendedUses; 322 323 /// True if the function has the OptSize attribute. 324 bool OptSize; 325 326 /// DataLayout for the Function being processed. 327 const DataLayout *DL = nullptr; 328 329 /// Building the dominator tree can be expensive, so we only build it 330 /// lazily and update it when required. 331 std::unique_ptr<DominatorTree> DT; 332 333 public: 334 static char ID; // Pass identification, replacement for typeid 335 336 CodeGenPrepare() : FunctionPass(ID) { 337 initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); 338 } 339 340 bool runOnFunction(Function &F) override; 341 342 StringRef getPassName() const override { return "CodeGen Prepare"; } 343 344 void getAnalysisUsage(AnalysisUsage &AU) const override { 345 // FIXME: When we can selectively preserve passes, preserve the domtree. 346 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 347 AU.addRequired<TargetLibraryInfoWrapperPass>(); 348 AU.addRequired<TargetPassConfig>(); 349 AU.addRequired<TargetTransformInfoWrapperPass>(); 350 AU.addRequired<LoopInfoWrapperPass>(); 351 } 352 353 private: 354 template <typename F> 355 void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) { 356 // Substituting can cause recursive simplifications, which can invalidate 357 // our iterator. Use a WeakTrackingVH to hold onto it in case this 358 // happens. 359 Value *CurValue = &*CurInstIterator; 360 WeakTrackingVH IterHandle(CurValue); 361 362 f(); 363 364 // If the iterator instruction was recursively deleted, start over at the 365 // start of the block. 366 if (IterHandle != CurValue) { 367 CurInstIterator = BB->begin(); 368 SunkAddrs.clear(); 369 } 370 } 371 372 // Get the DominatorTree, building if necessary. 373 DominatorTree &getDT(Function &F) { 374 if (!DT) 375 DT = std::make_unique<DominatorTree>(F); 376 return *DT; 377 } 378 379 void removeAllAssertingVHReferences(Value *V); 380 bool eliminateFallThrough(Function &F); 381 bool eliminateMostlyEmptyBlocks(Function &F); 382 BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); 383 bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; 384 void eliminateMostlyEmptyBlock(BasicBlock *BB); 385 bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, 386 bool isPreheader); 387 bool makeBitReverse(Instruction &I); 388 bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); 389 bool optimizeInst(Instruction *I, bool &ModifiedDT); 390 bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, 391 Type *AccessTy, unsigned AddrSpace); 392 bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); 393 bool optimizeInlineAsmInst(CallInst *CS); 394 bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); 395 bool optimizeExt(Instruction *&I); 396 bool optimizeExtUses(Instruction *I); 397 bool optimizeLoadExt(LoadInst *Load); 398 bool optimizeShiftInst(BinaryOperator *BO); 399 bool optimizeFunnelShift(IntrinsicInst *Fsh); 400 bool optimizeSelectInst(SelectInst *SI); 401 bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI); 402 bool optimizeSwitchInst(SwitchInst *SI); 403 bool optimizeExtractElementInst(Instruction *Inst); 404 bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT); 405 bool fixupDbgValue(Instruction *I); 406 bool placeDbgValues(Function &F); 407 bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, 408 LoadInst *&LI, Instruction *&Inst, bool HasPromoted); 409 bool tryToPromoteExts(TypePromotionTransaction &TPT, 410 const SmallVectorImpl<Instruction *> &Exts, 411 SmallVectorImpl<Instruction *> &ProfitablyMovedExts, 412 unsigned CreatedInstsCost = 0); 413 bool mergeSExts(Function &F); 414 bool splitLargeGEPOffsets(); 415 bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited, 416 SmallPtrSetImpl<Instruction *> &DeletedInstrs); 417 bool optimizePhiTypes(Function &F); 418 bool performAddressTypePromotion( 419 Instruction *&Inst, 420 bool AllowPromotionWithoutCommonHeader, 421 bool HasPromoted, TypePromotionTransaction &TPT, 422 SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); 423 bool splitBranchCondition(Function &F, bool &ModifiedDT); 424 bool simplifyOffsetableRelocate(GCStatepointInst &I); 425 426 bool tryToSinkFreeOperands(Instruction *I); 427 bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, 428 Value *Arg1, CmpInst *Cmp, 429 Intrinsic::ID IID); 430 bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT); 431 bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT); 432 bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT); 433 void verifyBFIUpdates(Function &F); 434 }; 435 436 } // end anonymous namespace 437 438 char CodeGenPrepare::ID = 0; 439 440 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, 441 "Optimize for code generation", false, false) 442 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 443 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, 444 "Optimize for code generation", false, false) 445 446 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); } 447 448 bool CodeGenPrepare::runOnFunction(Function &F) { 449 if (skipFunction(F)) 450 return false; 451 452 DL = &F.getParent()->getDataLayout(); 453 454 bool EverMadeChange = false; 455 // Clear per function information. 456 InsertedInsts.clear(); 457 PromotedInsts.clear(); 458 459 TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); 460 SubtargetInfo = TM->getSubtargetImpl(F); 461 TLI = SubtargetInfo->getTargetLowering(); 462 TRI = SubtargetInfo->getRegisterInfo(); 463 TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 464 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 465 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 466 BPI.reset(new BranchProbabilityInfo(F, *LI)); 467 BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); 468 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 469 OptSize = F.hasOptSize(); 470 if (ProfileGuidedSectionPrefix) { 471 if (PSI->isFunctionHotInCallGraph(&F, *BFI)) 472 F.setSectionPrefix(".hot"); 473 else if (PSI->isFunctionColdInCallGraph(&F, *BFI)) 474 F.setSectionPrefix(".unlikely"); 475 else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() && 476 PSI->isFunctionHotnessUnknown(F)) 477 F.setSectionPrefix(".unknown"); 478 } 479 480 /// This optimization identifies DIV instructions that can be 481 /// profitably bypassed and carried out with a shorter, faster divide. 482 if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) { 483 const DenseMap<unsigned int, unsigned int> &BypassWidths = 484 TLI->getBypassSlowDivWidths(); 485 BasicBlock* BB = &*F.begin(); 486 while (BB != nullptr) { 487 // bypassSlowDivision may create new BBs, but we don't want to reapply the 488 // optimization to those blocks. 489 BasicBlock* Next = BB->getNextNode(); 490 // F.hasOptSize is already checked in the outer if statement. 491 if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) 492 EverMadeChange |= bypassSlowDivision(BB, BypassWidths); 493 BB = Next; 494 } 495 } 496 497 // Eliminate blocks that contain only PHI nodes and an 498 // unconditional branch. 499 EverMadeChange |= eliminateMostlyEmptyBlocks(F); 500 501 bool ModifiedDT = false; 502 if (!DisableBranchOpts) 503 EverMadeChange |= splitBranchCondition(F, ModifiedDT); 504 505 // Split some critical edges where one of the sources is an indirect branch, 506 // to help generate sane code for PHIs involving such edges. 507 EverMadeChange |= SplitIndirectBrCriticalEdges(F); 508 509 bool MadeChange = true; 510 while (MadeChange) { 511 MadeChange = false; 512 DT.reset(); 513 for (Function::iterator I = F.begin(); I != F.end(); ) { 514 BasicBlock *BB = &*I++; 515 bool ModifiedDTOnIteration = false; 516 MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); 517 518 // Restart BB iteration if the dominator tree of the Function was changed 519 if (ModifiedDTOnIteration) 520 break; 521 } 522 if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) 523 MadeChange |= mergeSExts(F); 524 if (!LargeOffsetGEPMap.empty()) 525 MadeChange |= splitLargeGEPOffsets(); 526 MadeChange |= optimizePhiTypes(F); 527 528 if (MadeChange) 529 eliminateFallThrough(F); 530 531 // Really free removed instructions during promotion. 532 for (Instruction *I : RemovedInsts) 533 I->deleteValue(); 534 535 EverMadeChange |= MadeChange; 536 SeenChainsForSExt.clear(); 537 ValToSExtendedUses.clear(); 538 RemovedInsts.clear(); 539 LargeOffsetGEPMap.clear(); 540 LargeOffsetGEPID.clear(); 541 } 542 543 SunkAddrs.clear(); 544 545 if (!DisableBranchOpts) { 546 MadeChange = false; 547 // Use a set vector to get deterministic iteration order. The order the 548 // blocks are removed may affect whether or not PHI nodes in successors 549 // are removed. 550 SmallSetVector<BasicBlock*, 8> WorkList; 551 for (BasicBlock &BB : F) { 552 SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB)); 553 MadeChange |= ConstantFoldTerminator(&BB, true); 554 if (!MadeChange) continue; 555 556 for (SmallVectorImpl<BasicBlock*>::iterator 557 II = Successors.begin(), IE = Successors.end(); II != IE; ++II) 558 if (pred_begin(*II) == pred_end(*II)) 559 WorkList.insert(*II); 560 } 561 562 // Delete the dead blocks and any of their dead successors. 563 MadeChange |= !WorkList.empty(); 564 while (!WorkList.empty()) { 565 BasicBlock *BB = WorkList.pop_back_val(); 566 SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); 567 568 DeleteDeadBlock(BB); 569 570 for (SmallVectorImpl<BasicBlock*>::iterator 571 II = Successors.begin(), IE = Successors.end(); II != IE; ++II) 572 if (pred_begin(*II) == pred_end(*II)) 573 WorkList.insert(*II); 574 } 575 576 // Merge pairs of basic blocks with unconditional branches, connected by 577 // a single edge. 578 if (EverMadeChange || MadeChange) 579 MadeChange |= eliminateFallThrough(F); 580 581 EverMadeChange |= MadeChange; 582 } 583 584 if (!DisableGCOpts) { 585 SmallVector<GCStatepointInst *, 2> Statepoints; 586 for (BasicBlock &BB : F) 587 for (Instruction &I : BB) 588 if (auto *SP = dyn_cast<GCStatepointInst>(&I)) 589 Statepoints.push_back(SP); 590 for (auto &I : Statepoints) 591 EverMadeChange |= simplifyOffsetableRelocate(*I); 592 } 593 594 // Do this last to clean up use-before-def scenarios introduced by other 595 // preparatory transforms. 596 EverMadeChange |= placeDbgValues(F); 597 598 #ifndef NDEBUG 599 if (VerifyBFIUpdates) 600 verifyBFIUpdates(F); 601 #endif 602 603 return EverMadeChange; 604 } 605 606 /// An instruction is about to be deleted, so remove all references to it in our 607 /// GEP-tracking data strcutures. 608 void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) { 609 LargeOffsetGEPMap.erase(V); 610 NewGEPBases.erase(V); 611 612 auto GEP = dyn_cast<GetElementPtrInst>(V); 613 if (!GEP) 614 return; 615 616 LargeOffsetGEPID.erase(GEP); 617 618 auto VecI = LargeOffsetGEPMap.find(GEP->getPointerOperand()); 619 if (VecI == LargeOffsetGEPMap.end()) 620 return; 621 622 auto &GEPVector = VecI->second; 623 const auto &I = std::find_if(GEPVector.begin(), GEPVector.end(), 624 [=](auto &Elt) { return Elt.first == GEP; }); 625 if (I == GEPVector.end()) 626 return; 627 628 GEPVector.erase(I); 629 if (GEPVector.empty()) 630 LargeOffsetGEPMap.erase(VecI); 631 } 632 633 // Verify BFI has been updated correctly by recomputing BFI and comparing them. 634 void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) { 635 DominatorTree NewDT(F); 636 LoopInfo NewLI(NewDT); 637 BranchProbabilityInfo NewBPI(F, NewLI, TLInfo); 638 BlockFrequencyInfo NewBFI(F, NewBPI, NewLI); 639 NewBFI.verifyMatch(*BFI); 640 } 641 642 /// Merge basic blocks which are connected by a single edge, where one of the 643 /// basic blocks has a single successor pointing to the other basic block, 644 /// which has a single predecessor. 645 bool CodeGenPrepare::eliminateFallThrough(Function &F) { 646 bool Changed = false; 647 // Scan all of the blocks in the function, except for the entry block. 648 // Use a temporary array to avoid iterator being invalidated when 649 // deleting blocks. 650 SmallVector<WeakTrackingVH, 16> Blocks; 651 for (auto &Block : llvm::make_range(std::next(F.begin()), F.end())) 652 Blocks.push_back(&Block); 653 654 for (auto &Block : Blocks) { 655 auto *BB = cast_or_null<BasicBlock>(Block); 656 if (!BB) 657 continue; 658 // If the destination block has a single pred, then this is a trivial 659 // edge, just collapse it. 660 BasicBlock *SinglePred = BB->getSinglePredecessor(); 661 662 // Don't merge if BB's address is taken. 663 if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; 664 665 BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); 666 if (Term && !Term->isConditional()) { 667 Changed = true; 668 LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n"); 669 670 // Merge BB into SinglePred and delete it. 671 MergeBlockIntoPredecessor(BB); 672 } 673 } 674 return Changed; 675 } 676 677 /// Find a destination block from BB if BB is mergeable empty block. 678 BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { 679 // If this block doesn't end with an uncond branch, ignore it. 680 BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); 681 if (!BI || !BI->isUnconditional()) 682 return nullptr; 683 684 // If the instruction before the branch (skipping debug info) isn't a phi 685 // node, then other stuff is happening here. 686 BasicBlock::iterator BBI = BI->getIterator(); 687 if (BBI != BB->begin()) { 688 --BBI; 689 while (isa<DbgInfoIntrinsic>(BBI)) { 690 if (BBI == BB->begin()) 691 break; 692 --BBI; 693 } 694 if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)) 695 return nullptr; 696 } 697 698 // Do not break infinite loops. 699 BasicBlock *DestBB = BI->getSuccessor(0); 700 if (DestBB == BB) 701 return nullptr; 702 703 if (!canMergeBlocks(BB, DestBB)) 704 DestBB = nullptr; 705 706 return DestBB; 707 } 708 709 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an 710 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split 711 /// edges in ways that are non-optimal for isel. Start by eliminating these 712 /// blocks so we can split them the way we want them. 713 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { 714 SmallPtrSet<BasicBlock *, 16> Preheaders; 715 SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end()); 716 while (!LoopList.empty()) { 717 Loop *L = LoopList.pop_back_val(); 718 LoopList.insert(LoopList.end(), L->begin(), L->end()); 719 if (BasicBlock *Preheader = L->getLoopPreheader()) 720 Preheaders.insert(Preheader); 721 } 722 723 bool MadeChange = false; 724 // Copy blocks into a temporary array to avoid iterator invalidation issues 725 // as we remove them. 726 // Note that this intentionally skips the entry block. 727 SmallVector<WeakTrackingVH, 16> Blocks; 728 for (auto &Block : llvm::make_range(std::next(F.begin()), F.end())) 729 Blocks.push_back(&Block); 730 731 for (auto &Block : Blocks) { 732 BasicBlock *BB = cast_or_null<BasicBlock>(Block); 733 if (!BB) 734 continue; 735 BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB); 736 if (!DestBB || 737 !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB))) 738 continue; 739 740 eliminateMostlyEmptyBlock(BB); 741 MadeChange = true; 742 } 743 return MadeChange; 744 } 745 746 bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, 747 BasicBlock *DestBB, 748 bool isPreheader) { 749 // Do not delete loop preheaders if doing so would create a critical edge. 750 // Loop preheaders can be good locations to spill registers. If the 751 // preheader is deleted and we create a critical edge, registers may be 752 // spilled in the loop body instead. 753 if (!DisablePreheaderProtect && isPreheader && 754 !(BB->getSinglePredecessor() && 755 BB->getSinglePredecessor()->getSingleSuccessor())) 756 return false; 757 758 // Skip merging if the block's successor is also a successor to any callbr 759 // that leads to this block. 760 // FIXME: Is this really needed? Is this a correctness issue? 761 for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { 762 if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator())) 763 for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i) 764 if (DestBB == CBI->getSuccessor(i)) 765 return false; 766 } 767 768 // Try to skip merging if the unique predecessor of BB is terminated by a 769 // switch or indirect branch instruction, and BB is used as an incoming block 770 // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to 771 // add COPY instructions in the predecessor of BB instead of BB (if it is not 772 // merged). Note that the critical edge created by merging such blocks wont be 773 // split in MachineSink because the jump table is not analyzable. By keeping 774 // such empty block (BB), ISel will place COPY instructions in BB, not in the 775 // predecessor of BB. 776 BasicBlock *Pred = BB->getUniquePredecessor(); 777 if (!Pred || 778 !(isa<SwitchInst>(Pred->getTerminator()) || 779 isa<IndirectBrInst>(Pred->getTerminator()))) 780 return true; 781 782 if (BB->getTerminator() != BB->getFirstNonPHIOrDbg()) 783 return true; 784 785 // We use a simple cost heuristic which determine skipping merging is 786 // profitable if the cost of skipping merging is less than the cost of 787 // merging : Cost(skipping merging) < Cost(merging BB), where the 788 // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and 789 // the Cost(merging BB) is Freq(Pred) * Cost(Copy). 790 // Assuming Cost(Copy) == Cost(Branch), we could simplify it to : 791 // Freq(Pred) / Freq(BB) > 2. 792 // Note that if there are multiple empty blocks sharing the same incoming 793 // value for the PHIs in the DestBB, we consider them together. In such 794 // case, Cost(merging BB) will be the sum of their frequencies. 795 796 if (!isa<PHINode>(DestBB->begin())) 797 return true; 798 799 SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs; 800 801 // Find all other incoming blocks from which incoming values of all PHIs in 802 // DestBB are the same as the ones from BB. 803 for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E; 804 ++PI) { 805 BasicBlock *DestBBPred = *PI; 806 if (DestBBPred == BB) 807 continue; 808 809 if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) { 810 return DestPN.getIncomingValueForBlock(BB) == 811 DestPN.getIncomingValueForBlock(DestBBPred); 812 })) 813 SameIncomingValueBBs.insert(DestBBPred); 814 } 815 816 // See if all BB's incoming values are same as the value from Pred. In this 817 // case, no reason to skip merging because COPYs are expected to be place in 818 // Pred already. 819 if (SameIncomingValueBBs.count(Pred)) 820 return true; 821 822 BlockFrequency PredFreq = BFI->getBlockFreq(Pred); 823 BlockFrequency BBFreq = BFI->getBlockFreq(BB); 824 825 for (auto *SameValueBB : SameIncomingValueBBs) 826 if (SameValueBB->getUniquePredecessor() == Pred && 827 DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB)) 828 BBFreq += BFI->getBlockFreq(SameValueBB); 829 830 return PredFreq.getFrequency() <= 831 BBFreq.getFrequency() * FreqRatioToSkipMerge; 832 } 833 834 /// Return true if we can merge BB into DestBB if there is a single 835 /// unconditional branch between them, and BB contains no other non-phi 836 /// instructions. 837 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, 838 const BasicBlock *DestBB) const { 839 // We only want to eliminate blocks whose phi nodes are used by phi nodes in 840 // the successor. If there are more complex condition (e.g. preheaders), 841 // don't mess around with them. 842 for (const PHINode &PN : BB->phis()) { 843 for (const User *U : PN.users()) { 844 const Instruction *UI = cast<Instruction>(U); 845 if (UI->getParent() != DestBB || !isa<PHINode>(UI)) 846 return false; 847 // If User is inside DestBB block and it is a PHINode then check 848 // incoming value. If incoming value is not from BB then this is 849 // a complex condition (e.g. preheaders) we want to avoid here. 850 if (UI->getParent() == DestBB) { 851 if (const PHINode *UPN = dyn_cast<PHINode>(UI)) 852 for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { 853 Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); 854 if (Insn && Insn->getParent() == BB && 855 Insn->getParent() != UPN->getIncomingBlock(I)) 856 return false; 857 } 858 } 859 } 860 } 861 862 // If BB and DestBB contain any common predecessors, then the phi nodes in BB 863 // and DestBB may have conflicting incoming values for the block. If so, we 864 // can't merge the block. 865 const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); 866 if (!DestBBPN) return true; // no conflict. 867 868 // Collect the preds of BB. 869 SmallPtrSet<const BasicBlock*, 16> BBPreds; 870 if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { 871 // It is faster to get preds from a PHI than with pred_iterator. 872 for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) 873 BBPreds.insert(BBPN->getIncomingBlock(i)); 874 } else { 875 BBPreds.insert(pred_begin(BB), pred_end(BB)); 876 } 877 878 // Walk the preds of DestBB. 879 for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { 880 BasicBlock *Pred = DestBBPN->getIncomingBlock(i); 881 if (BBPreds.count(Pred)) { // Common predecessor? 882 for (const PHINode &PN : DestBB->phis()) { 883 const Value *V1 = PN.getIncomingValueForBlock(Pred); 884 const Value *V2 = PN.getIncomingValueForBlock(BB); 885 886 // If V2 is a phi node in BB, look up what the mapped value will be. 887 if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) 888 if (V2PN->getParent() == BB) 889 V2 = V2PN->getIncomingValueForBlock(Pred); 890 891 // If there is a conflict, bail out. 892 if (V1 != V2) return false; 893 } 894 } 895 } 896 897 return true; 898 } 899 900 /// Eliminate a basic block that has only phi's and an unconditional branch in 901 /// it. 902 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { 903 BranchInst *BI = cast<BranchInst>(BB->getTerminator()); 904 BasicBlock *DestBB = BI->getSuccessor(0); 905 906 LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" 907 << *BB << *DestBB); 908 909 // If the destination block has a single pred, then this is a trivial edge, 910 // just collapse it. 911 if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { 912 if (SinglePred != DestBB) { 913 assert(SinglePred == BB && 914 "Single predecessor not the same as predecessor"); 915 // Merge DestBB into SinglePred/BB and delete it. 916 MergeBlockIntoPredecessor(DestBB); 917 // Note: BB(=SinglePred) will not be deleted on this path. 918 // DestBB(=its single successor) is the one that was deleted. 919 LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n"); 920 return; 921 } 922 } 923 924 // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB 925 // to handle the new incoming edges it is about to have. 926 for (PHINode &PN : DestBB->phis()) { 927 // Remove the incoming value for BB, and remember it. 928 Value *InVal = PN.removeIncomingValue(BB, false); 929 930 // Two options: either the InVal is a phi node defined in BB or it is some 931 // value that dominates BB. 932 PHINode *InValPhi = dyn_cast<PHINode>(InVal); 933 if (InValPhi && InValPhi->getParent() == BB) { 934 // Add all of the input values of the input PHI as inputs of this phi. 935 for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) 936 PN.addIncoming(InValPhi->getIncomingValue(i), 937 InValPhi->getIncomingBlock(i)); 938 } else { 939 // Otherwise, add one instance of the dominating value for each edge that 940 // we will be adding. 941 if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { 942 for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) 943 PN.addIncoming(InVal, BBPN->getIncomingBlock(i)); 944 } else { 945 for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) 946 PN.addIncoming(InVal, *PI); 947 } 948 } 949 } 950 951 // The PHIs are now updated, change everything that refers to BB to use 952 // DestBB and remove BB. 953 BB->replaceAllUsesWith(DestBB); 954 BB->eraseFromParent(); 955 ++NumBlocksElim; 956 957 LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); 958 } 959 960 // Computes a map of base pointer relocation instructions to corresponding 961 // derived pointer relocation instructions given a vector of all relocate calls 962 static void computeBaseDerivedRelocateMap( 963 const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls, 964 DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> 965 &RelocateInstMap) { 966 // Collect information in two maps: one primarily for locating the base object 967 // while filling the second map; the second map is the final structure holding 968 // a mapping between Base and corresponding Derived relocate calls 969 DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap; 970 for (auto *ThisRelocate : AllRelocateCalls) { 971 auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), 972 ThisRelocate->getDerivedPtrIndex()); 973 RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); 974 } 975 for (auto &Item : RelocateIdxMap) { 976 std::pair<unsigned, unsigned> Key = Item.first; 977 if (Key.first == Key.second) 978 // Base relocation: nothing to insert 979 continue; 980 981 GCRelocateInst *I = Item.second; 982 auto BaseKey = std::make_pair(Key.first, Key.first); 983 984 // We're iterating over RelocateIdxMap so we cannot modify it. 985 auto MaybeBase = RelocateIdxMap.find(BaseKey); 986 if (MaybeBase == RelocateIdxMap.end()) 987 // TODO: We might want to insert a new base object relocate and gep off 988 // that, if there are enough derived object relocates. 989 continue; 990 991 RelocateInstMap[MaybeBase->second].push_back(I); 992 } 993 } 994 995 // Accepts a GEP and extracts the operands into a vector provided they're all 996 // small integer constants 997 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, 998 SmallVectorImpl<Value *> &OffsetV) { 999 for (unsigned i = 1; i < GEP->getNumOperands(); i++) { 1000 // Only accept small constant integer operands 1001 auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i)); 1002 if (!Op || Op->getZExtValue() > 20) 1003 return false; 1004 } 1005 1006 for (unsigned i = 1; i < GEP->getNumOperands(); i++) 1007 OffsetV.push_back(GEP->getOperand(i)); 1008 return true; 1009 } 1010 1011 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to 1012 // replace, computes a replacement, and affects it. 1013 static bool 1014 simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, 1015 const SmallVectorImpl<GCRelocateInst *> &Targets) { 1016 bool MadeChange = false; 1017 // We must ensure the relocation of derived pointer is defined after 1018 // relocation of base pointer. If we find a relocation corresponding to base 1019 // defined earlier than relocation of base then we move relocation of base 1020 // right before found relocation. We consider only relocation in the same 1021 // basic block as relocation of base. Relocations from other basic block will 1022 // be skipped by optimization and we do not care about them. 1023 for (auto R = RelocatedBase->getParent()->getFirstInsertionPt(); 1024 &*R != RelocatedBase; ++R) 1025 if (auto *RI = dyn_cast<GCRelocateInst>(R)) 1026 if (RI->getStatepoint() == RelocatedBase->getStatepoint()) 1027 if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) { 1028 RelocatedBase->moveBefore(RI); 1029 break; 1030 } 1031 1032 for (GCRelocateInst *ToReplace : Targets) { 1033 assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && 1034 "Not relocating a derived object of the original base object"); 1035 if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { 1036 // A duplicate relocate call. TODO: coalesce duplicates. 1037 continue; 1038 } 1039 1040 if (RelocatedBase->getParent() != ToReplace->getParent()) { 1041 // Base and derived relocates are in different basic blocks. 1042 // In this case transform is only valid when base dominates derived 1043 // relocate. However it would be too expensive to check dominance 1044 // for each such relocate, so we skip the whole transformation. 1045 continue; 1046 } 1047 1048 Value *Base = ToReplace->getBasePtr(); 1049 auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr()); 1050 if (!Derived || Derived->getPointerOperand() != Base) 1051 continue; 1052 1053 SmallVector<Value *, 2> OffsetV; 1054 if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV)) 1055 continue; 1056 1057 // Create a Builder and replace the target callsite with a gep 1058 assert(RelocatedBase->getNextNode() && 1059 "Should always have one since it's not a terminator"); 1060 1061 // Insert after RelocatedBase 1062 IRBuilder<> Builder(RelocatedBase->getNextNode()); 1063 Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); 1064 1065 // If gc_relocate does not match the actual type, cast it to the right type. 1066 // In theory, there must be a bitcast after gc_relocate if the type does not 1067 // match, and we should reuse it to get the derived pointer. But it could be 1068 // cases like this: 1069 // bb1: 1070 // ... 1071 // %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) 1072 // br label %merge 1073 // 1074 // bb2: 1075 // ... 1076 // %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) 1077 // br label %merge 1078 // 1079 // merge: 1080 // %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ] 1081 // %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)* 1082 // 1083 // In this case, we can not find the bitcast any more. So we insert a new bitcast 1084 // no matter there is already one or not. In this way, we can handle all cases, and 1085 // the extra bitcast should be optimized away in later passes. 1086 Value *ActualRelocatedBase = RelocatedBase; 1087 if (RelocatedBase->getType() != Base->getType()) { 1088 ActualRelocatedBase = 1089 Builder.CreateBitCast(RelocatedBase, Base->getType()); 1090 } 1091 Value *Replacement = Builder.CreateGEP( 1092 Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); 1093 Replacement->takeName(ToReplace); 1094 // If the newly generated derived pointer's type does not match the original derived 1095 // pointer's type, cast the new derived pointer to match it. Same reasoning as above. 1096 Value *ActualReplacement = Replacement; 1097 if (Replacement->getType() != ToReplace->getType()) { 1098 ActualReplacement = 1099 Builder.CreateBitCast(Replacement, ToReplace->getType()); 1100 } 1101 ToReplace->replaceAllUsesWith(ActualReplacement); 1102 ToReplace->eraseFromParent(); 1103 1104 MadeChange = true; 1105 } 1106 return MadeChange; 1107 } 1108 1109 // Turns this: 1110 // 1111 // %base = ... 1112 // %ptr = gep %base + 15 1113 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) 1114 // %base' = relocate(%tok, i32 4, i32 4) 1115 // %ptr' = relocate(%tok, i32 4, i32 5) 1116 // %val = load %ptr' 1117 // 1118 // into this: 1119 // 1120 // %base = ... 1121 // %ptr = gep %base + 15 1122 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) 1123 // %base' = gc.relocate(%tok, i32 4, i32 4) 1124 // %ptr' = gep %base' + 15 1125 // %val = load %ptr' 1126 bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) { 1127 bool MadeChange = false; 1128 SmallVector<GCRelocateInst *, 2> AllRelocateCalls; 1129 for (auto *U : I.users()) 1130 if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U)) 1131 // Collect all the relocate calls associated with a statepoint 1132 AllRelocateCalls.push_back(Relocate); 1133 1134 // We need at least one base pointer relocation + one derived pointer 1135 // relocation to mangle 1136 if (AllRelocateCalls.size() < 2) 1137 return false; 1138 1139 // RelocateInstMap is a mapping from the base relocate instruction to the 1140 // corresponding derived relocate instructions 1141 DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap; 1142 computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); 1143 if (RelocateInstMap.empty()) 1144 return false; 1145 1146 for (auto &Item : RelocateInstMap) 1147 // Item.first is the RelocatedBase to offset against 1148 // Item.second is the vector of Targets to replace 1149 MadeChange = simplifyRelocatesOffABase(Item.first, Item.second); 1150 return MadeChange; 1151 } 1152 1153 /// Sink the specified cast instruction into its user blocks. 1154 static bool SinkCast(CastInst *CI) { 1155 BasicBlock *DefBB = CI->getParent(); 1156 1157 /// InsertedCasts - Only insert a cast in each block once. 1158 DenseMap<BasicBlock*, CastInst*> InsertedCasts; 1159 1160 bool MadeChange = false; 1161 for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end(); 1162 UI != E; ) { 1163 Use &TheUse = UI.getUse(); 1164 Instruction *User = cast<Instruction>(*UI); 1165 1166 // Figure out which BB this cast is used in. For PHI's this is the 1167 // appropriate predecessor block. 1168 BasicBlock *UserBB = User->getParent(); 1169 if (PHINode *PN = dyn_cast<PHINode>(User)) { 1170 UserBB = PN->getIncomingBlock(TheUse); 1171 } 1172 1173 // Preincrement use iterator so we don't invalidate it. 1174 ++UI; 1175 1176 // The first insertion point of a block containing an EH pad is after the 1177 // pad. If the pad is the user, we cannot sink the cast past the pad. 1178 if (User->isEHPad()) 1179 continue; 1180 1181 // If the block selected to receive the cast is an EH pad that does not 1182 // allow non-PHI instructions before the terminator, we can't sink the 1183 // cast. 1184 if (UserBB->getTerminator()->isEHPad()) 1185 continue; 1186 1187 // If this user is in the same block as the cast, don't change the cast. 1188 if (UserBB == DefBB) continue; 1189 1190 // If we have already inserted a cast into this block, use it. 1191 CastInst *&InsertedCast = InsertedCasts[UserBB]; 1192 1193 if (!InsertedCast) { 1194 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 1195 assert(InsertPt != UserBB->end()); 1196 InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), 1197 CI->getType(), "", &*InsertPt); 1198 InsertedCast->setDebugLoc(CI->getDebugLoc()); 1199 } 1200 1201 // Replace a use of the cast with a use of the new cast. 1202 TheUse = InsertedCast; 1203 MadeChange = true; 1204 ++NumCastUses; 1205 } 1206 1207 // If we removed all uses, nuke the cast. 1208 if (CI->use_empty()) { 1209 salvageDebugInfo(*CI); 1210 CI->eraseFromParent(); 1211 MadeChange = true; 1212 } 1213 1214 return MadeChange; 1215 } 1216 1217 /// If the specified cast instruction is a noop copy (e.g. it's casting from 1218 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to 1219 /// reduce the number of virtual registers that must be created and coalesced. 1220 /// 1221 /// Return true if any changes are made. 1222 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, 1223 const DataLayout &DL) { 1224 // Sink only "cheap" (or nop) address-space casts. This is a weaker condition 1225 // than sinking only nop casts, but is helpful on some platforms. 1226 if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) { 1227 if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(), 1228 ASC->getDestAddressSpace())) 1229 return false; 1230 } 1231 1232 // If this is a noop copy, 1233 EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType()); 1234 EVT DstVT = TLI.getValueType(DL, CI->getType()); 1235 1236 // This is an fp<->int conversion? 1237 if (SrcVT.isInteger() != DstVT.isInteger()) 1238 return false; 1239 1240 // If this is an extension, it will be a zero or sign extension, which 1241 // isn't a noop. 1242 if (SrcVT.bitsLT(DstVT)) return false; 1243 1244 // If these values will be promoted, find out what they will be promoted 1245 // to. This helps us consider truncates on PPC as noop copies when they 1246 // are. 1247 if (TLI.getTypeAction(CI->getContext(), SrcVT) == 1248 TargetLowering::TypePromoteInteger) 1249 SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); 1250 if (TLI.getTypeAction(CI->getContext(), DstVT) == 1251 TargetLowering::TypePromoteInteger) 1252 DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); 1253 1254 // If, after promotion, these are the same types, this is a noop copy. 1255 if (SrcVT != DstVT) 1256 return false; 1257 1258 return SinkCast(CI); 1259 } 1260 1261 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, 1262 Value *Arg0, Value *Arg1, 1263 CmpInst *Cmp, 1264 Intrinsic::ID IID) { 1265 if (BO->getParent() != Cmp->getParent()) { 1266 // We used to use a dominator tree here to allow multi-block optimization. 1267 // But that was problematic because: 1268 // 1. It could cause a perf regression by hoisting the math op into the 1269 // critical path. 1270 // 2. It could cause a perf regression by creating a value that was live 1271 // across multiple blocks and increasing register pressure. 1272 // 3. Use of a dominator tree could cause large compile-time regression. 1273 // This is because we recompute the DT on every change in the main CGP 1274 // run-loop. The recomputing is probably unnecessary in many cases, so if 1275 // that was fixed, using a DT here would be ok. 1276 return false; 1277 } 1278 1279 // We allow matching the canonical IR (add X, C) back to (usubo X, -C). 1280 if (BO->getOpcode() == Instruction::Add && 1281 IID == Intrinsic::usub_with_overflow) { 1282 assert(isa<Constant>(Arg1) && "Unexpected input for usubo"); 1283 Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1)); 1284 } 1285 1286 // Insert at the first instruction of the pair. 1287 Instruction *InsertPt = nullptr; 1288 for (Instruction &Iter : *Cmp->getParent()) { 1289 // If BO is an XOR, it is not guaranteed that it comes after both inputs to 1290 // the overflow intrinsic are defined. 1291 if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) { 1292 InsertPt = &Iter; 1293 break; 1294 } 1295 } 1296 assert(InsertPt != nullptr && "Parent block did not contain cmp or binop"); 1297 1298 IRBuilder<> Builder(InsertPt); 1299 Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1); 1300 if (BO->getOpcode() != Instruction::Xor) { 1301 Value *Math = Builder.CreateExtractValue(MathOV, 0, "math"); 1302 BO->replaceAllUsesWith(Math); 1303 } else 1304 assert(BO->hasOneUse() && 1305 "Patterns with XOr should use the BO only in the compare"); 1306 Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov"); 1307 Cmp->replaceAllUsesWith(OV); 1308 Cmp->eraseFromParent(); 1309 BO->eraseFromParent(); 1310 return true; 1311 } 1312 1313 /// Match special-case patterns that check for unsigned add overflow. 1314 static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp, 1315 BinaryOperator *&Add) { 1316 // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val) 1317 // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero) 1318 Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); 1319 1320 // We are not expecting non-canonical/degenerate code. Just bail out. 1321 if (isa<Constant>(A)) 1322 return false; 1323 1324 ICmpInst::Predicate Pred = Cmp->getPredicate(); 1325 if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes())) 1326 B = ConstantInt::get(B->getType(), 1); 1327 else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) 1328 B = ConstantInt::get(B->getType(), -1); 1329 else 1330 return false; 1331 1332 // Check the users of the variable operand of the compare looking for an add 1333 // with the adjusted constant. 1334 for (User *U : A->users()) { 1335 if (match(U, m_Add(m_Specific(A), m_Specific(B)))) { 1336 Add = cast<BinaryOperator>(U); 1337 return true; 1338 } 1339 } 1340 return false; 1341 } 1342 1343 /// Try to combine the compare into a call to the llvm.uadd.with.overflow 1344 /// intrinsic. Return true if any changes were made. 1345 bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, 1346 bool &ModifiedDT) { 1347 Value *A, *B; 1348 BinaryOperator *Add; 1349 if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) { 1350 if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add)) 1351 return false; 1352 // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases. 1353 A = Add->getOperand(0); 1354 B = Add->getOperand(1); 1355 } 1356 1357 if (!TLI->shouldFormOverflowOp(ISD::UADDO, 1358 TLI->getValueType(*DL, Add->getType()), 1359 Add->hasNUsesOrMore(2))) 1360 return false; 1361 1362 // We don't want to move around uses of condition values this late, so we 1363 // check if it is legal to create the call to the intrinsic in the basic 1364 // block containing the icmp. 1365 if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse()) 1366 return false; 1367 1368 if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp, 1369 Intrinsic::uadd_with_overflow)) 1370 return false; 1371 1372 // Reset callers - do not crash by iterating over a dead instruction. 1373 ModifiedDT = true; 1374 return true; 1375 } 1376 1377 bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, 1378 bool &ModifiedDT) { 1379 // We are not expecting non-canonical/degenerate code. Just bail out. 1380 Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); 1381 if (isa<Constant>(A) && isa<Constant>(B)) 1382 return false; 1383 1384 // Convert (A u> B) to (A u< B) to simplify pattern matching. 1385 ICmpInst::Predicate Pred = Cmp->getPredicate(); 1386 if (Pred == ICmpInst::ICMP_UGT) { 1387 std::swap(A, B); 1388 Pred = ICmpInst::ICMP_ULT; 1389 } 1390 // Convert special-case: (A == 0) is the same as (A u< 1). 1391 if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) { 1392 B = ConstantInt::get(B->getType(), 1); 1393 Pred = ICmpInst::ICMP_ULT; 1394 } 1395 // Convert special-case: (A != 0) is the same as (0 u< A). 1396 if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) { 1397 std::swap(A, B); 1398 Pred = ICmpInst::ICMP_ULT; 1399 } 1400 if (Pred != ICmpInst::ICMP_ULT) 1401 return false; 1402 1403 // Walk the users of a variable operand of a compare looking for a subtract or 1404 // add with that same operand. Also match the 2nd operand of the compare to 1405 // the add/sub, but that may be a negated constant operand of an add. 1406 Value *CmpVariableOperand = isa<Constant>(A) ? B : A; 1407 BinaryOperator *Sub = nullptr; 1408 for (User *U : CmpVariableOperand->users()) { 1409 // A - B, A u< B --> usubo(A, B) 1410 if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) { 1411 Sub = cast<BinaryOperator>(U); 1412 break; 1413 } 1414 1415 // A + (-C), A u< C (canonicalized form of (sub A, C)) 1416 const APInt *CmpC, *AddC; 1417 if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) && 1418 match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) { 1419 Sub = cast<BinaryOperator>(U); 1420 break; 1421 } 1422 } 1423 if (!Sub) 1424 return false; 1425 1426 if (!TLI->shouldFormOverflowOp(ISD::USUBO, 1427 TLI->getValueType(*DL, Sub->getType()), 1428 Sub->hasNUsesOrMore(2))) 1429 return false; 1430 1431 if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1), 1432 Cmp, Intrinsic::usub_with_overflow)) 1433 return false; 1434 1435 // Reset callers - do not crash by iterating over a dead instruction. 1436 ModifiedDT = true; 1437 return true; 1438 } 1439 1440 /// Sink the given CmpInst into user blocks to reduce the number of virtual 1441 /// registers that must be created and coalesced. This is a clear win except on 1442 /// targets with multiple condition code registers (PowerPC), where it might 1443 /// lose; some adjustment may be wanted there. 1444 /// 1445 /// Return true if any changes are made. 1446 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { 1447 if (TLI.hasMultipleConditionRegisters()) 1448 return false; 1449 1450 // Avoid sinking soft-FP comparisons, since this can move them into a loop. 1451 if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp)) 1452 return false; 1453 1454 // Only insert a cmp in each block once. 1455 DenseMap<BasicBlock*, CmpInst*> InsertedCmps; 1456 1457 bool MadeChange = false; 1458 for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end(); 1459 UI != E; ) { 1460 Use &TheUse = UI.getUse(); 1461 Instruction *User = cast<Instruction>(*UI); 1462 1463 // Preincrement use iterator so we don't invalidate it. 1464 ++UI; 1465 1466 // Don't bother for PHI nodes. 1467 if (isa<PHINode>(User)) 1468 continue; 1469 1470 // Figure out which BB this cmp is used in. 1471 BasicBlock *UserBB = User->getParent(); 1472 BasicBlock *DefBB = Cmp->getParent(); 1473 1474 // If this user is in the same block as the cmp, don't change the cmp. 1475 if (UserBB == DefBB) continue; 1476 1477 // If we have already inserted a cmp into this block, use it. 1478 CmpInst *&InsertedCmp = InsertedCmps[UserBB]; 1479 1480 if (!InsertedCmp) { 1481 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 1482 assert(InsertPt != UserBB->end()); 1483 InsertedCmp = 1484 CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), 1485 Cmp->getOperand(0), Cmp->getOperand(1), "", 1486 &*InsertPt); 1487 // Propagate the debug info. 1488 InsertedCmp->setDebugLoc(Cmp->getDebugLoc()); 1489 } 1490 1491 // Replace a use of the cmp with a use of the new cmp. 1492 TheUse = InsertedCmp; 1493 MadeChange = true; 1494 ++NumCmpUses; 1495 } 1496 1497 // If we removed all uses, nuke the cmp. 1498 if (Cmp->use_empty()) { 1499 Cmp->eraseFromParent(); 1500 MadeChange = true; 1501 } 1502 1503 return MadeChange; 1504 } 1505 1506 /// For pattern like: 1507 /// 1508 /// DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB) 1509 /// ... 1510 /// DomBB: 1511 /// ... 1512 /// br DomCond, TrueBB, CmpBB 1513 /// CmpBB: (with DomBB being the single predecessor) 1514 /// ... 1515 /// Cmp = icmp eq CmpOp0, CmpOp1 1516 /// ... 1517 /// 1518 /// It would use two comparison on targets that lowering of icmp sgt/slt is 1519 /// different from lowering of icmp eq (PowerPC). This function try to convert 1520 /// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'. 1521 /// After that, DomCond and Cmp can use the same comparison so reduce one 1522 /// comparison. 1523 /// 1524 /// Return true if any changes are made. 1525 static bool foldICmpWithDominatingICmp(CmpInst *Cmp, 1526 const TargetLowering &TLI) { 1527 if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp()) 1528 return false; 1529 1530 ICmpInst::Predicate Pred = Cmp->getPredicate(); 1531 if (Pred != ICmpInst::ICMP_EQ) 1532 return false; 1533 1534 // If icmp eq has users other than BranchInst and SelectInst, converting it to 1535 // icmp slt/sgt would introduce more redundant LLVM IR. 1536 for (User *U : Cmp->users()) { 1537 if (isa<BranchInst>(U)) 1538 continue; 1539 if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == Cmp) 1540 continue; 1541 return false; 1542 } 1543 1544 // This is a cheap/incomplete check for dominance - just match a single 1545 // predecessor with a conditional branch. 1546 BasicBlock *CmpBB = Cmp->getParent(); 1547 BasicBlock *DomBB = CmpBB->getSinglePredecessor(); 1548 if (!DomBB) 1549 return false; 1550 1551 // We want to ensure that the only way control gets to the comparison of 1552 // interest is that a less/greater than comparison on the same operands is 1553 // false. 1554 Value *DomCond; 1555 BasicBlock *TrueBB, *FalseBB; 1556 if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB))) 1557 return false; 1558 if (CmpBB != FalseBB) 1559 return false; 1560 1561 Value *CmpOp0 = Cmp->getOperand(0), *CmpOp1 = Cmp->getOperand(1); 1562 ICmpInst::Predicate DomPred; 1563 if (!match(DomCond, m_ICmp(DomPred, m_Specific(CmpOp0), m_Specific(CmpOp1)))) 1564 return false; 1565 if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT) 1566 return false; 1567 1568 // Convert the equality comparison to the opposite of the dominating 1569 // comparison and swap the direction for all branch/select users. 1570 // We have conceptually converted: 1571 // Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>; 1572 // to 1573 // Res = (a < b) ? <LT_RES> : (a > b) ? <GT_RES> : <EQ_RES>; 1574 // And similarly for branches. 1575 for (User *U : Cmp->users()) { 1576 if (auto *BI = dyn_cast<BranchInst>(U)) { 1577 assert(BI->isConditional() && "Must be conditional"); 1578 BI->swapSuccessors(); 1579 continue; 1580 } 1581 if (auto *SI = dyn_cast<SelectInst>(U)) { 1582 // Swap operands 1583 SI->swapValues(); 1584 SI->swapProfMetadata(); 1585 continue; 1586 } 1587 llvm_unreachable("Must be a branch or a select"); 1588 } 1589 Cmp->setPredicate(CmpInst::getSwappedPredicate(DomPred)); 1590 return true; 1591 } 1592 1593 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) { 1594 if (sinkCmpExpression(Cmp, *TLI)) 1595 return true; 1596 1597 if (combineToUAddWithOverflow(Cmp, ModifiedDT)) 1598 return true; 1599 1600 if (combineToUSubWithOverflow(Cmp, ModifiedDT)) 1601 return true; 1602 1603 if (foldICmpWithDominatingICmp(Cmp, *TLI)) 1604 return true; 1605 1606 return false; 1607 } 1608 1609 /// Duplicate and sink the given 'and' instruction into user blocks where it is 1610 /// used in a compare to allow isel to generate better code for targets where 1611 /// this operation can be combined. 1612 /// 1613 /// Return true if any changes are made. 1614 static bool sinkAndCmp0Expression(Instruction *AndI, 1615 const TargetLowering &TLI, 1616 SetOfInstrs &InsertedInsts) { 1617 // Double-check that we're not trying to optimize an instruction that was 1618 // already optimized by some other part of this pass. 1619 assert(!InsertedInsts.count(AndI) && 1620 "Attempting to optimize already optimized and instruction"); 1621 (void) InsertedInsts; 1622 1623 // Nothing to do for single use in same basic block. 1624 if (AndI->hasOneUse() && 1625 AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) 1626 return false; 1627 1628 // Try to avoid cases where sinking/duplicating is likely to increase register 1629 // pressure. 1630 if (!isa<ConstantInt>(AndI->getOperand(0)) && 1631 !isa<ConstantInt>(AndI->getOperand(1)) && 1632 AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) 1633 return false; 1634 1635 for (auto *U : AndI->users()) { 1636 Instruction *User = cast<Instruction>(U); 1637 1638 // Only sink 'and' feeding icmp with 0. 1639 if (!isa<ICmpInst>(User)) 1640 return false; 1641 1642 auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); 1643 if (!CmpC || !CmpC->isZero()) 1644 return false; 1645 } 1646 1647 if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) 1648 return false; 1649 1650 LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); 1651 LLVM_DEBUG(AndI->getParent()->dump()); 1652 1653 // Push the 'and' into the same block as the icmp 0. There should only be 1654 // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any 1655 // others, so we don't need to keep track of which BBs we insert into. 1656 for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); 1657 UI != E; ) { 1658 Use &TheUse = UI.getUse(); 1659 Instruction *User = cast<Instruction>(*UI); 1660 1661 // Preincrement use iterator so we don't invalidate it. 1662 ++UI; 1663 1664 LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); 1665 1666 // Keep the 'and' in the same place if the use is already in the same block. 1667 Instruction *InsertPt = 1668 User->getParent() == AndI->getParent() ? AndI : User; 1669 Instruction *InsertedAnd = 1670 BinaryOperator::Create(Instruction::And, AndI->getOperand(0), 1671 AndI->getOperand(1), "", InsertPt); 1672 // Propagate the debug info. 1673 InsertedAnd->setDebugLoc(AndI->getDebugLoc()); 1674 1675 // Replace a use of the 'and' with a use of the new 'and'. 1676 TheUse = InsertedAnd; 1677 ++NumAndUses; 1678 LLVM_DEBUG(User->getParent()->dump()); 1679 } 1680 1681 // We removed all uses, nuke the and. 1682 AndI->eraseFromParent(); 1683 return true; 1684 } 1685 1686 /// Check if the candidates could be combined with a shift instruction, which 1687 /// includes: 1688 /// 1. Truncate instruction 1689 /// 2. And instruction and the imm is a mask of the low bits: 1690 /// imm & (imm+1) == 0 1691 static bool isExtractBitsCandidateUse(Instruction *User) { 1692 if (!isa<TruncInst>(User)) { 1693 if (User->getOpcode() != Instruction::And || 1694 !isa<ConstantInt>(User->getOperand(1))) 1695 return false; 1696 1697 const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue(); 1698 1699 if ((Cimm & (Cimm + 1)).getBoolValue()) 1700 return false; 1701 } 1702 return true; 1703 } 1704 1705 /// Sink both shift and truncate instruction to the use of truncate's BB. 1706 static bool 1707 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, 1708 DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts, 1709 const TargetLowering &TLI, const DataLayout &DL) { 1710 BasicBlock *UserBB = User->getParent(); 1711 DenseMap<BasicBlock *, CastInst *> InsertedTruncs; 1712 auto *TruncI = cast<TruncInst>(User); 1713 bool MadeChange = false; 1714 1715 for (Value::user_iterator TruncUI = TruncI->user_begin(), 1716 TruncE = TruncI->user_end(); 1717 TruncUI != TruncE;) { 1718 1719 Use &TruncTheUse = TruncUI.getUse(); 1720 Instruction *TruncUser = cast<Instruction>(*TruncUI); 1721 // Preincrement use iterator so we don't invalidate it. 1722 1723 ++TruncUI; 1724 1725 int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode()); 1726 if (!ISDOpcode) 1727 continue; 1728 1729 // If the use is actually a legal node, there will not be an 1730 // implicit truncate. 1731 // FIXME: always querying the result type is just an 1732 // approximation; some nodes' legality is determined by the 1733 // operand or other means. There's no good way to find out though. 1734 if (TLI.isOperationLegalOrCustom( 1735 ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true))) 1736 continue; 1737 1738 // Don't bother for PHI nodes. 1739 if (isa<PHINode>(TruncUser)) 1740 continue; 1741 1742 BasicBlock *TruncUserBB = TruncUser->getParent(); 1743 1744 if (UserBB == TruncUserBB) 1745 continue; 1746 1747 BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB]; 1748 CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB]; 1749 1750 if (!InsertedShift && !InsertedTrunc) { 1751 BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); 1752 assert(InsertPt != TruncUserBB->end()); 1753 // Sink the shift 1754 if (ShiftI->getOpcode() == Instruction::AShr) 1755 InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, 1756 "", &*InsertPt); 1757 else 1758 InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, 1759 "", &*InsertPt); 1760 InsertedShift->setDebugLoc(ShiftI->getDebugLoc()); 1761 1762 // Sink the trunc 1763 BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); 1764 TruncInsertPt++; 1765 assert(TruncInsertPt != TruncUserBB->end()); 1766 1767 InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, 1768 TruncI->getType(), "", &*TruncInsertPt); 1769 InsertedTrunc->setDebugLoc(TruncI->getDebugLoc()); 1770 1771 MadeChange = true; 1772 1773 TruncTheUse = InsertedTrunc; 1774 } 1775 } 1776 return MadeChange; 1777 } 1778 1779 /// Sink the shift *right* instruction into user blocks if the uses could 1780 /// potentially be combined with this shift instruction and generate BitExtract 1781 /// instruction. It will only be applied if the architecture supports BitExtract 1782 /// instruction. Here is an example: 1783 /// BB1: 1784 /// %x.extract.shift = lshr i64 %arg1, 32 1785 /// BB2: 1786 /// %x.extract.trunc = trunc i64 %x.extract.shift to i16 1787 /// ==> 1788 /// 1789 /// BB2: 1790 /// %x.extract.shift.1 = lshr i64 %arg1, 32 1791 /// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16 1792 /// 1793 /// CodeGen will recognize the pattern in BB2 and generate BitExtract 1794 /// instruction. 1795 /// Return true if any changes are made. 1796 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, 1797 const TargetLowering &TLI, 1798 const DataLayout &DL) { 1799 BasicBlock *DefBB = ShiftI->getParent(); 1800 1801 /// Only insert instructions in each block once. 1802 DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts; 1803 1804 bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType())); 1805 1806 bool MadeChange = false; 1807 for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end(); 1808 UI != E;) { 1809 Use &TheUse = UI.getUse(); 1810 Instruction *User = cast<Instruction>(*UI); 1811 // Preincrement use iterator so we don't invalidate it. 1812 ++UI; 1813 1814 // Don't bother for PHI nodes. 1815 if (isa<PHINode>(User)) 1816 continue; 1817 1818 if (!isExtractBitsCandidateUse(User)) 1819 continue; 1820 1821 BasicBlock *UserBB = User->getParent(); 1822 1823 if (UserBB == DefBB) { 1824 // If the shift and truncate instruction are in the same BB. The use of 1825 // the truncate(TruncUse) may still introduce another truncate if not 1826 // legal. In this case, we would like to sink both shift and truncate 1827 // instruction to the BB of TruncUse. 1828 // for example: 1829 // BB1: 1830 // i64 shift.result = lshr i64 opnd, imm 1831 // trunc.result = trunc shift.result to i16 1832 // 1833 // BB2: 1834 // ----> We will have an implicit truncate here if the architecture does 1835 // not have i16 compare. 1836 // cmp i16 trunc.result, opnd2 1837 // 1838 if (isa<TruncInst>(User) && shiftIsLegal 1839 // If the type of the truncate is legal, no truncate will be 1840 // introduced in other basic blocks. 1841 && 1842 (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType())))) 1843 MadeChange = 1844 SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL); 1845 1846 continue; 1847 } 1848 // If we have already inserted a shift into this block, use it. 1849 BinaryOperator *&InsertedShift = InsertedShifts[UserBB]; 1850 1851 if (!InsertedShift) { 1852 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 1853 assert(InsertPt != UserBB->end()); 1854 1855 if (ShiftI->getOpcode() == Instruction::AShr) 1856 InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, 1857 "", &*InsertPt); 1858 else 1859 InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, 1860 "", &*InsertPt); 1861 InsertedShift->setDebugLoc(ShiftI->getDebugLoc()); 1862 1863 MadeChange = true; 1864 } 1865 1866 // Replace a use of the shift with a use of the new shift. 1867 TheUse = InsertedShift; 1868 } 1869 1870 // If we removed all uses, or there are none, nuke the shift. 1871 if (ShiftI->use_empty()) { 1872 salvageDebugInfo(*ShiftI); 1873 ShiftI->eraseFromParent(); 1874 MadeChange = true; 1875 } 1876 1877 return MadeChange; 1878 } 1879 1880 /// If counting leading or trailing zeros is an expensive operation and a zero 1881 /// input is defined, add a check for zero to avoid calling the intrinsic. 1882 /// 1883 /// We want to transform: 1884 /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) 1885 /// 1886 /// into: 1887 /// entry: 1888 /// %cmpz = icmp eq i64 %A, 0 1889 /// br i1 %cmpz, label %cond.end, label %cond.false 1890 /// cond.false: 1891 /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) 1892 /// br label %cond.end 1893 /// cond.end: 1894 /// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] 1895 /// 1896 /// If the transform is performed, return true and set ModifiedDT to true. 1897 static bool despeculateCountZeros(IntrinsicInst *CountZeros, 1898 const TargetLowering *TLI, 1899 const DataLayout *DL, 1900 bool &ModifiedDT) { 1901 // If a zero input is undefined, it doesn't make sense to despeculate that. 1902 if (match(CountZeros->getOperand(1), m_One())) 1903 return false; 1904 1905 // If it's cheap to speculate, there's nothing to do. 1906 auto IntrinsicID = CountZeros->getIntrinsicID(); 1907 if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || 1908 (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) 1909 return false; 1910 1911 // Only handle legal scalar cases. Anything else requires too much work. 1912 Type *Ty = CountZeros->getType(); 1913 unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); 1914 if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits()) 1915 return false; 1916 1917 // The intrinsic will be sunk behind a compare against zero and branch. 1918 BasicBlock *StartBlock = CountZeros->getParent(); 1919 BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); 1920 1921 // Create another block after the count zero intrinsic. A PHI will be added 1922 // in this block to select the result of the intrinsic or the bit-width 1923 // constant if the input to the intrinsic is zero. 1924 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); 1925 BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); 1926 1927 // Set up a builder to create a compare, conditional branch, and PHI. 1928 IRBuilder<> Builder(CountZeros->getContext()); 1929 Builder.SetInsertPoint(StartBlock->getTerminator()); 1930 Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); 1931 1932 // Replace the unconditional branch that was created by the first split with 1933 // a compare against zero and a conditional branch. 1934 Value *Zero = Constant::getNullValue(Ty); 1935 Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); 1936 Builder.CreateCondBr(Cmp, EndBlock, CallBlock); 1937 StartBlock->getTerminator()->eraseFromParent(); 1938 1939 // Create a PHI in the end block to select either the output of the intrinsic 1940 // or the bit width of the operand. 1941 Builder.SetInsertPoint(&EndBlock->front()); 1942 PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); 1943 CountZeros->replaceAllUsesWith(PN); 1944 Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); 1945 PN->addIncoming(BitWidth, StartBlock); 1946 PN->addIncoming(CountZeros, CallBlock); 1947 1948 // We are explicitly handling the zero case, so we can set the intrinsic's 1949 // undefined zero argument to 'true'. This will also prevent reprocessing the 1950 // intrinsic; we only despeculate when a zero input is defined. 1951 CountZeros->setArgOperand(1, Builder.getTrue()); 1952 ModifiedDT = true; 1953 return true; 1954 } 1955 1956 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { 1957 BasicBlock *BB = CI->getParent(); 1958 1959 // Lower inline assembly if we can. 1960 // If we found an inline asm expession, and if the target knows how to 1961 // lower it to normal LLVM code, do so now. 1962 if (CI->isInlineAsm()) { 1963 if (TLI->ExpandInlineAsm(CI)) { 1964 // Avoid invalidating the iterator. 1965 CurInstIterator = BB->begin(); 1966 // Avoid processing instructions out of order, which could cause 1967 // reuse before a value is defined. 1968 SunkAddrs.clear(); 1969 return true; 1970 } 1971 // Sink address computing for memory operands into the block. 1972 if (optimizeInlineAsmInst(CI)) 1973 return true; 1974 } 1975 1976 // Align the pointer arguments to this call if the target thinks it's a good 1977 // idea 1978 unsigned MinSize, PrefAlign; 1979 if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { 1980 for (auto &Arg : CI->arg_operands()) { 1981 // We want to align both objects whose address is used directly and 1982 // objects whose address is used in casts and GEPs, though it only makes 1983 // sense for GEPs if the offset is a multiple of the desired alignment and 1984 // if size - offset meets the size threshold. 1985 if (!Arg->getType()->isPointerTy()) 1986 continue; 1987 APInt Offset(DL->getIndexSizeInBits( 1988 cast<PointerType>(Arg->getType())->getAddressSpace()), 1989 0); 1990 Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); 1991 uint64_t Offset2 = Offset.getLimitedValue(); 1992 if ((Offset2 & (PrefAlign-1)) != 0) 1993 continue; 1994 AllocaInst *AI; 1995 if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign && 1996 DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2) 1997 AI->setAlignment(Align(PrefAlign)); 1998 // Global variables can only be aligned if they are defined in this 1999 // object (i.e. they are uniquely initialized in this object), and 2000 // over-aligning global variables that have an explicit section is 2001 // forbidden. 2002 GlobalVariable *GV; 2003 if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() && 2004 GV->getPointerAlignment(*DL) < PrefAlign && 2005 DL->getTypeAllocSize(GV->getValueType()) >= 2006 MinSize + Offset2) 2007 GV->setAlignment(MaybeAlign(PrefAlign)); 2008 } 2009 // If this is a memcpy (or similar) then we may be able to improve the 2010 // alignment 2011 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) { 2012 Align DestAlign = getKnownAlignment(MI->getDest(), *DL); 2013 MaybeAlign MIDestAlign = MI->getDestAlign(); 2014 if (!MIDestAlign || DestAlign > *MIDestAlign) 2015 MI->setDestAlignment(DestAlign); 2016 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { 2017 MaybeAlign MTISrcAlign = MTI->getSourceAlign(); 2018 Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL); 2019 if (!MTISrcAlign || SrcAlign > *MTISrcAlign) 2020 MTI->setSourceAlignment(SrcAlign); 2021 } 2022 } 2023 } 2024 2025 // If we have a cold call site, try to sink addressing computation into the 2026 // cold block. This interacts with our handling for loads and stores to 2027 // ensure that we can fold all uses of a potential addressing computation 2028 // into their uses. TODO: generalize this to work over profiling data 2029 if (CI->hasFnAttr(Attribute::Cold) && 2030 !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) 2031 for (auto &Arg : CI->arg_operands()) { 2032 if (!Arg->getType()->isPointerTy()) 2033 continue; 2034 unsigned AS = Arg->getType()->getPointerAddressSpace(); 2035 return optimizeMemoryInst(CI, Arg, Arg->getType(), AS); 2036 } 2037 2038 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); 2039 if (II) { 2040 switch (II->getIntrinsicID()) { 2041 default: break; 2042 case Intrinsic::assume: { 2043 II->eraseFromParent(); 2044 return true; 2045 } 2046 2047 case Intrinsic::experimental_widenable_condition: { 2048 // Give up on future widening oppurtunties so that we can fold away dead 2049 // paths and merge blocks before going into block-local instruction 2050 // selection. 2051 if (II->use_empty()) { 2052 II->eraseFromParent(); 2053 return true; 2054 } 2055 Constant *RetVal = ConstantInt::getTrue(II->getContext()); 2056 resetIteratorIfInvalidatedWhileCalling(BB, [&]() { 2057 replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); 2058 }); 2059 return true; 2060 } 2061 case Intrinsic::objectsize: 2062 llvm_unreachable("llvm.objectsize.* should have been lowered already"); 2063 case Intrinsic::is_constant: 2064 llvm_unreachable("llvm.is.constant.* should have been lowered already"); 2065 case Intrinsic::aarch64_stlxr: 2066 case Intrinsic::aarch64_stxr: { 2067 ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); 2068 if (!ExtVal || !ExtVal->hasOneUse() || 2069 ExtVal->getParent() == CI->getParent()) 2070 return false; 2071 // Sink a zext feeding stlxr/stxr before it, so it can be folded into it. 2072 ExtVal->moveBefore(CI); 2073 // Mark this instruction as "inserted by CGP", so that other 2074 // optimizations don't touch it. 2075 InsertedInsts.insert(ExtVal); 2076 return true; 2077 } 2078 2079 case Intrinsic::launder_invariant_group: 2080 case Intrinsic::strip_invariant_group: { 2081 Value *ArgVal = II->getArgOperand(0); 2082 auto it = LargeOffsetGEPMap.find(II); 2083 if (it != LargeOffsetGEPMap.end()) { 2084 // Merge entries in LargeOffsetGEPMap to reflect the RAUW. 2085 // Make sure not to have to deal with iterator invalidation 2086 // after possibly adding ArgVal to LargeOffsetGEPMap. 2087 auto GEPs = std::move(it->second); 2088 LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end()); 2089 LargeOffsetGEPMap.erase(II); 2090 } 2091 2092 II->replaceAllUsesWith(ArgVal); 2093 II->eraseFromParent(); 2094 return true; 2095 } 2096 case Intrinsic::cttz: 2097 case Intrinsic::ctlz: 2098 // If counting zeros is expensive, try to avoid it. 2099 return despeculateCountZeros(II, TLI, DL, ModifiedDT); 2100 case Intrinsic::fshl: 2101 case Intrinsic::fshr: 2102 return optimizeFunnelShift(II); 2103 case Intrinsic::dbg_value: 2104 return fixupDbgValue(II); 2105 case Intrinsic::vscale: { 2106 // If datalayout has no special restrictions on vector data layout, 2107 // replace `llvm.vscale` by an equivalent constant expression 2108 // to benefit from cheap constant propagation. 2109 Type *ScalableVectorTy = 2110 VectorType::get(Type::getInt8Ty(II->getContext()), 1, true); 2111 if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) { 2112 auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo()); 2113 auto *One = ConstantInt::getSigned(II->getType(), 1); 2114 auto *CGep = 2115 ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One); 2116 II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType())); 2117 II->eraseFromParent(); 2118 return true; 2119 } 2120 break; 2121 } 2122 case Intrinsic::masked_gather: 2123 return optimizeGatherScatterInst(II, II->getArgOperand(0)); 2124 case Intrinsic::masked_scatter: 2125 return optimizeGatherScatterInst(II, II->getArgOperand(1)); 2126 } 2127 2128 SmallVector<Value *, 2> PtrOps; 2129 Type *AccessTy; 2130 if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) 2131 while (!PtrOps.empty()) { 2132 Value *PtrVal = PtrOps.pop_back_val(); 2133 unsigned AS = PtrVal->getType()->getPointerAddressSpace(); 2134 if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) 2135 return true; 2136 } 2137 } 2138 2139 // From here on out we're working with named functions. 2140 if (!CI->getCalledFunction()) return false; 2141 2142 // Lower all default uses of _chk calls. This is very similar 2143 // to what InstCombineCalls does, but here we are only lowering calls 2144 // to fortified library functions (e.g. __memcpy_chk) that have the default 2145 // "don't know" as the objectsize. Anything else should be left alone. 2146 FortifiedLibCallSimplifier Simplifier(TLInfo, true); 2147 IRBuilder<> Builder(CI); 2148 if (Value *V = Simplifier.optimizeCall(CI, Builder)) { 2149 CI->replaceAllUsesWith(V); 2150 CI->eraseFromParent(); 2151 return true; 2152 } 2153 2154 return false; 2155 } 2156 2157 /// Look for opportunities to duplicate return instructions to the predecessor 2158 /// to enable tail call optimizations. The case it is currently looking for is: 2159 /// @code 2160 /// bb0: 2161 /// %tmp0 = tail call i32 @f0() 2162 /// br label %return 2163 /// bb1: 2164 /// %tmp1 = tail call i32 @f1() 2165 /// br label %return 2166 /// bb2: 2167 /// %tmp2 = tail call i32 @f2() 2168 /// br label %return 2169 /// return: 2170 /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] 2171 /// ret i32 %retval 2172 /// @endcode 2173 /// 2174 /// => 2175 /// 2176 /// @code 2177 /// bb0: 2178 /// %tmp0 = tail call i32 @f0() 2179 /// ret i32 %tmp0 2180 /// bb1: 2181 /// %tmp1 = tail call i32 @f1() 2182 /// ret i32 %tmp1 2183 /// bb2: 2184 /// %tmp2 = tail call i32 @f2() 2185 /// ret i32 %tmp2 2186 /// @endcode 2187 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) { 2188 ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator()); 2189 if (!RetI) 2190 return false; 2191 2192 PHINode *PN = nullptr; 2193 ExtractValueInst *EVI = nullptr; 2194 BitCastInst *BCI = nullptr; 2195 Value *V = RetI->getReturnValue(); 2196 if (V) { 2197 BCI = dyn_cast<BitCastInst>(V); 2198 if (BCI) 2199 V = BCI->getOperand(0); 2200 2201 EVI = dyn_cast<ExtractValueInst>(V); 2202 if (EVI) { 2203 V = EVI->getOperand(0); 2204 if (!std::all_of(EVI->idx_begin(), EVI->idx_end(), 2205 [](unsigned idx) { return idx == 0; })) 2206 return false; 2207 } 2208 2209 PN = dyn_cast<PHINode>(V); 2210 if (!PN) 2211 return false; 2212 } 2213 2214 if (PN && PN->getParent() != BB) 2215 return false; 2216 2217 // Make sure there are no instructions between the PHI and return, or that the 2218 // return is the first instruction in the block. 2219 if (PN) { 2220 BasicBlock::iterator BI = BB->begin(); 2221 // Skip over debug and the bitcast. 2222 do { 2223 ++BI; 2224 } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI); 2225 if (&*BI != RetI) 2226 return false; 2227 } else { 2228 BasicBlock::iterator BI = BB->begin(); 2229 while (isa<DbgInfoIntrinsic>(BI)) ++BI; 2230 if (&*BI != RetI) 2231 return false; 2232 } 2233 2234 /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail 2235 /// call. 2236 const Function *F = BB->getParent(); 2237 SmallVector<BasicBlock*, 4> TailCallBBs; 2238 if (PN) { 2239 for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { 2240 // Look through bitcasts. 2241 Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts(); 2242 CallInst *CI = dyn_cast<CallInst>(IncomingVal); 2243 BasicBlock *PredBB = PN->getIncomingBlock(I); 2244 // Make sure the phi value is indeed produced by the tail call. 2245 if (CI && CI->hasOneUse() && CI->getParent() == PredBB && 2246 TLI->mayBeEmittedAsTailCall(CI) && 2247 attributesPermitTailCall(F, CI, RetI, *TLI)) 2248 TailCallBBs.push_back(PredBB); 2249 } 2250 } else { 2251 SmallPtrSet<BasicBlock*, 4> VisitedBBs; 2252 for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { 2253 if (!VisitedBBs.insert(*PI).second) 2254 continue; 2255 2256 BasicBlock::InstListType &InstList = (*PI)->getInstList(); 2257 BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); 2258 BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); 2259 do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI)); 2260 if (RI == RE) 2261 continue; 2262 2263 CallInst *CI = dyn_cast<CallInst>(&*RI); 2264 if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) && 2265 attributesPermitTailCall(F, CI, RetI, *TLI)) 2266 TailCallBBs.push_back(*PI); 2267 } 2268 } 2269 2270 bool Changed = false; 2271 for (auto const &TailCallBB : TailCallBBs) { 2272 // Make sure the call instruction is followed by an unconditional branch to 2273 // the return block. 2274 BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator()); 2275 if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB) 2276 continue; 2277 2278 // Duplicate the return into TailCallBB. 2279 (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB); 2280 assert(!VerifyBFIUpdates || 2281 BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB)); 2282 BFI->setBlockFreq( 2283 BB, 2284 (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency()); 2285 ModifiedDT = Changed = true; 2286 ++NumRetsDup; 2287 } 2288 2289 // If we eliminated all predecessors of the block, delete the block now. 2290 if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) 2291 BB->eraseFromParent(); 2292 2293 return Changed; 2294 } 2295 2296 //===----------------------------------------------------------------------===// 2297 // Memory Optimization 2298 //===----------------------------------------------------------------------===// 2299 2300 namespace { 2301 2302 /// This is an extended version of TargetLowering::AddrMode 2303 /// which holds actual Value*'s for register values. 2304 struct ExtAddrMode : public TargetLowering::AddrMode { 2305 Value *BaseReg = nullptr; 2306 Value *ScaledReg = nullptr; 2307 Value *OriginalValue = nullptr; 2308 bool InBounds = true; 2309 2310 enum FieldName { 2311 NoField = 0x00, 2312 BaseRegField = 0x01, 2313 BaseGVField = 0x02, 2314 BaseOffsField = 0x04, 2315 ScaledRegField = 0x08, 2316 ScaleField = 0x10, 2317 MultipleFields = 0xff 2318 }; 2319 2320 2321 ExtAddrMode() = default; 2322 2323 void print(raw_ostream &OS) const; 2324 void dump() const; 2325 2326 FieldName compare(const ExtAddrMode &other) { 2327 // First check that the types are the same on each field, as differing types 2328 // is something we can't cope with later on. 2329 if (BaseReg && other.BaseReg && 2330 BaseReg->getType() != other.BaseReg->getType()) 2331 return MultipleFields; 2332 if (BaseGV && other.BaseGV && 2333 BaseGV->getType() != other.BaseGV->getType()) 2334 return MultipleFields; 2335 if (ScaledReg && other.ScaledReg && 2336 ScaledReg->getType() != other.ScaledReg->getType()) 2337 return MultipleFields; 2338 2339 // Conservatively reject 'inbounds' mismatches. 2340 if (InBounds != other.InBounds) 2341 return MultipleFields; 2342 2343 // Check each field to see if it differs. 2344 unsigned Result = NoField; 2345 if (BaseReg != other.BaseReg) 2346 Result |= BaseRegField; 2347 if (BaseGV != other.BaseGV) 2348 Result |= BaseGVField; 2349 if (BaseOffs != other.BaseOffs) 2350 Result |= BaseOffsField; 2351 if (ScaledReg != other.ScaledReg) 2352 Result |= ScaledRegField; 2353 // Don't count 0 as being a different scale, because that actually means 2354 // unscaled (which will already be counted by having no ScaledReg). 2355 if (Scale && other.Scale && Scale != other.Scale) 2356 Result |= ScaleField; 2357 2358 if (countPopulation(Result) > 1) 2359 return MultipleFields; 2360 else 2361 return static_cast<FieldName>(Result); 2362 } 2363 2364 // An AddrMode is trivial if it involves no calculation i.e. it is just a base 2365 // with no offset. 2366 bool isTrivial() { 2367 // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is 2368 // trivial if at most one of these terms is nonzero, except that BaseGV and 2369 // BaseReg both being zero actually means a null pointer value, which we 2370 // consider to be 'non-zero' here. 2371 return !BaseOffs && !Scale && !(BaseGV && BaseReg); 2372 } 2373 2374 Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) { 2375 switch (Field) { 2376 default: 2377 return nullptr; 2378 case BaseRegField: 2379 return BaseReg; 2380 case BaseGVField: 2381 return BaseGV; 2382 case ScaledRegField: 2383 return ScaledReg; 2384 case BaseOffsField: 2385 return ConstantInt::get(IntPtrTy, BaseOffs); 2386 } 2387 } 2388 2389 void SetCombinedField(FieldName Field, Value *V, 2390 const SmallVectorImpl<ExtAddrMode> &AddrModes) { 2391 switch (Field) { 2392 default: 2393 llvm_unreachable("Unhandled fields are expected to be rejected earlier"); 2394 break; 2395 case ExtAddrMode::BaseRegField: 2396 BaseReg = V; 2397 break; 2398 case ExtAddrMode::BaseGVField: 2399 // A combined BaseGV is an Instruction, not a GlobalValue, so it goes 2400 // in the BaseReg field. 2401 assert(BaseReg == nullptr); 2402 BaseReg = V; 2403 BaseGV = nullptr; 2404 break; 2405 case ExtAddrMode::ScaledRegField: 2406 ScaledReg = V; 2407 // If we have a mix of scaled and unscaled addrmodes then we want scale 2408 // to be the scale and not zero. 2409 if (!Scale) 2410 for (const ExtAddrMode &AM : AddrModes) 2411 if (AM.Scale) { 2412 Scale = AM.Scale; 2413 break; 2414 } 2415 break; 2416 case ExtAddrMode::BaseOffsField: 2417 // The offset is no longer a constant, so it goes in ScaledReg with a 2418 // scale of 1. 2419 assert(ScaledReg == nullptr); 2420 ScaledReg = V; 2421 Scale = 1; 2422 BaseOffs = 0; 2423 break; 2424 } 2425 } 2426 }; 2427 2428 } // end anonymous namespace 2429 2430 #ifndef NDEBUG 2431 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { 2432 AM.print(OS); 2433 return OS; 2434 } 2435 #endif 2436 2437 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2438 void ExtAddrMode::print(raw_ostream &OS) const { 2439 bool NeedPlus = false; 2440 OS << "["; 2441 if (InBounds) 2442 OS << "inbounds "; 2443 if (BaseGV) { 2444 OS << (NeedPlus ? " + " : "") 2445 << "GV:"; 2446 BaseGV->printAsOperand(OS, /*PrintType=*/false); 2447 NeedPlus = true; 2448 } 2449 2450 if (BaseOffs) { 2451 OS << (NeedPlus ? " + " : "") 2452 << BaseOffs; 2453 NeedPlus = true; 2454 } 2455 2456 if (BaseReg) { 2457 OS << (NeedPlus ? " + " : "") 2458 << "Base:"; 2459 BaseReg->printAsOperand(OS, /*PrintType=*/false); 2460 NeedPlus = true; 2461 } 2462 if (Scale) { 2463 OS << (NeedPlus ? " + " : "") 2464 << Scale << "*"; 2465 ScaledReg->printAsOperand(OS, /*PrintType=*/false); 2466 } 2467 2468 OS << ']'; 2469 } 2470 2471 LLVM_DUMP_METHOD void ExtAddrMode::dump() const { 2472 print(dbgs()); 2473 dbgs() << '\n'; 2474 } 2475 #endif 2476 2477 namespace { 2478 2479 /// This class provides transaction based operation on the IR. 2480 /// Every change made through this class is recorded in the internal state and 2481 /// can be undone (rollback) until commit is called. 2482 /// CGP does not check if instructions could be speculatively executed when 2483 /// moved. Preserving the original location would pessimize the debugging 2484 /// experience, as well as negatively impact the quality of sample PGO. 2485 class TypePromotionTransaction { 2486 /// This represents the common interface of the individual transaction. 2487 /// Each class implements the logic for doing one specific modification on 2488 /// the IR via the TypePromotionTransaction. 2489 class TypePromotionAction { 2490 protected: 2491 /// The Instruction modified. 2492 Instruction *Inst; 2493 2494 public: 2495 /// Constructor of the action. 2496 /// The constructor performs the related action on the IR. 2497 TypePromotionAction(Instruction *Inst) : Inst(Inst) {} 2498 2499 virtual ~TypePromotionAction() = default; 2500 2501 /// Undo the modification done by this action. 2502 /// When this method is called, the IR must be in the same state as it was 2503 /// before this action was applied. 2504 /// \pre Undoing the action works if and only if the IR is in the exact same 2505 /// state as it was directly after this action was applied. 2506 virtual void undo() = 0; 2507 2508 /// Advocate every change made by this action. 2509 /// When the results on the IR of the action are to be kept, it is important 2510 /// to call this function, otherwise hidden information may be kept forever. 2511 virtual void commit() { 2512 // Nothing to be done, this action is not doing anything. 2513 } 2514 }; 2515 2516 /// Utility to remember the position of an instruction. 2517 class InsertionHandler { 2518 /// Position of an instruction. 2519 /// Either an instruction: 2520 /// - Is the first in a basic block: BB is used. 2521 /// - Has a previous instruction: PrevInst is used. 2522 union { 2523 Instruction *PrevInst; 2524 BasicBlock *BB; 2525 } Point; 2526 2527 /// Remember whether or not the instruction had a previous instruction. 2528 bool HasPrevInstruction; 2529 2530 public: 2531 /// Record the position of \p Inst. 2532 InsertionHandler(Instruction *Inst) { 2533 BasicBlock::iterator It = Inst->getIterator(); 2534 HasPrevInstruction = (It != (Inst->getParent()->begin())); 2535 if (HasPrevInstruction) 2536 Point.PrevInst = &*--It; 2537 else 2538 Point.BB = Inst->getParent(); 2539 } 2540 2541 /// Insert \p Inst at the recorded position. 2542 void insert(Instruction *Inst) { 2543 if (HasPrevInstruction) { 2544 if (Inst->getParent()) 2545 Inst->removeFromParent(); 2546 Inst->insertAfter(Point.PrevInst); 2547 } else { 2548 Instruction *Position = &*Point.BB->getFirstInsertionPt(); 2549 if (Inst->getParent()) 2550 Inst->moveBefore(Position); 2551 else 2552 Inst->insertBefore(Position); 2553 } 2554 } 2555 }; 2556 2557 /// Move an instruction before another. 2558 class InstructionMoveBefore : public TypePromotionAction { 2559 /// Original position of the instruction. 2560 InsertionHandler Position; 2561 2562 public: 2563 /// Move \p Inst before \p Before. 2564 InstructionMoveBefore(Instruction *Inst, Instruction *Before) 2565 : TypePromotionAction(Inst), Position(Inst) { 2566 LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before 2567 << "\n"); 2568 Inst->moveBefore(Before); 2569 } 2570 2571 /// Move the instruction back to its original position. 2572 void undo() override { 2573 LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n"); 2574 Position.insert(Inst); 2575 } 2576 }; 2577 2578 /// Set the operand of an instruction with a new value. 2579 class OperandSetter : public TypePromotionAction { 2580 /// Original operand of the instruction. 2581 Value *Origin; 2582 2583 /// Index of the modified instruction. 2584 unsigned Idx; 2585 2586 public: 2587 /// Set \p Idx operand of \p Inst with \p NewVal. 2588 OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal) 2589 : TypePromotionAction(Inst), Idx(Idx) { 2590 LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n" 2591 << "for:" << *Inst << "\n" 2592 << "with:" << *NewVal << "\n"); 2593 Origin = Inst->getOperand(Idx); 2594 Inst->setOperand(Idx, NewVal); 2595 } 2596 2597 /// Restore the original value of the instruction. 2598 void undo() override { 2599 LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n" 2600 << "for: " << *Inst << "\n" 2601 << "with: " << *Origin << "\n"); 2602 Inst->setOperand(Idx, Origin); 2603 } 2604 }; 2605 2606 /// Hide the operands of an instruction. 2607 /// Do as if this instruction was not using any of its operands. 2608 class OperandsHider : public TypePromotionAction { 2609 /// The list of original operands. 2610 SmallVector<Value *, 4> OriginalValues; 2611 2612 public: 2613 /// Remove \p Inst from the uses of the operands of \p Inst. 2614 OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) { 2615 LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n"); 2616 unsigned NumOpnds = Inst->getNumOperands(); 2617 OriginalValues.reserve(NumOpnds); 2618 for (unsigned It = 0; It < NumOpnds; ++It) { 2619 // Save the current operand. 2620 Value *Val = Inst->getOperand(It); 2621 OriginalValues.push_back(Val); 2622 // Set a dummy one. 2623 // We could use OperandSetter here, but that would imply an overhead 2624 // that we are not willing to pay. 2625 Inst->setOperand(It, UndefValue::get(Val->getType())); 2626 } 2627 } 2628 2629 /// Restore the original list of uses. 2630 void undo() override { 2631 LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n"); 2632 for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It) 2633 Inst->setOperand(It, OriginalValues[It]); 2634 } 2635 }; 2636 2637 /// Build a truncate instruction. 2638 class TruncBuilder : public TypePromotionAction { 2639 Value *Val; 2640 2641 public: 2642 /// Build a truncate instruction of \p Opnd producing a \p Ty 2643 /// result. 2644 /// trunc Opnd to Ty. 2645 TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) { 2646 IRBuilder<> Builder(Opnd); 2647 Builder.SetCurrentDebugLocation(DebugLoc()); 2648 Val = Builder.CreateTrunc(Opnd, Ty, "promoted"); 2649 LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n"); 2650 } 2651 2652 /// Get the built value. 2653 Value *getBuiltValue() { return Val; } 2654 2655 /// Remove the built instruction. 2656 void undo() override { 2657 LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n"); 2658 if (Instruction *IVal = dyn_cast<Instruction>(Val)) 2659 IVal->eraseFromParent(); 2660 } 2661 }; 2662 2663 /// Build a sign extension instruction. 2664 class SExtBuilder : public TypePromotionAction { 2665 Value *Val; 2666 2667 public: 2668 /// Build a sign extension instruction of \p Opnd producing a \p Ty 2669 /// result. 2670 /// sext Opnd to Ty. 2671 SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) 2672 : TypePromotionAction(InsertPt) { 2673 IRBuilder<> Builder(InsertPt); 2674 Val = Builder.CreateSExt(Opnd, Ty, "promoted"); 2675 LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n"); 2676 } 2677 2678 /// Get the built value. 2679 Value *getBuiltValue() { return Val; } 2680 2681 /// Remove the built instruction. 2682 void undo() override { 2683 LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n"); 2684 if (Instruction *IVal = dyn_cast<Instruction>(Val)) 2685 IVal->eraseFromParent(); 2686 } 2687 }; 2688 2689 /// Build a zero extension instruction. 2690 class ZExtBuilder : public TypePromotionAction { 2691 Value *Val; 2692 2693 public: 2694 /// Build a zero extension instruction of \p Opnd producing a \p Ty 2695 /// result. 2696 /// zext Opnd to Ty. 2697 ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) 2698 : TypePromotionAction(InsertPt) { 2699 IRBuilder<> Builder(InsertPt); 2700 Builder.SetCurrentDebugLocation(DebugLoc()); 2701 Val = Builder.CreateZExt(Opnd, Ty, "promoted"); 2702 LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n"); 2703 } 2704 2705 /// Get the built value. 2706 Value *getBuiltValue() { return Val; } 2707 2708 /// Remove the built instruction. 2709 void undo() override { 2710 LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n"); 2711 if (Instruction *IVal = dyn_cast<Instruction>(Val)) 2712 IVal->eraseFromParent(); 2713 } 2714 }; 2715 2716 /// Mutate an instruction to another type. 2717 class TypeMutator : public TypePromotionAction { 2718 /// Record the original type. 2719 Type *OrigTy; 2720 2721 public: 2722 /// Mutate the type of \p Inst into \p NewTy. 2723 TypeMutator(Instruction *Inst, Type *NewTy) 2724 : TypePromotionAction(Inst), OrigTy(Inst->getType()) { 2725 LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy 2726 << "\n"); 2727 Inst->mutateType(NewTy); 2728 } 2729 2730 /// Mutate the instruction back to its original type. 2731 void undo() override { 2732 LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy 2733 << "\n"); 2734 Inst->mutateType(OrigTy); 2735 } 2736 }; 2737 2738 /// Replace the uses of an instruction by another instruction. 2739 class UsesReplacer : public TypePromotionAction { 2740 /// Helper structure to keep track of the replaced uses. 2741 struct InstructionAndIdx { 2742 /// The instruction using the instruction. 2743 Instruction *Inst; 2744 2745 /// The index where this instruction is used for Inst. 2746 unsigned Idx; 2747 2748 InstructionAndIdx(Instruction *Inst, unsigned Idx) 2749 : Inst(Inst), Idx(Idx) {} 2750 }; 2751 2752 /// Keep track of the original uses (pair Instruction, Index). 2753 SmallVector<InstructionAndIdx, 4> OriginalUses; 2754 /// Keep track of the debug users. 2755 SmallVector<DbgValueInst *, 1> DbgValues; 2756 2757 using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator; 2758 2759 public: 2760 /// Replace all the use of \p Inst by \p New. 2761 UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) { 2762 LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New 2763 << "\n"); 2764 // Record the original uses. 2765 for (Use &U : Inst->uses()) { 2766 Instruction *UserI = cast<Instruction>(U.getUser()); 2767 OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo())); 2768 } 2769 // Record the debug uses separately. They are not in the instruction's 2770 // use list, but they are replaced by RAUW. 2771 findDbgValues(DbgValues, Inst); 2772 2773 // Now, we can replace the uses. 2774 Inst->replaceAllUsesWith(New); 2775 } 2776 2777 /// Reassign the original uses of Inst to Inst. 2778 void undo() override { 2779 LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n"); 2780 for (use_iterator UseIt = OriginalUses.begin(), 2781 EndIt = OriginalUses.end(); 2782 UseIt != EndIt; ++UseIt) { 2783 UseIt->Inst->setOperand(UseIt->Idx, Inst); 2784 } 2785 // RAUW has replaced all original uses with references to the new value, 2786 // including the debug uses. Since we are undoing the replacements, 2787 // the original debug uses must also be reinstated to maintain the 2788 // correctness and utility of debug value instructions. 2789 for (auto *DVI: DbgValues) { 2790 LLVMContext &Ctx = Inst->getType()->getContext(); 2791 auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst)); 2792 DVI->setOperand(0, MV); 2793 } 2794 } 2795 }; 2796 2797 /// Remove an instruction from the IR. 2798 class InstructionRemover : public TypePromotionAction { 2799 /// Original position of the instruction. 2800 InsertionHandler Inserter; 2801 2802 /// Helper structure to hide all the link to the instruction. In other 2803 /// words, this helps to do as if the instruction was removed. 2804 OperandsHider Hider; 2805 2806 /// Keep track of the uses replaced, if any. 2807 UsesReplacer *Replacer = nullptr; 2808 2809 /// Keep track of instructions removed. 2810 SetOfInstrs &RemovedInsts; 2811 2812 public: 2813 /// Remove all reference of \p Inst and optionally replace all its 2814 /// uses with New. 2815 /// \p RemovedInsts Keep track of the instructions removed by this Action. 2816 /// \pre If !Inst->use_empty(), then New != nullptr 2817 InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, 2818 Value *New = nullptr) 2819 : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), 2820 RemovedInsts(RemovedInsts) { 2821 if (New) 2822 Replacer = new UsesReplacer(Inst, New); 2823 LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); 2824 RemovedInsts.insert(Inst); 2825 /// The instructions removed here will be freed after completing 2826 /// optimizeBlock() for all blocks as we need to keep track of the 2827 /// removed instructions during promotion. 2828 Inst->removeFromParent(); 2829 } 2830 2831 ~InstructionRemover() override { delete Replacer; } 2832 2833 /// Resurrect the instruction and reassign it to the proper uses if 2834 /// new value was provided when build this action. 2835 void undo() override { 2836 LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n"); 2837 Inserter.insert(Inst); 2838 if (Replacer) 2839 Replacer->undo(); 2840 Hider.undo(); 2841 RemovedInsts.erase(Inst); 2842 } 2843 }; 2844 2845 public: 2846 /// Restoration point. 2847 /// The restoration point is a pointer to an action instead of an iterator 2848 /// because the iterator may be invalidated but not the pointer. 2849 using ConstRestorationPt = const TypePromotionAction *; 2850 2851 TypePromotionTransaction(SetOfInstrs &RemovedInsts) 2852 : RemovedInsts(RemovedInsts) {} 2853 2854 /// Advocate every changes made in that transaction. Return true if any change 2855 /// happen. 2856 bool commit(); 2857 2858 /// Undo all the changes made after the given point. 2859 void rollback(ConstRestorationPt Point); 2860 2861 /// Get the current restoration point. 2862 ConstRestorationPt getRestorationPoint() const; 2863 2864 /// \name API for IR modification with state keeping to support rollback. 2865 /// @{ 2866 /// Same as Instruction::setOperand. 2867 void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal); 2868 2869 /// Same as Instruction::eraseFromParent. 2870 void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr); 2871 2872 /// Same as Value::replaceAllUsesWith. 2873 void replaceAllUsesWith(Instruction *Inst, Value *New); 2874 2875 /// Same as Value::mutateType. 2876 void mutateType(Instruction *Inst, Type *NewTy); 2877 2878 /// Same as IRBuilder::createTrunc. 2879 Value *createTrunc(Instruction *Opnd, Type *Ty); 2880 2881 /// Same as IRBuilder::createSExt. 2882 Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty); 2883 2884 /// Same as IRBuilder::createZExt. 2885 Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty); 2886 2887 /// Same as Instruction::moveBefore. 2888 void moveBefore(Instruction *Inst, Instruction *Before); 2889 /// @} 2890 2891 private: 2892 /// The ordered list of actions made so far. 2893 SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions; 2894 2895 using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator; 2896 2897 SetOfInstrs &RemovedInsts; 2898 }; 2899 2900 } // end anonymous namespace 2901 2902 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, 2903 Value *NewVal) { 2904 Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>( 2905 Inst, Idx, NewVal)); 2906 } 2907 2908 void TypePromotionTransaction::eraseInstruction(Instruction *Inst, 2909 Value *NewVal) { 2910 Actions.push_back( 2911 std::make_unique<TypePromotionTransaction::InstructionRemover>( 2912 Inst, RemovedInsts, NewVal)); 2913 } 2914 2915 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, 2916 Value *New) { 2917 Actions.push_back( 2918 std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New)); 2919 } 2920 2921 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) { 2922 Actions.push_back( 2923 std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy)); 2924 } 2925 2926 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, 2927 Type *Ty) { 2928 std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty)); 2929 Value *Val = Ptr->getBuiltValue(); 2930 Actions.push_back(std::move(Ptr)); 2931 return Val; 2932 } 2933 2934 Value *TypePromotionTransaction::createSExt(Instruction *Inst, 2935 Value *Opnd, Type *Ty) { 2936 std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty)); 2937 Value *Val = Ptr->getBuiltValue(); 2938 Actions.push_back(std::move(Ptr)); 2939 return Val; 2940 } 2941 2942 Value *TypePromotionTransaction::createZExt(Instruction *Inst, 2943 Value *Opnd, Type *Ty) { 2944 std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty)); 2945 Value *Val = Ptr->getBuiltValue(); 2946 Actions.push_back(std::move(Ptr)); 2947 return Val; 2948 } 2949 2950 void TypePromotionTransaction::moveBefore(Instruction *Inst, 2951 Instruction *Before) { 2952 Actions.push_back( 2953 std::make_unique<TypePromotionTransaction::InstructionMoveBefore>( 2954 Inst, Before)); 2955 } 2956 2957 TypePromotionTransaction::ConstRestorationPt 2958 TypePromotionTransaction::getRestorationPoint() const { 2959 return !Actions.empty() ? Actions.back().get() : nullptr; 2960 } 2961 2962 bool TypePromotionTransaction::commit() { 2963 for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; 2964 ++It) 2965 (*It)->commit(); 2966 bool Modified = !Actions.empty(); 2967 Actions.clear(); 2968 return Modified; 2969 } 2970 2971 void TypePromotionTransaction::rollback( 2972 TypePromotionTransaction::ConstRestorationPt Point) { 2973 while (!Actions.empty() && Point != Actions.back().get()) { 2974 std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val(); 2975 Curr->undo(); 2976 } 2977 } 2978 2979 namespace { 2980 2981 /// A helper class for matching addressing modes. 2982 /// 2983 /// This encapsulates the logic for matching the target-legal addressing modes. 2984 class AddressingModeMatcher { 2985 SmallVectorImpl<Instruction*> &AddrModeInsts; 2986 const TargetLowering &TLI; 2987 const TargetRegisterInfo &TRI; 2988 const DataLayout &DL; 2989 2990 /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and 2991 /// the memory instruction that we're computing this address for. 2992 Type *AccessTy; 2993 unsigned AddrSpace; 2994 Instruction *MemoryInst; 2995 2996 /// This is the addressing mode that we're building up. This is 2997 /// part of the return value of this addressing mode matching stuff. 2998 ExtAddrMode &AddrMode; 2999 3000 /// The instructions inserted by other CodeGenPrepare optimizations. 3001 const SetOfInstrs &InsertedInsts; 3002 3003 /// A map from the instructions to their type before promotion. 3004 InstrToOrigTy &PromotedInsts; 3005 3006 /// The ongoing transaction where every action should be registered. 3007 TypePromotionTransaction &TPT; 3008 3009 // A GEP which has too large offset to be folded into the addressing mode. 3010 std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP; 3011 3012 /// This is set to true when we should not do profitability checks. 3013 /// When true, IsProfitableToFoldIntoAddressingMode always returns true. 3014 bool IgnoreProfitability; 3015 3016 /// True if we are optimizing for size. 3017 bool OptSize; 3018 3019 ProfileSummaryInfo *PSI; 3020 BlockFrequencyInfo *BFI; 3021 3022 AddressingModeMatcher( 3023 SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI, 3024 const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, 3025 ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, 3026 InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, 3027 std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP, 3028 bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) 3029 : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), 3030 DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), 3031 MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), 3032 PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP), 3033 OptSize(OptSize), PSI(PSI), BFI(BFI) { 3034 IgnoreProfitability = false; 3035 } 3036 3037 public: 3038 /// Find the maximal addressing mode that a load/store of V can fold, 3039 /// give an access type of AccessTy. This returns a list of involved 3040 /// instructions in AddrModeInsts. 3041 /// \p InsertedInsts The instructions inserted by other CodeGenPrepare 3042 /// optimizations. 3043 /// \p PromotedInsts maps the instructions to their type before promotion. 3044 /// \p The ongoing transaction where every action should be registered. 3045 static ExtAddrMode 3046 Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, 3047 SmallVectorImpl<Instruction *> &AddrModeInsts, 3048 const TargetLowering &TLI, const TargetRegisterInfo &TRI, 3049 const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, 3050 TypePromotionTransaction &TPT, 3051 std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP, 3052 bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { 3053 ExtAddrMode Result; 3054 3055 bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, 3056 MemoryInst, Result, InsertedInsts, 3057 PromotedInsts, TPT, LargeOffsetGEP, 3058 OptSize, PSI, BFI) 3059 .matchAddr(V, 0); 3060 (void)Success; assert(Success && "Couldn't select *anything*?"); 3061 return Result; 3062 } 3063 3064 private: 3065 bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); 3066 bool matchAddr(Value *Addr, unsigned Depth); 3067 bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth, 3068 bool *MovedAway = nullptr); 3069 bool isProfitableToFoldIntoAddressingMode(Instruction *I, 3070 ExtAddrMode &AMBefore, 3071 ExtAddrMode &AMAfter); 3072 bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); 3073 bool isPromotionProfitable(unsigned NewCost, unsigned OldCost, 3074 Value *PromotedOperand) const; 3075 }; 3076 3077 class PhiNodeSet; 3078 3079 /// An iterator for PhiNodeSet. 3080 class PhiNodeSetIterator { 3081 PhiNodeSet * const Set; 3082 size_t CurrentIndex = 0; 3083 3084 public: 3085 /// The constructor. Start should point to either a valid element, or be equal 3086 /// to the size of the underlying SmallVector of the PhiNodeSet. 3087 PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start); 3088 PHINode * operator*() const; 3089 PhiNodeSetIterator& operator++(); 3090 bool operator==(const PhiNodeSetIterator &RHS) const; 3091 bool operator!=(const PhiNodeSetIterator &RHS) const; 3092 }; 3093 3094 /// Keeps a set of PHINodes. 3095 /// 3096 /// This is a minimal set implementation for a specific use case: 3097 /// It is very fast when there are very few elements, but also provides good 3098 /// performance when there are many. It is similar to SmallPtrSet, but also 3099 /// provides iteration by insertion order, which is deterministic and stable 3100 /// across runs. It is also similar to SmallSetVector, but provides removing 3101 /// elements in O(1) time. This is achieved by not actually removing the element 3102 /// from the underlying vector, so comes at the cost of using more memory, but 3103 /// that is fine, since PhiNodeSets are used as short lived objects. 3104 class PhiNodeSet { 3105 friend class PhiNodeSetIterator; 3106 3107 using MapType = SmallDenseMap<PHINode *, size_t, 32>; 3108 using iterator = PhiNodeSetIterator; 3109 3110 /// Keeps the elements in the order of their insertion in the underlying 3111 /// vector. To achieve constant time removal, it never deletes any element. 3112 SmallVector<PHINode *, 32> NodeList; 3113 3114 /// Keeps the elements in the underlying set implementation. This (and not the 3115 /// NodeList defined above) is the source of truth on whether an element 3116 /// is actually in the collection. 3117 MapType NodeMap; 3118 3119 /// Points to the first valid (not deleted) element when the set is not empty 3120 /// and the value is not zero. Equals to the size of the underlying vector 3121 /// when the set is empty. When the value is 0, as in the beginning, the 3122 /// first element may or may not be valid. 3123 size_t FirstValidElement = 0; 3124 3125 public: 3126 /// Inserts a new element to the collection. 3127 /// \returns true if the element is actually added, i.e. was not in the 3128 /// collection before the operation. 3129 bool insert(PHINode *Ptr) { 3130 if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) { 3131 NodeList.push_back(Ptr); 3132 return true; 3133 } 3134 return false; 3135 } 3136 3137 /// Removes the element from the collection. 3138 /// \returns whether the element is actually removed, i.e. was in the 3139 /// collection before the operation. 3140 bool erase(PHINode *Ptr) { 3141 auto it = NodeMap.find(Ptr); 3142 if (it != NodeMap.end()) { 3143 NodeMap.erase(Ptr); 3144 SkipRemovedElements(FirstValidElement); 3145 return true; 3146 } 3147 return false; 3148 } 3149 3150 /// Removes all elements and clears the collection. 3151 void clear() { 3152 NodeMap.clear(); 3153 NodeList.clear(); 3154 FirstValidElement = 0; 3155 } 3156 3157 /// \returns an iterator that will iterate the elements in the order of 3158 /// insertion. 3159 iterator begin() { 3160 if (FirstValidElement == 0) 3161 SkipRemovedElements(FirstValidElement); 3162 return PhiNodeSetIterator(this, FirstValidElement); 3163 } 3164 3165 /// \returns an iterator that points to the end of the collection. 3166 iterator end() { return PhiNodeSetIterator(this, NodeList.size()); } 3167 3168 /// Returns the number of elements in the collection. 3169 size_t size() const { 3170 return NodeMap.size(); 3171 } 3172 3173 /// \returns 1 if the given element is in the collection, and 0 if otherwise. 3174 size_t count(PHINode *Ptr) const { 3175 return NodeMap.count(Ptr); 3176 } 3177 3178 private: 3179 /// Updates the CurrentIndex so that it will point to a valid element. 3180 /// 3181 /// If the element of NodeList at CurrentIndex is valid, it does not 3182 /// change it. If there are no more valid elements, it updates CurrentIndex 3183 /// to point to the end of the NodeList. 3184 void SkipRemovedElements(size_t &CurrentIndex) { 3185 while (CurrentIndex < NodeList.size()) { 3186 auto it = NodeMap.find(NodeList[CurrentIndex]); 3187 // If the element has been deleted and added again later, NodeMap will 3188 // point to a different index, so CurrentIndex will still be invalid. 3189 if (it != NodeMap.end() && it->second == CurrentIndex) 3190 break; 3191 ++CurrentIndex; 3192 } 3193 } 3194 }; 3195 3196 PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start) 3197 : Set(Set), CurrentIndex(Start) {} 3198 3199 PHINode * PhiNodeSetIterator::operator*() const { 3200 assert(CurrentIndex < Set->NodeList.size() && 3201 "PhiNodeSet access out of range"); 3202 return Set->NodeList[CurrentIndex]; 3203 } 3204 3205 PhiNodeSetIterator& PhiNodeSetIterator::operator++() { 3206 assert(CurrentIndex < Set->NodeList.size() && 3207 "PhiNodeSet access out of range"); 3208 ++CurrentIndex; 3209 Set->SkipRemovedElements(CurrentIndex); 3210 return *this; 3211 } 3212 3213 bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const { 3214 return CurrentIndex == RHS.CurrentIndex; 3215 } 3216 3217 bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const { 3218 return !((*this) == RHS); 3219 } 3220 3221 /// Keep track of simplification of Phi nodes. 3222 /// Accept the set of all phi nodes and erase phi node from this set 3223 /// if it is simplified. 3224 class SimplificationTracker { 3225 DenseMap<Value *, Value *> Storage; 3226 const SimplifyQuery &SQ; 3227 // Tracks newly created Phi nodes. The elements are iterated by insertion 3228 // order. 3229 PhiNodeSet AllPhiNodes; 3230 // Tracks newly created Select nodes. 3231 SmallPtrSet<SelectInst *, 32> AllSelectNodes; 3232 3233 public: 3234 SimplificationTracker(const SimplifyQuery &sq) 3235 : SQ(sq) {} 3236 3237 Value *Get(Value *V) { 3238 do { 3239 auto SV = Storage.find(V); 3240 if (SV == Storage.end()) 3241 return V; 3242 V = SV->second; 3243 } while (true); 3244 } 3245 3246 Value *Simplify(Value *Val) { 3247 SmallVector<Value *, 32> WorkList; 3248 SmallPtrSet<Value *, 32> Visited; 3249 WorkList.push_back(Val); 3250 while (!WorkList.empty()) { 3251 auto *P = WorkList.pop_back_val(); 3252 if (!Visited.insert(P).second) 3253 continue; 3254 if (auto *PI = dyn_cast<Instruction>(P)) 3255 if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) { 3256 for (auto *U : PI->users()) 3257 WorkList.push_back(cast<Value>(U)); 3258 Put(PI, V); 3259 PI->replaceAllUsesWith(V); 3260 if (auto *PHI = dyn_cast<PHINode>(PI)) 3261 AllPhiNodes.erase(PHI); 3262 if (auto *Select = dyn_cast<SelectInst>(PI)) 3263 AllSelectNodes.erase(Select); 3264 PI->eraseFromParent(); 3265 } 3266 } 3267 return Get(Val); 3268 } 3269 3270 void Put(Value *From, Value *To) { 3271 Storage.insert({ From, To }); 3272 } 3273 3274 void ReplacePhi(PHINode *From, PHINode *To) { 3275 Value* OldReplacement = Get(From); 3276 while (OldReplacement != From) { 3277 From = To; 3278 To = dyn_cast<PHINode>(OldReplacement); 3279 OldReplacement = Get(From); 3280 } 3281 assert(To && Get(To) == To && "Replacement PHI node is already replaced."); 3282 Put(From, To); 3283 From->replaceAllUsesWith(To); 3284 AllPhiNodes.erase(From); 3285 From->eraseFromParent(); 3286 } 3287 3288 PhiNodeSet& newPhiNodes() { return AllPhiNodes; } 3289 3290 void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); } 3291 3292 void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); } 3293 3294 unsigned countNewPhiNodes() const { return AllPhiNodes.size(); } 3295 3296 unsigned countNewSelectNodes() const { return AllSelectNodes.size(); } 3297 3298 void destroyNewNodes(Type *CommonType) { 3299 // For safe erasing, replace the uses with dummy value first. 3300 auto *Dummy = UndefValue::get(CommonType); 3301 for (auto *I : AllPhiNodes) { 3302 I->replaceAllUsesWith(Dummy); 3303 I->eraseFromParent(); 3304 } 3305 AllPhiNodes.clear(); 3306 for (auto *I : AllSelectNodes) { 3307 I->replaceAllUsesWith(Dummy); 3308 I->eraseFromParent(); 3309 } 3310 AllSelectNodes.clear(); 3311 } 3312 }; 3313 3314 /// A helper class for combining addressing modes. 3315 class AddressingModeCombiner { 3316 typedef DenseMap<Value *, Value *> FoldAddrToValueMapping; 3317 typedef std::pair<PHINode *, PHINode *> PHIPair; 3318 3319 private: 3320 /// The addressing modes we've collected. 3321 SmallVector<ExtAddrMode, 16> AddrModes; 3322 3323 /// The field in which the AddrModes differ, when we have more than one. 3324 ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField; 3325 3326 /// Are the AddrModes that we have all just equal to their original values? 3327 bool AllAddrModesTrivial = true; 3328 3329 /// Common Type for all different fields in addressing modes. 3330 Type *CommonType; 3331 3332 /// SimplifyQuery for simplifyInstruction utility. 3333 const SimplifyQuery &SQ; 3334 3335 /// Original Address. 3336 Value *Original; 3337 3338 public: 3339 AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue) 3340 : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {} 3341 3342 /// Get the combined AddrMode 3343 const ExtAddrMode &getAddrMode() const { 3344 return AddrModes[0]; 3345 } 3346 3347 /// Add a new AddrMode if it's compatible with the AddrModes we already 3348 /// have. 3349 /// \return True iff we succeeded in doing so. 3350 bool addNewAddrMode(ExtAddrMode &NewAddrMode) { 3351 // Take note of if we have any non-trivial AddrModes, as we need to detect 3352 // when all AddrModes are trivial as then we would introduce a phi or select 3353 // which just duplicates what's already there. 3354 AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial(); 3355 3356 // If this is the first addrmode then everything is fine. 3357 if (AddrModes.empty()) { 3358 AddrModes.emplace_back(NewAddrMode); 3359 return true; 3360 } 3361 3362 // Figure out how different this is from the other address modes, which we 3363 // can do just by comparing against the first one given that we only care 3364 // about the cumulative difference. 3365 ExtAddrMode::FieldName ThisDifferentField = 3366 AddrModes[0].compare(NewAddrMode); 3367 if (DifferentField == ExtAddrMode::NoField) 3368 DifferentField = ThisDifferentField; 3369 else if (DifferentField != ThisDifferentField) 3370 DifferentField = ExtAddrMode::MultipleFields; 3371 3372 // If NewAddrMode differs in more than one dimension we cannot handle it. 3373 bool CanHandle = DifferentField != ExtAddrMode::MultipleFields; 3374 3375 // If Scale Field is different then we reject. 3376 CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField; 3377 3378 // We also must reject the case when base offset is different and 3379 // scale reg is not null, we cannot handle this case due to merge of 3380 // different offsets will be used as ScaleReg. 3381 CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField || 3382 !NewAddrMode.ScaledReg); 3383 3384 // We also must reject the case when GV is different and BaseReg installed 3385 // due to we want to use base reg as a merge of GV values. 3386 CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField || 3387 !NewAddrMode.HasBaseReg); 3388 3389 // Even if NewAddMode is the same we still need to collect it due to 3390 // original value is different. And later we will need all original values 3391 // as anchors during finding the common Phi node. 3392 if (CanHandle) 3393 AddrModes.emplace_back(NewAddrMode); 3394 else 3395 AddrModes.clear(); 3396 3397 return CanHandle; 3398 } 3399 3400 /// Combine the addressing modes we've collected into a single 3401 /// addressing mode. 3402 /// \return True iff we successfully combined them or we only had one so 3403 /// didn't need to combine them anyway. 3404 bool combineAddrModes() { 3405 // If we have no AddrModes then they can't be combined. 3406 if (AddrModes.size() == 0) 3407 return false; 3408 3409 // A single AddrMode can trivially be combined. 3410 if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField) 3411 return true; 3412 3413 // If the AddrModes we collected are all just equal to the value they are 3414 // derived from then combining them wouldn't do anything useful. 3415 if (AllAddrModesTrivial) 3416 return false; 3417 3418 if (!addrModeCombiningAllowed()) 3419 return false; 3420 3421 // Build a map between <original value, basic block where we saw it> to 3422 // value of base register. 3423 // Bail out if there is no common type. 3424 FoldAddrToValueMapping Map; 3425 if (!initializeMap(Map)) 3426 return false; 3427 3428 Value *CommonValue = findCommon(Map); 3429 if (CommonValue) 3430 AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes); 3431 return CommonValue != nullptr; 3432 } 3433 3434 private: 3435 /// Initialize Map with anchor values. For address seen 3436 /// we set the value of different field saw in this address. 3437 /// At the same time we find a common type for different field we will 3438 /// use to create new Phi/Select nodes. Keep it in CommonType field. 3439 /// Return false if there is no common type found. 3440 bool initializeMap(FoldAddrToValueMapping &Map) { 3441 // Keep track of keys where the value is null. We will need to replace it 3442 // with constant null when we know the common type. 3443 SmallVector<Value *, 2> NullValue; 3444 Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType()); 3445 for (auto &AM : AddrModes) { 3446 Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy); 3447 if (DV) { 3448 auto *Type = DV->getType(); 3449 if (CommonType && CommonType != Type) 3450 return false; 3451 CommonType = Type; 3452 Map[AM.OriginalValue] = DV; 3453 } else { 3454 NullValue.push_back(AM.OriginalValue); 3455 } 3456 } 3457 assert(CommonType && "At least one non-null value must be!"); 3458 for (auto *V : NullValue) 3459 Map[V] = Constant::getNullValue(CommonType); 3460 return true; 3461 } 3462 3463 /// We have mapping between value A and other value B where B was a field in 3464 /// addressing mode represented by A. Also we have an original value C 3465 /// representing an address we start with. Traversing from C through phi and 3466 /// selects we ended up with A's in a map. This utility function tries to find 3467 /// a value V which is a field in addressing mode C and traversing through phi 3468 /// nodes and selects we will end up in corresponded values B in a map. 3469 /// The utility will create a new Phi/Selects if needed. 3470 // The simple example looks as follows: 3471 // BB1: 3472 // p1 = b1 + 40 3473 // br cond BB2, BB3 3474 // BB2: 3475 // p2 = b2 + 40 3476 // br BB3 3477 // BB3: 3478 // p = phi [p1, BB1], [p2, BB2] 3479 // v = load p 3480 // Map is 3481 // p1 -> b1 3482 // p2 -> b2 3483 // Request is 3484 // p -> ? 3485 // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3. 3486 Value *findCommon(FoldAddrToValueMapping &Map) { 3487 // Tracks the simplification of newly created phi nodes. The reason we use 3488 // this mapping is because we will add new created Phi nodes in AddrToBase. 3489 // Simplification of Phi nodes is recursive, so some Phi node may 3490 // be simplified after we added it to AddrToBase. In reality this 3491 // simplification is possible only if original phi/selects were not 3492 // simplified yet. 3493 // Using this mapping we can find the current value in AddrToBase. 3494 SimplificationTracker ST(SQ); 3495 3496 // First step, DFS to create PHI nodes for all intermediate blocks. 3497 // Also fill traverse order for the second step. 3498 SmallVector<Value *, 32> TraverseOrder; 3499 InsertPlaceholders(Map, TraverseOrder, ST); 3500 3501 // Second Step, fill new nodes by merged values and simplify if possible. 3502 FillPlaceholders(Map, TraverseOrder, ST); 3503 3504 if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) { 3505 ST.destroyNewNodes(CommonType); 3506 return nullptr; 3507 } 3508 3509 // Now we'd like to match New Phi nodes to existed ones. 3510 unsigned PhiNotMatchedCount = 0; 3511 if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) { 3512 ST.destroyNewNodes(CommonType); 3513 return nullptr; 3514 } 3515 3516 auto *Result = ST.Get(Map.find(Original)->second); 3517 if (Result) { 3518 NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount; 3519 NumMemoryInstsSelectCreated += ST.countNewSelectNodes(); 3520 } 3521 return Result; 3522 } 3523 3524 /// Try to match PHI node to Candidate. 3525 /// Matcher tracks the matched Phi nodes. 3526 bool MatchPhiNode(PHINode *PHI, PHINode *Candidate, 3527 SmallSetVector<PHIPair, 8> &Matcher, 3528 PhiNodeSet &PhiNodesToMatch) { 3529 SmallVector<PHIPair, 8> WorkList; 3530 Matcher.insert({ PHI, Candidate }); 3531 SmallSet<PHINode *, 8> MatchedPHIs; 3532 MatchedPHIs.insert(PHI); 3533 WorkList.push_back({ PHI, Candidate }); 3534 SmallSet<PHIPair, 8> Visited; 3535 while (!WorkList.empty()) { 3536 auto Item = WorkList.pop_back_val(); 3537 if (!Visited.insert(Item).second) 3538 continue; 3539 // We iterate over all incoming values to Phi to compare them. 3540 // If values are different and both of them Phi and the first one is a 3541 // Phi we added (subject to match) and both of them is in the same basic 3542 // block then we can match our pair if values match. So we state that 3543 // these values match and add it to work list to verify that. 3544 for (auto B : Item.first->blocks()) { 3545 Value *FirstValue = Item.first->getIncomingValueForBlock(B); 3546 Value *SecondValue = Item.second->getIncomingValueForBlock(B); 3547 if (FirstValue == SecondValue) 3548 continue; 3549 3550 PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue); 3551 PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue); 3552 3553 // One of them is not Phi or 3554 // The first one is not Phi node from the set we'd like to match or 3555 // Phi nodes from different basic blocks then 3556 // we will not be able to match. 3557 if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) || 3558 FirstPhi->getParent() != SecondPhi->getParent()) 3559 return false; 3560 3561 // If we already matched them then continue. 3562 if (Matcher.count({ FirstPhi, SecondPhi })) 3563 continue; 3564 // So the values are different and does not match. So we need them to 3565 // match. (But we register no more than one match per PHI node, so that 3566 // we won't later try to replace them twice.) 3567 if (MatchedPHIs.insert(FirstPhi).second) 3568 Matcher.insert({ FirstPhi, SecondPhi }); 3569 // But me must check it. 3570 WorkList.push_back({ FirstPhi, SecondPhi }); 3571 } 3572 } 3573 return true; 3574 } 3575 3576 /// For the given set of PHI nodes (in the SimplificationTracker) try 3577 /// to find their equivalents. 3578 /// Returns false if this matching fails and creation of new Phi is disabled. 3579 bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes, 3580 unsigned &PhiNotMatchedCount) { 3581 // Matched and PhiNodesToMatch iterate their elements in a deterministic 3582 // order, so the replacements (ReplacePhi) are also done in a deterministic 3583 // order. 3584 SmallSetVector<PHIPair, 8> Matched; 3585 SmallPtrSet<PHINode *, 8> WillNotMatch; 3586 PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes(); 3587 while (PhiNodesToMatch.size()) { 3588 PHINode *PHI = *PhiNodesToMatch.begin(); 3589 3590 // Add us, if no Phi nodes in the basic block we do not match. 3591 WillNotMatch.clear(); 3592 WillNotMatch.insert(PHI); 3593 3594 // Traverse all Phis until we found equivalent or fail to do that. 3595 bool IsMatched = false; 3596 for (auto &P : PHI->getParent()->phis()) { 3597 if (&P == PHI) 3598 continue; 3599 if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch))) 3600 break; 3601 // If it does not match, collect all Phi nodes from matcher. 3602 // if we end up with no match, them all these Phi nodes will not match 3603 // later. 3604 for (auto M : Matched) 3605 WillNotMatch.insert(M.first); 3606 Matched.clear(); 3607 } 3608 if (IsMatched) { 3609 // Replace all matched values and erase them. 3610 for (auto MV : Matched) 3611 ST.ReplacePhi(MV.first, MV.second); 3612 Matched.clear(); 3613 continue; 3614 } 3615 // If we are not allowed to create new nodes then bail out. 3616 if (!AllowNewPhiNodes) 3617 return false; 3618 // Just remove all seen values in matcher. They will not match anything. 3619 PhiNotMatchedCount += WillNotMatch.size(); 3620 for (auto *P : WillNotMatch) 3621 PhiNodesToMatch.erase(P); 3622 } 3623 return true; 3624 } 3625 /// Fill the placeholders with values from predecessors and simplify them. 3626 void FillPlaceholders(FoldAddrToValueMapping &Map, 3627 SmallVectorImpl<Value *> &TraverseOrder, 3628 SimplificationTracker &ST) { 3629 while (!TraverseOrder.empty()) { 3630 Value *Current = TraverseOrder.pop_back_val(); 3631 assert(Map.find(Current) != Map.end() && "No node to fill!!!"); 3632 Value *V = Map[Current]; 3633 3634 if (SelectInst *Select = dyn_cast<SelectInst>(V)) { 3635 // CurrentValue also must be Select. 3636 auto *CurrentSelect = cast<SelectInst>(Current); 3637 auto *TrueValue = CurrentSelect->getTrueValue(); 3638 assert(Map.find(TrueValue) != Map.end() && "No True Value!"); 3639 Select->setTrueValue(ST.Get(Map[TrueValue])); 3640 auto *FalseValue = CurrentSelect->getFalseValue(); 3641 assert(Map.find(FalseValue) != Map.end() && "No False Value!"); 3642 Select->setFalseValue(ST.Get(Map[FalseValue])); 3643 } else { 3644 // Must be a Phi node then. 3645 auto *PHI = cast<PHINode>(V); 3646 // Fill the Phi node with values from predecessors. 3647 for (auto *B : predecessors(PHI->getParent())) { 3648 Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B); 3649 assert(Map.find(PV) != Map.end() && "No predecessor Value!"); 3650 PHI->addIncoming(ST.Get(Map[PV]), B); 3651 } 3652 } 3653 Map[Current] = ST.Simplify(V); 3654 } 3655 } 3656 3657 /// Starting from original value recursively iterates over def-use chain up to 3658 /// known ending values represented in a map. For each traversed phi/select 3659 /// inserts a placeholder Phi or Select. 3660 /// Reports all new created Phi/Select nodes by adding them to set. 3661 /// Also reports and order in what values have been traversed. 3662 void InsertPlaceholders(FoldAddrToValueMapping &Map, 3663 SmallVectorImpl<Value *> &TraverseOrder, 3664 SimplificationTracker &ST) { 3665 SmallVector<Value *, 32> Worklist; 3666 assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) && 3667 "Address must be a Phi or Select node"); 3668 auto *Dummy = UndefValue::get(CommonType); 3669 Worklist.push_back(Original); 3670 while (!Worklist.empty()) { 3671 Value *Current = Worklist.pop_back_val(); 3672 // if it is already visited or it is an ending value then skip it. 3673 if (Map.find(Current) != Map.end()) 3674 continue; 3675 TraverseOrder.push_back(Current); 3676 3677 // CurrentValue must be a Phi node or select. All others must be covered 3678 // by anchors. 3679 if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) { 3680 // Is it OK to get metadata from OrigSelect?! 3681 // Create a Select placeholder with dummy value. 3682 SelectInst *Select = SelectInst::Create( 3683 CurrentSelect->getCondition(), Dummy, Dummy, 3684 CurrentSelect->getName(), CurrentSelect, CurrentSelect); 3685 Map[Current] = Select; 3686 ST.insertNewSelect(Select); 3687 // We are interested in True and False values. 3688 Worklist.push_back(CurrentSelect->getTrueValue()); 3689 Worklist.push_back(CurrentSelect->getFalseValue()); 3690 } else { 3691 // It must be a Phi node then. 3692 PHINode *CurrentPhi = cast<PHINode>(Current); 3693 unsigned PredCount = CurrentPhi->getNumIncomingValues(); 3694 PHINode *PHI = 3695 PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi); 3696 Map[Current] = PHI; 3697 ST.insertNewPhi(PHI); 3698 for (Value *P : CurrentPhi->incoming_values()) 3699 Worklist.push_back(P); 3700 } 3701 } 3702 } 3703 3704 bool addrModeCombiningAllowed() { 3705 if (DisableComplexAddrModes) 3706 return false; 3707 switch (DifferentField) { 3708 default: 3709 return false; 3710 case ExtAddrMode::BaseRegField: 3711 return AddrSinkCombineBaseReg; 3712 case ExtAddrMode::BaseGVField: 3713 return AddrSinkCombineBaseGV; 3714 case ExtAddrMode::BaseOffsField: 3715 return AddrSinkCombineBaseOffs; 3716 case ExtAddrMode::ScaledRegField: 3717 return AddrSinkCombineScaledReg; 3718 } 3719 } 3720 }; 3721 } // end anonymous namespace 3722 3723 /// Try adding ScaleReg*Scale to the current addressing mode. 3724 /// Return true and update AddrMode if this addr mode is legal for the target, 3725 /// false if not. 3726 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, 3727 unsigned Depth) { 3728 // If Scale is 1, then this is the same as adding ScaleReg to the addressing 3729 // mode. Just process that directly. 3730 if (Scale == 1) 3731 return matchAddr(ScaleReg, Depth); 3732 3733 // If the scale is 0, it takes nothing to add this. 3734 if (Scale == 0) 3735 return true; 3736 3737 // If we already have a scale of this value, we can add to it, otherwise, we 3738 // need an available scale field. 3739 if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) 3740 return false; 3741 3742 ExtAddrMode TestAddrMode = AddrMode; 3743 3744 // Add scale to turn X*4+X*3 -> X*7. This could also do things like 3745 // [A+B + A*7] -> [B+A*8]. 3746 TestAddrMode.Scale += Scale; 3747 TestAddrMode.ScaledReg = ScaleReg; 3748 3749 // If the new address isn't legal, bail out. 3750 if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) 3751 return false; 3752 3753 // It was legal, so commit it. 3754 AddrMode = TestAddrMode; 3755 3756 // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now 3757 // to see if ScaleReg is actually X+C. If so, we can turn this into adding 3758 // X*Scale + C*Scale to addr mode. 3759 ConstantInt *CI = nullptr; Value *AddLHS = nullptr; 3760 if (isa<Instruction>(ScaleReg) && // not a constant expr. 3761 match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) && 3762 CI->getValue().isSignedIntN(64)) { 3763 TestAddrMode.InBounds = false; 3764 TestAddrMode.ScaledReg = AddLHS; 3765 TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale; 3766 3767 // If this addressing mode is legal, commit it and remember that we folded 3768 // this instruction. 3769 if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) { 3770 AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); 3771 AddrMode = TestAddrMode; 3772 return true; 3773 } 3774 } 3775 3776 // Otherwise, not (x+c)*scale, just return what we have. 3777 return true; 3778 } 3779 3780 /// This is a little filter, which returns true if an addressing computation 3781 /// involving I might be folded into a load/store accessing it. 3782 /// This doesn't need to be perfect, but needs to accept at least 3783 /// the set of instructions that MatchOperationAddr can. 3784 static bool MightBeFoldableInst(Instruction *I) { 3785 switch (I->getOpcode()) { 3786 case Instruction::BitCast: 3787 case Instruction::AddrSpaceCast: 3788 // Don't touch identity bitcasts. 3789 if (I->getType() == I->getOperand(0)->getType()) 3790 return false; 3791 return I->getType()->isIntOrPtrTy(); 3792 case Instruction::PtrToInt: 3793 // PtrToInt is always a noop, as we know that the int type is pointer sized. 3794 return true; 3795 case Instruction::IntToPtr: 3796 // We know the input is intptr_t, so this is foldable. 3797 return true; 3798 case Instruction::Add: 3799 return true; 3800 case Instruction::Mul: 3801 case Instruction::Shl: 3802 // Can only handle X*C and X << C. 3803 return isa<ConstantInt>(I->getOperand(1)); 3804 case Instruction::GetElementPtr: 3805 return true; 3806 default: 3807 return false; 3808 } 3809 } 3810 3811 /// Check whether or not \p Val is a legal instruction for \p TLI. 3812 /// \note \p Val is assumed to be the product of some type promotion. 3813 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed 3814 /// to be legal, as the non-promoted value would have had the same state. 3815 static bool isPromotedInstructionLegal(const TargetLowering &TLI, 3816 const DataLayout &DL, Value *Val) { 3817 Instruction *PromotedInst = dyn_cast<Instruction>(Val); 3818 if (!PromotedInst) 3819 return false; 3820 int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); 3821 // If the ISDOpcode is undefined, it was undefined before the promotion. 3822 if (!ISDOpcode) 3823 return true; 3824 // Otherwise, check if the promoted instruction is legal or not. 3825 return TLI.isOperationLegalOrCustom( 3826 ISDOpcode, TLI.getValueType(DL, PromotedInst->getType())); 3827 } 3828 3829 namespace { 3830 3831 /// Hepler class to perform type promotion. 3832 class TypePromotionHelper { 3833 /// Utility function to add a promoted instruction \p ExtOpnd to 3834 /// \p PromotedInsts and record the type of extension we have seen. 3835 static void addPromotedInst(InstrToOrigTy &PromotedInsts, 3836 Instruction *ExtOpnd, 3837 bool IsSExt) { 3838 ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension; 3839 InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd); 3840 if (It != PromotedInsts.end()) { 3841 // If the new extension is same as original, the information in 3842 // PromotedInsts[ExtOpnd] is still correct. 3843 if (It->second.getInt() == ExtTy) 3844 return; 3845 3846 // Now the new extension is different from old extension, we make 3847 // the type information invalid by setting extension type to 3848 // BothExtension. 3849 ExtTy = BothExtension; 3850 } 3851 PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy); 3852 } 3853 3854 /// Utility function to query the original type of instruction \p Opnd 3855 /// with a matched extension type. If the extension doesn't match, we 3856 /// cannot use the information we had on the original type. 3857 /// BothExtension doesn't match any extension type. 3858 static const Type *getOrigType(const InstrToOrigTy &PromotedInsts, 3859 Instruction *Opnd, 3860 bool IsSExt) { 3861 ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension; 3862 InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd); 3863 if (It != PromotedInsts.end() && It->second.getInt() == ExtTy) 3864 return It->second.getPointer(); 3865 return nullptr; 3866 } 3867 3868 /// Utility function to check whether or not a sign or zero extension 3869 /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by 3870 /// either using the operands of \p Inst or promoting \p Inst. 3871 /// The type of the extension is defined by \p IsSExt. 3872 /// In other words, check if: 3873 /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType. 3874 /// #1 Promotion applies: 3875 /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...). 3876 /// #2 Operand reuses: 3877 /// ext opnd1 to ConsideredExtType. 3878 /// \p PromotedInsts maps the instructions to their type before promotion. 3879 static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType, 3880 const InstrToOrigTy &PromotedInsts, bool IsSExt); 3881 3882 /// Utility function to determine if \p OpIdx should be promoted when 3883 /// promoting \p Inst. 3884 static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { 3885 return !(isa<SelectInst>(Inst) && OpIdx == 0); 3886 } 3887 3888 /// Utility function to promote the operand of \p Ext when this 3889 /// operand is a promotable trunc or sext or zext. 3890 /// \p PromotedInsts maps the instructions to their type before promotion. 3891 /// \p CreatedInstsCost[out] contains the cost of all instructions 3892 /// created to promote the operand of Ext. 3893 /// Newly added extensions are inserted in \p Exts. 3894 /// Newly added truncates are inserted in \p Truncs. 3895 /// Should never be called directly. 3896 /// \return The promoted value which is used instead of Ext. 3897 static Value *promoteOperandForTruncAndAnyExt( 3898 Instruction *Ext, TypePromotionTransaction &TPT, 3899 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3900 SmallVectorImpl<Instruction *> *Exts, 3901 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI); 3902 3903 /// Utility function to promote the operand of \p Ext when this 3904 /// operand is promotable and is not a supported trunc or sext. 3905 /// \p PromotedInsts maps the instructions to their type before promotion. 3906 /// \p CreatedInstsCost[out] contains the cost of all the instructions 3907 /// created to promote the operand of Ext. 3908 /// Newly added extensions are inserted in \p Exts. 3909 /// Newly added truncates are inserted in \p Truncs. 3910 /// Should never be called directly. 3911 /// \return The promoted value which is used instead of Ext. 3912 static Value *promoteOperandForOther(Instruction *Ext, 3913 TypePromotionTransaction &TPT, 3914 InstrToOrigTy &PromotedInsts, 3915 unsigned &CreatedInstsCost, 3916 SmallVectorImpl<Instruction *> *Exts, 3917 SmallVectorImpl<Instruction *> *Truncs, 3918 const TargetLowering &TLI, bool IsSExt); 3919 3920 /// \see promoteOperandForOther. 3921 static Value *signExtendOperandForOther( 3922 Instruction *Ext, TypePromotionTransaction &TPT, 3923 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3924 SmallVectorImpl<Instruction *> *Exts, 3925 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { 3926 return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, 3927 Exts, Truncs, TLI, true); 3928 } 3929 3930 /// \see promoteOperandForOther. 3931 static Value *zeroExtendOperandForOther( 3932 Instruction *Ext, TypePromotionTransaction &TPT, 3933 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3934 SmallVectorImpl<Instruction *> *Exts, 3935 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { 3936 return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, 3937 Exts, Truncs, TLI, false); 3938 } 3939 3940 public: 3941 /// Type for the utility function that promotes the operand of Ext. 3942 using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT, 3943 InstrToOrigTy &PromotedInsts, 3944 unsigned &CreatedInstsCost, 3945 SmallVectorImpl<Instruction *> *Exts, 3946 SmallVectorImpl<Instruction *> *Truncs, 3947 const TargetLowering &TLI); 3948 3949 /// Given a sign/zero extend instruction \p Ext, return the appropriate 3950 /// action to promote the operand of \p Ext instead of using Ext. 3951 /// \return NULL if no promotable action is possible with the current 3952 /// sign extension. 3953 /// \p InsertedInsts keeps track of all the instructions inserted by the 3954 /// other CodeGenPrepare optimizations. This information is important 3955 /// because we do not want to promote these instructions as CodeGenPrepare 3956 /// will reinsert them later. Thus creating an infinite loop: create/remove. 3957 /// \p PromotedInsts maps the instructions to their type before promotion. 3958 static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts, 3959 const TargetLowering &TLI, 3960 const InstrToOrigTy &PromotedInsts); 3961 }; 3962 3963 } // end anonymous namespace 3964 3965 bool TypePromotionHelper::canGetThrough(const Instruction *Inst, 3966 Type *ConsideredExtType, 3967 const InstrToOrigTy &PromotedInsts, 3968 bool IsSExt) { 3969 // The promotion helper does not know how to deal with vector types yet. 3970 // To be able to fix that, we would need to fix the places where we 3971 // statically extend, e.g., constants and such. 3972 if (Inst->getType()->isVectorTy()) 3973 return false; 3974 3975 // We can always get through zext. 3976 if (isa<ZExtInst>(Inst)) 3977 return true; 3978 3979 // sext(sext) is ok too. 3980 if (IsSExt && isa<SExtInst>(Inst)) 3981 return true; 3982 3983 // We can get through binary operator, if it is legal. In other words, the 3984 // binary operator must have a nuw or nsw flag. 3985 const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 3986 if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) && 3987 ((!IsSExt && BinOp->hasNoUnsignedWrap()) || 3988 (IsSExt && BinOp->hasNoSignedWrap()))) 3989 return true; 3990 3991 // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst)) 3992 if ((Inst->getOpcode() == Instruction::And || 3993 Inst->getOpcode() == Instruction::Or)) 3994 return true; 3995 3996 // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst)) 3997 if (Inst->getOpcode() == Instruction::Xor) { 3998 const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1)); 3999 // Make sure it is not a NOT. 4000 if (Cst && !Cst->getValue().isAllOnesValue()) 4001 return true; 4002 } 4003 4004 // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst)) 4005 // It may change a poisoned value into a regular value, like 4006 // zext i32 (shrl i8 %val, 12) --> shrl i32 (zext i8 %val), 12 4007 // poisoned value regular value 4008 // It should be OK since undef covers valid value. 4009 if (Inst->getOpcode() == Instruction::LShr && !IsSExt) 4010 return true; 4011 4012 // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst) 4013 // It may change a poisoned value into a regular value, like 4014 // zext i32 (shl i8 %val, 12) --> shl i32 (zext i8 %val), 12 4015 // poisoned value regular value 4016 // It should be OK since undef covers valid value. 4017 if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) { 4018 const auto *ExtInst = cast<const Instruction>(*Inst->user_begin()); 4019 if (ExtInst->hasOneUse()) { 4020 const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin()); 4021 if (AndInst && AndInst->getOpcode() == Instruction::And) { 4022 const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1)); 4023 if (Cst && 4024 Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth())) 4025 return true; 4026 } 4027 } 4028 } 4029 4030 // Check if we can do the following simplification. 4031 // ext(trunc(opnd)) --> ext(opnd) 4032 if (!isa<TruncInst>(Inst)) 4033 return false; 4034 4035 Value *OpndVal = Inst->getOperand(0); 4036 // Check if we can use this operand in the extension. 4037 // If the type is larger than the result type of the extension, we cannot. 4038 if (!OpndVal->getType()->isIntegerTy() || 4039 OpndVal->getType()->getIntegerBitWidth() > 4040 ConsideredExtType->getIntegerBitWidth()) 4041 return false; 4042 4043 // If the operand of the truncate is not an instruction, we will not have 4044 // any information on the dropped bits. 4045 // (Actually we could for constant but it is not worth the extra logic). 4046 Instruction *Opnd = dyn_cast<Instruction>(OpndVal); 4047 if (!Opnd) 4048 return false; 4049 4050 // Check if the source of the type is narrow enough. 4051 // I.e., check that trunc just drops extended bits of the same kind of 4052 // the extension. 4053 // #1 get the type of the operand and check the kind of the extended bits. 4054 const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt); 4055 if (OpndType) 4056 ; 4057 else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd))) 4058 OpndType = Opnd->getOperand(0)->getType(); 4059 else 4060 return false; 4061 4062 // #2 check that the truncate just drops extended bits. 4063 return Inst->getType()->getIntegerBitWidth() >= 4064 OpndType->getIntegerBitWidth(); 4065 } 4066 4067 TypePromotionHelper::Action TypePromotionHelper::getAction( 4068 Instruction *Ext, const SetOfInstrs &InsertedInsts, 4069 const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) { 4070 assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 4071 "Unexpected instruction type"); 4072 Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0)); 4073 Type *ExtTy = Ext->getType(); 4074 bool IsSExt = isa<SExtInst>(Ext); 4075 // If the operand of the extension is not an instruction, we cannot 4076 // get through. 4077 // If it, check we can get through. 4078 if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt)) 4079 return nullptr; 4080 4081 // Do not promote if the operand has been added by codegenprepare. 4082 // Otherwise, it means we are undoing an optimization that is likely to be 4083 // redone, thus causing potential infinite loop. 4084 if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd)) 4085 return nullptr; 4086 4087 // SExt or Trunc instructions. 4088 // Return the related handler. 4089 if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) || 4090 isa<ZExtInst>(ExtOpnd)) 4091 return promoteOperandForTruncAndAnyExt; 4092 4093 // Regular instruction. 4094 // Abort early if we will have to insert non-free instructions. 4095 if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType())) 4096 return nullptr; 4097 return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther; 4098 } 4099 4100 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( 4101 Instruction *SExt, TypePromotionTransaction &TPT, 4102 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 4103 SmallVectorImpl<Instruction *> *Exts, 4104 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { 4105 // By construction, the operand of SExt is an instruction. Otherwise we cannot 4106 // get through it and this method should not be called. 4107 Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0)); 4108 Value *ExtVal = SExt; 4109 bool HasMergedNonFreeExt = false; 4110 if (isa<ZExtInst>(SExtOpnd)) { 4111 // Replace s|zext(zext(opnd)) 4112 // => zext(opnd). 4113 HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd); 4114 Value *ZExt = 4115 TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType()); 4116 TPT.replaceAllUsesWith(SExt, ZExt); 4117 TPT.eraseInstruction(SExt); 4118 ExtVal = ZExt; 4119 } else { 4120 // Replace z|sext(trunc(opnd)) or sext(sext(opnd)) 4121 // => z|sext(opnd). 4122 TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0)); 4123 } 4124 CreatedInstsCost = 0; 4125 4126 // Remove dead code. 4127 if (SExtOpnd->use_empty()) 4128 TPT.eraseInstruction(SExtOpnd); 4129 4130 // Check if the extension is still needed. 4131 Instruction *ExtInst = dyn_cast<Instruction>(ExtVal); 4132 if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) { 4133 if (ExtInst) { 4134 if (Exts) 4135 Exts->push_back(ExtInst); 4136 CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt; 4137 } 4138 return ExtVal; 4139 } 4140 4141 // At this point we have: ext ty opnd to ty. 4142 // Reassign the uses of ExtInst to the opnd and remove ExtInst. 4143 Value *NextVal = ExtInst->getOperand(0); 4144 TPT.eraseInstruction(ExtInst, NextVal); 4145 return NextVal; 4146 } 4147 4148 Value *TypePromotionHelper::promoteOperandForOther( 4149 Instruction *Ext, TypePromotionTransaction &TPT, 4150 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 4151 SmallVectorImpl<Instruction *> *Exts, 4152 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI, 4153 bool IsSExt) { 4154 // By construction, the operand of Ext is an instruction. Otherwise we cannot 4155 // get through it and this method should not be called. 4156 Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0)); 4157 CreatedInstsCost = 0; 4158 if (!ExtOpnd->hasOneUse()) { 4159 // ExtOpnd will be promoted. 4160 // All its uses, but Ext, will need to use a truncated value of the 4161 // promoted version. 4162 // Create the truncate now. 4163 Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType()); 4164 if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) { 4165 // Insert it just after the definition. 4166 ITrunc->moveAfter(ExtOpnd); 4167 if (Truncs) 4168 Truncs->push_back(ITrunc); 4169 } 4170 4171 TPT.replaceAllUsesWith(ExtOpnd, Trunc); 4172 // Restore the operand of Ext (which has been replaced by the previous call 4173 // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext. 4174 TPT.setOperand(Ext, 0, ExtOpnd); 4175 } 4176 4177 // Get through the Instruction: 4178 // 1. Update its type. 4179 // 2. Replace the uses of Ext by Inst. 4180 // 3. Extend each operand that needs to be extended. 4181 4182 // Remember the original type of the instruction before promotion. 4183 // This is useful to know that the high bits are sign extended bits. 4184 addPromotedInst(PromotedInsts, ExtOpnd, IsSExt); 4185 // Step #1. 4186 TPT.mutateType(ExtOpnd, Ext->getType()); 4187 // Step #2. 4188 TPT.replaceAllUsesWith(Ext, ExtOpnd); 4189 // Step #3. 4190 Instruction *ExtForOpnd = Ext; 4191 4192 LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n"); 4193 for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx; 4194 ++OpIdx) { 4195 LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n'); 4196 if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() || 4197 !shouldExtOperand(ExtOpnd, OpIdx)) { 4198 LLVM_DEBUG(dbgs() << "No need to propagate\n"); 4199 continue; 4200 } 4201 // Check if we can statically extend the operand. 4202 Value *Opnd = ExtOpnd->getOperand(OpIdx); 4203 if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) { 4204 LLVM_DEBUG(dbgs() << "Statically extend\n"); 4205 unsigned BitWidth = Ext->getType()->getIntegerBitWidth(); 4206 APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth) 4207 : Cst->getValue().zext(BitWidth); 4208 TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal)); 4209 continue; 4210 } 4211 // UndefValue are typed, so we have to statically sign extend them. 4212 if (isa<UndefValue>(Opnd)) { 4213 LLVM_DEBUG(dbgs() << "Statically extend\n"); 4214 TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType())); 4215 continue; 4216 } 4217 4218 // Otherwise we have to explicitly sign extend the operand. 4219 // Check if Ext was reused to extend an operand. 4220 if (!ExtForOpnd) { 4221 // If yes, create a new one. 4222 LLVM_DEBUG(dbgs() << "More operands to ext\n"); 4223 Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) 4224 : TPT.createZExt(Ext, Opnd, Ext->getType()); 4225 if (!isa<Instruction>(ValForExtOpnd)) { 4226 TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd); 4227 continue; 4228 } 4229 ExtForOpnd = cast<Instruction>(ValForExtOpnd); 4230 } 4231 if (Exts) 4232 Exts->push_back(ExtForOpnd); 4233 TPT.setOperand(ExtForOpnd, 0, Opnd); 4234 4235 // Move the sign extension before the insertion point. 4236 TPT.moveBefore(ExtForOpnd, ExtOpnd); 4237 TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd); 4238 CreatedInstsCost += !TLI.isExtFree(ExtForOpnd); 4239 // If more sext are required, new instructions will have to be created. 4240 ExtForOpnd = nullptr; 4241 } 4242 if (ExtForOpnd == Ext) { 4243 LLVM_DEBUG(dbgs() << "Extension is useless now\n"); 4244 TPT.eraseInstruction(Ext); 4245 } 4246 return ExtOpnd; 4247 } 4248 4249 /// Check whether or not promoting an instruction to a wider type is profitable. 4250 /// \p NewCost gives the cost of extension instructions created by the 4251 /// promotion. 4252 /// \p OldCost gives the cost of extension instructions before the promotion 4253 /// plus the number of instructions that have been 4254 /// matched in the addressing mode the promotion. 4255 /// \p PromotedOperand is the value that has been promoted. 4256 /// \return True if the promotion is profitable, false otherwise. 4257 bool AddressingModeMatcher::isPromotionProfitable( 4258 unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const { 4259 LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost 4260 << '\n'); 4261 // The cost of the new extensions is greater than the cost of the 4262 // old extension plus what we folded. 4263 // This is not profitable. 4264 if (NewCost > OldCost) 4265 return false; 4266 if (NewCost < OldCost) 4267 return true; 4268 // The promotion is neutral but it may help folding the sign extension in 4269 // loads for instance. 4270 // Check that we did not create an illegal instruction. 4271 return isPromotedInstructionLegal(TLI, DL, PromotedOperand); 4272 } 4273 4274 /// Given an instruction or constant expr, see if we can fold the operation 4275 /// into the addressing mode. If so, update the addressing mode and return 4276 /// true, otherwise return false without modifying AddrMode. 4277 /// If \p MovedAway is not NULL, it contains the information of whether or 4278 /// not AddrInst has to be folded into the addressing mode on success. 4279 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing 4280 /// because it has been moved away. 4281 /// Thus AddrInst must not be added in the matched instructions. 4282 /// This state can happen when AddrInst is a sext, since it may be moved away. 4283 /// Therefore, AddrInst may not be valid when MovedAway is true and it must 4284 /// not be referenced anymore. 4285 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, 4286 unsigned Depth, 4287 bool *MovedAway) { 4288 // Avoid exponential behavior on extremely deep expression trees. 4289 if (Depth >= 5) return false; 4290 4291 // By default, all matched instructions stay in place. 4292 if (MovedAway) 4293 *MovedAway = false; 4294 4295 switch (Opcode) { 4296 case Instruction::PtrToInt: 4297 // PtrToInt is always a noop, as we know that the int type is pointer sized. 4298 return matchAddr(AddrInst->getOperand(0), Depth); 4299 case Instruction::IntToPtr: { 4300 auto AS = AddrInst->getType()->getPointerAddressSpace(); 4301 auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); 4302 // This inttoptr is a no-op if the integer type is pointer sized. 4303 if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy) 4304 return matchAddr(AddrInst->getOperand(0), Depth); 4305 return false; 4306 } 4307 case Instruction::BitCast: 4308 // BitCast is always a noop, and we can handle it as long as it is 4309 // int->int or pointer->pointer (we don't want int<->fp or something). 4310 if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() && 4311 // Don't touch identity bitcasts. These were probably put here by LSR, 4312 // and we don't want to mess around with them. Assume it knows what it 4313 // is doing. 4314 AddrInst->getOperand(0)->getType() != AddrInst->getType()) 4315 return matchAddr(AddrInst->getOperand(0), Depth); 4316 return false; 4317 case Instruction::AddrSpaceCast: { 4318 unsigned SrcAS 4319 = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); 4320 unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); 4321 if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) 4322 return matchAddr(AddrInst->getOperand(0), Depth); 4323 return false; 4324 } 4325 case Instruction::Add: { 4326 // Check to see if we can merge in the RHS then the LHS. If so, we win. 4327 ExtAddrMode BackupAddrMode = AddrMode; 4328 unsigned OldSize = AddrModeInsts.size(); 4329 // Start a transaction at this point. 4330 // The LHS may match but not the RHS. 4331 // Therefore, we need a higher level restoration point to undo partially 4332 // matched operation. 4333 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4334 TPT.getRestorationPoint(); 4335 4336 AddrMode.InBounds = false; 4337 if (matchAddr(AddrInst->getOperand(1), Depth+1) && 4338 matchAddr(AddrInst->getOperand(0), Depth+1)) 4339 return true; 4340 4341 // Restore the old addr mode info. 4342 AddrMode = BackupAddrMode; 4343 AddrModeInsts.resize(OldSize); 4344 TPT.rollback(LastKnownGood); 4345 4346 // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. 4347 if (matchAddr(AddrInst->getOperand(0), Depth+1) && 4348 matchAddr(AddrInst->getOperand(1), Depth+1)) 4349 return true; 4350 4351 // Otherwise we definitely can't merge the ADD in. 4352 AddrMode = BackupAddrMode; 4353 AddrModeInsts.resize(OldSize); 4354 TPT.rollback(LastKnownGood); 4355 break; 4356 } 4357 //case Instruction::Or: 4358 // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. 4359 //break; 4360 case Instruction::Mul: 4361 case Instruction::Shl: { 4362 // Can only handle X*C and X << C. 4363 AddrMode.InBounds = false; 4364 ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); 4365 if (!RHS || RHS->getBitWidth() > 64) 4366 return false; 4367 int64_t Scale = RHS->getSExtValue(); 4368 if (Opcode == Instruction::Shl) 4369 Scale = 1LL << Scale; 4370 4371 return matchScaledValue(AddrInst->getOperand(0), Scale, Depth); 4372 } 4373 case Instruction::GetElementPtr: { 4374 // Scan the GEP. We check it if it contains constant offsets and at most 4375 // one variable offset. 4376 int VariableOperand = -1; 4377 unsigned VariableScale = 0; 4378 4379 int64_t ConstantOffset = 0; 4380 gep_type_iterator GTI = gep_type_begin(AddrInst); 4381 for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { 4382 if (StructType *STy = GTI.getStructTypeOrNull()) { 4383 const StructLayout *SL = DL.getStructLayout(STy); 4384 unsigned Idx = 4385 cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); 4386 ConstantOffset += SL->getElementOffset(Idx); 4387 } else { 4388 TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType()); 4389 if (TS.isNonZero()) { 4390 // The optimisations below currently only work for fixed offsets. 4391 if (TS.isScalable()) 4392 return false; 4393 int64_t TypeSize = TS.getFixedSize(); 4394 if (ConstantInt *CI = 4395 dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { 4396 const APInt &CVal = CI->getValue(); 4397 if (CVal.getMinSignedBits() <= 64) { 4398 ConstantOffset += CVal.getSExtValue() * TypeSize; 4399 continue; 4400 } 4401 } 4402 // We only allow one variable index at the moment. 4403 if (VariableOperand != -1) 4404 return false; 4405 4406 // Remember the variable index. 4407 VariableOperand = i; 4408 VariableScale = TypeSize; 4409 } 4410 } 4411 } 4412 4413 // A common case is for the GEP to only do a constant offset. In this case, 4414 // just add it to the disp field and check validity. 4415 if (VariableOperand == -1) { 4416 AddrMode.BaseOffs += ConstantOffset; 4417 if (ConstantOffset == 0 || 4418 TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) { 4419 // Check to see if we can fold the base pointer in too. 4420 if (matchAddr(AddrInst->getOperand(0), Depth+1)) { 4421 if (!cast<GEPOperator>(AddrInst)->isInBounds()) 4422 AddrMode.InBounds = false; 4423 return true; 4424 } 4425 } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) && 4426 TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 && 4427 ConstantOffset > 0) { 4428 // Record GEPs with non-zero offsets as candidates for splitting in the 4429 // event that the offset cannot fit into the r+i addressing mode. 4430 // Simple and common case that only one GEP is used in calculating the 4431 // address for the memory access. 4432 Value *Base = AddrInst->getOperand(0); 4433 auto *BaseI = dyn_cast<Instruction>(Base); 4434 auto *GEP = cast<GetElementPtrInst>(AddrInst); 4435 if (isa<Argument>(Base) || isa<GlobalValue>(Base) || 4436 (BaseI && !isa<CastInst>(BaseI) && 4437 !isa<GetElementPtrInst>(BaseI))) { 4438 // Make sure the parent block allows inserting non-PHI instructions 4439 // before the terminator. 4440 BasicBlock *Parent = 4441 BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock(); 4442 if (!Parent->getTerminator()->isEHPad()) 4443 LargeOffsetGEP = std::make_pair(GEP, ConstantOffset); 4444 } 4445 } 4446 AddrMode.BaseOffs -= ConstantOffset; 4447 return false; 4448 } 4449 4450 // Save the valid addressing mode in case we can't match. 4451 ExtAddrMode BackupAddrMode = AddrMode; 4452 unsigned OldSize = AddrModeInsts.size(); 4453 4454 // See if the scale and offset amount is valid for this target. 4455 AddrMode.BaseOffs += ConstantOffset; 4456 if (!cast<GEPOperator>(AddrInst)->isInBounds()) 4457 AddrMode.InBounds = false; 4458 4459 // Match the base operand of the GEP. 4460 if (!matchAddr(AddrInst->getOperand(0), Depth+1)) { 4461 // If it couldn't be matched, just stuff the value in a register. 4462 if (AddrMode.HasBaseReg) { 4463 AddrMode = BackupAddrMode; 4464 AddrModeInsts.resize(OldSize); 4465 return false; 4466 } 4467 AddrMode.HasBaseReg = true; 4468 AddrMode.BaseReg = AddrInst->getOperand(0); 4469 } 4470 4471 // Match the remaining variable portion of the GEP. 4472 if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, 4473 Depth)) { 4474 // If it couldn't be matched, try stuffing the base into a register 4475 // instead of matching it, and retrying the match of the scale. 4476 AddrMode = BackupAddrMode; 4477 AddrModeInsts.resize(OldSize); 4478 if (AddrMode.HasBaseReg) 4479 return false; 4480 AddrMode.HasBaseReg = true; 4481 AddrMode.BaseReg = AddrInst->getOperand(0); 4482 AddrMode.BaseOffs += ConstantOffset; 4483 if (!matchScaledValue(AddrInst->getOperand(VariableOperand), 4484 VariableScale, Depth)) { 4485 // If even that didn't work, bail. 4486 AddrMode = BackupAddrMode; 4487 AddrModeInsts.resize(OldSize); 4488 return false; 4489 } 4490 } 4491 4492 return true; 4493 } 4494 case Instruction::SExt: 4495 case Instruction::ZExt: { 4496 Instruction *Ext = dyn_cast<Instruction>(AddrInst); 4497 if (!Ext) 4498 return false; 4499 4500 // Try to move this ext out of the way of the addressing mode. 4501 // Ask for a method for doing so. 4502 TypePromotionHelper::Action TPH = 4503 TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts); 4504 if (!TPH) 4505 return false; 4506 4507 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4508 TPT.getRestorationPoint(); 4509 unsigned CreatedInstsCost = 0; 4510 unsigned ExtCost = !TLI.isExtFree(Ext); 4511 Value *PromotedOperand = 4512 TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI); 4513 // SExt has been moved away. 4514 // Thus either it will be rematched later in the recursive calls or it is 4515 // gone. Anyway, we must not fold it into the addressing mode at this point. 4516 // E.g., 4517 // op = add opnd, 1 4518 // idx = ext op 4519 // addr = gep base, idx 4520 // is now: 4521 // promotedOpnd = ext opnd <- no match here 4522 // op = promoted_add promotedOpnd, 1 <- match (later in recursive calls) 4523 // addr = gep base, op <- match 4524 if (MovedAway) 4525 *MovedAway = true; 4526 4527 assert(PromotedOperand && 4528 "TypePromotionHelper should have filtered out those cases"); 4529 4530 ExtAddrMode BackupAddrMode = AddrMode; 4531 unsigned OldSize = AddrModeInsts.size(); 4532 4533 if (!matchAddr(PromotedOperand, Depth) || 4534 // The total of the new cost is equal to the cost of the created 4535 // instructions. 4536 // The total of the old cost is equal to the cost of the extension plus 4537 // what we have saved in the addressing mode. 4538 !isPromotionProfitable(CreatedInstsCost, 4539 ExtCost + (AddrModeInsts.size() - OldSize), 4540 PromotedOperand)) { 4541 AddrMode = BackupAddrMode; 4542 AddrModeInsts.resize(OldSize); 4543 LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n"); 4544 TPT.rollback(LastKnownGood); 4545 return false; 4546 } 4547 return true; 4548 } 4549 } 4550 return false; 4551 } 4552 4553 /// If we can, try to add the value of 'Addr' into the current addressing mode. 4554 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode 4555 /// unmodified. This assumes that Addr is either a pointer type or intptr_t 4556 /// for the target. 4557 /// 4558 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { 4559 // Start a transaction at this point that we will rollback if the matching 4560 // fails. 4561 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4562 TPT.getRestorationPoint(); 4563 if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { 4564 if (CI->getValue().isSignedIntN(64)) { 4565 // Fold in immediates if legal for the target. 4566 AddrMode.BaseOffs += CI->getSExtValue(); 4567 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4568 return true; 4569 AddrMode.BaseOffs -= CI->getSExtValue(); 4570 } 4571 } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { 4572 // If this is a global variable, try to fold it into the addressing mode. 4573 if (!AddrMode.BaseGV) { 4574 AddrMode.BaseGV = GV; 4575 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4576 return true; 4577 AddrMode.BaseGV = nullptr; 4578 } 4579 } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { 4580 ExtAddrMode BackupAddrMode = AddrMode; 4581 unsigned OldSize = AddrModeInsts.size(); 4582 4583 // Check to see if it is possible to fold this operation. 4584 bool MovedAway = false; 4585 if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { 4586 // This instruction may have been moved away. If so, there is nothing 4587 // to check here. 4588 if (MovedAway) 4589 return true; 4590 // Okay, it's possible to fold this. Check to see if it is actually 4591 // *profitable* to do so. We use a simple cost model to avoid increasing 4592 // register pressure too much. 4593 if (I->hasOneUse() || 4594 isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { 4595 AddrModeInsts.push_back(I); 4596 return true; 4597 } 4598 4599 // It isn't profitable to do this, roll back. 4600 //cerr << "NOT FOLDING: " << *I; 4601 AddrMode = BackupAddrMode; 4602 AddrModeInsts.resize(OldSize); 4603 TPT.rollback(LastKnownGood); 4604 } 4605 } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { 4606 if (matchOperationAddr(CE, CE->getOpcode(), Depth)) 4607 return true; 4608 TPT.rollback(LastKnownGood); 4609 } else if (isa<ConstantPointerNull>(Addr)) { 4610 // Null pointer gets folded without affecting the addressing mode. 4611 return true; 4612 } 4613 4614 // Worse case, the target should support [reg] addressing modes. :) 4615 if (!AddrMode.HasBaseReg) { 4616 AddrMode.HasBaseReg = true; 4617 AddrMode.BaseReg = Addr; 4618 // Still check for legality in case the target supports [imm] but not [i+r]. 4619 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4620 return true; 4621 AddrMode.HasBaseReg = false; 4622 AddrMode.BaseReg = nullptr; 4623 } 4624 4625 // If the base register is already taken, see if we can do [r+r]. 4626 if (AddrMode.Scale == 0) { 4627 AddrMode.Scale = 1; 4628 AddrMode.ScaledReg = Addr; 4629 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4630 return true; 4631 AddrMode.Scale = 0; 4632 AddrMode.ScaledReg = nullptr; 4633 } 4634 // Couldn't match. 4635 TPT.rollback(LastKnownGood); 4636 return false; 4637 } 4638 4639 /// Check to see if all uses of OpVal by the specified inline asm call are due 4640 /// to memory operands. If so, return true, otherwise return false. 4641 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, 4642 const TargetLowering &TLI, 4643 const TargetRegisterInfo &TRI) { 4644 const Function *F = CI->getFunction(); 4645 TargetLowering::AsmOperandInfoVector TargetConstraints = 4646 TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI); 4647 4648 for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { 4649 TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; 4650 4651 // Compute the constraint code and ConstraintType to use. 4652 TLI.ComputeConstraintToUse(OpInfo, SDValue()); 4653 4654 // If this asm operand is our Value*, and if it isn't an indirect memory 4655 // operand, we can't fold it! 4656 if (OpInfo.CallOperandVal == OpVal && 4657 (OpInfo.ConstraintType != TargetLowering::C_Memory || 4658 !OpInfo.isIndirect)) 4659 return false; 4660 } 4661 4662 return true; 4663 } 4664 4665 // Max number of memory uses to look at before aborting the search to conserve 4666 // compile time. 4667 static constexpr int MaxMemoryUsesToScan = 20; 4668 4669 /// Recursively walk all the uses of I until we find a memory use. 4670 /// If we find an obviously non-foldable instruction, return true. 4671 /// Add the ultimately found memory instructions to MemoryUses. 4672 static bool FindAllMemoryUses( 4673 Instruction *I, 4674 SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, 4675 SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI, 4676 const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI, 4677 BlockFrequencyInfo *BFI, int SeenInsts = 0) { 4678 // If we already considered this instruction, we're done. 4679 if (!ConsideredInsts.insert(I).second) 4680 return false; 4681 4682 // If this is an obviously unfoldable instruction, bail out. 4683 if (!MightBeFoldableInst(I)) 4684 return true; 4685 4686 // Loop over all the uses, recursively processing them. 4687 for (Use &U : I->uses()) { 4688 // Conservatively return true if we're seeing a large number or a deep chain 4689 // of users. This avoids excessive compilation times in pathological cases. 4690 if (SeenInsts++ >= MaxMemoryUsesToScan) 4691 return true; 4692 4693 Instruction *UserI = cast<Instruction>(U.getUser()); 4694 if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) { 4695 MemoryUses.push_back(std::make_pair(LI, U.getOperandNo())); 4696 continue; 4697 } 4698 4699 if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { 4700 unsigned opNo = U.getOperandNo(); 4701 if (opNo != StoreInst::getPointerOperandIndex()) 4702 return true; // Storing addr, not into addr. 4703 MemoryUses.push_back(std::make_pair(SI, opNo)); 4704 continue; 4705 } 4706 4707 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { 4708 unsigned opNo = U.getOperandNo(); 4709 if (opNo != AtomicRMWInst::getPointerOperandIndex()) 4710 return true; // Storing addr, not into addr. 4711 MemoryUses.push_back(std::make_pair(RMW, opNo)); 4712 continue; 4713 } 4714 4715 if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { 4716 unsigned opNo = U.getOperandNo(); 4717 if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) 4718 return true; // Storing addr, not into addr. 4719 MemoryUses.push_back(std::make_pair(CmpX, opNo)); 4720 continue; 4721 } 4722 4723 if (CallInst *CI = dyn_cast<CallInst>(UserI)) { 4724 if (CI->hasFnAttr(Attribute::Cold)) { 4725 // If this is a cold call, we can sink the addressing calculation into 4726 // the cold path. See optimizeCallInst 4727 bool OptForSize = OptSize || 4728 llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); 4729 if (!OptForSize) 4730 continue; 4731 } 4732 4733 InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand()); 4734 if (!IA) return true; 4735 4736 // If this is a memory operand, we're cool, otherwise bail out. 4737 if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI)) 4738 return true; 4739 continue; 4740 } 4741 4742 if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, 4743 PSI, BFI, SeenInsts)) 4744 return true; 4745 } 4746 4747 return false; 4748 } 4749 4750 /// Return true if Val is already known to be live at the use site that we're 4751 /// folding it into. If so, there is no cost to include it in the addressing 4752 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the 4753 /// instruction already. 4754 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, 4755 Value *KnownLive2) { 4756 // If Val is either of the known-live values, we know it is live! 4757 if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2) 4758 return true; 4759 4760 // All values other than instructions and arguments (e.g. constants) are live. 4761 if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; 4762 4763 // If Val is a constant sized alloca in the entry block, it is live, this is 4764 // true because it is just a reference to the stack/frame pointer, which is 4765 // live for the whole function. 4766 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) 4767 if (AI->isStaticAlloca()) 4768 return true; 4769 4770 // Check to see if this value is already used in the memory instruction's 4771 // block. If so, it's already live into the block at the very least, so we 4772 // can reasonably fold it. 4773 return Val->isUsedInBasicBlock(MemoryInst->getParent()); 4774 } 4775 4776 /// It is possible for the addressing mode of the machine to fold the specified 4777 /// instruction into a load or store that ultimately uses it. 4778 /// However, the specified instruction has multiple uses. 4779 /// Given this, it may actually increase register pressure to fold it 4780 /// into the load. For example, consider this code: 4781 /// 4782 /// X = ... 4783 /// Y = X+1 4784 /// use(Y) -> nonload/store 4785 /// Z = Y+1 4786 /// load Z 4787 /// 4788 /// In this case, Y has multiple uses, and can be folded into the load of Z 4789 /// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to 4790 /// be live at the use(Y) line. If we don't fold Y into load Z, we use one 4791 /// fewer register. Since Y can't be folded into "use(Y)" we don't increase the 4792 /// number of computations either. 4793 /// 4794 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If 4795 /// X was live across 'load Z' for other reasons, we actually *would* want to 4796 /// fold the addressing mode in the Z case. This would make Y die earlier. 4797 bool AddressingModeMatcher:: 4798 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, 4799 ExtAddrMode &AMAfter) { 4800 if (IgnoreProfitability) return true; 4801 4802 // AMBefore is the addressing mode before this instruction was folded into it, 4803 // and AMAfter is the addressing mode after the instruction was folded. Get 4804 // the set of registers referenced by AMAfter and subtract out those 4805 // referenced by AMBefore: this is the set of values which folding in this 4806 // address extends the lifetime of. 4807 // 4808 // Note that there are only two potential values being referenced here, 4809 // BaseReg and ScaleReg (global addresses are always available, as are any 4810 // folded immediates). 4811 Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; 4812 4813 // If the BaseReg or ScaledReg was referenced by the previous addrmode, their 4814 // lifetime wasn't extended by adding this instruction. 4815 if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) 4816 BaseReg = nullptr; 4817 if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) 4818 ScaledReg = nullptr; 4819 4820 // If folding this instruction (and it's subexprs) didn't extend any live 4821 // ranges, we're ok with it. 4822 if (!BaseReg && !ScaledReg) 4823 return true; 4824 4825 // If all uses of this instruction can have the address mode sunk into them, 4826 // we can remove the addressing mode and effectively trade one live register 4827 // for another (at worst.) In this context, folding an addressing mode into 4828 // the use is just a particularly nice way of sinking it. 4829 SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; 4830 SmallPtrSet<Instruction*, 16> ConsideredInsts; 4831 if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, 4832 PSI, BFI)) 4833 return false; // Has a non-memory, non-foldable use! 4834 4835 // Now that we know that all uses of this instruction are part of a chain of 4836 // computation involving only operations that could theoretically be folded 4837 // into a memory use, loop over each of these memory operation uses and see 4838 // if they could *actually* fold the instruction. The assumption is that 4839 // addressing modes are cheap and that duplicating the computation involved 4840 // many times is worthwhile, even on a fastpath. For sinking candidates 4841 // (i.e. cold call sites), this serves as a way to prevent excessive code 4842 // growth since most architectures have some reasonable small and fast way to 4843 // compute an effective address. (i.e LEA on x86) 4844 SmallVector<Instruction*, 32> MatchedAddrModeInsts; 4845 for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { 4846 Instruction *User = MemoryUses[i].first; 4847 unsigned OpNo = MemoryUses[i].second; 4848 4849 // Get the access type of this use. If the use isn't a pointer, we don't 4850 // know what it accesses. 4851 Value *Address = User->getOperand(OpNo); 4852 PointerType *AddrTy = dyn_cast<PointerType>(Address->getType()); 4853 if (!AddrTy) 4854 return false; 4855 Type *AddressAccessTy = AddrTy->getElementType(); 4856 unsigned AS = AddrTy->getAddressSpace(); 4857 4858 // Do a match against the root of this address, ignoring profitability. This 4859 // will tell us if the addressing mode for the memory operation will 4860 // *actually* cover the shared instruction. 4861 ExtAddrMode Result; 4862 std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr, 4863 0); 4864 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4865 TPT.getRestorationPoint(); 4866 AddressingModeMatcher Matcher( 4867 MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, 4868 InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); 4869 Matcher.IgnoreProfitability = true; 4870 bool Success = Matcher.matchAddr(Address, 0); 4871 (void)Success; assert(Success && "Couldn't select *anything*?"); 4872 4873 // The match was to check the profitability, the changes made are not 4874 // part of the original matcher. Therefore, they should be dropped 4875 // otherwise the original matcher will not present the right state. 4876 TPT.rollback(LastKnownGood); 4877 4878 // If the match didn't cover I, then it won't be shared by it. 4879 if (!is_contained(MatchedAddrModeInsts, I)) 4880 return false; 4881 4882 MatchedAddrModeInsts.clear(); 4883 } 4884 4885 return true; 4886 } 4887 4888 /// Return true if the specified values are defined in a 4889 /// different basic block than BB. 4890 static bool IsNonLocalValue(Value *V, BasicBlock *BB) { 4891 if (Instruction *I = dyn_cast<Instruction>(V)) 4892 return I->getParent() != BB; 4893 return false; 4894 } 4895 4896 /// Sink addressing mode computation immediate before MemoryInst if doing so 4897 /// can be done without increasing register pressure. The need for the 4898 /// register pressure constraint means this can end up being an all or nothing 4899 /// decision for all uses of the same addressing computation. 4900 /// 4901 /// Load and Store Instructions often have addressing modes that can do 4902 /// significant amounts of computation. As such, instruction selection will try 4903 /// to get the load or store to do as much computation as possible for the 4904 /// program. The problem is that isel can only see within a single block. As 4905 /// such, we sink as much legal addressing mode work into the block as possible. 4906 /// 4907 /// This method is used to optimize both load/store and inline asms with memory 4908 /// operands. It's also used to sink addressing computations feeding into cold 4909 /// call sites into their (cold) basic block. 4910 /// 4911 /// The motivation for handling sinking into cold blocks is that doing so can 4912 /// both enable other address mode sinking (by satisfying the register pressure 4913 /// constraint above), and reduce register pressure globally (by removing the 4914 /// addressing mode computation from the fast path entirely.). 4915 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, 4916 Type *AccessTy, unsigned AddrSpace) { 4917 Value *Repl = Addr; 4918 4919 // Try to collapse single-value PHI nodes. This is necessary to undo 4920 // unprofitable PRE transformations. 4921 SmallVector<Value*, 8> worklist; 4922 SmallPtrSet<Value*, 16> Visited; 4923 worklist.push_back(Addr); 4924 4925 // Use a worklist to iteratively look through PHI and select nodes, and 4926 // ensure that the addressing mode obtained from the non-PHI/select roots of 4927 // the graph are compatible. 4928 bool PhiOrSelectSeen = false; 4929 SmallVector<Instruction*, 16> AddrModeInsts; 4930 const SimplifyQuery SQ(*DL, TLInfo); 4931 AddressingModeCombiner AddrModes(SQ, Addr); 4932 TypePromotionTransaction TPT(RemovedInsts); 4933 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4934 TPT.getRestorationPoint(); 4935 while (!worklist.empty()) { 4936 Value *V = worklist.back(); 4937 worklist.pop_back(); 4938 4939 // We allow traversing cyclic Phi nodes. 4940 // In case of success after this loop we ensure that traversing through 4941 // Phi nodes ends up with all cases to compute address of the form 4942 // BaseGV + Base + Scale * Index + Offset 4943 // where Scale and Offset are constans and BaseGV, Base and Index 4944 // are exactly the same Values in all cases. 4945 // It means that BaseGV, Scale and Offset dominate our memory instruction 4946 // and have the same value as they had in address computation represented 4947 // as Phi. So we can safely sink address computation to memory instruction. 4948 if (!Visited.insert(V).second) 4949 continue; 4950 4951 // For a PHI node, push all of its incoming values. 4952 if (PHINode *P = dyn_cast<PHINode>(V)) { 4953 for (Value *IncValue : P->incoming_values()) 4954 worklist.push_back(IncValue); 4955 PhiOrSelectSeen = true; 4956 continue; 4957 } 4958 // Similar for select. 4959 if (SelectInst *SI = dyn_cast<SelectInst>(V)) { 4960 worklist.push_back(SI->getFalseValue()); 4961 worklist.push_back(SI->getTrueValue()); 4962 PhiOrSelectSeen = true; 4963 continue; 4964 } 4965 4966 // For non-PHIs, determine the addressing mode being computed. Note that 4967 // the result may differ depending on what other uses our candidate 4968 // addressing instructions might have. 4969 AddrModeInsts.clear(); 4970 std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr, 4971 0); 4972 ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( 4973 V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, 4974 InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, 4975 BFI.get()); 4976 4977 GetElementPtrInst *GEP = LargeOffsetGEP.first; 4978 if (GEP && !NewGEPBases.count(GEP)) { 4979 // If splitting the underlying data structure can reduce the offset of a 4980 // GEP, collect the GEP. Skip the GEPs that are the new bases of 4981 // previously split data structures. 4982 LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP); 4983 if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end()) 4984 LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size(); 4985 } 4986 4987 NewAddrMode.OriginalValue = V; 4988 if (!AddrModes.addNewAddrMode(NewAddrMode)) 4989 break; 4990 } 4991 4992 // Try to combine the AddrModes we've collected. If we couldn't collect any, 4993 // or we have multiple but either couldn't combine them or combining them 4994 // wouldn't do anything useful, bail out now. 4995 if (!AddrModes.combineAddrModes()) { 4996 TPT.rollback(LastKnownGood); 4997 return false; 4998 } 4999 bool Modified = TPT.commit(); 5000 5001 // Get the combined AddrMode (or the only AddrMode, if we only had one). 5002 ExtAddrMode AddrMode = AddrModes.getAddrMode(); 5003 5004 // If all the instructions matched are already in this BB, don't do anything. 5005 // If we saw a Phi node then it is not local definitely, and if we saw a select 5006 // then we want to push the address calculation past it even if it's already 5007 // in this BB. 5008 if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) { 5009 return IsNonLocalValue(V, MemoryInst->getParent()); 5010 })) { 5011 LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode 5012 << "\n"); 5013 return Modified; 5014 } 5015 5016 // Insert this computation right after this user. Since our caller is 5017 // scanning from the top of the BB to the bottom, reuse of the expr are 5018 // guaranteed to happen later. 5019 IRBuilder<> Builder(MemoryInst); 5020 5021 // Now that we determined the addressing expression we want to use and know 5022 // that we have to sink it into this block. Check to see if we have already 5023 // done this for some other load/store instr in this block. If so, reuse 5024 // the computation. Before attempting reuse, check if the address is valid 5025 // as it may have been erased. 5026 5027 WeakTrackingVH SunkAddrVH = SunkAddrs[Addr]; 5028 5029 Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; 5030 if (SunkAddr) { 5031 LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode 5032 << " for " << *MemoryInst << "\n"); 5033 if (SunkAddr->getType() != Addr->getType()) 5034 SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); 5035 } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && 5036 SubtargetInfo->addrSinkUsingGEPs())) { 5037 // By default, we use the GEP-based method when AA is used later. This 5038 // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. 5039 LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode 5040 << " for " << *MemoryInst << "\n"); 5041 Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); 5042 Value *ResultPtr = nullptr, *ResultIndex = nullptr; 5043 5044 // First, find the pointer. 5045 if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) { 5046 ResultPtr = AddrMode.BaseReg; 5047 AddrMode.BaseReg = nullptr; 5048 } 5049 5050 if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) { 5051 // We can't add more than one pointer together, nor can we scale a 5052 // pointer (both of which seem meaningless). 5053 if (ResultPtr || AddrMode.Scale != 1) 5054 return Modified; 5055 5056 ResultPtr = AddrMode.ScaledReg; 5057 AddrMode.Scale = 0; 5058 } 5059 5060 // It is only safe to sign extend the BaseReg if we know that the math 5061 // required to create it did not overflow before we extend it. Since 5062 // the original IR value was tossed in favor of a constant back when 5063 // the AddrMode was created we need to bail out gracefully if widths 5064 // do not match instead of extending it. 5065 // 5066 // (See below for code to add the scale.) 5067 if (AddrMode.Scale) { 5068 Type *ScaledRegTy = AddrMode.ScaledReg->getType(); 5069 if (cast<IntegerType>(IntPtrTy)->getBitWidth() > 5070 cast<IntegerType>(ScaledRegTy)->getBitWidth()) 5071 return Modified; 5072 } 5073 5074 if (AddrMode.BaseGV) { 5075 if (ResultPtr) 5076 return Modified; 5077 5078 ResultPtr = AddrMode.BaseGV; 5079 } 5080 5081 // If the real base value actually came from an inttoptr, then the matcher 5082 // will look through it and provide only the integer value. In that case, 5083 // use it here. 5084 if (!DL->isNonIntegralPointerType(Addr->getType())) { 5085 if (!ResultPtr && AddrMode.BaseReg) { 5086 ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), 5087 "sunkaddr"); 5088 AddrMode.BaseReg = nullptr; 5089 } else if (!ResultPtr && AddrMode.Scale == 1) { 5090 ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), 5091 "sunkaddr"); 5092 AddrMode.Scale = 0; 5093 } 5094 } 5095 5096 if (!ResultPtr && 5097 !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) { 5098 SunkAddr = Constant::getNullValue(Addr->getType()); 5099 } else if (!ResultPtr) { 5100 return Modified; 5101 } else { 5102 Type *I8PtrTy = 5103 Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); 5104 Type *I8Ty = Builder.getInt8Ty(); 5105 5106 // Start with the base register. Do this first so that subsequent address 5107 // matching finds it last, which will prevent it from trying to match it 5108 // as the scaled value in case it happens to be a mul. That would be 5109 // problematic if we've sunk a different mul for the scale, because then 5110 // we'd end up sinking both muls. 5111 if (AddrMode.BaseReg) { 5112 Value *V = AddrMode.BaseReg; 5113 if (V->getType() != IntPtrTy) 5114 V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); 5115 5116 ResultIndex = V; 5117 } 5118 5119 // Add the scale value. 5120 if (AddrMode.Scale) { 5121 Value *V = AddrMode.ScaledReg; 5122 if (V->getType() == IntPtrTy) { 5123 // done. 5124 } else { 5125 assert(cast<IntegerType>(IntPtrTy)->getBitWidth() < 5126 cast<IntegerType>(V->getType())->getBitWidth() && 5127 "We can't transform if ScaledReg is too narrow"); 5128 V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); 5129 } 5130 5131 if (AddrMode.Scale != 1) 5132 V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), 5133 "sunkaddr"); 5134 if (ResultIndex) 5135 ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr"); 5136 else 5137 ResultIndex = V; 5138 } 5139 5140 // Add in the Base Offset if present. 5141 if (AddrMode.BaseOffs) { 5142 Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); 5143 if (ResultIndex) { 5144 // We need to add this separately from the scale above to help with 5145 // SDAG consecutive load/store merging. 5146 if (ResultPtr->getType() != I8PtrTy) 5147 ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); 5148 ResultPtr = 5149 AddrMode.InBounds 5150 ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex, 5151 "sunkaddr") 5152 : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); 5153 } 5154 5155 ResultIndex = V; 5156 } 5157 5158 if (!ResultIndex) { 5159 SunkAddr = ResultPtr; 5160 } else { 5161 if (ResultPtr->getType() != I8PtrTy) 5162 ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); 5163 SunkAddr = 5164 AddrMode.InBounds 5165 ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex, 5166 "sunkaddr") 5167 : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); 5168 } 5169 5170 if (SunkAddr->getType() != Addr->getType()) 5171 SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); 5172 } 5173 } else { 5174 // We'd require a ptrtoint/inttoptr down the line, which we can't do for 5175 // non-integral pointers, so in that case bail out now. 5176 Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr; 5177 Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr; 5178 PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy); 5179 PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy); 5180 if (DL->isNonIntegralPointerType(Addr->getType()) || 5181 (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) || 5182 (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) || 5183 (AddrMode.BaseGV && 5184 DL->isNonIntegralPointerType(AddrMode.BaseGV->getType()))) 5185 return Modified; 5186 5187 LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode 5188 << " for " << *MemoryInst << "\n"); 5189 Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); 5190 Value *Result = nullptr; 5191 5192 // Start with the base register. Do this first so that subsequent address 5193 // matching finds it last, which will prevent it from trying to match it 5194 // as the scaled value in case it happens to be a mul. That would be 5195 // problematic if we've sunk a different mul for the scale, because then 5196 // we'd end up sinking both muls. 5197 if (AddrMode.BaseReg) { 5198 Value *V = AddrMode.BaseReg; 5199 if (V->getType()->isPointerTy()) 5200 V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); 5201 if (V->getType() != IntPtrTy) 5202 V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); 5203 Result = V; 5204 } 5205 5206 // Add the scale value. 5207 if (AddrMode.Scale) { 5208 Value *V = AddrMode.ScaledReg; 5209 if (V->getType() == IntPtrTy) { 5210 // done. 5211 } else if (V->getType()->isPointerTy()) { 5212 V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); 5213 } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < 5214 cast<IntegerType>(V->getType())->getBitWidth()) { 5215 V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); 5216 } else { 5217 // It is only safe to sign extend the BaseReg if we know that the math 5218 // required to create it did not overflow before we extend it. Since 5219 // the original IR value was tossed in favor of a constant back when 5220 // the AddrMode was created we need to bail out gracefully if widths 5221 // do not match instead of extending it. 5222 Instruction *I = dyn_cast_or_null<Instruction>(Result); 5223 if (I && (Result != AddrMode.BaseReg)) 5224 I->eraseFromParent(); 5225 return Modified; 5226 } 5227 if (AddrMode.Scale != 1) 5228 V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), 5229 "sunkaddr"); 5230 if (Result) 5231 Result = Builder.CreateAdd(Result, V, "sunkaddr"); 5232 else 5233 Result = V; 5234 } 5235 5236 // Add in the BaseGV if present. 5237 if (AddrMode.BaseGV) { 5238 Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); 5239 if (Result) 5240 Result = Builder.CreateAdd(Result, V, "sunkaddr"); 5241 else 5242 Result = V; 5243 } 5244 5245 // Add in the Base Offset if present. 5246 if (AddrMode.BaseOffs) { 5247 Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); 5248 if (Result) 5249 Result = Builder.CreateAdd(Result, V, "sunkaddr"); 5250 else 5251 Result = V; 5252 } 5253 5254 if (!Result) 5255 SunkAddr = Constant::getNullValue(Addr->getType()); 5256 else 5257 SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); 5258 } 5259 5260 MemoryInst->replaceUsesOfWith(Repl, SunkAddr); 5261 // Store the newly computed address into the cache. In the case we reused a 5262 // value, this should be idempotent. 5263 SunkAddrs[Addr] = WeakTrackingVH(SunkAddr); 5264 5265 // If we have no uses, recursively delete the value and all dead instructions 5266 // using it. 5267 if (Repl->use_empty()) { 5268 // This can cause recursive deletion, which can invalidate our iterator. 5269 // Use a WeakTrackingVH to hold onto it in case this happens. 5270 Value *CurValue = &*CurInstIterator; 5271 WeakTrackingVH IterHandle(CurValue); 5272 BasicBlock *BB = CurInstIterator->getParent(); 5273 5274 RecursivelyDeleteTriviallyDeadInstructions( 5275 Repl, TLInfo, nullptr, 5276 [&](Value *V) { removeAllAssertingVHReferences(V); }); 5277 5278 if (IterHandle != CurValue) { 5279 // If the iterator instruction was recursively deleted, start over at the 5280 // start of the block. 5281 CurInstIterator = BB->begin(); 5282 SunkAddrs.clear(); 5283 } 5284 } 5285 ++NumMemoryInsts; 5286 return true; 5287 } 5288 5289 /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find 5290 /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can 5291 /// only handle a 2 operand GEP in the same basic block or a splat constant 5292 /// vector. The 2 operands to the GEP must have a scalar pointer and a vector 5293 /// index. 5294 /// 5295 /// If the existing GEP has a vector base pointer that is splat, we can look 5296 /// through the splat to find the scalar pointer. If we can't find a scalar 5297 /// pointer there's nothing we can do. 5298 /// 5299 /// If we have a GEP with more than 2 indices where the middle indices are all 5300 /// zeroes, we can replace it with 2 GEPs where the second has 2 operands. 5301 /// 5302 /// If the final index isn't a vector or is a splat, we can emit a scalar GEP 5303 /// followed by a GEP with an all zeroes vector index. This will enable 5304 /// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a 5305 /// zero index. 5306 bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, 5307 Value *Ptr) { 5308 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 5309 if (!GEP || !GEP->hasIndices()) 5310 return false; 5311 5312 // If the GEP and the gather/scatter aren't in the same BB, don't optimize. 5313 // FIXME: We should support this by sinking the GEP. 5314 if (MemoryInst->getParent() != GEP->getParent()) 5315 return false; 5316 5317 SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end()); 5318 5319 bool RewriteGEP = false; 5320 5321 if (Ops[0]->getType()->isVectorTy()) { 5322 Ops[0] = const_cast<Value *>(getSplatValue(Ops[0])); 5323 if (!Ops[0]) 5324 return false; 5325 RewriteGEP = true; 5326 } 5327 5328 unsigned FinalIndex = Ops.size() - 1; 5329 5330 // Ensure all but the last index is 0. 5331 // FIXME: This isn't strictly required. All that's required is that they are 5332 // all scalars or splats. 5333 for (unsigned i = 1; i < FinalIndex; ++i) { 5334 auto *C = dyn_cast<Constant>(Ops[i]); 5335 if (!C) 5336 return false; 5337 if (isa<VectorType>(C->getType())) 5338 C = C->getSplatValue(); 5339 auto *CI = dyn_cast_or_null<ConstantInt>(C); 5340 if (!CI || !CI->isZero()) 5341 return false; 5342 // Scalarize the index if needed. 5343 Ops[i] = CI; 5344 } 5345 5346 // Try to scalarize the final index. 5347 if (Ops[FinalIndex]->getType()->isVectorTy()) { 5348 if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) { 5349 auto *C = dyn_cast<ConstantInt>(V); 5350 // Don't scalarize all zeros vector. 5351 if (!C || !C->isZero()) { 5352 Ops[FinalIndex] = V; 5353 RewriteGEP = true; 5354 } 5355 } 5356 } 5357 5358 // If we made any changes or the we have extra operands, we need to generate 5359 // new instructions. 5360 if (!RewriteGEP && Ops.size() == 2) 5361 return false; 5362 5363 unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements(); 5364 5365 IRBuilder<> Builder(MemoryInst); 5366 5367 Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType()); 5368 5369 Value *NewAddr; 5370 5371 // If the final index isn't a vector, emit a scalar GEP containing all ops 5372 // and a vector GEP with all zeroes final index. 5373 if (!Ops[FinalIndex]->getType()->isVectorTy()) { 5374 NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front()); 5375 auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts); 5376 NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy)); 5377 } else { 5378 Value *Base = Ops[0]; 5379 Value *Index = Ops[FinalIndex]; 5380 5381 // Create a scalar GEP if there are more than 2 operands. 5382 if (Ops.size() != 2) { 5383 // Replace the last index with 0. 5384 Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy); 5385 Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front()); 5386 } 5387 5388 // Now create the GEP with scalar pointer and vector index. 5389 NewAddr = Builder.CreateGEP(Base, Index); 5390 } 5391 5392 MemoryInst->replaceUsesOfWith(Ptr, NewAddr); 5393 5394 // If we have no uses, recursively delete the value and all dead instructions 5395 // using it. 5396 if (Ptr->use_empty()) 5397 RecursivelyDeleteTriviallyDeadInstructions( 5398 Ptr, TLInfo, nullptr, 5399 [&](Value *V) { removeAllAssertingVHReferences(V); }); 5400 5401 return true; 5402 } 5403 5404 /// If there are any memory operands, use OptimizeMemoryInst to sink their 5405 /// address computing into the block when possible / profitable. 5406 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { 5407 bool MadeChange = false; 5408 5409 const TargetRegisterInfo *TRI = 5410 TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo(); 5411 TargetLowering::AsmOperandInfoVector TargetConstraints = 5412 TLI->ParseConstraints(*DL, TRI, *CS); 5413 unsigned ArgNo = 0; 5414 for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { 5415 TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; 5416 5417 // Compute the constraint code and ConstraintType to use. 5418 TLI->ComputeConstraintToUse(OpInfo, SDValue()); 5419 5420 if (OpInfo.ConstraintType == TargetLowering::C_Memory && 5421 OpInfo.isIndirect) { 5422 Value *OpVal = CS->getArgOperand(ArgNo++); 5423 MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); 5424 } else if (OpInfo.Type == InlineAsm::isInput) 5425 ArgNo++; 5426 } 5427 5428 return MadeChange; 5429 } 5430 5431 /// Check if all the uses of \p Val are equivalent (or free) zero or 5432 /// sign extensions. 5433 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { 5434 assert(!Val->use_empty() && "Input must have at least one use"); 5435 const Instruction *FirstUser = cast<Instruction>(*Val->user_begin()); 5436 bool IsSExt = isa<SExtInst>(FirstUser); 5437 Type *ExtTy = FirstUser->getType(); 5438 for (const User *U : Val->users()) { 5439 const Instruction *UI = cast<Instruction>(U); 5440 if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI))) 5441 return false; 5442 Type *CurTy = UI->getType(); 5443 // Same input and output types: Same instruction after CSE. 5444 if (CurTy == ExtTy) 5445 continue; 5446 5447 // If IsSExt is true, we are in this situation: 5448 // a = Val 5449 // b = sext ty1 a to ty2 5450 // c = sext ty1 a to ty3 5451 // Assuming ty2 is shorter than ty3, this could be turned into: 5452 // a = Val 5453 // b = sext ty1 a to ty2 5454 // c = sext ty2 b to ty3 5455 // However, the last sext is not free. 5456 if (IsSExt) 5457 return false; 5458 5459 // This is a ZExt, maybe this is free to extend from one type to another. 5460 // In that case, we would not account for a different use. 5461 Type *NarrowTy; 5462 Type *LargeTy; 5463 if (ExtTy->getScalarType()->getIntegerBitWidth() > 5464 CurTy->getScalarType()->getIntegerBitWidth()) { 5465 NarrowTy = CurTy; 5466 LargeTy = ExtTy; 5467 } else { 5468 NarrowTy = ExtTy; 5469 LargeTy = CurTy; 5470 } 5471 5472 if (!TLI.isZExtFree(NarrowTy, LargeTy)) 5473 return false; 5474 } 5475 // All uses are the same or can be derived from one another for free. 5476 return true; 5477 } 5478 5479 /// Try to speculatively promote extensions in \p Exts and continue 5480 /// promoting through newly promoted operands recursively as far as doing so is 5481 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. 5482 /// When some promotion happened, \p TPT contains the proper state to revert 5483 /// them. 5484 /// 5485 /// \return true if some promotion happened, false otherwise. 5486 bool CodeGenPrepare::tryToPromoteExts( 5487 TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, 5488 SmallVectorImpl<Instruction *> &ProfitablyMovedExts, 5489 unsigned CreatedInstsCost) { 5490 bool Promoted = false; 5491 5492 // Iterate over all the extensions to try to promote them. 5493 for (auto *I : Exts) { 5494 // Early check if we directly have ext(load). 5495 if (isa<LoadInst>(I->getOperand(0))) { 5496 ProfitablyMovedExts.push_back(I); 5497 continue; 5498 } 5499 5500 // Check whether or not we want to do any promotion. The reason we have 5501 // this check inside the for loop is to catch the case where an extension 5502 // is directly fed by a load because in such case the extension can be moved 5503 // up without any promotion on its operands. 5504 if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion) 5505 return false; 5506 5507 // Get the action to perform the promotion. 5508 TypePromotionHelper::Action TPH = 5509 TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts); 5510 // Check if we can promote. 5511 if (!TPH) { 5512 // Save the current extension as we cannot move up through its operand. 5513 ProfitablyMovedExts.push_back(I); 5514 continue; 5515 } 5516 5517 // Save the current state. 5518 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 5519 TPT.getRestorationPoint(); 5520 SmallVector<Instruction *, 4> NewExts; 5521 unsigned NewCreatedInstsCost = 0; 5522 unsigned ExtCost = !TLI->isExtFree(I); 5523 // Promote. 5524 Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost, 5525 &NewExts, nullptr, *TLI); 5526 assert(PromotedVal && 5527 "TypePromotionHelper should have filtered out those cases"); 5528 5529 // We would be able to merge only one extension in a load. 5530 // Therefore, if we have more than 1 new extension we heuristically 5531 // cut this search path, because it means we degrade the code quality. 5532 // With exactly 2, the transformation is neutral, because we will merge 5533 // one extension but leave one. However, we optimistically keep going, 5534 // because the new extension may be removed too. 5535 long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; 5536 // FIXME: It would be possible to propagate a negative value instead of 5537 // conservatively ceiling it to 0. 5538 TotalCreatedInstsCost = 5539 std::max((long long)0, (TotalCreatedInstsCost - ExtCost)); 5540 if (!StressExtLdPromotion && 5541 (TotalCreatedInstsCost > 1 || 5542 !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { 5543 // This promotion is not profitable, rollback to the previous state, and 5544 // save the current extension in ProfitablyMovedExts as the latest 5545 // speculative promotion turned out to be unprofitable. 5546 TPT.rollback(LastKnownGood); 5547 ProfitablyMovedExts.push_back(I); 5548 continue; 5549 } 5550 // Continue promoting NewExts as far as doing so is profitable. 5551 SmallVector<Instruction *, 2> NewlyMovedExts; 5552 (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); 5553 bool NewPromoted = false; 5554 for (auto *ExtInst : NewlyMovedExts) { 5555 Instruction *MovedExt = cast<Instruction>(ExtInst); 5556 Value *ExtOperand = MovedExt->getOperand(0); 5557 // If we have reached to a load, we need this extra profitability check 5558 // as it could potentially be merged into an ext(load). 5559 if (isa<LoadInst>(ExtOperand) && 5560 !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || 5561 (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) 5562 continue; 5563 5564 ProfitablyMovedExts.push_back(MovedExt); 5565 NewPromoted = true; 5566 } 5567 5568 // If none of speculative promotions for NewExts is profitable, rollback 5569 // and save the current extension (I) as the last profitable extension. 5570 if (!NewPromoted) { 5571 TPT.rollback(LastKnownGood); 5572 ProfitablyMovedExts.push_back(I); 5573 continue; 5574 } 5575 // The promotion is profitable. 5576 Promoted = true; 5577 } 5578 return Promoted; 5579 } 5580 5581 /// Merging redundant sexts when one is dominating the other. 5582 bool CodeGenPrepare::mergeSExts(Function &F) { 5583 bool Changed = false; 5584 for (auto &Entry : ValToSExtendedUses) { 5585 SExts &Insts = Entry.second; 5586 SExts CurPts; 5587 for (Instruction *Inst : Insts) { 5588 if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || 5589 Inst->getOperand(0) != Entry.first) 5590 continue; 5591 bool inserted = false; 5592 for (auto &Pt : CurPts) { 5593 if (getDT(F).dominates(Inst, Pt)) { 5594 Pt->replaceAllUsesWith(Inst); 5595 RemovedInsts.insert(Pt); 5596 Pt->removeFromParent(); 5597 Pt = Inst; 5598 inserted = true; 5599 Changed = true; 5600 break; 5601 } 5602 if (!getDT(F).dominates(Pt, Inst)) 5603 // Give up if we need to merge in a common dominator as the 5604 // experiments show it is not profitable. 5605 continue; 5606 Inst->replaceAllUsesWith(Pt); 5607 RemovedInsts.insert(Inst); 5608 Inst->removeFromParent(); 5609 inserted = true; 5610 Changed = true; 5611 break; 5612 } 5613 if (!inserted) 5614 CurPts.push_back(Inst); 5615 } 5616 } 5617 return Changed; 5618 } 5619 5620 // Splitting large data structures so that the GEPs accessing them can have 5621 // smaller offsets so that they can be sunk to the same blocks as their users. 5622 // For example, a large struct starting from %base is split into two parts 5623 // where the second part starts from %new_base. 5624 // 5625 // Before: 5626 // BB0: 5627 // %base = 5628 // 5629 // BB1: 5630 // %gep0 = gep %base, off0 5631 // %gep1 = gep %base, off1 5632 // %gep2 = gep %base, off2 5633 // 5634 // BB2: 5635 // %load1 = load %gep0 5636 // %load2 = load %gep1 5637 // %load3 = load %gep2 5638 // 5639 // After: 5640 // BB0: 5641 // %base = 5642 // %new_base = gep %base, off0 5643 // 5644 // BB1: 5645 // %new_gep0 = %new_base 5646 // %new_gep1 = gep %new_base, off1 - off0 5647 // %new_gep2 = gep %new_base, off2 - off0 5648 // 5649 // BB2: 5650 // %load1 = load i32, i32* %new_gep0 5651 // %load2 = load i32, i32* %new_gep1 5652 // %load3 = load i32, i32* %new_gep2 5653 // 5654 // %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because 5655 // their offsets are smaller enough to fit into the addressing mode. 5656 bool CodeGenPrepare::splitLargeGEPOffsets() { 5657 bool Changed = false; 5658 for (auto &Entry : LargeOffsetGEPMap) { 5659 Value *OldBase = Entry.first; 5660 SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>> 5661 &LargeOffsetGEPs = Entry.second; 5662 auto compareGEPOffset = 5663 [&](const std::pair<GetElementPtrInst *, int64_t> &LHS, 5664 const std::pair<GetElementPtrInst *, int64_t> &RHS) { 5665 if (LHS.first == RHS.first) 5666 return false; 5667 if (LHS.second != RHS.second) 5668 return LHS.second < RHS.second; 5669 return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first]; 5670 }; 5671 // Sorting all the GEPs of the same data structures based on the offsets. 5672 llvm::sort(LargeOffsetGEPs, compareGEPOffset); 5673 LargeOffsetGEPs.erase( 5674 std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()), 5675 LargeOffsetGEPs.end()); 5676 // Skip if all the GEPs have the same offsets. 5677 if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second) 5678 continue; 5679 GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first; 5680 int64_t BaseOffset = LargeOffsetGEPs.begin()->second; 5681 Value *NewBaseGEP = nullptr; 5682 5683 auto *LargeOffsetGEP = LargeOffsetGEPs.begin(); 5684 while (LargeOffsetGEP != LargeOffsetGEPs.end()) { 5685 GetElementPtrInst *GEP = LargeOffsetGEP->first; 5686 int64_t Offset = LargeOffsetGEP->second; 5687 if (Offset != BaseOffset) { 5688 TargetLowering::AddrMode AddrMode; 5689 AddrMode.BaseOffs = Offset - BaseOffset; 5690 // The result type of the GEP might not be the type of the memory 5691 // access. 5692 if (!TLI->isLegalAddressingMode(*DL, AddrMode, 5693 GEP->getResultElementType(), 5694 GEP->getAddressSpace())) { 5695 // We need to create a new base if the offset to the current base is 5696 // too large to fit into the addressing mode. So, a very large struct 5697 // may be split into several parts. 5698 BaseGEP = GEP; 5699 BaseOffset = Offset; 5700 NewBaseGEP = nullptr; 5701 } 5702 } 5703 5704 // Generate a new GEP to replace the current one. 5705 LLVMContext &Ctx = GEP->getContext(); 5706 Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); 5707 Type *I8PtrTy = 5708 Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace()); 5709 Type *I8Ty = Type::getInt8Ty(Ctx); 5710 5711 if (!NewBaseGEP) { 5712 // Create a new base if we don't have one yet. Find the insertion 5713 // pointer for the new base first. 5714 BasicBlock::iterator NewBaseInsertPt; 5715 BasicBlock *NewBaseInsertBB; 5716 if (auto *BaseI = dyn_cast<Instruction>(OldBase)) { 5717 // If the base of the struct is an instruction, the new base will be 5718 // inserted close to it. 5719 NewBaseInsertBB = BaseI->getParent(); 5720 if (isa<PHINode>(BaseI)) 5721 NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); 5722 else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) { 5723 NewBaseInsertBB = 5724 SplitEdge(NewBaseInsertBB, Invoke->getNormalDest()); 5725 NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); 5726 } else 5727 NewBaseInsertPt = std::next(BaseI->getIterator()); 5728 } else { 5729 // If the current base is an argument or global value, the new base 5730 // will be inserted to the entry block. 5731 NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock(); 5732 NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); 5733 } 5734 IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); 5735 // Create a new base. 5736 Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset); 5737 NewBaseGEP = OldBase; 5738 if (NewBaseGEP->getType() != I8PtrTy) 5739 NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); 5740 NewBaseGEP = 5741 NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); 5742 NewGEPBases.insert(NewBaseGEP); 5743 } 5744 5745 IRBuilder<> Builder(GEP); 5746 Value *NewGEP = NewBaseGEP; 5747 if (Offset == BaseOffset) { 5748 if (GEP->getType() != I8PtrTy) 5749 NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); 5750 } else { 5751 // Calculate the new offset for the new GEP. 5752 Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset); 5753 NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index); 5754 5755 if (GEP->getType() != I8PtrTy) 5756 NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); 5757 } 5758 GEP->replaceAllUsesWith(NewGEP); 5759 LargeOffsetGEPID.erase(GEP); 5760 LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP); 5761 GEP->eraseFromParent(); 5762 Changed = true; 5763 } 5764 } 5765 return Changed; 5766 } 5767 5768 bool CodeGenPrepare::optimizePhiType( 5769 PHINode *I, SmallPtrSetImpl<PHINode *> &Visited, 5770 SmallPtrSetImpl<Instruction *> &DeletedInstrs) { 5771 // We are looking for a collection on interconnected phi nodes that together 5772 // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts 5773 // are of the same type. Convert the whole set of nodes to the type of the 5774 // bitcast. 5775 Type *PhiTy = I->getType(); 5776 Type *ConvertTy = nullptr; 5777 if (Visited.count(I) || 5778 (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy())) 5779 return false; 5780 5781 SmallVector<Instruction *, 4> Worklist; 5782 Worklist.push_back(cast<Instruction>(I)); 5783 SmallPtrSet<PHINode *, 4> PhiNodes; 5784 PhiNodes.insert(I); 5785 Visited.insert(I); 5786 SmallPtrSet<Instruction *, 4> Defs; 5787 SmallPtrSet<Instruction *, 4> Uses; 5788 5789 while (!Worklist.empty()) { 5790 Instruction *II = Worklist.pop_back_val(); 5791 5792 if (auto *Phi = dyn_cast<PHINode>(II)) { 5793 // Handle Defs, which might also be PHI's 5794 for (Value *V : Phi->incoming_values()) { 5795 if (auto *OpPhi = dyn_cast<PHINode>(V)) { 5796 if (!PhiNodes.count(OpPhi)) { 5797 if (Visited.count(OpPhi)) 5798 return false; 5799 PhiNodes.insert(OpPhi); 5800 Visited.insert(OpPhi); 5801 Worklist.push_back(OpPhi); 5802 } 5803 } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) { 5804 if (!Defs.count(OpLoad)) { 5805 Defs.insert(OpLoad); 5806 Worklist.push_back(OpLoad); 5807 } 5808 } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) { 5809 if (!Defs.count(OpEx)) { 5810 Defs.insert(OpEx); 5811 Worklist.push_back(OpEx); 5812 } 5813 } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) { 5814 if (!ConvertTy) 5815 ConvertTy = OpBC->getOperand(0)->getType(); 5816 if (OpBC->getOperand(0)->getType() != ConvertTy) 5817 return false; 5818 if (!Defs.count(OpBC)) { 5819 Defs.insert(OpBC); 5820 Worklist.push_back(OpBC); 5821 } 5822 } else if (!isa<UndefValue>(V)) 5823 return false; 5824 } 5825 } 5826 5827 // Handle uses which might also be phi's 5828 for (User *V : II->users()) { 5829 if (auto *OpPhi = dyn_cast<PHINode>(V)) { 5830 if (!PhiNodes.count(OpPhi)) { 5831 if (Visited.count(OpPhi)) 5832 return false; 5833 PhiNodes.insert(OpPhi); 5834 Visited.insert(OpPhi); 5835 Worklist.push_back(OpPhi); 5836 } 5837 } else if (auto *OpStore = dyn_cast<StoreInst>(V)) { 5838 if (OpStore->getOperand(0) != II) 5839 return false; 5840 Uses.insert(OpStore); 5841 } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) { 5842 if (!ConvertTy) 5843 ConvertTy = OpBC->getType(); 5844 if (OpBC->getType() != ConvertTy) 5845 return false; 5846 Uses.insert(OpBC); 5847 } else 5848 return false; 5849 } 5850 } 5851 5852 if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy)) 5853 return false; 5854 5855 LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to " 5856 << *ConvertTy << "\n"); 5857 5858 // Create all the new phi nodes of the new type, and bitcast any loads to the 5859 // correct type. 5860 ValueToValueMap ValMap; 5861 ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy); 5862 for (Instruction *D : Defs) { 5863 if (isa<BitCastInst>(D)) 5864 ValMap[D] = D->getOperand(0); 5865 else 5866 ValMap[D] = 5867 new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode()); 5868 } 5869 for (PHINode *Phi : PhiNodes) 5870 ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(), 5871 Phi->getName() + ".tc", Phi); 5872 // Pipe together all the PhiNodes. 5873 for (PHINode *Phi : PhiNodes) { 5874 PHINode *NewPhi = cast<PHINode>(ValMap[Phi]); 5875 for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++) 5876 NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)], 5877 Phi->getIncomingBlock(i)); 5878 } 5879 // And finally pipe up the stores and bitcasts 5880 for (Instruction *U : Uses) { 5881 if (isa<BitCastInst>(U)) { 5882 DeletedInstrs.insert(U); 5883 U->replaceAllUsesWith(ValMap[U->getOperand(0)]); 5884 } else 5885 U->setOperand(0, 5886 new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U)); 5887 } 5888 5889 // Save the removed phis to be deleted later. 5890 for (PHINode *Phi : PhiNodes) 5891 DeletedInstrs.insert(Phi); 5892 return true; 5893 } 5894 5895 bool CodeGenPrepare::optimizePhiTypes(Function &F) { 5896 if (!OptimizePhiTypes) 5897 return false; 5898 5899 bool Changed = false; 5900 SmallPtrSet<PHINode *, 4> Visited; 5901 SmallPtrSet<Instruction *, 4> DeletedInstrs; 5902 5903 // Attempt to optimize all the phis in the functions to the correct type. 5904 for (auto &BB : F) 5905 for (auto &Phi : BB.phis()) 5906 Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs); 5907 5908 // Remove any old phi's that have been converted. 5909 for (auto *I : DeletedInstrs) { 5910 I->replaceAllUsesWith(UndefValue::get(I->getType())); 5911 I->eraseFromParent(); 5912 } 5913 5914 return Changed; 5915 } 5916 5917 /// Return true, if an ext(load) can be formed from an extension in 5918 /// \p MovedExts. 5919 bool CodeGenPrepare::canFormExtLd( 5920 const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, 5921 Instruction *&Inst, bool HasPromoted) { 5922 for (auto *MovedExtInst : MovedExts) { 5923 if (isa<LoadInst>(MovedExtInst->getOperand(0))) { 5924 LI = cast<LoadInst>(MovedExtInst->getOperand(0)); 5925 Inst = MovedExtInst; 5926 break; 5927 } 5928 } 5929 if (!LI) 5930 return false; 5931 5932 // If they're already in the same block, there's nothing to do. 5933 // Make the cheap checks first if we did not promote. 5934 // If we promoted, we need to check if it is indeed profitable. 5935 if (!HasPromoted && LI->getParent() == Inst->getParent()) 5936 return false; 5937 5938 return TLI->isExtLoad(LI, Inst, *DL); 5939 } 5940 5941 /// Move a zext or sext fed by a load into the same basic block as the load, 5942 /// unless conditions are unfavorable. This allows SelectionDAG to fold the 5943 /// extend into the load. 5944 /// 5945 /// E.g., 5946 /// \code 5947 /// %ld = load i32* %addr 5948 /// %add = add nuw i32 %ld, 4 5949 /// %zext = zext i32 %add to i64 5950 // \endcode 5951 /// => 5952 /// \code 5953 /// %ld = load i32* %addr 5954 /// %zext = zext i32 %ld to i64 5955 /// %add = add nuw i64 %zext, 4 5956 /// \encode 5957 /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which 5958 /// allow us to match zext(load i32*) to i64. 5959 /// 5960 /// Also, try to promote the computations used to obtain a sign extended 5961 /// value used into memory accesses. 5962 /// E.g., 5963 /// \code 5964 /// a = add nsw i32 b, 3 5965 /// d = sext i32 a to i64 5966 /// e = getelementptr ..., i64 d 5967 /// \endcode 5968 /// => 5969 /// \code 5970 /// f = sext i32 b to i64 5971 /// a = add nsw i64 f, 3 5972 /// e = getelementptr ..., i64 a 5973 /// \endcode 5974 /// 5975 /// \p Inst[in/out] the extension may be modified during the process if some 5976 /// promotions apply. 5977 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { 5978 bool AllowPromotionWithoutCommonHeader = false; 5979 /// See if it is an interesting sext operations for the address type 5980 /// promotion before trying to promote it, e.g., the ones with the right 5981 /// type and used in memory accesses. 5982 bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( 5983 *Inst, AllowPromotionWithoutCommonHeader); 5984 TypePromotionTransaction TPT(RemovedInsts); 5985 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 5986 TPT.getRestorationPoint(); 5987 SmallVector<Instruction *, 1> Exts; 5988 SmallVector<Instruction *, 2> SpeculativelyMovedExts; 5989 Exts.push_back(Inst); 5990 5991 bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); 5992 5993 // Look for a load being extended. 5994 LoadInst *LI = nullptr; 5995 Instruction *ExtFedByLoad; 5996 5997 // Try to promote a chain of computation if it allows to form an extended 5998 // load. 5999 if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { 6000 assert(LI && ExtFedByLoad && "Expect a valid load and extension"); 6001 TPT.commit(); 6002 // Move the extend into the same block as the load. 6003 ExtFedByLoad->moveAfter(LI); 6004 ++NumExtsMoved; 6005 Inst = ExtFedByLoad; 6006 return true; 6007 } 6008 6009 // Continue promoting SExts if known as considerable depending on targets. 6010 if (ATPConsiderable && 6011 performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, 6012 HasPromoted, TPT, SpeculativelyMovedExts)) 6013 return true; 6014 6015 TPT.rollback(LastKnownGood); 6016 return false; 6017 } 6018 6019 // Perform address type promotion if doing so is profitable. 6020 // If AllowPromotionWithoutCommonHeader == false, we should find other sext 6021 // instructions that sign extended the same initial value. However, if 6022 // AllowPromotionWithoutCommonHeader == true, we expect promoting the 6023 // extension is just profitable. 6024 bool CodeGenPrepare::performAddressTypePromotion( 6025 Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, 6026 bool HasPromoted, TypePromotionTransaction &TPT, 6027 SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { 6028 bool Promoted = false; 6029 SmallPtrSet<Instruction *, 1> UnhandledExts; 6030 bool AllSeenFirst = true; 6031 for (auto *I : SpeculativelyMovedExts) { 6032 Value *HeadOfChain = I->getOperand(0); 6033 DenseMap<Value *, Instruction *>::iterator AlreadySeen = 6034 SeenChainsForSExt.find(HeadOfChain); 6035 // If there is an unhandled SExt which has the same header, try to promote 6036 // it as well. 6037 if (AlreadySeen != SeenChainsForSExt.end()) { 6038 if (AlreadySeen->second != nullptr) 6039 UnhandledExts.insert(AlreadySeen->second); 6040 AllSeenFirst = false; 6041 } 6042 } 6043 6044 if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && 6045 SpeculativelyMovedExts.size() == 1)) { 6046 TPT.commit(); 6047 if (HasPromoted) 6048 Promoted = true; 6049 for (auto *I : SpeculativelyMovedExts) { 6050 Value *HeadOfChain = I->getOperand(0); 6051 SeenChainsForSExt[HeadOfChain] = nullptr; 6052 ValToSExtendedUses[HeadOfChain].push_back(I); 6053 } 6054 // Update Inst as promotion happen. 6055 Inst = SpeculativelyMovedExts.pop_back_val(); 6056 } else { 6057 // This is the first chain visited from the header, keep the current chain 6058 // as unhandled. Defer to promote this until we encounter another SExt 6059 // chain derived from the same header. 6060 for (auto *I : SpeculativelyMovedExts) { 6061 Value *HeadOfChain = I->getOperand(0); 6062 SeenChainsForSExt[HeadOfChain] = Inst; 6063 } 6064 return false; 6065 } 6066 6067 if (!AllSeenFirst && !UnhandledExts.empty()) 6068 for (auto *VisitedSExt : UnhandledExts) { 6069 if (RemovedInsts.count(VisitedSExt)) 6070 continue; 6071 TypePromotionTransaction TPT(RemovedInsts); 6072 SmallVector<Instruction *, 1> Exts; 6073 SmallVector<Instruction *, 2> Chains; 6074 Exts.push_back(VisitedSExt); 6075 bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); 6076 TPT.commit(); 6077 if (HasPromoted) 6078 Promoted = true; 6079 for (auto *I : Chains) { 6080 Value *HeadOfChain = I->getOperand(0); 6081 // Mark this as handled. 6082 SeenChainsForSExt[HeadOfChain] = nullptr; 6083 ValToSExtendedUses[HeadOfChain].push_back(I); 6084 } 6085 } 6086 return Promoted; 6087 } 6088 6089 bool CodeGenPrepare::optimizeExtUses(Instruction *I) { 6090 BasicBlock *DefBB = I->getParent(); 6091 6092 // If the result of a {s|z}ext and its source are both live out, rewrite all 6093 // other uses of the source with result of extension. 6094 Value *Src = I->getOperand(0); 6095 if (Src->hasOneUse()) 6096 return false; 6097 6098 // Only do this xform if truncating is free. 6099 if (!TLI->isTruncateFree(I->getType(), Src->getType())) 6100 return false; 6101 6102 // Only safe to perform the optimization if the source is also defined in 6103 // this block. 6104 if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()) 6105 return false; 6106 6107 bool DefIsLiveOut = false; 6108 for (User *U : I->users()) { 6109 Instruction *UI = cast<Instruction>(U); 6110 6111 // Figure out which BB this ext is used in. 6112 BasicBlock *UserBB = UI->getParent(); 6113 if (UserBB == DefBB) continue; 6114 DefIsLiveOut = true; 6115 break; 6116 } 6117 if (!DefIsLiveOut) 6118 return false; 6119 6120 // Make sure none of the uses are PHI nodes. 6121 for (User *U : Src->users()) { 6122 Instruction *UI = cast<Instruction>(U); 6123 BasicBlock *UserBB = UI->getParent(); 6124 if (UserBB == DefBB) continue; 6125 // Be conservative. We don't want this xform to end up introducing 6126 // reloads just before load / store instructions. 6127 if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI)) 6128 return false; 6129 } 6130 6131 // InsertedTruncs - Only insert one trunc in each block once. 6132 DenseMap<BasicBlock*, Instruction*> InsertedTruncs; 6133 6134 bool MadeChange = false; 6135 for (Use &U : Src->uses()) { 6136 Instruction *User = cast<Instruction>(U.getUser()); 6137 6138 // Figure out which BB this ext is used in. 6139 BasicBlock *UserBB = User->getParent(); 6140 if (UserBB == DefBB) continue; 6141 6142 // Both src and def are live in this block. Rewrite the use. 6143 Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; 6144 6145 if (!InsertedTrunc) { 6146 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 6147 assert(InsertPt != UserBB->end()); 6148 InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt); 6149 InsertedInsts.insert(InsertedTrunc); 6150 } 6151 6152 // Replace a use of the {s|z}ext source with a use of the result. 6153 U = InsertedTrunc; 6154 ++NumExtUses; 6155 MadeChange = true; 6156 } 6157 6158 return MadeChange; 6159 } 6160 6161 // Find loads whose uses only use some of the loaded value's bits. Add an "and" 6162 // just after the load if the target can fold this into one extload instruction, 6163 // with the hope of eliminating some of the other later "and" instructions using 6164 // the loaded value. "and"s that are made trivially redundant by the insertion 6165 // of the new "and" are removed by this function, while others (e.g. those whose 6166 // path from the load goes through a phi) are left for isel to potentially 6167 // remove. 6168 // 6169 // For example: 6170 // 6171 // b0: 6172 // x = load i32 6173 // ... 6174 // b1: 6175 // y = and x, 0xff 6176 // z = use y 6177 // 6178 // becomes: 6179 // 6180 // b0: 6181 // x = load i32 6182 // x' = and x, 0xff 6183 // ... 6184 // b1: 6185 // z = use x' 6186 // 6187 // whereas: 6188 // 6189 // b0: 6190 // x1 = load i32 6191 // ... 6192 // b1: 6193 // x2 = load i32 6194 // ... 6195 // b2: 6196 // x = phi x1, x2 6197 // y = and x, 0xff 6198 // 6199 // becomes (after a call to optimizeLoadExt for each load): 6200 // 6201 // b0: 6202 // x1 = load i32 6203 // x1' = and x1, 0xff 6204 // ... 6205 // b1: 6206 // x2 = load i32 6207 // x2' = and x2, 0xff 6208 // ... 6209 // b2: 6210 // x = phi x1', x2' 6211 // y = and x, 0xff 6212 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { 6213 if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy()) 6214 return false; 6215 6216 // Skip loads we've already transformed. 6217 if (Load->hasOneUse() && 6218 InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) 6219 return false; 6220 6221 // Look at all uses of Load, looking through phis, to determine how many bits 6222 // of the loaded value are needed. 6223 SmallVector<Instruction *, 8> WorkList; 6224 SmallPtrSet<Instruction *, 16> Visited; 6225 SmallVector<Instruction *, 8> AndsToMaybeRemove; 6226 for (auto *U : Load->users()) 6227 WorkList.push_back(cast<Instruction>(U)); 6228 6229 EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); 6230 unsigned BitWidth = LoadResultVT.getSizeInBits(); 6231 APInt DemandBits(BitWidth, 0); 6232 APInt WidestAndBits(BitWidth, 0); 6233 6234 while (!WorkList.empty()) { 6235 Instruction *I = WorkList.back(); 6236 WorkList.pop_back(); 6237 6238 // Break use-def graph loops. 6239 if (!Visited.insert(I).second) 6240 continue; 6241 6242 // For a PHI node, push all of its users. 6243 if (auto *Phi = dyn_cast<PHINode>(I)) { 6244 for (auto *U : Phi->users()) 6245 WorkList.push_back(cast<Instruction>(U)); 6246 continue; 6247 } 6248 6249 switch (I->getOpcode()) { 6250 case Instruction::And: { 6251 auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1)); 6252 if (!AndC) 6253 return false; 6254 APInt AndBits = AndC->getValue(); 6255 DemandBits |= AndBits; 6256 // Keep track of the widest and mask we see. 6257 if (AndBits.ugt(WidestAndBits)) 6258 WidestAndBits = AndBits; 6259 if (AndBits == WidestAndBits && I->getOperand(0) == Load) 6260 AndsToMaybeRemove.push_back(I); 6261 break; 6262 } 6263 6264 case Instruction::Shl: { 6265 auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1)); 6266 if (!ShlC) 6267 return false; 6268 uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); 6269 DemandBits.setLowBits(BitWidth - ShiftAmt); 6270 break; 6271 } 6272 6273 case Instruction::Trunc: { 6274 EVT TruncVT = TLI->getValueType(*DL, I->getType()); 6275 unsigned TruncBitWidth = TruncVT.getSizeInBits(); 6276 DemandBits.setLowBits(TruncBitWidth); 6277 break; 6278 } 6279 6280 default: 6281 return false; 6282 } 6283 } 6284 6285 uint32_t ActiveBits = DemandBits.getActiveBits(); 6286 // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the 6287 // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, 6288 // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but 6289 // (and (load x) 1) is not matched as a single instruction, rather as a LDR 6290 // followed by an AND. 6291 // TODO: Look into removing this restriction by fixing backends to either 6292 // return false for isLoadExtLegal for i1 or have them select this pattern to 6293 // a single instruction. 6294 // 6295 // Also avoid hoisting if we didn't see any ands with the exact DemandBits 6296 // mask, since these are the only ands that will be removed by isel. 6297 if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) || 6298 WidestAndBits != DemandBits) 6299 return false; 6300 6301 LLVMContext &Ctx = Load->getType()->getContext(); 6302 Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); 6303 EVT TruncVT = TLI->getValueType(*DL, TruncTy); 6304 6305 // Reject cases that won't be matched as extloads. 6306 if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || 6307 !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) 6308 return false; 6309 6310 IRBuilder<> Builder(Load->getNextNode()); 6311 auto *NewAnd = cast<Instruction>( 6312 Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); 6313 // Mark this instruction as "inserted by CGP", so that other 6314 // optimizations don't touch it. 6315 InsertedInsts.insert(NewAnd); 6316 6317 // Replace all uses of load with new and (except for the use of load in the 6318 // new and itself). 6319 Load->replaceAllUsesWith(NewAnd); 6320 NewAnd->setOperand(0, Load); 6321 6322 // Remove any and instructions that are now redundant. 6323 for (auto *And : AndsToMaybeRemove) 6324 // Check that the and mask is the same as the one we decided to put on the 6325 // new and. 6326 if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) { 6327 And->replaceAllUsesWith(NewAnd); 6328 if (&*CurInstIterator == And) 6329 CurInstIterator = std::next(And->getIterator()); 6330 And->eraseFromParent(); 6331 ++NumAndUses; 6332 } 6333 6334 ++NumAndsAdded; 6335 return true; 6336 } 6337 6338 /// Check if V (an operand of a select instruction) is an expensive instruction 6339 /// that is only used once. 6340 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { 6341 auto *I = dyn_cast<Instruction>(V); 6342 // If it's safe to speculatively execute, then it should not have side 6343 // effects; therefore, it's safe to sink and possibly *not* execute. 6344 return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && 6345 TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >= 6346 TargetTransformInfo::TCC_Expensive; 6347 } 6348 6349 /// Returns true if a SelectInst should be turned into an explicit branch. 6350 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, 6351 const TargetLowering *TLI, 6352 SelectInst *SI) { 6353 // If even a predictable select is cheap, then a branch can't be cheaper. 6354 if (!TLI->isPredictableSelectExpensive()) 6355 return false; 6356 6357 // FIXME: This should use the same heuristics as IfConversion to determine 6358 // whether a select is better represented as a branch. 6359 6360 // If metadata tells us that the select condition is obviously predictable, 6361 // then we want to replace the select with a branch. 6362 uint64_t TrueWeight, FalseWeight; 6363 if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { 6364 uint64_t Max = std::max(TrueWeight, FalseWeight); 6365 uint64_t Sum = TrueWeight + FalseWeight; 6366 if (Sum != 0) { 6367 auto Probability = BranchProbability::getBranchProbability(Max, Sum); 6368 if (Probability > TLI->getPredictableBranchThreshold()) 6369 return true; 6370 } 6371 } 6372 6373 CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); 6374 6375 // If a branch is predictable, an out-of-order CPU can avoid blocking on its 6376 // comparison condition. If the compare has more than one use, there's 6377 // probably another cmov or setcc around, so it's not worth emitting a branch. 6378 if (!Cmp || !Cmp->hasOneUse()) 6379 return false; 6380 6381 // If either operand of the select is expensive and only needed on one side 6382 // of the select, we should form a branch. 6383 if (sinkSelectOperand(TTI, SI->getTrueValue()) || 6384 sinkSelectOperand(TTI, SI->getFalseValue())) 6385 return true; 6386 6387 return false; 6388 } 6389 6390 /// If \p isTrue is true, return the true value of \p SI, otherwise return 6391 /// false value of \p SI. If the true/false value of \p SI is defined by any 6392 /// select instructions in \p Selects, look through the defining select 6393 /// instruction until the true/false value is not defined in \p Selects. 6394 static Value *getTrueOrFalseValue( 6395 SelectInst *SI, bool isTrue, 6396 const SmallPtrSet<const Instruction *, 2> &Selects) { 6397 Value *V = nullptr; 6398 6399 for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI); 6400 DefSI = dyn_cast<SelectInst>(V)) { 6401 assert(DefSI->getCondition() == SI->getCondition() && 6402 "The condition of DefSI does not match with SI"); 6403 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); 6404 } 6405 6406 assert(V && "Failed to get select true/false value"); 6407 return V; 6408 } 6409 6410 bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) { 6411 assert(Shift->isShift() && "Expected a shift"); 6412 6413 // If this is (1) a vector shift, (2) shifts by scalars are cheaper than 6414 // general vector shifts, and (3) the shift amount is a select-of-splatted 6415 // values, hoist the shifts before the select: 6416 // shift Op0, (select Cond, TVal, FVal) --> 6417 // select Cond, (shift Op0, TVal), (shift Op0, FVal) 6418 // 6419 // This is inverting a generic IR transform when we know that the cost of a 6420 // general vector shift is more than the cost of 2 shift-by-scalars. 6421 // We can't do this effectively in SDAG because we may not be able to 6422 // determine if the select operands are splats from within a basic block. 6423 Type *Ty = Shift->getType(); 6424 if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) 6425 return false; 6426 Value *Cond, *TVal, *FVal; 6427 if (!match(Shift->getOperand(1), 6428 m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal))))) 6429 return false; 6430 if (!isSplatValue(TVal) || !isSplatValue(FVal)) 6431 return false; 6432 6433 IRBuilder<> Builder(Shift); 6434 BinaryOperator::BinaryOps Opcode = Shift->getOpcode(); 6435 Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal); 6436 Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal); 6437 Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal); 6438 Shift->replaceAllUsesWith(NewSel); 6439 Shift->eraseFromParent(); 6440 return true; 6441 } 6442 6443 bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) { 6444 Intrinsic::ID Opcode = Fsh->getIntrinsicID(); 6445 assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) && 6446 "Expected a funnel shift"); 6447 6448 // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper 6449 // than general vector shifts, and (3) the shift amount is select-of-splatted 6450 // values, hoist the funnel shifts before the select: 6451 // fsh Op0, Op1, (select Cond, TVal, FVal) --> 6452 // select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal) 6453 // 6454 // This is inverting a generic IR transform when we know that the cost of a 6455 // general vector shift is more than the cost of 2 shift-by-scalars. 6456 // We can't do this effectively in SDAG because we may not be able to 6457 // determine if the select operands are splats from within a basic block. 6458 Type *Ty = Fsh->getType(); 6459 if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)) 6460 return false; 6461 Value *Cond, *TVal, *FVal; 6462 if (!match(Fsh->getOperand(2), 6463 m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal))))) 6464 return false; 6465 if (!isSplatValue(TVal) || !isSplatValue(FVal)) 6466 return false; 6467 6468 IRBuilder<> Builder(Fsh); 6469 Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1); 6470 Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal }); 6471 Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal }); 6472 Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal); 6473 Fsh->replaceAllUsesWith(NewSel); 6474 Fsh->eraseFromParent(); 6475 return true; 6476 } 6477 6478 /// If we have a SelectInst that will likely profit from branch prediction, 6479 /// turn it into a branch. 6480 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { 6481 // If branch conversion isn't desirable, exit early. 6482 if (DisableSelectToBranch || OptSize || 6483 llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())) 6484 return false; 6485 6486 // Find all consecutive select instructions that share the same condition. 6487 SmallVector<SelectInst *, 2> ASI; 6488 ASI.push_back(SI); 6489 for (BasicBlock::iterator It = ++BasicBlock::iterator(SI); 6490 It != SI->getParent()->end(); ++It) { 6491 SelectInst *I = dyn_cast<SelectInst>(&*It); 6492 if (I && SI->getCondition() == I->getCondition()) { 6493 ASI.push_back(I); 6494 } else { 6495 break; 6496 } 6497 } 6498 6499 SelectInst *LastSI = ASI.back(); 6500 // Increment the current iterator to skip all the rest of select instructions 6501 // because they will be either "not lowered" or "all lowered" to branch. 6502 CurInstIterator = std::next(LastSI->getIterator()); 6503 6504 bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); 6505 6506 // Can we convert the 'select' to CF ? 6507 if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable)) 6508 return false; 6509 6510 TargetLowering::SelectSupportKind SelectKind; 6511 if (VectorCond) 6512 SelectKind = TargetLowering::VectorMaskSelect; 6513 else if (SI->getType()->isVectorTy()) 6514 SelectKind = TargetLowering::ScalarCondVectorVal; 6515 else 6516 SelectKind = TargetLowering::ScalarValSelect; 6517 6518 if (TLI->isSelectSupported(SelectKind) && 6519 !isFormingBranchFromSelectProfitable(TTI, TLI, SI)) 6520 return false; 6521 6522 // The DominatorTree needs to be rebuilt by any consumers after this 6523 // transformation. We simply reset here rather than setting the ModifiedDT 6524 // flag to avoid restarting the function walk in runOnFunction for each 6525 // select optimized. 6526 DT.reset(); 6527 6528 // Transform a sequence like this: 6529 // start: 6530 // %cmp = cmp uge i32 %a, %b 6531 // %sel = select i1 %cmp, i32 %c, i32 %d 6532 // 6533 // Into: 6534 // start: 6535 // %cmp = cmp uge i32 %a, %b 6536 // %cmp.frozen = freeze %cmp 6537 // br i1 %cmp.frozen, label %select.true, label %select.false 6538 // select.true: 6539 // br label %select.end 6540 // select.false: 6541 // br label %select.end 6542 // select.end: 6543 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] 6544 // 6545 // %cmp should be frozen, otherwise it may introduce undefined behavior. 6546 // In addition, we may sink instructions that produce %c or %d from 6547 // the entry block into the destination(s) of the new branch. 6548 // If the true or false blocks do not contain a sunken instruction, that 6549 // block and its branch may be optimized away. In that case, one side of the 6550 // first branch will point directly to select.end, and the corresponding PHI 6551 // predecessor block will be the start block. 6552 6553 // First, we split the block containing the select into 2 blocks. 6554 BasicBlock *StartBlock = SI->getParent(); 6555 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); 6556 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); 6557 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); 6558 6559 // Delete the unconditional branch that was just created by the split. 6560 StartBlock->getTerminator()->eraseFromParent(); 6561 6562 // These are the new basic blocks for the conditional branch. 6563 // At least one will become an actual new basic block. 6564 BasicBlock *TrueBlock = nullptr; 6565 BasicBlock *FalseBlock = nullptr; 6566 BranchInst *TrueBranch = nullptr; 6567 BranchInst *FalseBranch = nullptr; 6568 6569 // Sink expensive instructions into the conditional blocks to avoid executing 6570 // them speculatively. 6571 for (SelectInst *SI : ASI) { 6572 if (sinkSelectOperand(TTI, SI->getTrueValue())) { 6573 if (TrueBlock == nullptr) { 6574 TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", 6575 EndBlock->getParent(), EndBlock); 6576 TrueBranch = BranchInst::Create(EndBlock, TrueBlock); 6577 TrueBranch->setDebugLoc(SI->getDebugLoc()); 6578 } 6579 auto *TrueInst = cast<Instruction>(SI->getTrueValue()); 6580 TrueInst->moveBefore(TrueBranch); 6581 } 6582 if (sinkSelectOperand(TTI, SI->getFalseValue())) { 6583 if (FalseBlock == nullptr) { 6584 FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", 6585 EndBlock->getParent(), EndBlock); 6586 FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 6587 FalseBranch->setDebugLoc(SI->getDebugLoc()); 6588 } 6589 auto *FalseInst = cast<Instruction>(SI->getFalseValue()); 6590 FalseInst->moveBefore(FalseBranch); 6591 } 6592 } 6593 6594 // If there was nothing to sink, then arbitrarily choose the 'false' side 6595 // for a new input value to the PHI. 6596 if (TrueBlock == FalseBlock) { 6597 assert(TrueBlock == nullptr && 6598 "Unexpected basic block transform while optimizing select"); 6599 6600 FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", 6601 EndBlock->getParent(), EndBlock); 6602 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 6603 FalseBranch->setDebugLoc(SI->getDebugLoc()); 6604 } 6605 6606 // Insert the real conditional branch based on the original condition. 6607 // If we did not create a new block for one of the 'true' or 'false' paths 6608 // of the condition, it means that side of the branch goes to the end block 6609 // directly and the path originates from the start block from the point of 6610 // view of the new PHI. 6611 BasicBlock *TT, *FT; 6612 if (TrueBlock == nullptr) { 6613 TT = EndBlock; 6614 FT = FalseBlock; 6615 TrueBlock = StartBlock; 6616 } else if (FalseBlock == nullptr) { 6617 TT = TrueBlock; 6618 FT = EndBlock; 6619 FalseBlock = StartBlock; 6620 } else { 6621 TT = TrueBlock; 6622 FT = FalseBlock; 6623 } 6624 IRBuilder<> IB(SI); 6625 auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); 6626 IB.CreateCondBr(CondFr, TT, FT, SI); 6627 6628 SmallPtrSet<const Instruction *, 2> INS; 6629 INS.insert(ASI.begin(), ASI.end()); 6630 // Use reverse iterator because later select may use the value of the 6631 // earlier select, and we need to propagate value through earlier select 6632 // to get the PHI operand. 6633 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { 6634 SelectInst *SI = *It; 6635 // The select itself is replaced with a PHI Node. 6636 PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); 6637 PN->takeName(SI); 6638 PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock); 6639 PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); 6640 PN->setDebugLoc(SI->getDebugLoc()); 6641 6642 SI->replaceAllUsesWith(PN); 6643 SI->eraseFromParent(); 6644 INS.erase(SI); 6645 ++NumSelectsExpanded; 6646 } 6647 6648 // Instruct OptimizeBlock to skip to the next block. 6649 CurInstIterator = StartBlock->end(); 6650 return true; 6651 } 6652 6653 /// Some targets only accept certain types for splat inputs. For example a VDUP 6654 /// in MVE takes a GPR (integer) register, and the instruction that incorporate 6655 /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register. 6656 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { 6657 if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 6658 m_Undef(), m_ZeroMask()))) 6659 return false; 6660 Type *NewType = TLI->shouldConvertSplatType(SVI); 6661 if (!NewType) 6662 return false; 6663 6664 auto *SVIVecType = cast<FixedVectorType>(SVI->getType()); 6665 assert(!NewType->isVectorTy() && "Expected a scalar type!"); 6666 assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() && 6667 "Expected a type of the same size!"); 6668 auto *NewVecType = 6669 FixedVectorType::get(NewType, SVIVecType->getNumElements()); 6670 6671 // Create a bitcast (shuffle (insert (bitcast(..)))) 6672 IRBuilder<> Builder(SVI->getContext()); 6673 Builder.SetInsertPoint(SVI); 6674 Value *BC1 = Builder.CreateBitCast( 6675 cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType); 6676 Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1, 6677 (uint64_t)0); 6678 Value *Shuffle = Builder.CreateShuffleVector( 6679 Insert, UndefValue::get(NewVecType), SVI->getShuffleMask()); 6680 Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType); 6681 6682 SVI->replaceAllUsesWith(BC2); 6683 RecursivelyDeleteTriviallyDeadInstructions( 6684 SVI, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); }); 6685 6686 // Also hoist the bitcast up to its operand if it they are not in the same 6687 // block. 6688 if (auto *BCI = dyn_cast<Instruction>(BC1)) 6689 if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0))) 6690 if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) && 6691 !Op->isTerminator() && !Op->isEHPad()) 6692 BCI->moveAfter(Op); 6693 6694 return true; 6695 } 6696 6697 bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { 6698 // If the operands of I can be folded into a target instruction together with 6699 // I, duplicate and sink them. 6700 SmallVector<Use *, 4> OpsToSink; 6701 if (!TLI->shouldSinkOperands(I, OpsToSink)) 6702 return false; 6703 6704 // OpsToSink can contain multiple uses in a use chain (e.g. 6705 // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating 6706 // uses must come first, so we process the ops in reverse order so as to not 6707 // create invalid IR. 6708 BasicBlock *TargetBB = I->getParent(); 6709 bool Changed = false; 6710 SmallVector<Use *, 4> ToReplace; 6711 for (Use *U : reverse(OpsToSink)) { 6712 auto *UI = cast<Instruction>(U->get()); 6713 if (UI->getParent() == TargetBB || isa<PHINode>(UI)) 6714 continue; 6715 ToReplace.push_back(U); 6716 } 6717 6718 SetVector<Instruction *> MaybeDead; 6719 DenseMap<Instruction *, Instruction *> NewInstructions; 6720 Instruction *InsertPoint = I; 6721 for (Use *U : ToReplace) { 6722 auto *UI = cast<Instruction>(U->get()); 6723 Instruction *NI = UI->clone(); 6724 NewInstructions[UI] = NI; 6725 MaybeDead.insert(UI); 6726 LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n"); 6727 NI->insertBefore(InsertPoint); 6728 InsertPoint = NI; 6729 InsertedInsts.insert(NI); 6730 6731 // Update the use for the new instruction, making sure that we update the 6732 // sunk instruction uses, if it is part of a chain that has already been 6733 // sunk. 6734 Instruction *OldI = cast<Instruction>(U->getUser()); 6735 if (NewInstructions.count(OldI)) 6736 NewInstructions[OldI]->setOperand(U->getOperandNo(), NI); 6737 else 6738 U->set(NI); 6739 Changed = true; 6740 } 6741 6742 // Remove instructions that are dead after sinking. 6743 for (auto *I : MaybeDead) { 6744 if (!I->hasNUsesOrMore(1)) { 6745 LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n"); 6746 I->eraseFromParent(); 6747 } 6748 } 6749 6750 return Changed; 6751 } 6752 6753 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { 6754 Value *Cond = SI->getCondition(); 6755 Type *OldType = Cond->getType(); 6756 LLVMContext &Context = Cond->getContext(); 6757 MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); 6758 unsigned RegWidth = RegType.getSizeInBits(); 6759 6760 if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth()) 6761 return false; 6762 6763 // If the register width is greater than the type width, expand the condition 6764 // of the switch instruction and each case constant to the width of the 6765 // register. By widening the type of the switch condition, subsequent 6766 // comparisons (for case comparisons) will not need to be extended to the 6767 // preferred register width, so we will potentially eliminate N-1 extends, 6768 // where N is the number of cases in the switch. 6769 auto *NewType = Type::getIntNTy(Context, RegWidth); 6770 6771 // Zero-extend the switch condition and case constants unless the switch 6772 // condition is a function argument that is already being sign-extended. 6773 // In that case, we can avoid an unnecessary mask/extension by sign-extending 6774 // everything instead. 6775 Instruction::CastOps ExtType = Instruction::ZExt; 6776 if (auto *Arg = dyn_cast<Argument>(Cond)) 6777 if (Arg->hasSExtAttr()) 6778 ExtType = Instruction::SExt; 6779 6780 auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); 6781 ExtInst->insertBefore(SI); 6782 ExtInst->setDebugLoc(SI->getDebugLoc()); 6783 SI->setCondition(ExtInst); 6784 for (auto Case : SI->cases()) { 6785 APInt NarrowConst = Case.getCaseValue()->getValue(); 6786 APInt WideConst = (ExtType == Instruction::ZExt) ? 6787 NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); 6788 Case.setValue(ConstantInt::get(Context, WideConst)); 6789 } 6790 6791 return true; 6792 } 6793 6794 6795 namespace { 6796 6797 /// Helper class to promote a scalar operation to a vector one. 6798 /// This class is used to move downward extractelement transition. 6799 /// E.g., 6800 /// a = vector_op <2 x i32> 6801 /// b = extractelement <2 x i32> a, i32 0 6802 /// c = scalar_op b 6803 /// store c 6804 /// 6805 /// => 6806 /// a = vector_op <2 x i32> 6807 /// c = vector_op a (equivalent to scalar_op on the related lane) 6808 /// * d = extractelement <2 x i32> c, i32 0 6809 /// * store d 6810 /// Assuming both extractelement and store can be combine, we get rid of the 6811 /// transition. 6812 class VectorPromoteHelper { 6813 /// DataLayout associated with the current module. 6814 const DataLayout &DL; 6815 6816 /// Used to perform some checks on the legality of vector operations. 6817 const TargetLowering &TLI; 6818 6819 /// Used to estimated the cost of the promoted chain. 6820 const TargetTransformInfo &TTI; 6821 6822 /// The transition being moved downwards. 6823 Instruction *Transition; 6824 6825 /// The sequence of instructions to be promoted. 6826 SmallVector<Instruction *, 4> InstsToBePromoted; 6827 6828 /// Cost of combining a store and an extract. 6829 unsigned StoreExtractCombineCost; 6830 6831 /// Instruction that will be combined with the transition. 6832 Instruction *CombineInst = nullptr; 6833 6834 /// The instruction that represents the current end of the transition. 6835 /// Since we are faking the promotion until we reach the end of the chain 6836 /// of computation, we need a way to get the current end of the transition. 6837 Instruction *getEndOfTransition() const { 6838 if (InstsToBePromoted.empty()) 6839 return Transition; 6840 return InstsToBePromoted.back(); 6841 } 6842 6843 /// Return the index of the original value in the transition. 6844 /// E.g., for "extractelement <2 x i32> c, i32 1" the original value, 6845 /// c, is at index 0. 6846 unsigned getTransitionOriginalValueIdx() const { 6847 assert(isa<ExtractElementInst>(Transition) && 6848 "Other kind of transitions are not supported yet"); 6849 return 0; 6850 } 6851 6852 /// Return the index of the index in the transition. 6853 /// E.g., for "extractelement <2 x i32> c, i32 0" the index 6854 /// is at index 1. 6855 unsigned getTransitionIdx() const { 6856 assert(isa<ExtractElementInst>(Transition) && 6857 "Other kind of transitions are not supported yet"); 6858 return 1; 6859 } 6860 6861 /// Get the type of the transition. 6862 /// This is the type of the original value. 6863 /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the 6864 /// transition is <2 x i32>. 6865 Type *getTransitionType() const { 6866 return Transition->getOperand(getTransitionOriginalValueIdx())->getType(); 6867 } 6868 6869 /// Promote \p ToBePromoted by moving \p Def downward through. 6870 /// I.e., we have the following sequence: 6871 /// Def = Transition <ty1> a to <ty2> 6872 /// b = ToBePromoted <ty2> Def, ... 6873 /// => 6874 /// b = ToBePromoted <ty1> a, ... 6875 /// Def = Transition <ty1> ToBePromoted to <ty2> 6876 void promoteImpl(Instruction *ToBePromoted); 6877 6878 /// Check whether or not it is profitable to promote all the 6879 /// instructions enqueued to be promoted. 6880 bool isProfitableToPromote() { 6881 Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx()); 6882 unsigned Index = isa<ConstantInt>(ValIdx) 6883 ? cast<ConstantInt>(ValIdx)->getZExtValue() 6884 : -1; 6885 Type *PromotedType = getTransitionType(); 6886 6887 StoreInst *ST = cast<StoreInst>(CombineInst); 6888 unsigned AS = ST->getPointerAddressSpace(); 6889 unsigned Align = ST->getAlignment(); 6890 // Check if this store is supported. 6891 if (!TLI.allowsMisalignedMemoryAccesses( 6892 TLI.getValueType(DL, ST->getValueOperand()->getType()), AS, 6893 Align)) { 6894 // If this is not supported, there is no way we can combine 6895 // the extract with the store. 6896 return false; 6897 } 6898 6899 // The scalar chain of computation has to pay for the transition 6900 // scalar to vector. 6901 // The vector chain has to account for the combining cost. 6902 uint64_t ScalarCost = 6903 TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); 6904 uint64_t VectorCost = StoreExtractCombineCost; 6905 enum TargetTransformInfo::TargetCostKind CostKind = 6906 TargetTransformInfo::TCK_RecipThroughput; 6907 for (const auto &Inst : InstsToBePromoted) { 6908 // Compute the cost. 6909 // By construction, all instructions being promoted are arithmetic ones. 6910 // Moreover, one argument is a constant that can be viewed as a splat 6911 // constant. 6912 Value *Arg0 = Inst->getOperand(0); 6913 bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) || 6914 isa<ConstantFP>(Arg0); 6915 TargetTransformInfo::OperandValueKind Arg0OVK = 6916 IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue 6917 : TargetTransformInfo::OK_AnyValue; 6918 TargetTransformInfo::OperandValueKind Arg1OVK = 6919 !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue 6920 : TargetTransformInfo::OK_AnyValue; 6921 ScalarCost += TTI.getArithmeticInstrCost( 6922 Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK); 6923 VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, 6924 CostKind, 6925 Arg0OVK, Arg1OVK); 6926 } 6927 LLVM_DEBUG( 6928 dbgs() << "Estimated cost of computation to be promoted:\nScalar: " 6929 << ScalarCost << "\nVector: " << VectorCost << '\n'); 6930 return ScalarCost > VectorCost; 6931 } 6932 6933 /// Generate a constant vector with \p Val with the same 6934 /// number of elements as the transition. 6935 /// \p UseSplat defines whether or not \p Val should be replicated 6936 /// across the whole vector. 6937 /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>, 6938 /// otherwise we generate a vector with as many undef as possible: 6939 /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only 6940 /// used at the index of the extract. 6941 Value *getConstantVector(Constant *Val, bool UseSplat) const { 6942 unsigned ExtractIdx = std::numeric_limits<unsigned>::max(); 6943 if (!UseSplat) { 6944 // If we cannot determine where the constant must be, we have to 6945 // use a splat constant. 6946 Value *ValExtractIdx = Transition->getOperand(getTransitionIdx()); 6947 if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx)) 6948 ExtractIdx = CstVal->getSExtValue(); 6949 else 6950 UseSplat = true; 6951 } 6952 6953 ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount(); 6954 if (UseSplat) 6955 return ConstantVector::getSplat(EC, Val); 6956 6957 if (!EC.Scalable) { 6958 SmallVector<Constant *, 4> ConstVec; 6959 UndefValue *UndefVal = UndefValue::get(Val->getType()); 6960 for (unsigned Idx = 0; Idx != EC.Min; ++Idx) { 6961 if (Idx == ExtractIdx) 6962 ConstVec.push_back(Val); 6963 else 6964 ConstVec.push_back(UndefVal); 6965 } 6966 return ConstantVector::get(ConstVec); 6967 } else 6968 llvm_unreachable( 6969 "Generate scalable vector for non-splat is unimplemented"); 6970 } 6971 6972 /// Check if promoting to a vector type an operand at \p OperandIdx 6973 /// in \p Use can trigger undefined behavior. 6974 static bool canCauseUndefinedBehavior(const Instruction *Use, 6975 unsigned OperandIdx) { 6976 // This is not safe to introduce undef when the operand is on 6977 // the right hand side of a division-like instruction. 6978 if (OperandIdx != 1) 6979 return false; 6980 switch (Use->getOpcode()) { 6981 default: 6982 return false; 6983 case Instruction::SDiv: 6984 case Instruction::UDiv: 6985 case Instruction::SRem: 6986 case Instruction::URem: 6987 return true; 6988 case Instruction::FDiv: 6989 case Instruction::FRem: 6990 return !Use->hasNoNaNs(); 6991 } 6992 llvm_unreachable(nullptr); 6993 } 6994 6995 public: 6996 VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI, 6997 const TargetTransformInfo &TTI, Instruction *Transition, 6998 unsigned CombineCost) 6999 : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition), 7000 StoreExtractCombineCost(CombineCost) { 7001 assert(Transition && "Do not know how to promote null"); 7002 } 7003 7004 /// Check if we can promote \p ToBePromoted to \p Type. 7005 bool canPromote(const Instruction *ToBePromoted) const { 7006 // We could support CastInst too. 7007 return isa<BinaryOperator>(ToBePromoted); 7008 } 7009 7010 /// Check if it is profitable to promote \p ToBePromoted 7011 /// by moving downward the transition through. 7012 bool shouldPromote(const Instruction *ToBePromoted) const { 7013 // Promote only if all the operands can be statically expanded. 7014 // Indeed, we do not want to introduce any new kind of transitions. 7015 for (const Use &U : ToBePromoted->operands()) { 7016 const Value *Val = U.get(); 7017 if (Val == getEndOfTransition()) { 7018 // If the use is a division and the transition is on the rhs, 7019 // we cannot promote the operation, otherwise we may create a 7020 // division by zero. 7021 if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())) 7022 return false; 7023 continue; 7024 } 7025 if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) && 7026 !isa<ConstantFP>(Val)) 7027 return false; 7028 } 7029 // Check that the resulting operation is legal. 7030 int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode()); 7031 if (!ISDOpcode) 7032 return false; 7033 return StressStoreExtract || 7034 TLI.isOperationLegalOrCustom( 7035 ISDOpcode, TLI.getValueType(DL, getTransitionType(), true)); 7036 } 7037 7038 /// Check whether or not \p Use can be combined 7039 /// with the transition. 7040 /// I.e., is it possible to do Use(Transition) => AnotherUse? 7041 bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); } 7042 7043 /// Record \p ToBePromoted as part of the chain to be promoted. 7044 void enqueueForPromotion(Instruction *ToBePromoted) { 7045 InstsToBePromoted.push_back(ToBePromoted); 7046 } 7047 7048 /// Set the instruction that will be combined with the transition. 7049 void recordCombineInstruction(Instruction *ToBeCombined) { 7050 assert(canCombine(ToBeCombined) && "Unsupported instruction to combine"); 7051 CombineInst = ToBeCombined; 7052 } 7053 7054 /// Promote all the instructions enqueued for promotion if it is 7055 /// is profitable. 7056 /// \return True if the promotion happened, false otherwise. 7057 bool promote() { 7058 // Check if there is something to promote. 7059 // Right now, if we do not have anything to combine with, 7060 // we assume the promotion is not profitable. 7061 if (InstsToBePromoted.empty() || !CombineInst) 7062 return false; 7063 7064 // Check cost. 7065 if (!StressStoreExtract && !isProfitableToPromote()) 7066 return false; 7067 7068 // Promote. 7069 for (auto &ToBePromoted : InstsToBePromoted) 7070 promoteImpl(ToBePromoted); 7071 InstsToBePromoted.clear(); 7072 return true; 7073 } 7074 }; 7075 7076 } // end anonymous namespace 7077 7078 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { 7079 // At this point, we know that all the operands of ToBePromoted but Def 7080 // can be statically promoted. 7081 // For Def, we need to use its parameter in ToBePromoted: 7082 // b = ToBePromoted ty1 a 7083 // Def = Transition ty1 b to ty2 7084 // Move the transition down. 7085 // 1. Replace all uses of the promoted operation by the transition. 7086 // = ... b => = ... Def. 7087 assert(ToBePromoted->getType() == Transition->getType() && 7088 "The type of the result of the transition does not match " 7089 "the final type"); 7090 ToBePromoted->replaceAllUsesWith(Transition); 7091 // 2. Update the type of the uses. 7092 // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def. 7093 Type *TransitionTy = getTransitionType(); 7094 ToBePromoted->mutateType(TransitionTy); 7095 // 3. Update all the operands of the promoted operation with promoted 7096 // operands. 7097 // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a. 7098 for (Use &U : ToBePromoted->operands()) { 7099 Value *Val = U.get(); 7100 Value *NewVal = nullptr; 7101 if (Val == Transition) 7102 NewVal = Transition->getOperand(getTransitionOriginalValueIdx()); 7103 else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) || 7104 isa<ConstantFP>(Val)) { 7105 // Use a splat constant if it is not safe to use undef. 7106 NewVal = getConstantVector( 7107 cast<Constant>(Val), 7108 isa<UndefValue>(Val) || 7109 canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); 7110 } else 7111 llvm_unreachable("Did you modified shouldPromote and forgot to update " 7112 "this?"); 7113 ToBePromoted->setOperand(U.getOperandNo(), NewVal); 7114 } 7115 Transition->moveAfter(ToBePromoted); 7116 Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); 7117 } 7118 7119 /// Some targets can do store(extractelement) with one instruction. 7120 /// Try to push the extractelement towards the stores when the target 7121 /// has this feature and this is profitable. 7122 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { 7123 unsigned CombineCost = std::numeric_limits<unsigned>::max(); 7124 if (DisableStoreExtract || 7125 (!StressStoreExtract && 7126 !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(), 7127 Inst->getOperand(1), CombineCost))) 7128 return false; 7129 7130 // At this point we know that Inst is a vector to scalar transition. 7131 // Try to move it down the def-use chain, until: 7132 // - We can combine the transition with its single use 7133 // => we got rid of the transition. 7134 // - We escape the current basic block 7135 // => we would need to check that we are moving it at a cheaper place and 7136 // we do not do that for now. 7137 BasicBlock *Parent = Inst->getParent(); 7138 LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); 7139 VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost); 7140 // If the transition has more than one use, assume this is not going to be 7141 // beneficial. 7142 while (Inst->hasOneUse()) { 7143 Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin()); 7144 LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n'); 7145 7146 if (ToBePromoted->getParent() != Parent) { 7147 LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block (" 7148 << ToBePromoted->getParent()->getName() 7149 << ") than the transition (" << Parent->getName() 7150 << ").\n"); 7151 return false; 7152 } 7153 7154 if (VPH.canCombine(ToBePromoted)) { 7155 LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n' 7156 << "will be combined with: " << *ToBePromoted << '\n'); 7157 VPH.recordCombineInstruction(ToBePromoted); 7158 bool Changed = VPH.promote(); 7159 NumStoreExtractExposed += Changed; 7160 return Changed; 7161 } 7162 7163 LLVM_DEBUG(dbgs() << "Try promoting.\n"); 7164 if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted)) 7165 return false; 7166 7167 LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n"); 7168 7169 VPH.enqueueForPromotion(ToBePromoted); 7170 Inst = ToBePromoted; 7171 } 7172 return false; 7173 } 7174 7175 /// For the instruction sequence of store below, F and I values 7176 /// are bundled together as an i64 value before being stored into memory. 7177 /// Sometimes it is more efficient to generate separate stores for F and I, 7178 /// which can remove the bitwise instructions or sink them to colder places. 7179 /// 7180 /// (store (or (zext (bitcast F to i32) to i64), 7181 /// (shl (zext I to i64), 32)), addr) --> 7182 /// (store F, addr) and (store I, addr+4) 7183 /// 7184 /// Similarly, splitting for other merged store can also be beneficial, like: 7185 /// For pair of {i32, i32}, i64 store --> two i32 stores. 7186 /// For pair of {i32, i16}, i64 store --> two i32 stores. 7187 /// For pair of {i16, i16}, i32 store --> two i16 stores. 7188 /// For pair of {i16, i8}, i32 store --> two i16 stores. 7189 /// For pair of {i8, i8}, i16 store --> two i8 stores. 7190 /// 7191 /// We allow each target to determine specifically which kind of splitting is 7192 /// supported. 7193 /// 7194 /// The store patterns are commonly seen from the simple code snippet below 7195 /// if only std::make_pair(...) is sroa transformed before inlined into hoo. 7196 /// void goo(const std::pair<int, float> &); 7197 /// hoo() { 7198 /// ... 7199 /// goo(std::make_pair(tmp, ftmp)); 7200 /// ... 7201 /// } 7202 /// 7203 /// Although we already have similar splitting in DAG Combine, we duplicate 7204 /// it in CodeGenPrepare to catch the case in which pattern is across 7205 /// multiple BBs. The logic in DAG Combine is kept to catch case generated 7206 /// during code expansion. 7207 static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, 7208 const TargetLowering &TLI) { 7209 // Handle simple but common cases only. 7210 Type *StoreType = SI.getValueOperand()->getType(); 7211 7212 // The code below assumes shifting a value by <number of bits>, 7213 // whereas scalable vectors would have to be shifted by 7214 // <2log(vscale) + number of bits> in order to store the 7215 // low/high parts. Bailing out for now. 7216 if (isa<ScalableVectorType>(StoreType)) 7217 return false; 7218 7219 if (!DL.typeSizeEqualsStoreSize(StoreType) || 7220 DL.getTypeSizeInBits(StoreType) == 0) 7221 return false; 7222 7223 unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2; 7224 Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize); 7225 if (!DL.typeSizeEqualsStoreSize(SplitStoreType)) 7226 return false; 7227 7228 // Don't split the store if it is volatile. 7229 if (SI.isVolatile()) 7230 return false; 7231 7232 // Match the following patterns: 7233 // (store (or (zext LValue to i64), 7234 // (shl (zext HValue to i64), 32)), HalfValBitSize) 7235 // or 7236 // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize) 7237 // (zext LValue to i64), 7238 // Expect both operands of OR and the first operand of SHL have only 7239 // one use. 7240 Value *LValue, *HValue; 7241 if (!match(SI.getValueOperand(), 7242 m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))), 7243 m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))), 7244 m_SpecificInt(HalfValBitSize)))))) 7245 return false; 7246 7247 // Check LValue and HValue are int with size less or equal than 32. 7248 if (!LValue->getType()->isIntegerTy() || 7249 DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize || 7250 !HValue->getType()->isIntegerTy() || 7251 DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize) 7252 return false; 7253 7254 // If LValue/HValue is a bitcast instruction, use the EVT before bitcast 7255 // as the input of target query. 7256 auto *LBC = dyn_cast<BitCastInst>(LValue); 7257 auto *HBC = dyn_cast<BitCastInst>(HValue); 7258 EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType()) 7259 : EVT::getEVT(LValue->getType()); 7260 EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType()) 7261 : EVT::getEVT(HValue->getType()); 7262 if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) 7263 return false; 7264 7265 // Start to split store. 7266 IRBuilder<> Builder(SI.getContext()); 7267 Builder.SetInsertPoint(&SI); 7268 7269 // If LValue/HValue is a bitcast in another BB, create a new one in current 7270 // BB so it may be merged with the splitted stores by dag combiner. 7271 if (LBC && LBC->getParent() != SI.getParent()) 7272 LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType()); 7273 if (HBC && HBC->getParent() != SI.getParent()) 7274 HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType()); 7275 7276 bool IsLE = SI.getModule()->getDataLayout().isLittleEndian(); 7277 auto CreateSplitStore = [&](Value *V, bool Upper) { 7278 V = Builder.CreateZExtOrBitCast(V, SplitStoreType); 7279 Value *Addr = Builder.CreateBitCast( 7280 SI.getOperand(1), 7281 SplitStoreType->getPointerTo(SI.getPointerAddressSpace())); 7282 Align Alignment = SI.getAlign(); 7283 const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper); 7284 if (IsOffsetStore) { 7285 Addr = Builder.CreateGEP( 7286 SplitStoreType, Addr, 7287 ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1)); 7288 7289 // When splitting the store in half, naturally one half will retain the 7290 // alignment of the original wider store, regardless of whether it was 7291 // over-aligned or not, while the other will require adjustment. 7292 Alignment = commonAlignment(Alignment, HalfValBitSize / 8); 7293 } 7294 Builder.CreateAlignedStore(V, Addr, Alignment); 7295 }; 7296 7297 CreateSplitStore(LValue, false); 7298 CreateSplitStore(HValue, true); 7299 7300 // Delete the old store. 7301 SI.eraseFromParent(); 7302 return true; 7303 } 7304 7305 // Return true if the GEP has two operands, the first operand is of a sequential 7306 // type, and the second operand is a constant. 7307 static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) { 7308 gep_type_iterator I = gep_type_begin(*GEP); 7309 return GEP->getNumOperands() == 2 && 7310 I.isSequential() && 7311 isa<ConstantInt>(GEP->getOperand(1)); 7312 } 7313 7314 // Try unmerging GEPs to reduce liveness interference (register pressure) across 7315 // IndirectBr edges. Since IndirectBr edges tend to touch on many blocks, 7316 // reducing liveness interference across those edges benefits global register 7317 // allocation. Currently handles only certain cases. 7318 // 7319 // For example, unmerge %GEPI and %UGEPI as below. 7320 // 7321 // ---------- BEFORE ---------- 7322 // SrcBlock: 7323 // ... 7324 // %GEPIOp = ... 7325 // ... 7326 // %GEPI = gep %GEPIOp, Idx 7327 // ... 7328 // indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ] 7329 // (* %GEPI is alive on the indirectbr edges due to other uses ahead) 7330 // (* %GEPIOp is alive on the indirectbr edges only because of it's used by 7331 // %UGEPI) 7332 // 7333 // DstB0: ... (there may be a gep similar to %UGEPI to be unmerged) 7334 // DstB1: ... (there may be a gep similar to %UGEPI to be unmerged) 7335 // ... 7336 // 7337 // DstBi: 7338 // ... 7339 // %UGEPI = gep %GEPIOp, UIdx 7340 // ... 7341 // --------------------------- 7342 // 7343 // ---------- AFTER ---------- 7344 // SrcBlock: 7345 // ... (same as above) 7346 // (* %GEPI is still alive on the indirectbr edges) 7347 // (* %GEPIOp is no longer alive on the indirectbr edges as a result of the 7348 // unmerging) 7349 // ... 7350 // 7351 // DstBi: 7352 // ... 7353 // %UGEPI = gep %GEPI, (UIdx-Idx) 7354 // ... 7355 // --------------------------- 7356 // 7357 // The register pressure on the IndirectBr edges is reduced because %GEPIOp is 7358 // no longer alive on them. 7359 // 7360 // We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging 7361 // of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as 7362 // not to disable further simplications and optimizations as a result of GEP 7363 // merging. 7364 // 7365 // Note this unmerging may increase the length of the data flow critical path 7366 // (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff 7367 // between the register pressure and the length of data-flow critical 7368 // path. Restricting this to the uncommon IndirectBr case would minimize the 7369 // impact of potentially longer critical path, if any, and the impact on compile 7370 // time. 7371 static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, 7372 const TargetTransformInfo *TTI) { 7373 BasicBlock *SrcBlock = GEPI->getParent(); 7374 // Check that SrcBlock ends with an IndirectBr. If not, give up. The common 7375 // (non-IndirectBr) cases exit early here. 7376 if (!isa<IndirectBrInst>(SrcBlock->getTerminator())) 7377 return false; 7378 // Check that GEPI is a simple gep with a single constant index. 7379 if (!GEPSequentialConstIndexed(GEPI)) 7380 return false; 7381 ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1)); 7382 // Check that GEPI is a cheap one. 7383 if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(), 7384 TargetTransformInfo::TCK_SizeAndLatency) 7385 > TargetTransformInfo::TCC_Basic) 7386 return false; 7387 Value *GEPIOp = GEPI->getOperand(0); 7388 // Check that GEPIOp is an instruction that's also defined in SrcBlock. 7389 if (!isa<Instruction>(GEPIOp)) 7390 return false; 7391 auto *GEPIOpI = cast<Instruction>(GEPIOp); 7392 if (GEPIOpI->getParent() != SrcBlock) 7393 return false; 7394 // Check that GEP is used outside the block, meaning it's alive on the 7395 // IndirectBr edge(s). 7396 if (find_if(GEPI->users(), [&](User *Usr) { 7397 if (auto *I = dyn_cast<Instruction>(Usr)) { 7398 if (I->getParent() != SrcBlock) { 7399 return true; 7400 } 7401 } 7402 return false; 7403 }) == GEPI->users().end()) 7404 return false; 7405 // The second elements of the GEP chains to be unmerged. 7406 std::vector<GetElementPtrInst *> UGEPIs; 7407 // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive 7408 // on IndirectBr edges. 7409 for (User *Usr : GEPIOp->users()) { 7410 if (Usr == GEPI) continue; 7411 // Check if Usr is an Instruction. If not, give up. 7412 if (!isa<Instruction>(Usr)) 7413 return false; 7414 auto *UI = cast<Instruction>(Usr); 7415 // Check if Usr in the same block as GEPIOp, which is fine, skip. 7416 if (UI->getParent() == SrcBlock) 7417 continue; 7418 // Check if Usr is a GEP. If not, give up. 7419 if (!isa<GetElementPtrInst>(Usr)) 7420 return false; 7421 auto *UGEPI = cast<GetElementPtrInst>(Usr); 7422 // Check if UGEPI is a simple gep with a single constant index and GEPIOp is 7423 // the pointer operand to it. If so, record it in the vector. If not, give 7424 // up. 7425 if (!GEPSequentialConstIndexed(UGEPI)) 7426 return false; 7427 if (UGEPI->getOperand(0) != GEPIOp) 7428 return false; 7429 if (GEPIIdx->getType() != 7430 cast<ConstantInt>(UGEPI->getOperand(1))->getType()) 7431 return false; 7432 ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); 7433 if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(), 7434 TargetTransformInfo::TCK_SizeAndLatency) 7435 > TargetTransformInfo::TCC_Basic) 7436 return false; 7437 UGEPIs.push_back(UGEPI); 7438 } 7439 if (UGEPIs.size() == 0) 7440 return false; 7441 // Check the materializing cost of (Uidx-Idx). 7442 for (GetElementPtrInst *UGEPI : UGEPIs) { 7443 ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); 7444 APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue(); 7445 unsigned ImmCost = 7446 TTI->getIntImmCost(NewIdx, GEPIIdx->getType(), 7447 TargetTransformInfo::TCK_SizeAndLatency); 7448 if (ImmCost > TargetTransformInfo::TCC_Basic) 7449 return false; 7450 } 7451 // Now unmerge between GEPI and UGEPIs. 7452 for (GetElementPtrInst *UGEPI : UGEPIs) { 7453 UGEPI->setOperand(0, GEPI); 7454 ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); 7455 Constant *NewUGEPIIdx = 7456 ConstantInt::get(GEPIIdx->getType(), 7457 UGEPIIdx->getValue() - GEPIIdx->getValue()); 7458 UGEPI->setOperand(1, NewUGEPIIdx); 7459 // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not 7460 // inbounds to avoid UB. 7461 if (!GEPI->isInBounds()) { 7462 UGEPI->setIsInBounds(false); 7463 } 7464 } 7465 // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not 7466 // alive on IndirectBr edges). 7467 assert(find_if(GEPIOp->users(), [&](User *Usr) { 7468 return cast<Instruction>(Usr)->getParent() != SrcBlock; 7469 }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock"); 7470 return true; 7471 } 7472 7473 bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { 7474 // Bail out if we inserted the instruction to prevent optimizations from 7475 // stepping on each other's toes. 7476 if (InsertedInsts.count(I)) 7477 return false; 7478 7479 // TODO: Move into the switch on opcode below here. 7480 if (PHINode *P = dyn_cast<PHINode>(I)) { 7481 // It is possible for very late stage optimizations (such as SimplifyCFG) 7482 // to introduce PHI nodes too late to be cleaned up. If we detect such a 7483 // trivial PHI, go ahead and zap it here. 7484 if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) { 7485 LargeOffsetGEPMap.erase(P); 7486 P->replaceAllUsesWith(V); 7487 P->eraseFromParent(); 7488 ++NumPHIsElim; 7489 return true; 7490 } 7491 return false; 7492 } 7493 7494 if (CastInst *CI = dyn_cast<CastInst>(I)) { 7495 // If the source of the cast is a constant, then this should have 7496 // already been constant folded. The only reason NOT to constant fold 7497 // it is if something (e.g. LSR) was careful to place the constant 7498 // evaluation in a block other than then one that uses it (e.g. to hoist 7499 // the address of globals out of a loop). If this is the case, we don't 7500 // want to forward-subst the cast. 7501 if (isa<Constant>(CI->getOperand(0))) 7502 return false; 7503 7504 if (OptimizeNoopCopyExpression(CI, *TLI, *DL)) 7505 return true; 7506 7507 if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { 7508 /// Sink a zext or sext into its user blocks if the target type doesn't 7509 /// fit in one register 7510 if (TLI->getTypeAction(CI->getContext(), 7511 TLI->getValueType(*DL, CI->getType())) == 7512 TargetLowering::TypeExpandInteger) { 7513 return SinkCast(CI); 7514 } else { 7515 bool MadeChange = optimizeExt(I); 7516 return MadeChange | optimizeExtUses(I); 7517 } 7518 } 7519 return false; 7520 } 7521 7522 if (auto *Cmp = dyn_cast<CmpInst>(I)) 7523 if (optimizeCmp(Cmp, ModifiedDT)) 7524 return true; 7525 7526 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 7527 LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); 7528 bool Modified = optimizeLoadExt(LI); 7529 unsigned AS = LI->getPointerAddressSpace(); 7530 Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); 7531 return Modified; 7532 } 7533 7534 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 7535 if (splitMergedValStore(*SI, *DL, *TLI)) 7536 return true; 7537 SI->setMetadata(LLVMContext::MD_invariant_group, nullptr); 7538 unsigned AS = SI->getPointerAddressSpace(); 7539 return optimizeMemoryInst(I, SI->getOperand(1), 7540 SI->getOperand(0)->getType(), AS); 7541 } 7542 7543 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { 7544 unsigned AS = RMW->getPointerAddressSpace(); 7545 return optimizeMemoryInst(I, RMW->getPointerOperand(), 7546 RMW->getType(), AS); 7547 } 7548 7549 if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { 7550 unsigned AS = CmpX->getPointerAddressSpace(); 7551 return optimizeMemoryInst(I, CmpX->getPointerOperand(), 7552 CmpX->getCompareOperand()->getType(), AS); 7553 } 7554 7555 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); 7556 7557 if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking) 7558 return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); 7559 7560 // TODO: Move this into the switch on opcode - it handles shifts already. 7561 if (BinOp && (BinOp->getOpcode() == Instruction::AShr || 7562 BinOp->getOpcode() == Instruction::LShr)) { 7563 ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); 7564 if (CI && TLI->hasExtractBitsInsn()) 7565 if (OptimizeExtractBits(BinOp, CI, *TLI, *DL)) 7566 return true; 7567 } 7568 7569 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { 7570 if (GEPI->hasAllZeroIndices()) { 7571 /// The GEP operand must be a pointer, so must its result -> BitCast 7572 Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), 7573 GEPI->getName(), GEPI); 7574 NC->setDebugLoc(GEPI->getDebugLoc()); 7575 GEPI->replaceAllUsesWith(NC); 7576 GEPI->eraseFromParent(); 7577 ++NumGEPsElim; 7578 optimizeInst(NC, ModifiedDT); 7579 return true; 7580 } 7581 if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) { 7582 return true; 7583 } 7584 return false; 7585 } 7586 7587 if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) { 7588 // freeze(icmp a, const)) -> icmp (freeze a), const 7589 // This helps generate efficient conditional jumps. 7590 Instruction *CmpI = nullptr; 7591 if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0))) 7592 CmpI = II; 7593 else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0))) 7594 CmpI = F->getFastMathFlags().none() ? F : nullptr; 7595 7596 if (CmpI && CmpI->hasOneUse()) { 7597 auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1); 7598 bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) || 7599 isa<ConstantPointerNull>(Op0); 7600 bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) || 7601 isa<ConstantPointerNull>(Op1); 7602 if (Const0 || Const1) { 7603 if (!Const0 || !Const1) { 7604 auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI); 7605 F->takeName(FI); 7606 CmpI->setOperand(Const0 ? 1 : 0, F); 7607 } 7608 FI->replaceAllUsesWith(CmpI); 7609 FI->eraseFromParent(); 7610 return true; 7611 } 7612 } 7613 return false; 7614 } 7615 7616 if (tryToSinkFreeOperands(I)) 7617 return true; 7618 7619 switch (I->getOpcode()) { 7620 case Instruction::Shl: 7621 case Instruction::LShr: 7622 case Instruction::AShr: 7623 return optimizeShiftInst(cast<BinaryOperator>(I)); 7624 case Instruction::Call: 7625 return optimizeCallInst(cast<CallInst>(I), ModifiedDT); 7626 case Instruction::Select: 7627 return optimizeSelectInst(cast<SelectInst>(I)); 7628 case Instruction::ShuffleVector: 7629 return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I)); 7630 case Instruction::Switch: 7631 return optimizeSwitchInst(cast<SwitchInst>(I)); 7632 case Instruction::ExtractElement: 7633 return optimizeExtractElementInst(cast<ExtractElementInst>(I)); 7634 } 7635 7636 return false; 7637 } 7638 7639 /// Given an OR instruction, check to see if this is a bitreverse 7640 /// idiom. If so, insert the new intrinsic and return true. 7641 bool CodeGenPrepare::makeBitReverse(Instruction &I) { 7642 if (!I.getType()->isIntegerTy() || 7643 !TLI->isOperationLegalOrCustom(ISD::BITREVERSE, 7644 TLI->getValueType(*DL, I.getType(), true))) 7645 return false; 7646 7647 SmallVector<Instruction*, 4> Insts; 7648 if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts)) 7649 return false; 7650 Instruction *LastInst = Insts.back(); 7651 I.replaceAllUsesWith(LastInst); 7652 RecursivelyDeleteTriviallyDeadInstructions( 7653 &I, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); }); 7654 return true; 7655 } 7656 7657 // In this pass we look for GEP and cast instructions that are used 7658 // across basic blocks and rewrite them to improve basic-block-at-a-time 7659 // selection. 7660 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { 7661 SunkAddrs.clear(); 7662 bool MadeChange = false; 7663 7664 CurInstIterator = BB.begin(); 7665 while (CurInstIterator != BB.end()) { 7666 MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); 7667 if (ModifiedDT) 7668 return true; 7669 } 7670 7671 bool MadeBitReverse = true; 7672 while (MadeBitReverse) { 7673 MadeBitReverse = false; 7674 for (auto &I : reverse(BB)) { 7675 if (makeBitReverse(I)) { 7676 MadeBitReverse = MadeChange = true; 7677 break; 7678 } 7679 } 7680 } 7681 MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT); 7682 7683 return MadeChange; 7684 } 7685 7686 // Some CGP optimizations may move or alter what's computed in a block. Check 7687 // whether a dbg.value intrinsic could be pointed at a more appropriate operand. 7688 bool CodeGenPrepare::fixupDbgValue(Instruction *I) { 7689 assert(isa<DbgValueInst>(I)); 7690 DbgValueInst &DVI = *cast<DbgValueInst>(I); 7691 7692 // Does this dbg.value refer to a sunk address calculation? 7693 Value *Location = DVI.getVariableLocation(); 7694 WeakTrackingVH SunkAddrVH = SunkAddrs[Location]; 7695 Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; 7696 if (SunkAddr) { 7697 // Point dbg.value at locally computed address, which should give the best 7698 // opportunity to be accurately lowered. This update may change the type of 7699 // pointer being referred to; however this makes no difference to debugging 7700 // information, and we can't generate bitcasts that may affect codegen. 7701 DVI.setOperand(0, MetadataAsValue::get(DVI.getContext(), 7702 ValueAsMetadata::get(SunkAddr))); 7703 return true; 7704 } 7705 return false; 7706 } 7707 7708 // A llvm.dbg.value may be using a value before its definition, due to 7709 // optimizations in this pass and others. Scan for such dbg.values, and rescue 7710 // them by moving the dbg.value to immediately after the value definition. 7711 // FIXME: Ideally this should never be necessary, and this has the potential 7712 // to re-order dbg.value intrinsics. 7713 bool CodeGenPrepare::placeDbgValues(Function &F) { 7714 bool MadeChange = false; 7715 DominatorTree DT(F); 7716 7717 for (BasicBlock &BB : F) { 7718 for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { 7719 Instruction *Insn = &*BI++; 7720 DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); 7721 if (!DVI) 7722 continue; 7723 7724 Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); 7725 7726 if (!VI || VI->isTerminator()) 7727 continue; 7728 7729 // If VI is a phi in a block with an EHPad terminator, we can't insert 7730 // after it. 7731 if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad()) 7732 continue; 7733 7734 // If the defining instruction dominates the dbg.value, we do not need 7735 // to move the dbg.value. 7736 if (DT.dominates(VI, DVI)) 7737 continue; 7738 7739 LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n" 7740 << *DVI << ' ' << *VI); 7741 DVI->removeFromParent(); 7742 if (isa<PHINode>(VI)) 7743 DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); 7744 else 7745 DVI->insertAfter(VI); 7746 MadeChange = true; 7747 ++NumDbgValueMoved; 7748 } 7749 } 7750 return MadeChange; 7751 } 7752 7753 /// Scale down both weights to fit into uint32_t. 7754 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { 7755 uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; 7756 uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1; 7757 NewTrue = NewTrue / Scale; 7758 NewFalse = NewFalse / Scale; 7759 } 7760 7761 /// Some targets prefer to split a conditional branch like: 7762 /// \code 7763 /// %0 = icmp ne i32 %a, 0 7764 /// %1 = icmp ne i32 %b, 0 7765 /// %or.cond = or i1 %0, %1 7766 /// br i1 %or.cond, label %TrueBB, label %FalseBB 7767 /// \endcode 7768 /// into multiple branch instructions like: 7769 /// \code 7770 /// bb1: 7771 /// %0 = icmp ne i32 %a, 0 7772 /// br i1 %0, label %TrueBB, label %bb2 7773 /// bb2: 7774 /// %1 = icmp ne i32 %b, 0 7775 /// br i1 %1, label %TrueBB, label %FalseBB 7776 /// \endcode 7777 /// This usually allows instruction selection to do even further optimizations 7778 /// and combine the compare with the branch instruction. Currently this is 7779 /// applied for targets which have "cheap" jump instructions. 7780 /// 7781 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. 7782 /// 7783 bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) { 7784 if (!TM->Options.EnableFastISel || TLI->isJumpExpensive()) 7785 return false; 7786 7787 bool MadeChange = false; 7788 for (auto &BB : F) { 7789 // Does this BB end with the following? 7790 // %cond1 = icmp|fcmp|binary instruction ... 7791 // %cond2 = icmp|fcmp|binary instruction ... 7792 // %cond.or = or|and i1 %cond1, cond2 7793 // br i1 %cond.or label %dest1, label %dest2" 7794 BinaryOperator *LogicOp; 7795 BasicBlock *TBB, *FBB; 7796 if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) 7797 continue; 7798 7799 auto *Br1 = cast<BranchInst>(BB.getTerminator()); 7800 if (Br1->getMetadata(LLVMContext::MD_unpredictable)) 7801 continue; 7802 7803 // The merging of mostly empty BB can cause a degenerate branch. 7804 if (TBB == FBB) 7805 continue; 7806 7807 unsigned Opc; 7808 Value *Cond1, *Cond2; 7809 if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), 7810 m_OneUse(m_Value(Cond2))))) 7811 Opc = Instruction::And; 7812 else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)), 7813 m_OneUse(m_Value(Cond2))))) 7814 Opc = Instruction::Or; 7815 else 7816 continue; 7817 7818 if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) || 7819 !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) ) 7820 continue; 7821 7822 LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump()); 7823 7824 // Create a new BB. 7825 auto *TmpBB = 7826 BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split", 7827 BB.getParent(), BB.getNextNode()); 7828 7829 // Update original basic block by using the first condition directly by the 7830 // branch instruction and removing the no longer needed and/or instruction. 7831 Br1->setCondition(Cond1); 7832 LogicOp->eraseFromParent(); 7833 7834 // Depending on the condition we have to either replace the true or the 7835 // false successor of the original branch instruction. 7836 if (Opc == Instruction::And) 7837 Br1->setSuccessor(0, TmpBB); 7838 else 7839 Br1->setSuccessor(1, TmpBB); 7840 7841 // Fill in the new basic block. 7842 auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB); 7843 if (auto *I = dyn_cast<Instruction>(Cond2)) { 7844 I->removeFromParent(); 7845 I->insertBefore(Br2); 7846 } 7847 7848 // Update PHI nodes in both successors. The original BB needs to be 7849 // replaced in one successor's PHI nodes, because the branch comes now from 7850 // the newly generated BB (NewBB). In the other successor we need to add one 7851 // incoming edge to the PHI nodes, because both branch instructions target 7852 // now the same successor. Depending on the original branch condition 7853 // (and/or) we have to swap the successors (TrueDest, FalseDest), so that 7854 // we perform the correct update for the PHI nodes. 7855 // This doesn't change the successor order of the just created branch 7856 // instruction (or any other instruction). 7857 if (Opc == Instruction::Or) 7858 std::swap(TBB, FBB); 7859 7860 // Replace the old BB with the new BB. 7861 TBB->replacePhiUsesWith(&BB, TmpBB); 7862 7863 // Add another incoming edge form the new BB. 7864 for (PHINode &PN : FBB->phis()) { 7865 auto *Val = PN.getIncomingValueForBlock(&BB); 7866 PN.addIncoming(Val, TmpBB); 7867 } 7868 7869 // Update the branch weights (from SelectionDAGBuilder:: 7870 // FindMergedConditions). 7871 if (Opc == Instruction::Or) { 7872 // Codegen X | Y as: 7873 // BB1: 7874 // jmp_if_X TBB 7875 // jmp TmpBB 7876 // TmpBB: 7877 // jmp_if_Y TBB 7878 // jmp FBB 7879 // 7880 7881 // We have flexibility in setting Prob for BB1 and Prob for NewBB. 7882 // The requirement is that 7883 // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) 7884 // = TrueProb for original BB. 7885 // Assuming the original weights are A and B, one choice is to set BB1's 7886 // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice 7887 // assumes that 7888 // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. 7889 // Another choice is to assume TrueProb for BB1 equals to TrueProb for 7890 // TmpBB, but the math is more complicated. 7891 uint64_t TrueWeight, FalseWeight; 7892 if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) { 7893 uint64_t NewTrueWeight = TrueWeight; 7894 uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight; 7895 scaleWeights(NewTrueWeight, NewFalseWeight); 7896 Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) 7897 .createBranchWeights(TrueWeight, FalseWeight)); 7898 7899 NewTrueWeight = TrueWeight; 7900 NewFalseWeight = 2 * FalseWeight; 7901 scaleWeights(NewTrueWeight, NewFalseWeight); 7902 Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) 7903 .createBranchWeights(TrueWeight, FalseWeight)); 7904 } 7905 } else { 7906 // Codegen X & Y as: 7907 // BB1: 7908 // jmp_if_X TmpBB 7909 // jmp FBB 7910 // TmpBB: 7911 // jmp_if_Y TBB 7912 // jmp FBB 7913 // 7914 // This requires creation of TmpBB after CurBB. 7915 7916 // We have flexibility in setting Prob for BB1 and Prob for TmpBB. 7917 // The requirement is that 7918 // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) 7919 // = FalseProb for original BB. 7920 // Assuming the original weights are A and B, one choice is to set BB1's 7921 // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice 7922 // assumes that 7923 // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. 7924 uint64_t TrueWeight, FalseWeight; 7925 if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) { 7926 uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight; 7927 uint64_t NewFalseWeight = FalseWeight; 7928 scaleWeights(NewTrueWeight, NewFalseWeight); 7929 Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) 7930 .createBranchWeights(TrueWeight, FalseWeight)); 7931 7932 NewTrueWeight = 2 * TrueWeight; 7933 NewFalseWeight = FalseWeight; 7934 scaleWeights(NewTrueWeight, NewFalseWeight); 7935 Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) 7936 .createBranchWeights(TrueWeight, FalseWeight)); 7937 } 7938 } 7939 7940 ModifiedDT = true; 7941 MadeChange = true; 7942 7943 LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); 7944 TmpBB->dump()); 7945 } 7946 return MadeChange; 7947 } 7948