1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass munges the code in the input function to better prepare it for 11 // SelectionDAG-based code generation. This works around limitations in it's 12 // basic-block-at-a-time approach. It should eventually be removed. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/APInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/PointerIntPair.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/Analysis/BlockFrequencyInfo.h" 26 #include "llvm/Analysis/BranchProbabilityInfo.h" 27 #include "llvm/Analysis/ConstantFolding.h" 28 #include "llvm/Analysis/InstructionSimplify.h" 29 #include "llvm/Analysis/LoopInfo.h" 30 #include "llvm/Analysis/MemoryBuiltins.h" 31 #include "llvm/Analysis/ProfileSummaryInfo.h" 32 #include "llvm/Analysis/TargetLibraryInfo.h" 33 #include "llvm/Analysis/TargetTransformInfo.h" 34 #include "llvm/Analysis/ValueTracking.h" 35 #include "llvm/CodeGen/Analysis.h" 36 #include "llvm/CodeGen/ISDOpcodes.h" 37 #include "llvm/CodeGen/MachineValueType.h" 38 #include "llvm/CodeGen/SelectionDAGNodes.h" 39 #include "llvm/CodeGen/TargetPassConfig.h" 40 #include "llvm/CodeGen/ValueTypes.h" 41 #include "llvm/IR/Argument.h" 42 #include "llvm/IR/Attributes.h" 43 #include "llvm/IR/BasicBlock.h" 44 #include "llvm/IR/CallSite.h" 45 #include "llvm/IR/Constant.h" 46 #include "llvm/IR/Constants.h" 47 #include "llvm/IR/DataLayout.h" 48 #include "llvm/IR/DerivedTypes.h" 49 #include "llvm/IR/Dominators.h" 50 #include "llvm/IR/Function.h" 51 #include "llvm/IR/GetElementPtrTypeIterator.h" 52 #include "llvm/IR/GlobalValue.h" 53 #include "llvm/IR/GlobalVariable.h" 54 #include "llvm/IR/IRBuilder.h" 55 #include "llvm/IR/InlineAsm.h" 56 #include "llvm/IR/InstrTypes.h" 57 #include "llvm/IR/Instruction.h" 58 #include "llvm/IR/Instructions.h" 59 #include "llvm/IR/IntrinsicInst.h" 60 #include "llvm/IR/Intrinsics.h" 61 #include "llvm/IR/LLVMContext.h" 62 #include "llvm/IR/MDBuilder.h" 63 #include "llvm/IR/Module.h" 64 #include "llvm/IR/Operator.h" 65 #include "llvm/IR/PatternMatch.h" 66 #include "llvm/IR/Statepoint.h" 67 #include "llvm/IR/Type.h" 68 #include "llvm/IR/Use.h" 69 #include "llvm/IR/User.h" 70 #include "llvm/IR/Value.h" 71 #include "llvm/IR/ValueHandle.h" 72 #include "llvm/IR/ValueMap.h" 73 #include "llvm/Pass.h" 74 #include "llvm/Support/BlockFrequency.h" 75 #include "llvm/Support/BranchProbability.h" 76 #include "llvm/Support/Casting.h" 77 #include "llvm/Support/CommandLine.h" 78 #include "llvm/Support/Compiler.h" 79 #include "llvm/Support/Debug.h" 80 #include "llvm/Support/ErrorHandling.h" 81 #include "llvm/Support/MathExtras.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include "llvm/Target/TargetLowering.h" 84 #include "llvm/Target/TargetMachine.h" 85 #include "llvm/Target/TargetOptions.h" 86 #include "llvm/Target/TargetSubtargetInfo.h" 87 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 88 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 89 #include "llvm/Transforms/Utils/Cloning.h" 90 #include "llvm/Transforms/Utils/Local.h" 91 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 92 #include "llvm/Transforms/Utils/ValueMapper.h" 93 #include <algorithm> 94 #include <cassert> 95 #include <cstdint> 96 #include <iterator> 97 #include <limits> 98 #include <memory> 99 #include <utility> 100 #include <vector> 101 102 using namespace llvm; 103 using namespace llvm::PatternMatch; 104 105 #define DEBUG_TYPE "codegenprepare" 106 107 STATISTIC(NumBlocksElim, "Number of blocks eliminated"); 108 STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); 109 STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); 110 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " 111 "sunken Cmps"); 112 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " 113 "of sunken Casts"); 114 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " 115 "computations were sunk"); 116 STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); 117 STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); 118 STATISTIC(NumAndsAdded, 119 "Number of and mask instructions added to form ext loads"); 120 STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); 121 STATISTIC(NumRetsDup, "Number of return instructions duplicated"); 122 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); 123 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); 124 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); 125 126 STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); 127 STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); 128 STATISTIC(NumMemCmpGreaterThanMax, 129 "Number of memcmp calls with size greater than max size"); 130 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); 131 132 static cl::opt<bool> DisableBranchOpts( 133 "disable-cgp-branch-opts", cl::Hidden, cl::init(false), 134 cl::desc("Disable branch optimizations in CodeGenPrepare")); 135 136 static cl::opt<bool> 137 DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false), 138 cl::desc("Disable GC optimizations in CodeGenPrepare")); 139 140 static cl::opt<bool> DisableSelectToBranch( 141 "disable-cgp-select2branch", cl::Hidden, cl::init(false), 142 cl::desc("Disable select to branch conversion.")); 143 144 static cl::opt<bool> AddrSinkUsingGEPs( 145 "addr-sink-using-gep", cl::Hidden, cl::init(true), 146 cl::desc("Address sinking in CGP using GEPs.")); 147 148 static cl::opt<bool> EnableAndCmpSinking( 149 "enable-andcmp-sinking", cl::Hidden, cl::init(true), 150 cl::desc("Enable sinkinig and/cmp into branches.")); 151 152 static cl::opt<bool> DisableStoreExtract( 153 "disable-cgp-store-extract", cl::Hidden, cl::init(false), 154 cl::desc("Disable store(extract) optimizations in CodeGenPrepare")); 155 156 static cl::opt<bool> StressStoreExtract( 157 "stress-cgp-store-extract", cl::Hidden, cl::init(false), 158 cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); 159 160 static cl::opt<bool> DisableExtLdPromotion( 161 "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), 162 cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in " 163 "CodeGenPrepare")); 164 165 static cl::opt<bool> StressExtLdPromotion( 166 "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), 167 cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) " 168 "optimization in CodeGenPrepare")); 169 170 static cl::opt<bool> DisablePreheaderProtect( 171 "disable-preheader-prot", cl::Hidden, cl::init(false), 172 cl::desc("Disable protection against removing loop preheaders")); 173 174 static cl::opt<bool> ProfileGuidedSectionPrefix( 175 "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore, 176 cl::desc("Use profile info to add section prefix for hot/cold functions")); 177 178 static cl::opt<unsigned> FreqRatioToSkipMerge( 179 "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2), 180 cl::desc("Skip merging empty blocks if (frequency of empty block) / " 181 "(frequency of destination block) is greater than this ratio")); 182 183 static cl::opt<bool> ForceSplitStore( 184 "force-split-store", cl::Hidden, cl::init(false), 185 cl::desc("Force store splitting no matter what the target query says.")); 186 187 static cl::opt<bool> 188 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, 189 cl::desc("Enable merging of redundant sexts when one is dominating" 190 " the other."), cl::init(true)); 191 192 static cl::opt<unsigned> MemCmpNumLoadsPerBlock( 193 "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), 194 cl::desc("The number of loads per basic block for inline expansion of " 195 "memcmp that is only being compared against zero.")); 196 197 namespace { 198 199 using SetOfInstrs = SmallPtrSet<Instruction *, 16>; 200 using TypeIsSExt = PointerIntPair<Type *, 1, bool>; 201 using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>; 202 using SExts = SmallVector<Instruction *, 16>; 203 using ValueToSExts = DenseMap<Value *, SExts>; 204 205 class TypePromotionTransaction; 206 207 class CodeGenPrepare : public FunctionPass { 208 const TargetMachine *TM = nullptr; 209 const TargetSubtargetInfo *SubtargetInfo; 210 const TargetLowering *TLI = nullptr; 211 const TargetRegisterInfo *TRI; 212 const TargetTransformInfo *TTI = nullptr; 213 const TargetLibraryInfo *TLInfo; 214 const LoopInfo *LI; 215 std::unique_ptr<BlockFrequencyInfo> BFI; 216 std::unique_ptr<BranchProbabilityInfo> BPI; 217 218 /// As we scan instructions optimizing them, this is the next instruction 219 /// to optimize. Transforms that can invalidate this should update it. 220 BasicBlock::iterator CurInstIterator; 221 222 /// Keeps track of non-local addresses that have been sunk into a block. 223 /// This allows us to avoid inserting duplicate code for blocks with 224 /// multiple load/stores of the same address. 225 ValueMap<Value*, Value*> SunkAddrs; 226 227 /// Keeps track of all instructions inserted for the current function. 228 SetOfInstrs InsertedInsts; 229 230 /// Keeps track of the type of the related instruction before their 231 /// promotion for the current function. 232 InstrToOrigTy PromotedInsts; 233 234 /// Keep track of instructions removed during promotion. 235 SetOfInstrs RemovedInsts; 236 237 /// Keep track of sext chains based on their initial value. 238 DenseMap<Value *, Instruction *> SeenChainsForSExt; 239 240 /// Keep track of SExt promoted. 241 ValueToSExts ValToSExtendedUses; 242 243 /// True if CFG is modified in any way. 244 bool ModifiedDT; 245 246 /// True if optimizing for size. 247 bool OptSize; 248 249 /// DataLayout for the Function being processed. 250 const DataLayout *DL = nullptr; 251 252 public: 253 static char ID; // Pass identification, replacement for typeid 254 255 CodeGenPrepare() : FunctionPass(ID) { 256 initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); 257 } 258 259 bool runOnFunction(Function &F) override; 260 261 StringRef getPassName() const override { return "CodeGen Prepare"; } 262 263 void getAnalysisUsage(AnalysisUsage &AU) const override { 264 // FIXME: When we can selectively preserve passes, preserve the domtree. 265 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 266 AU.addRequired<TargetLibraryInfoWrapperPass>(); 267 AU.addRequired<TargetTransformInfoWrapperPass>(); 268 AU.addRequired<LoopInfoWrapperPass>(); 269 } 270 271 private: 272 bool eliminateFallThrough(Function &F); 273 bool eliminateMostlyEmptyBlocks(Function &F); 274 BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); 275 bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; 276 void eliminateMostlyEmptyBlock(BasicBlock *BB); 277 bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, 278 bool isPreheader); 279 bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); 280 bool optimizeInst(Instruction *I, bool &ModifiedDT); 281 bool optimizeMemoryInst(Instruction *I, Value *Addr, 282 Type *AccessTy, unsigned AS); 283 bool optimizeInlineAsmInst(CallInst *CS); 284 bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); 285 bool optimizeExt(Instruction *&I); 286 bool optimizeExtUses(Instruction *I); 287 bool optimizeLoadExt(LoadInst *I); 288 bool optimizeSelectInst(SelectInst *SI); 289 bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); 290 bool optimizeSwitchInst(SwitchInst *CI); 291 bool optimizeExtractElementInst(Instruction *Inst); 292 bool dupRetToEnableTailCallOpts(BasicBlock *BB); 293 bool placeDbgValues(Function &F); 294 bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, 295 LoadInst *&LI, Instruction *&Inst, bool HasPromoted); 296 bool tryToPromoteExts(TypePromotionTransaction &TPT, 297 const SmallVectorImpl<Instruction *> &Exts, 298 SmallVectorImpl<Instruction *> &ProfitablyMovedExts, 299 unsigned CreatedInstsCost = 0); 300 bool mergeSExts(Function &F); 301 bool performAddressTypePromotion( 302 Instruction *&Inst, 303 bool AllowPromotionWithoutCommonHeader, 304 bool HasPromoted, TypePromotionTransaction &TPT, 305 SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); 306 bool splitBranchCondition(Function &F); 307 bool simplifyOffsetableRelocate(Instruction &I); 308 bool splitIndirectCriticalEdges(Function &F); 309 }; 310 311 } // end anonymous namespace 312 313 char CodeGenPrepare::ID = 0; 314 315 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, 316 "Optimize for code generation", false, false) 317 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 318 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, 319 "Optimize for code generation", false, false) 320 321 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); } 322 323 bool CodeGenPrepare::runOnFunction(Function &F) { 324 if (skipFunction(F)) 325 return false; 326 327 DL = &F.getParent()->getDataLayout(); 328 329 bool EverMadeChange = false; 330 // Clear per function information. 331 InsertedInsts.clear(); 332 PromotedInsts.clear(); 333 BFI.reset(); 334 BPI.reset(); 335 336 ModifiedDT = false; 337 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { 338 TM = &TPC->getTM<TargetMachine>(); 339 SubtargetInfo = TM->getSubtargetImpl(F); 340 TLI = SubtargetInfo->getTargetLowering(); 341 TRI = SubtargetInfo->getRegisterInfo(); 342 } 343 TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 344 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 345 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 346 OptSize = F.optForSize(); 347 348 if (ProfileGuidedSectionPrefix) { 349 ProfileSummaryInfo *PSI = 350 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 351 if (PSI->isFunctionHotInCallGraph(&F)) 352 F.setSectionPrefix(".hot"); 353 else if (PSI->isFunctionColdInCallGraph(&F)) 354 F.setSectionPrefix(".unlikely"); 355 } 356 357 /// This optimization identifies DIV instructions that can be 358 /// profitably bypassed and carried out with a shorter, faster divide. 359 if (!OptSize && TLI && TLI->isSlowDivBypassed()) { 360 const DenseMap<unsigned int, unsigned int> &BypassWidths = 361 TLI->getBypassSlowDivWidths(); 362 BasicBlock* BB = &*F.begin(); 363 while (BB != nullptr) { 364 // bypassSlowDivision may create new BBs, but we don't want to reapply the 365 // optimization to those blocks. 366 BasicBlock* Next = BB->getNextNode(); 367 EverMadeChange |= bypassSlowDivision(BB, BypassWidths); 368 BB = Next; 369 } 370 } 371 372 // Eliminate blocks that contain only PHI nodes and an 373 // unconditional branch. 374 EverMadeChange |= eliminateMostlyEmptyBlocks(F); 375 376 // llvm.dbg.value is far away from the value then iSel may not be able 377 // handle it properly. iSel will drop llvm.dbg.value if it can not 378 // find a node corresponding to the value. 379 EverMadeChange |= placeDbgValues(F); 380 381 if (!DisableBranchOpts) 382 EverMadeChange |= splitBranchCondition(F); 383 384 // Split some critical edges where one of the sources is an indirect branch, 385 // to help generate sane code for PHIs involving such edges. 386 EverMadeChange |= splitIndirectCriticalEdges(F); 387 388 bool MadeChange = true; 389 while (MadeChange) { 390 MadeChange = false; 391 SeenChainsForSExt.clear(); 392 ValToSExtendedUses.clear(); 393 RemovedInsts.clear(); 394 for (Function::iterator I = F.begin(); I != F.end(); ) { 395 BasicBlock *BB = &*I++; 396 bool ModifiedDTOnIteration = false; 397 MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); 398 399 // Restart BB iteration if the dominator tree of the Function was changed 400 if (ModifiedDTOnIteration) 401 break; 402 } 403 if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) 404 MadeChange |= mergeSExts(F); 405 406 // Really free removed instructions during promotion. 407 for (Instruction *I : RemovedInsts) 408 I->deleteValue(); 409 410 EverMadeChange |= MadeChange; 411 } 412 413 SunkAddrs.clear(); 414 415 if (!DisableBranchOpts) { 416 MadeChange = false; 417 SmallPtrSet<BasicBlock*, 8> WorkList; 418 for (BasicBlock &BB : F) { 419 SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB)); 420 MadeChange |= ConstantFoldTerminator(&BB, true); 421 if (!MadeChange) continue; 422 423 for (SmallVectorImpl<BasicBlock*>::iterator 424 II = Successors.begin(), IE = Successors.end(); II != IE; ++II) 425 if (pred_begin(*II) == pred_end(*II)) 426 WorkList.insert(*II); 427 } 428 429 // Delete the dead blocks and any of their dead successors. 430 MadeChange |= !WorkList.empty(); 431 while (!WorkList.empty()) { 432 BasicBlock *BB = *WorkList.begin(); 433 WorkList.erase(BB); 434 SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); 435 436 DeleteDeadBlock(BB); 437 438 for (SmallVectorImpl<BasicBlock*>::iterator 439 II = Successors.begin(), IE = Successors.end(); II != IE; ++II) 440 if (pred_begin(*II) == pred_end(*II)) 441 WorkList.insert(*II); 442 } 443 444 // Merge pairs of basic blocks with unconditional branches, connected by 445 // a single edge. 446 if (EverMadeChange || MadeChange) 447 MadeChange |= eliminateFallThrough(F); 448 449 EverMadeChange |= MadeChange; 450 } 451 452 if (!DisableGCOpts) { 453 SmallVector<Instruction *, 2> Statepoints; 454 for (BasicBlock &BB : F) 455 for (Instruction &I : BB) 456 if (isStatepoint(I)) 457 Statepoints.push_back(&I); 458 for (auto &I : Statepoints) 459 EverMadeChange |= simplifyOffsetableRelocate(*I); 460 } 461 462 return EverMadeChange; 463 } 464 465 /// Merge basic blocks which are connected by a single edge, where one of the 466 /// basic blocks has a single successor pointing to the other basic block, 467 /// which has a single predecessor. 468 bool CodeGenPrepare::eliminateFallThrough(Function &F) { 469 bool Changed = false; 470 // Scan all of the blocks in the function, except for the entry block. 471 for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { 472 BasicBlock *BB = &*I++; 473 // If the destination block has a single pred, then this is a trivial 474 // edge, just collapse it. 475 BasicBlock *SinglePred = BB->getSinglePredecessor(); 476 477 // Don't merge if BB's address is taken. 478 if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; 479 480 BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); 481 if (Term && !Term->isConditional()) { 482 Changed = true; 483 DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n"); 484 // Remember if SinglePred was the entry block of the function. 485 // If so, we will need to move BB back to the entry position. 486 bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); 487 MergeBasicBlockIntoOnlyPred(BB, nullptr); 488 489 if (isEntry && BB != &BB->getParent()->getEntryBlock()) 490 BB->moveBefore(&BB->getParent()->getEntryBlock()); 491 492 // We have erased a block. Update the iterator. 493 I = BB->getIterator(); 494 } 495 } 496 return Changed; 497 } 498 499 /// Find a destination block from BB if BB is mergeable empty block. 500 BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { 501 // If this block doesn't end with an uncond branch, ignore it. 502 BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); 503 if (!BI || !BI->isUnconditional()) 504 return nullptr; 505 506 // If the instruction before the branch (skipping debug info) isn't a phi 507 // node, then other stuff is happening here. 508 BasicBlock::iterator BBI = BI->getIterator(); 509 if (BBI != BB->begin()) { 510 --BBI; 511 while (isa<DbgInfoIntrinsic>(BBI)) { 512 if (BBI == BB->begin()) 513 break; 514 --BBI; 515 } 516 if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)) 517 return nullptr; 518 } 519 520 // Do not break infinite loops. 521 BasicBlock *DestBB = BI->getSuccessor(0); 522 if (DestBB == BB) 523 return nullptr; 524 525 if (!canMergeBlocks(BB, DestBB)) 526 DestBB = nullptr; 527 528 return DestBB; 529 } 530 531 // Return the unique indirectbr predecessor of a block. This may return null 532 // even if such a predecessor exists, if it's not useful for splitting. 533 // If a predecessor is found, OtherPreds will contain all other (non-indirectbr) 534 // predecessors of BB. 535 static BasicBlock * 536 findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { 537 // If the block doesn't have any PHIs, we don't care about it, since there's 538 // no point in splitting it. 539 PHINode *PN = dyn_cast<PHINode>(BB->begin()); 540 if (!PN) 541 return nullptr; 542 543 // Verify we have exactly one IBR predecessor. 544 // Conservatively bail out if one of the other predecessors is not a "regular" 545 // terminator (that is, not a switch or a br). 546 BasicBlock *IBB = nullptr; 547 for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { 548 BasicBlock *PredBB = PN->getIncomingBlock(Pred); 549 TerminatorInst *PredTerm = PredBB->getTerminator(); 550 switch (PredTerm->getOpcode()) { 551 case Instruction::IndirectBr: 552 if (IBB) 553 return nullptr; 554 IBB = PredBB; 555 break; 556 case Instruction::Br: 557 case Instruction::Switch: 558 OtherPreds.push_back(PredBB); 559 continue; 560 default: 561 return nullptr; 562 } 563 } 564 565 return IBB; 566 } 567 568 // Split critical edges where the source of the edge is an indirectbr 569 // instruction. This isn't always possible, but we can handle some easy cases. 570 // This is useful because MI is unable to split such critical edges, 571 // which means it will not be able to sink instructions along those edges. 572 // This is especially painful for indirect branches with many successors, where 573 // we end up having to prepare all outgoing values in the origin block. 574 // 575 // Our normal algorithm for splitting critical edges requires us to update 576 // the outgoing edges of the edge origin block, but for an indirectbr this 577 // is hard, since it would require finding and updating the block addresses 578 // the indirect branch uses. But if a block only has a single indirectbr 579 // predecessor, with the others being regular branches, we can do it in a 580 // different way. 581 // Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr. 582 // We can split D into D0 and D1, where D0 contains only the PHIs from D, 583 // and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and 584 // create the following structure: 585 // A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1 586 bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) { 587 // Check whether the function has any indirectbrs, and collect which blocks 588 // they may jump to. Since most functions don't have indirect branches, 589 // this lowers the common case's overhead to O(Blocks) instead of O(Edges). 590 SmallSetVector<BasicBlock *, 16> Targets; 591 for (auto &BB : F) { 592 auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); 593 if (!IBI) 594 continue; 595 596 for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) 597 Targets.insert(IBI->getSuccessor(Succ)); 598 } 599 600 if (Targets.empty()) 601 return false; 602 603 bool Changed = false; 604 for (BasicBlock *Target : Targets) { 605 SmallVector<BasicBlock *, 16> OtherPreds; 606 BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); 607 // If we did not found an indirectbr, or the indirectbr is the only 608 // incoming edge, this isn't the kind of edge we're looking for. 609 if (!IBRPred || OtherPreds.empty()) 610 continue; 611 612 // Don't even think about ehpads/landingpads. 613 Instruction *FirstNonPHI = Target->getFirstNonPHI(); 614 if (FirstNonPHI->isEHPad() || Target->isLandingPad()) 615 continue; 616 617 BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); 618 // It's possible Target was its own successor through an indirectbr. 619 // In this case, the indirectbr now comes from BodyBlock. 620 if (IBRPred == Target) 621 IBRPred = BodyBlock; 622 623 // At this point Target only has PHIs, and BodyBlock has the rest of the 624 // block's body. Create a copy of Target that will be used by the "direct" 625 // preds. 626 ValueToValueMapTy VMap; 627 BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); 628 629 for (BasicBlock *Pred : OtherPreds) { 630 // If the target is a loop to itself, then the terminator of the split 631 // block needs to be updated. 632 if (Pred == Target) 633 BodyBlock->getTerminator()->replaceUsesOfWith(Target, DirectSucc); 634 else 635 Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc); 636 } 637 638 // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that 639 // they are clones, so the number of PHIs are the same. 640 // (a) Remove the edge coming from IBRPred from the "Direct" PHI 641 // (b) Leave that as the only edge in the "Indirect" PHI. 642 // (c) Merge the two in the body block. 643 BasicBlock::iterator Indirect = Target->begin(), 644 End = Target->getFirstNonPHI()->getIterator(); 645 BasicBlock::iterator Direct = DirectSucc->begin(); 646 BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); 647 648 assert(&*End == Target->getTerminator() && 649 "Block was expected to only contain PHIs"); 650 651 while (Indirect != End) { 652 PHINode *DirPHI = cast<PHINode>(Direct); 653 PHINode *IndPHI = cast<PHINode>(Indirect); 654 655 // Now, clean up - the direct block shouldn't get the indirect value, 656 // and vice versa. 657 DirPHI->removeIncomingValue(IBRPred); 658 Direct++; 659 660 // Advance the pointer here, to avoid invalidation issues when the old 661 // PHI is erased. 662 Indirect++; 663 664 PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); 665 NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), 666 IBRPred); 667 668 // Create a PHI in the body block, to merge the direct and indirect 669 // predecessors. 670 PHINode *MergePHI = 671 PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); 672 MergePHI->addIncoming(NewIndPHI, Target); 673 MergePHI->addIncoming(DirPHI, DirectSucc); 674 675 IndPHI->replaceAllUsesWith(MergePHI); 676 IndPHI->eraseFromParent(); 677 } 678 679 Changed = true; 680 } 681 682 return Changed; 683 } 684 685 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an 686 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split 687 /// edges in ways that are non-optimal for isel. Start by eliminating these 688 /// blocks so we can split them the way we want them. 689 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { 690 SmallPtrSet<BasicBlock *, 16> Preheaders; 691 SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end()); 692 while (!LoopList.empty()) { 693 Loop *L = LoopList.pop_back_val(); 694 LoopList.insert(LoopList.end(), L->begin(), L->end()); 695 if (BasicBlock *Preheader = L->getLoopPreheader()) 696 Preheaders.insert(Preheader); 697 } 698 699 bool MadeChange = false; 700 // Note that this intentionally skips the entry block. 701 for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { 702 BasicBlock *BB = &*I++; 703 BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB); 704 if (!DestBB || 705 !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB))) 706 continue; 707 708 eliminateMostlyEmptyBlock(BB); 709 MadeChange = true; 710 } 711 return MadeChange; 712 } 713 714 bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, 715 BasicBlock *DestBB, 716 bool isPreheader) { 717 // Do not delete loop preheaders if doing so would create a critical edge. 718 // Loop preheaders can be good locations to spill registers. If the 719 // preheader is deleted and we create a critical edge, registers may be 720 // spilled in the loop body instead. 721 if (!DisablePreheaderProtect && isPreheader && 722 !(BB->getSinglePredecessor() && 723 BB->getSinglePredecessor()->getSingleSuccessor())) 724 return false; 725 726 // Try to skip merging if the unique predecessor of BB is terminated by a 727 // switch or indirect branch instruction, and BB is used as an incoming block 728 // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to 729 // add COPY instructions in the predecessor of BB instead of BB (if it is not 730 // merged). Note that the critical edge created by merging such blocks wont be 731 // split in MachineSink because the jump table is not analyzable. By keeping 732 // such empty block (BB), ISel will place COPY instructions in BB, not in the 733 // predecessor of BB. 734 BasicBlock *Pred = BB->getUniquePredecessor(); 735 if (!Pred || 736 !(isa<SwitchInst>(Pred->getTerminator()) || 737 isa<IndirectBrInst>(Pred->getTerminator()))) 738 return true; 739 740 if (BB->getTerminator() != BB->getFirstNonPHI()) 741 return true; 742 743 // We use a simple cost heuristic which determine skipping merging is 744 // profitable if the cost of skipping merging is less than the cost of 745 // merging : Cost(skipping merging) < Cost(merging BB), where the 746 // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and 747 // the Cost(merging BB) is Freq(Pred) * Cost(Copy). 748 // Assuming Cost(Copy) == Cost(Branch), we could simplify it to : 749 // Freq(Pred) / Freq(BB) > 2. 750 // Note that if there are multiple empty blocks sharing the same incoming 751 // value for the PHIs in the DestBB, we consider them together. In such 752 // case, Cost(merging BB) will be the sum of their frequencies. 753 754 if (!isa<PHINode>(DestBB->begin())) 755 return true; 756 757 SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs; 758 759 // Find all other incoming blocks from which incoming values of all PHIs in 760 // DestBB are the same as the ones from BB. 761 for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E; 762 ++PI) { 763 BasicBlock *DestBBPred = *PI; 764 if (DestBBPred == BB) 765 continue; 766 767 bool HasAllSameValue = true; 768 BasicBlock::const_iterator DestBBI = DestBB->begin(); 769 while (const PHINode *DestPN = dyn_cast<PHINode>(DestBBI++)) { 770 if (DestPN->getIncomingValueForBlock(BB) != 771 DestPN->getIncomingValueForBlock(DestBBPred)) { 772 HasAllSameValue = false; 773 break; 774 } 775 } 776 if (HasAllSameValue) 777 SameIncomingValueBBs.insert(DestBBPred); 778 } 779 780 // See if all BB's incoming values are same as the value from Pred. In this 781 // case, no reason to skip merging because COPYs are expected to be place in 782 // Pred already. 783 if (SameIncomingValueBBs.count(Pred)) 784 return true; 785 786 if (!BFI) { 787 Function &F = *BB->getParent(); 788 LoopInfo LI{DominatorTree(F)}; 789 BPI.reset(new BranchProbabilityInfo(F, LI)); 790 BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); 791 } 792 793 BlockFrequency PredFreq = BFI->getBlockFreq(Pred); 794 BlockFrequency BBFreq = BFI->getBlockFreq(BB); 795 796 for (auto SameValueBB : SameIncomingValueBBs) 797 if (SameValueBB->getUniquePredecessor() == Pred && 798 DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB)) 799 BBFreq += BFI->getBlockFreq(SameValueBB); 800 801 return PredFreq.getFrequency() <= 802 BBFreq.getFrequency() * FreqRatioToSkipMerge; 803 } 804 805 /// Return true if we can merge BB into DestBB if there is a single 806 /// unconditional branch between them, and BB contains no other non-phi 807 /// instructions. 808 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, 809 const BasicBlock *DestBB) const { 810 // We only want to eliminate blocks whose phi nodes are used by phi nodes in 811 // the successor. If there are more complex condition (e.g. preheaders), 812 // don't mess around with them. 813 BasicBlock::const_iterator BBI = BB->begin(); 814 while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { 815 for (const User *U : PN->users()) { 816 const Instruction *UI = cast<Instruction>(U); 817 if (UI->getParent() != DestBB || !isa<PHINode>(UI)) 818 return false; 819 // If User is inside DestBB block and it is a PHINode then check 820 // incoming value. If incoming value is not from BB then this is 821 // a complex condition (e.g. preheaders) we want to avoid here. 822 if (UI->getParent() == DestBB) { 823 if (const PHINode *UPN = dyn_cast<PHINode>(UI)) 824 for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { 825 Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); 826 if (Insn && Insn->getParent() == BB && 827 Insn->getParent() != UPN->getIncomingBlock(I)) 828 return false; 829 } 830 } 831 } 832 } 833 834 // If BB and DestBB contain any common predecessors, then the phi nodes in BB 835 // and DestBB may have conflicting incoming values for the block. If so, we 836 // can't merge the block. 837 const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); 838 if (!DestBBPN) return true; // no conflict. 839 840 // Collect the preds of BB. 841 SmallPtrSet<const BasicBlock*, 16> BBPreds; 842 if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { 843 // It is faster to get preds from a PHI than with pred_iterator. 844 for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) 845 BBPreds.insert(BBPN->getIncomingBlock(i)); 846 } else { 847 BBPreds.insert(pred_begin(BB), pred_end(BB)); 848 } 849 850 // Walk the preds of DestBB. 851 for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { 852 BasicBlock *Pred = DestBBPN->getIncomingBlock(i); 853 if (BBPreds.count(Pred)) { // Common predecessor? 854 BBI = DestBB->begin(); 855 while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { 856 const Value *V1 = PN->getIncomingValueForBlock(Pred); 857 const Value *V2 = PN->getIncomingValueForBlock(BB); 858 859 // If V2 is a phi node in BB, look up what the mapped value will be. 860 if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) 861 if (V2PN->getParent() == BB) 862 V2 = V2PN->getIncomingValueForBlock(Pred); 863 864 // If there is a conflict, bail out. 865 if (V1 != V2) return false; 866 } 867 } 868 } 869 870 return true; 871 } 872 873 /// Eliminate a basic block that has only phi's and an unconditional branch in 874 /// it. 875 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { 876 BranchInst *BI = cast<BranchInst>(BB->getTerminator()); 877 BasicBlock *DestBB = BI->getSuccessor(0); 878 879 DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB); 880 881 // If the destination block has a single pred, then this is a trivial edge, 882 // just collapse it. 883 if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { 884 if (SinglePred != DestBB) { 885 // Remember if SinglePred was the entry block of the function. If so, we 886 // will need to move BB back to the entry position. 887 bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); 888 MergeBasicBlockIntoOnlyPred(DestBB, nullptr); 889 890 if (isEntry && BB != &BB->getParent()->getEntryBlock()) 891 BB->moveBefore(&BB->getParent()->getEntryBlock()); 892 893 DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); 894 return; 895 } 896 } 897 898 // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB 899 // to handle the new incoming edges it is about to have. 900 PHINode *PN; 901 for (BasicBlock::iterator BBI = DestBB->begin(); 902 (PN = dyn_cast<PHINode>(BBI)); ++BBI) { 903 // Remove the incoming value for BB, and remember it. 904 Value *InVal = PN->removeIncomingValue(BB, false); 905 906 // Two options: either the InVal is a phi node defined in BB or it is some 907 // value that dominates BB. 908 PHINode *InValPhi = dyn_cast<PHINode>(InVal); 909 if (InValPhi && InValPhi->getParent() == BB) { 910 // Add all of the input values of the input PHI as inputs of this phi. 911 for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) 912 PN->addIncoming(InValPhi->getIncomingValue(i), 913 InValPhi->getIncomingBlock(i)); 914 } else { 915 // Otherwise, add one instance of the dominating value for each edge that 916 // we will be adding. 917 if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { 918 for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) 919 PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); 920 } else { 921 for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) 922 PN->addIncoming(InVal, *PI); 923 } 924 } 925 } 926 927 // The PHIs are now updated, change everything that refers to BB to use 928 // DestBB and remove BB. 929 BB->replaceAllUsesWith(DestBB); 930 BB->eraseFromParent(); 931 ++NumBlocksElim; 932 933 DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); 934 } 935 936 // Computes a map of base pointer relocation instructions to corresponding 937 // derived pointer relocation instructions given a vector of all relocate calls 938 static void computeBaseDerivedRelocateMap( 939 const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls, 940 DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> 941 &RelocateInstMap) { 942 // Collect information in two maps: one primarily for locating the base object 943 // while filling the second map; the second map is the final structure holding 944 // a mapping between Base and corresponding Derived relocate calls 945 DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap; 946 for (auto *ThisRelocate : AllRelocateCalls) { 947 auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), 948 ThisRelocate->getDerivedPtrIndex()); 949 RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); 950 } 951 for (auto &Item : RelocateIdxMap) { 952 std::pair<unsigned, unsigned> Key = Item.first; 953 if (Key.first == Key.second) 954 // Base relocation: nothing to insert 955 continue; 956 957 GCRelocateInst *I = Item.second; 958 auto BaseKey = std::make_pair(Key.first, Key.first); 959 960 // We're iterating over RelocateIdxMap so we cannot modify it. 961 auto MaybeBase = RelocateIdxMap.find(BaseKey); 962 if (MaybeBase == RelocateIdxMap.end()) 963 // TODO: We might want to insert a new base object relocate and gep off 964 // that, if there are enough derived object relocates. 965 continue; 966 967 RelocateInstMap[MaybeBase->second].push_back(I); 968 } 969 } 970 971 // Accepts a GEP and extracts the operands into a vector provided they're all 972 // small integer constants 973 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, 974 SmallVectorImpl<Value *> &OffsetV) { 975 for (unsigned i = 1; i < GEP->getNumOperands(); i++) { 976 // Only accept small constant integer operands 977 auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i)); 978 if (!Op || Op->getZExtValue() > 20) 979 return false; 980 } 981 982 for (unsigned i = 1; i < GEP->getNumOperands(); i++) 983 OffsetV.push_back(GEP->getOperand(i)); 984 return true; 985 } 986 987 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to 988 // replace, computes a replacement, and affects it. 989 static bool 990 simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, 991 const SmallVectorImpl<GCRelocateInst *> &Targets) { 992 bool MadeChange = false; 993 // We must ensure the relocation of derived pointer is defined after 994 // relocation of base pointer. If we find a relocation corresponding to base 995 // defined earlier than relocation of base then we move relocation of base 996 // right before found relocation. We consider only relocation in the same 997 // basic block as relocation of base. Relocations from other basic block will 998 // be skipped by optimization and we do not care about them. 999 for (auto R = RelocatedBase->getParent()->getFirstInsertionPt(); 1000 &*R != RelocatedBase; ++R) 1001 if (auto RI = dyn_cast<GCRelocateInst>(R)) 1002 if (RI->getStatepoint() == RelocatedBase->getStatepoint()) 1003 if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) { 1004 RelocatedBase->moveBefore(RI); 1005 break; 1006 } 1007 1008 for (GCRelocateInst *ToReplace : Targets) { 1009 assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && 1010 "Not relocating a derived object of the original base object"); 1011 if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { 1012 // A duplicate relocate call. TODO: coalesce duplicates. 1013 continue; 1014 } 1015 1016 if (RelocatedBase->getParent() != ToReplace->getParent()) { 1017 // Base and derived relocates are in different basic blocks. 1018 // In this case transform is only valid when base dominates derived 1019 // relocate. However it would be too expensive to check dominance 1020 // for each such relocate, so we skip the whole transformation. 1021 continue; 1022 } 1023 1024 Value *Base = ToReplace->getBasePtr(); 1025 auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr()); 1026 if (!Derived || Derived->getPointerOperand() != Base) 1027 continue; 1028 1029 SmallVector<Value *, 2> OffsetV; 1030 if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV)) 1031 continue; 1032 1033 // Create a Builder and replace the target callsite with a gep 1034 assert(RelocatedBase->getNextNode() && 1035 "Should always have one since it's not a terminator"); 1036 1037 // Insert after RelocatedBase 1038 IRBuilder<> Builder(RelocatedBase->getNextNode()); 1039 Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); 1040 1041 // If gc_relocate does not match the actual type, cast it to the right type. 1042 // In theory, there must be a bitcast after gc_relocate if the type does not 1043 // match, and we should reuse it to get the derived pointer. But it could be 1044 // cases like this: 1045 // bb1: 1046 // ... 1047 // %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) 1048 // br label %merge 1049 // 1050 // bb2: 1051 // ... 1052 // %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) 1053 // br label %merge 1054 // 1055 // merge: 1056 // %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ] 1057 // %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)* 1058 // 1059 // In this case, we can not find the bitcast any more. So we insert a new bitcast 1060 // no matter there is already one or not. In this way, we can handle all cases, and 1061 // the extra bitcast should be optimized away in later passes. 1062 Value *ActualRelocatedBase = RelocatedBase; 1063 if (RelocatedBase->getType() != Base->getType()) { 1064 ActualRelocatedBase = 1065 Builder.CreateBitCast(RelocatedBase, Base->getType()); 1066 } 1067 Value *Replacement = Builder.CreateGEP( 1068 Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); 1069 Replacement->takeName(ToReplace); 1070 // If the newly generated derived pointer's type does not match the original derived 1071 // pointer's type, cast the new derived pointer to match it. Same reasoning as above. 1072 Value *ActualReplacement = Replacement; 1073 if (Replacement->getType() != ToReplace->getType()) { 1074 ActualReplacement = 1075 Builder.CreateBitCast(Replacement, ToReplace->getType()); 1076 } 1077 ToReplace->replaceAllUsesWith(ActualReplacement); 1078 ToReplace->eraseFromParent(); 1079 1080 MadeChange = true; 1081 } 1082 return MadeChange; 1083 } 1084 1085 // Turns this: 1086 // 1087 // %base = ... 1088 // %ptr = gep %base + 15 1089 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) 1090 // %base' = relocate(%tok, i32 4, i32 4) 1091 // %ptr' = relocate(%tok, i32 4, i32 5) 1092 // %val = load %ptr' 1093 // 1094 // into this: 1095 // 1096 // %base = ... 1097 // %ptr = gep %base + 15 1098 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) 1099 // %base' = gc.relocate(%tok, i32 4, i32 4) 1100 // %ptr' = gep %base' + 15 1101 // %val = load %ptr' 1102 bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { 1103 bool MadeChange = false; 1104 SmallVector<GCRelocateInst *, 2> AllRelocateCalls; 1105 1106 for (auto *U : I.users()) 1107 if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U)) 1108 // Collect all the relocate calls associated with a statepoint 1109 AllRelocateCalls.push_back(Relocate); 1110 1111 // We need atleast one base pointer relocation + one derived pointer 1112 // relocation to mangle 1113 if (AllRelocateCalls.size() < 2) 1114 return false; 1115 1116 // RelocateInstMap is a mapping from the base relocate instruction to the 1117 // corresponding derived relocate instructions 1118 DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap; 1119 computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); 1120 if (RelocateInstMap.empty()) 1121 return false; 1122 1123 for (auto &Item : RelocateInstMap) 1124 // Item.first is the RelocatedBase to offset against 1125 // Item.second is the vector of Targets to replace 1126 MadeChange = simplifyRelocatesOffABase(Item.first, Item.second); 1127 return MadeChange; 1128 } 1129 1130 /// SinkCast - Sink the specified cast instruction into its user blocks 1131 static bool SinkCast(CastInst *CI) { 1132 BasicBlock *DefBB = CI->getParent(); 1133 1134 /// InsertedCasts - Only insert a cast in each block once. 1135 DenseMap<BasicBlock*, CastInst*> InsertedCasts; 1136 1137 bool MadeChange = false; 1138 for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end(); 1139 UI != E; ) { 1140 Use &TheUse = UI.getUse(); 1141 Instruction *User = cast<Instruction>(*UI); 1142 1143 // Figure out which BB this cast is used in. For PHI's this is the 1144 // appropriate predecessor block. 1145 BasicBlock *UserBB = User->getParent(); 1146 if (PHINode *PN = dyn_cast<PHINode>(User)) { 1147 UserBB = PN->getIncomingBlock(TheUse); 1148 } 1149 1150 // Preincrement use iterator so we don't invalidate it. 1151 ++UI; 1152 1153 // The first insertion point of a block containing an EH pad is after the 1154 // pad. If the pad is the user, we cannot sink the cast past the pad. 1155 if (User->isEHPad()) 1156 continue; 1157 1158 // If the block selected to receive the cast is an EH pad that does not 1159 // allow non-PHI instructions before the terminator, we can't sink the 1160 // cast. 1161 if (UserBB->getTerminator()->isEHPad()) 1162 continue; 1163 1164 // If this user is in the same block as the cast, don't change the cast. 1165 if (UserBB == DefBB) continue; 1166 1167 // If we have already inserted a cast into this block, use it. 1168 CastInst *&InsertedCast = InsertedCasts[UserBB]; 1169 1170 if (!InsertedCast) { 1171 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 1172 assert(InsertPt != UserBB->end()); 1173 InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), 1174 CI->getType(), "", &*InsertPt); 1175 } 1176 1177 // Replace a use of the cast with a use of the new cast. 1178 TheUse = InsertedCast; 1179 MadeChange = true; 1180 ++NumCastUses; 1181 } 1182 1183 // If we removed all uses, nuke the cast. 1184 if (CI->use_empty()) { 1185 CI->eraseFromParent(); 1186 MadeChange = true; 1187 } 1188 1189 return MadeChange; 1190 } 1191 1192 /// If the specified cast instruction is a noop copy (e.g. it's casting from 1193 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to 1194 /// reduce the number of virtual registers that must be created and coalesced. 1195 /// 1196 /// Return true if any changes are made. 1197 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, 1198 const DataLayout &DL) { 1199 // Sink only "cheap" (or nop) address-space casts. This is a weaker condition 1200 // than sinking only nop casts, but is helpful on some platforms. 1201 if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) { 1202 if (!TLI.isCheapAddrSpaceCast(ASC->getSrcAddressSpace(), 1203 ASC->getDestAddressSpace())) 1204 return false; 1205 } 1206 1207 // If this is a noop copy, 1208 EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType()); 1209 EVT DstVT = TLI.getValueType(DL, CI->getType()); 1210 1211 // This is an fp<->int conversion? 1212 if (SrcVT.isInteger() != DstVT.isInteger()) 1213 return false; 1214 1215 // If this is an extension, it will be a zero or sign extension, which 1216 // isn't a noop. 1217 if (SrcVT.bitsLT(DstVT)) return false; 1218 1219 // If these values will be promoted, find out what they will be promoted 1220 // to. This helps us consider truncates on PPC as noop copies when they 1221 // are. 1222 if (TLI.getTypeAction(CI->getContext(), SrcVT) == 1223 TargetLowering::TypePromoteInteger) 1224 SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); 1225 if (TLI.getTypeAction(CI->getContext(), DstVT) == 1226 TargetLowering::TypePromoteInteger) 1227 DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); 1228 1229 // If, after promotion, these are the same types, this is a noop copy. 1230 if (SrcVT != DstVT) 1231 return false; 1232 1233 return SinkCast(CI); 1234 } 1235 1236 /// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if 1237 /// possible. 1238 /// 1239 /// Return true if any changes were made. 1240 static bool CombineUAddWithOverflow(CmpInst *CI) { 1241 Value *A, *B; 1242 Instruction *AddI; 1243 if (!match(CI, 1244 m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI)))) 1245 return false; 1246 1247 Type *Ty = AddI->getType(); 1248 if (!isa<IntegerType>(Ty)) 1249 return false; 1250 1251 // We don't want to move around uses of condition values this late, so we we 1252 // check if it is legal to create the call to the intrinsic in the basic 1253 // block containing the icmp: 1254 1255 if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse()) 1256 return false; 1257 1258 #ifndef NDEBUG 1259 // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption 1260 // for now: 1261 if (AddI->hasOneUse()) 1262 assert(*AddI->user_begin() == CI && "expected!"); 1263 #endif 1264 1265 Module *M = CI->getModule(); 1266 Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty); 1267 1268 auto *InsertPt = AddI->hasOneUse() ? CI : AddI; 1269 1270 auto *UAddWithOverflow = 1271 CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt); 1272 auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt); 1273 auto *Overflow = 1274 ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt); 1275 1276 CI->replaceAllUsesWith(Overflow); 1277 AddI->replaceAllUsesWith(UAdd); 1278 CI->eraseFromParent(); 1279 AddI->eraseFromParent(); 1280 return true; 1281 } 1282 1283 /// Sink the given CmpInst into user blocks to reduce the number of virtual 1284 /// registers that must be created and coalesced. This is a clear win except on 1285 /// targets with multiple condition code registers (PowerPC), where it might 1286 /// lose; some adjustment may be wanted there. 1287 /// 1288 /// Return true if any changes are made. 1289 static bool SinkCmpExpression(CmpInst *CI, const TargetLowering *TLI) { 1290 BasicBlock *DefBB = CI->getParent(); 1291 1292 // Avoid sinking soft-FP comparisons, since this can move them into a loop. 1293 if (TLI && TLI->useSoftFloat() && isa<FCmpInst>(CI)) 1294 return false; 1295 1296 // Only insert a cmp in each block once. 1297 DenseMap<BasicBlock*, CmpInst*> InsertedCmps; 1298 1299 bool MadeChange = false; 1300 for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end(); 1301 UI != E; ) { 1302 Use &TheUse = UI.getUse(); 1303 Instruction *User = cast<Instruction>(*UI); 1304 1305 // Preincrement use iterator so we don't invalidate it. 1306 ++UI; 1307 1308 // Don't bother for PHI nodes. 1309 if (isa<PHINode>(User)) 1310 continue; 1311 1312 // Figure out which BB this cmp is used in. 1313 BasicBlock *UserBB = User->getParent(); 1314 1315 // If this user is in the same block as the cmp, don't change the cmp. 1316 if (UserBB == DefBB) continue; 1317 1318 // If we have already inserted a cmp into this block, use it. 1319 CmpInst *&InsertedCmp = InsertedCmps[UserBB]; 1320 1321 if (!InsertedCmp) { 1322 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 1323 assert(InsertPt != UserBB->end()); 1324 InsertedCmp = 1325 CmpInst::Create(CI->getOpcode(), CI->getPredicate(), 1326 CI->getOperand(0), CI->getOperand(1), "", &*InsertPt); 1327 // Propagate the debug info. 1328 InsertedCmp->setDebugLoc(CI->getDebugLoc()); 1329 } 1330 1331 // Replace a use of the cmp with a use of the new cmp. 1332 TheUse = InsertedCmp; 1333 MadeChange = true; 1334 ++NumCmpUses; 1335 } 1336 1337 // If we removed all uses, nuke the cmp. 1338 if (CI->use_empty()) { 1339 CI->eraseFromParent(); 1340 MadeChange = true; 1341 } 1342 1343 return MadeChange; 1344 } 1345 1346 static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) { 1347 if (SinkCmpExpression(CI, TLI)) 1348 return true; 1349 1350 if (CombineUAddWithOverflow(CI)) 1351 return true; 1352 1353 return false; 1354 } 1355 1356 /// Duplicate and sink the given 'and' instruction into user blocks where it is 1357 /// used in a compare to allow isel to generate better code for targets where 1358 /// this operation can be combined. 1359 /// 1360 /// Return true if any changes are made. 1361 static bool sinkAndCmp0Expression(Instruction *AndI, 1362 const TargetLowering &TLI, 1363 SetOfInstrs &InsertedInsts) { 1364 // Double-check that we're not trying to optimize an instruction that was 1365 // already optimized by some other part of this pass. 1366 assert(!InsertedInsts.count(AndI) && 1367 "Attempting to optimize already optimized and instruction"); 1368 (void) InsertedInsts; 1369 1370 // Nothing to do for single use in same basic block. 1371 if (AndI->hasOneUse() && 1372 AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) 1373 return false; 1374 1375 // Try to avoid cases where sinking/duplicating is likely to increase register 1376 // pressure. 1377 if (!isa<ConstantInt>(AndI->getOperand(0)) && 1378 !isa<ConstantInt>(AndI->getOperand(1)) && 1379 AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) 1380 return false; 1381 1382 for (auto *U : AndI->users()) { 1383 Instruction *User = cast<Instruction>(U); 1384 1385 // Only sink for and mask feeding icmp with 0. 1386 if (!isa<ICmpInst>(User)) 1387 return false; 1388 1389 auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); 1390 if (!CmpC || !CmpC->isZero()) 1391 return false; 1392 } 1393 1394 if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) 1395 return false; 1396 1397 DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); 1398 DEBUG(AndI->getParent()->dump()); 1399 1400 // Push the 'and' into the same block as the icmp 0. There should only be 1401 // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any 1402 // others, so we don't need to keep track of which BBs we insert into. 1403 for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); 1404 UI != E; ) { 1405 Use &TheUse = UI.getUse(); 1406 Instruction *User = cast<Instruction>(*UI); 1407 1408 // Preincrement use iterator so we don't invalidate it. 1409 ++UI; 1410 1411 DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); 1412 1413 // Keep the 'and' in the same place if the use is already in the same block. 1414 Instruction *InsertPt = 1415 User->getParent() == AndI->getParent() ? AndI : User; 1416 Instruction *InsertedAnd = 1417 BinaryOperator::Create(Instruction::And, AndI->getOperand(0), 1418 AndI->getOperand(1), "", InsertPt); 1419 // Propagate the debug info. 1420 InsertedAnd->setDebugLoc(AndI->getDebugLoc()); 1421 1422 // Replace a use of the 'and' with a use of the new 'and'. 1423 TheUse = InsertedAnd; 1424 ++NumAndUses; 1425 DEBUG(User->getParent()->dump()); 1426 } 1427 1428 // We removed all uses, nuke the and. 1429 AndI->eraseFromParent(); 1430 return true; 1431 } 1432 1433 /// Check if the candidates could be combined with a shift instruction, which 1434 /// includes: 1435 /// 1. Truncate instruction 1436 /// 2. And instruction and the imm is a mask of the low bits: 1437 /// imm & (imm+1) == 0 1438 static bool isExtractBitsCandidateUse(Instruction *User) { 1439 if (!isa<TruncInst>(User)) { 1440 if (User->getOpcode() != Instruction::And || 1441 !isa<ConstantInt>(User->getOperand(1))) 1442 return false; 1443 1444 const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue(); 1445 1446 if ((Cimm & (Cimm + 1)).getBoolValue()) 1447 return false; 1448 } 1449 return true; 1450 } 1451 1452 /// Sink both shift and truncate instruction to the use of truncate's BB. 1453 static bool 1454 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, 1455 DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts, 1456 const TargetLowering &TLI, const DataLayout &DL) { 1457 BasicBlock *UserBB = User->getParent(); 1458 DenseMap<BasicBlock *, CastInst *> InsertedTruncs; 1459 TruncInst *TruncI = dyn_cast<TruncInst>(User); 1460 bool MadeChange = false; 1461 1462 for (Value::user_iterator TruncUI = TruncI->user_begin(), 1463 TruncE = TruncI->user_end(); 1464 TruncUI != TruncE;) { 1465 1466 Use &TruncTheUse = TruncUI.getUse(); 1467 Instruction *TruncUser = cast<Instruction>(*TruncUI); 1468 // Preincrement use iterator so we don't invalidate it. 1469 1470 ++TruncUI; 1471 1472 int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode()); 1473 if (!ISDOpcode) 1474 continue; 1475 1476 // If the use is actually a legal node, there will not be an 1477 // implicit truncate. 1478 // FIXME: always querying the result type is just an 1479 // approximation; some nodes' legality is determined by the 1480 // operand or other means. There's no good way to find out though. 1481 if (TLI.isOperationLegalOrCustom( 1482 ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true))) 1483 continue; 1484 1485 // Don't bother for PHI nodes. 1486 if (isa<PHINode>(TruncUser)) 1487 continue; 1488 1489 BasicBlock *TruncUserBB = TruncUser->getParent(); 1490 1491 if (UserBB == TruncUserBB) 1492 continue; 1493 1494 BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB]; 1495 CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB]; 1496 1497 if (!InsertedShift && !InsertedTrunc) { 1498 BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); 1499 assert(InsertPt != TruncUserBB->end()); 1500 // Sink the shift 1501 if (ShiftI->getOpcode() == Instruction::AShr) 1502 InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, 1503 "", &*InsertPt); 1504 else 1505 InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, 1506 "", &*InsertPt); 1507 1508 // Sink the trunc 1509 BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); 1510 TruncInsertPt++; 1511 assert(TruncInsertPt != TruncUserBB->end()); 1512 1513 InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, 1514 TruncI->getType(), "", &*TruncInsertPt); 1515 1516 MadeChange = true; 1517 1518 TruncTheUse = InsertedTrunc; 1519 } 1520 } 1521 return MadeChange; 1522 } 1523 1524 /// Sink the shift *right* instruction into user blocks if the uses could 1525 /// potentially be combined with this shift instruction and generate BitExtract 1526 /// instruction. It will only be applied if the architecture supports BitExtract 1527 /// instruction. Here is an example: 1528 /// BB1: 1529 /// %x.extract.shift = lshr i64 %arg1, 32 1530 /// BB2: 1531 /// %x.extract.trunc = trunc i64 %x.extract.shift to i16 1532 /// ==> 1533 /// 1534 /// BB2: 1535 /// %x.extract.shift.1 = lshr i64 %arg1, 32 1536 /// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16 1537 /// 1538 /// CodeGen will recoginze the pattern in BB2 and generate BitExtract 1539 /// instruction. 1540 /// Return true if any changes are made. 1541 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, 1542 const TargetLowering &TLI, 1543 const DataLayout &DL) { 1544 BasicBlock *DefBB = ShiftI->getParent(); 1545 1546 /// Only insert instructions in each block once. 1547 DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts; 1548 1549 bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType())); 1550 1551 bool MadeChange = false; 1552 for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end(); 1553 UI != E;) { 1554 Use &TheUse = UI.getUse(); 1555 Instruction *User = cast<Instruction>(*UI); 1556 // Preincrement use iterator so we don't invalidate it. 1557 ++UI; 1558 1559 // Don't bother for PHI nodes. 1560 if (isa<PHINode>(User)) 1561 continue; 1562 1563 if (!isExtractBitsCandidateUse(User)) 1564 continue; 1565 1566 BasicBlock *UserBB = User->getParent(); 1567 1568 if (UserBB == DefBB) { 1569 // If the shift and truncate instruction are in the same BB. The use of 1570 // the truncate(TruncUse) may still introduce another truncate if not 1571 // legal. In this case, we would like to sink both shift and truncate 1572 // instruction to the BB of TruncUse. 1573 // for example: 1574 // BB1: 1575 // i64 shift.result = lshr i64 opnd, imm 1576 // trunc.result = trunc shift.result to i16 1577 // 1578 // BB2: 1579 // ----> We will have an implicit truncate here if the architecture does 1580 // not have i16 compare. 1581 // cmp i16 trunc.result, opnd2 1582 // 1583 if (isa<TruncInst>(User) && shiftIsLegal 1584 // If the type of the truncate is legal, no trucate will be 1585 // introduced in other basic blocks. 1586 && 1587 (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType())))) 1588 MadeChange = 1589 SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL); 1590 1591 continue; 1592 } 1593 // If we have already inserted a shift into this block, use it. 1594 BinaryOperator *&InsertedShift = InsertedShifts[UserBB]; 1595 1596 if (!InsertedShift) { 1597 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 1598 assert(InsertPt != UserBB->end()); 1599 1600 if (ShiftI->getOpcode() == Instruction::AShr) 1601 InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, 1602 "", &*InsertPt); 1603 else 1604 InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, 1605 "", &*InsertPt); 1606 1607 MadeChange = true; 1608 } 1609 1610 // Replace a use of the shift with a use of the new shift. 1611 TheUse = InsertedShift; 1612 } 1613 1614 // If we removed all uses, nuke the shift. 1615 if (ShiftI->use_empty()) 1616 ShiftI->eraseFromParent(); 1617 1618 return MadeChange; 1619 } 1620 1621 /// If counting leading or trailing zeros is an expensive operation and a zero 1622 /// input is defined, add a check for zero to avoid calling the intrinsic. 1623 /// 1624 /// We want to transform: 1625 /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) 1626 /// 1627 /// into: 1628 /// entry: 1629 /// %cmpz = icmp eq i64 %A, 0 1630 /// br i1 %cmpz, label %cond.end, label %cond.false 1631 /// cond.false: 1632 /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) 1633 /// br label %cond.end 1634 /// cond.end: 1635 /// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] 1636 /// 1637 /// If the transform is performed, return true and set ModifiedDT to true. 1638 static bool despeculateCountZeros(IntrinsicInst *CountZeros, 1639 const TargetLowering *TLI, 1640 const DataLayout *DL, 1641 bool &ModifiedDT) { 1642 if (!TLI || !DL) 1643 return false; 1644 1645 // If a zero input is undefined, it doesn't make sense to despeculate that. 1646 if (match(CountZeros->getOperand(1), m_One())) 1647 return false; 1648 1649 // If it's cheap to speculate, there's nothing to do. 1650 auto IntrinsicID = CountZeros->getIntrinsicID(); 1651 if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || 1652 (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) 1653 return false; 1654 1655 // Only handle legal scalar cases. Anything else requires too much work. 1656 Type *Ty = CountZeros->getType(); 1657 unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); 1658 if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits()) 1659 return false; 1660 1661 // The intrinsic will be sunk behind a compare against zero and branch. 1662 BasicBlock *StartBlock = CountZeros->getParent(); 1663 BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); 1664 1665 // Create another block after the count zero intrinsic. A PHI will be added 1666 // in this block to select the result of the intrinsic or the bit-width 1667 // constant if the input to the intrinsic is zero. 1668 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); 1669 BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); 1670 1671 // Set up a builder to create a compare, conditional branch, and PHI. 1672 IRBuilder<> Builder(CountZeros->getContext()); 1673 Builder.SetInsertPoint(StartBlock->getTerminator()); 1674 Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); 1675 1676 // Replace the unconditional branch that was created by the first split with 1677 // a compare against zero and a conditional branch. 1678 Value *Zero = Constant::getNullValue(Ty); 1679 Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); 1680 Builder.CreateCondBr(Cmp, EndBlock, CallBlock); 1681 StartBlock->getTerminator()->eraseFromParent(); 1682 1683 // Create a PHI in the end block to select either the output of the intrinsic 1684 // or the bit width of the operand. 1685 Builder.SetInsertPoint(&EndBlock->front()); 1686 PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); 1687 CountZeros->replaceAllUsesWith(PN); 1688 Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); 1689 PN->addIncoming(BitWidth, StartBlock); 1690 PN->addIncoming(CountZeros, CallBlock); 1691 1692 // We are explicitly handling the zero case, so we can set the intrinsic's 1693 // undefined zero argument to 'true'. This will also prevent reprocessing the 1694 // intrinsic; we only despeculate when a zero input is defined. 1695 CountZeros->setArgOperand(1, Builder.getTrue()); 1696 ModifiedDT = true; 1697 return true; 1698 } 1699 1700 namespace { 1701 1702 // This class provides helper functions to expand a memcmp library call into an 1703 // inline expansion. 1704 class MemCmpExpansion { 1705 struct ResultBlock { 1706 BasicBlock *BB = nullptr; 1707 PHINode *PhiSrc1 = nullptr; 1708 PHINode *PhiSrc2 = nullptr; 1709 1710 ResultBlock() = default; 1711 }; 1712 1713 CallInst *CI; 1714 ResultBlock ResBlock; 1715 unsigned MaxLoadSize; 1716 unsigned NumBlocks; 1717 unsigned NumBlocksNonOneByte; 1718 unsigned NumLoadsPerBlock; 1719 std::vector<BasicBlock *> LoadCmpBlocks; 1720 BasicBlock *EndBlock; 1721 PHINode *PhiRes; 1722 bool IsUsedForZeroCmp; 1723 const DataLayout &DL; 1724 IRBuilder<> Builder; 1725 1726 unsigned calculateNumBlocks(unsigned Size); 1727 void createLoadCmpBlocks(); 1728 void createResultBlock(); 1729 void setupResultBlockPHINodes(); 1730 void setupEndBlockPHINodes(); 1731 void emitLoadCompareBlock(unsigned Index, unsigned LoadSize, 1732 unsigned GEPIndex); 1733 Value *getCompareLoadPairs(unsigned Index, unsigned Size, 1734 unsigned &NumBytesProcessed); 1735 void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, 1736 unsigned &NumBytesProcessed); 1737 void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex); 1738 void emitMemCmpResultBlock(); 1739 Value *getMemCmpExpansionZeroCase(unsigned Size); 1740 Value *getMemCmpEqZeroOneBlock(unsigned Size); 1741 Value *getMemCmpOneBlock(unsigned Size); 1742 unsigned getLoadSize(unsigned Size); 1743 unsigned getNumLoads(unsigned Size); 1744 1745 public: 1746 MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, 1747 unsigned NumLoadsPerBlock, const DataLayout &DL); 1748 1749 Value *getMemCmpExpansion(uint64_t Size); 1750 }; 1751 1752 } // end anonymous namespace 1753 1754 // Initialize the basic block structure required for expansion of memcmp call 1755 // with given maximum load size and memcmp size parameter. 1756 // This structure includes: 1757 // 1. A list of load compare blocks - LoadCmpBlocks. 1758 // 2. An EndBlock, split from original instruction point, which is the block to 1759 // return from. 1760 // 3. ResultBlock, block to branch to for early exit when a 1761 // LoadCmpBlock finds a difference. 1762 MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, 1763 unsigned MaxLoadSize, unsigned LoadsPerBlock, 1764 const DataLayout &TheDataLayout) 1765 : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock), 1766 DL(TheDataLayout), Builder(CI) { 1767 // A memcmp with zero-comparison with only one block of load and compare does 1768 // not need to set up any extra blocks. This case could be handled in the DAG, 1769 // but since we have all of the machinery to flexibly expand any memcpy here, 1770 // we choose to handle this case too to avoid fragmented lowering. 1771 IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); 1772 NumBlocks = calculateNumBlocks(Size); 1773 if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) { 1774 BasicBlock *StartBlock = CI->getParent(); 1775 EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); 1776 setupEndBlockPHINodes(); 1777 createResultBlock(); 1778 1779 // If return value of memcmp is not used in a zero equality, we need to 1780 // calculate which source was larger. The calculation requires the 1781 // two loaded source values of each load compare block. 1782 // These will be saved in the phi nodes created by setupResultBlockPHINodes. 1783 if (!IsUsedForZeroCmp) 1784 setupResultBlockPHINodes(); 1785 1786 // Create the number of required load compare basic blocks. 1787 createLoadCmpBlocks(); 1788 1789 // Update the terminator added by splitBasicBlock to branch to the first 1790 // LoadCmpBlock. 1791 StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); 1792 } 1793 1794 Builder.SetCurrentDebugLocation(CI->getDebugLoc()); 1795 } 1796 1797 void MemCmpExpansion::createLoadCmpBlocks() { 1798 for (unsigned i = 0; i < NumBlocks; i++) { 1799 BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", 1800 EndBlock->getParent(), EndBlock); 1801 LoadCmpBlocks.push_back(BB); 1802 } 1803 } 1804 1805 void MemCmpExpansion::createResultBlock() { 1806 ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", 1807 EndBlock->getParent(), EndBlock); 1808 } 1809 1810 // This function creates the IR instructions for loading and comparing 1 byte. 1811 // It loads 1 byte from each source of the memcmp parameters with the given 1812 // GEPIndex. It then subtracts the two loaded values and adds this result to the 1813 // final phi node for selecting the memcmp result. 1814 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, 1815 unsigned GEPIndex) { 1816 Value *Source1 = CI->getArgOperand(0); 1817 Value *Source2 = CI->getArgOperand(1); 1818 1819 Builder.SetInsertPoint(LoadCmpBlocks[Index]); 1820 Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); 1821 // Cast source to LoadSizeType*. 1822 if (Source1->getType() != LoadSizeType) 1823 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); 1824 if (Source2->getType() != LoadSizeType) 1825 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); 1826 1827 // Get the base address using the GEPIndex. 1828 if (GEPIndex != 0) { 1829 Source1 = Builder.CreateGEP(LoadSizeType, Source1, 1830 ConstantInt::get(LoadSizeType, GEPIndex)); 1831 Source2 = Builder.CreateGEP(LoadSizeType, Source2, 1832 ConstantInt::get(LoadSizeType, GEPIndex)); 1833 } 1834 1835 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); 1836 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); 1837 1838 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); 1839 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); 1840 Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); 1841 1842 PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); 1843 1844 if (Index < (LoadCmpBlocks.size() - 1)) { 1845 // Early exit branch if difference found to EndBlock. Otherwise, continue to 1846 // next LoadCmpBlock, 1847 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, 1848 ConstantInt::get(Diff->getType(), 0)); 1849 BranchInst *CmpBr = 1850 BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp); 1851 Builder.Insert(CmpBr); 1852 } else { 1853 // The last block has an unconditional branch to EndBlock. 1854 BranchInst *CmpBr = BranchInst::Create(EndBlock); 1855 Builder.Insert(CmpBr); 1856 } 1857 } 1858 1859 unsigned MemCmpExpansion::getNumLoads(unsigned Size) { 1860 return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize); 1861 } 1862 1863 unsigned MemCmpExpansion::getLoadSize(unsigned Size) { 1864 return MinAlign(PowerOf2Floor(Size), MaxLoadSize); 1865 } 1866 1867 /// Generate an equality comparison for one or more pairs of loaded values. 1868 /// This is used in the case where the memcmp() call is compared equal or not 1869 /// equal to zero. 1870 Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, 1871 unsigned &NumBytesProcessed) { 1872 std::vector<Value *> XorList, OrList; 1873 Value *Diff; 1874 1875 unsigned RemainingBytes = Size - NumBytesProcessed; 1876 unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); 1877 unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); 1878 1879 // For a single-block expansion, start inserting before the memcmp call. 1880 if (LoadCmpBlocks.empty()) 1881 Builder.SetInsertPoint(CI); 1882 else 1883 Builder.SetInsertPoint(LoadCmpBlocks[Index]); 1884 1885 Value *Cmp = nullptr; 1886 for (unsigned i = 0; i < NumLoads; ++i) { 1887 unsigned LoadSize = getLoadSize(RemainingBytes); 1888 unsigned GEPIndex = NumBytesProcessed / LoadSize; 1889 NumBytesProcessed += LoadSize; 1890 RemainingBytes -= LoadSize; 1891 1892 Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); 1893 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); 1894 assert(LoadSize <= MaxLoadSize && "Unexpected load type"); 1895 1896 Value *Source1 = CI->getArgOperand(0); 1897 Value *Source2 = CI->getArgOperand(1); 1898 1899 // Cast source to LoadSizeType*. 1900 if (Source1->getType() != LoadSizeType) 1901 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); 1902 if (Source2->getType() != LoadSizeType) 1903 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); 1904 1905 // Get the base address using the GEPIndex. 1906 if (GEPIndex != 0) { 1907 Source1 = Builder.CreateGEP(LoadSizeType, Source1, 1908 ConstantInt::get(LoadSizeType, GEPIndex)); 1909 Source2 = Builder.CreateGEP(LoadSizeType, Source2, 1910 ConstantInt::get(LoadSizeType, GEPIndex)); 1911 } 1912 1913 // Get a constant or load a value for each source address. 1914 Value *LoadSrc1 = nullptr; 1915 if (auto *Source1C = dyn_cast<Constant>(Source1)) 1916 LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); 1917 if (!LoadSrc1) 1918 LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); 1919 1920 Value *LoadSrc2 = nullptr; 1921 if (auto *Source2C = dyn_cast<Constant>(Source2)) 1922 LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); 1923 if (!LoadSrc2) 1924 LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); 1925 1926 if (NumLoads != 1) { 1927 if (LoadSizeType != MaxLoadType) { 1928 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); 1929 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); 1930 } 1931 // If we have multiple loads per block, we need to generate a composite 1932 // comparison using xor+or. 1933 Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); 1934 Diff = Builder.CreateZExt(Diff, MaxLoadType); 1935 XorList.push_back(Diff); 1936 } else { 1937 // If there's only one load per block, we just compare the loaded values. 1938 Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); 1939 } 1940 } 1941 1942 auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> { 1943 std::vector<Value *> OutList; 1944 for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { 1945 Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); 1946 OutList.push_back(Or); 1947 } 1948 if (InList.size() % 2 != 0) 1949 OutList.push_back(InList.back()); 1950 return OutList; 1951 }; 1952 1953 if (!Cmp) { 1954 // Pairwise OR the XOR results. 1955 OrList = pairWiseOr(XorList); 1956 1957 // Pairwise OR the OR results until one result left. 1958 while (OrList.size() != 1) { 1959 OrList = pairWiseOr(OrList); 1960 } 1961 Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); 1962 } 1963 1964 return Cmp; 1965 } 1966 1967 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( 1968 unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { 1969 Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed); 1970 1971 BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) 1972 ? EndBlock 1973 : LoadCmpBlocks[Index + 1]; 1974 // Early exit branch if difference found to ResultBlock. Otherwise, 1975 // continue to next LoadCmpBlock or EndBlock. 1976 BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); 1977 Builder.Insert(CmpBr); 1978 1979 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 1980 // since early exit to ResultBlock was not taken (no difference was found in 1981 // any of the bytes). 1982 if (Index == LoadCmpBlocks.size() - 1) { 1983 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); 1984 PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); 1985 } 1986 } 1987 1988 // This function creates the IR intructions for loading and comparing using the 1989 // given LoadSize. It loads the number of bytes specified by LoadSize from each 1990 // source of the memcmp parameters. It then does a subtract to see if there was 1991 // a difference in the loaded values. If a difference is found, it branches 1992 // with an early exit to the ResultBlock for calculating which source was 1993 // larger. Otherwise, it falls through to the either the next LoadCmpBlock or 1994 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with 1995 // a special case through emitLoadCompareByteBlock. The special handling can 1996 // simply subtract the loaded values and add it to the result phi node. 1997 void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, 1998 unsigned GEPIndex) { 1999 if (LoadSize == 1) { 2000 MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); 2001 return; 2002 } 2003 2004 Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); 2005 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); 2006 assert(LoadSize <= MaxLoadSize && "Unexpected load type"); 2007 2008 Value *Source1 = CI->getArgOperand(0); 2009 Value *Source2 = CI->getArgOperand(1); 2010 2011 Builder.SetInsertPoint(LoadCmpBlocks[Index]); 2012 // Cast source to LoadSizeType*. 2013 if (Source1->getType() != LoadSizeType) 2014 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); 2015 if (Source2->getType() != LoadSizeType) 2016 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); 2017 2018 // Get the base address using the GEPIndex. 2019 if (GEPIndex != 0) { 2020 Source1 = Builder.CreateGEP(LoadSizeType, Source1, 2021 ConstantInt::get(LoadSizeType, GEPIndex)); 2022 Source2 = Builder.CreateGEP(LoadSizeType, Source2, 2023 ConstantInt::get(LoadSizeType, GEPIndex)); 2024 } 2025 2026 // Load LoadSizeType from the base address. 2027 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); 2028 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); 2029 2030 if (DL.isLittleEndian()) { 2031 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), 2032 Intrinsic::bswap, LoadSizeType); 2033 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); 2034 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); 2035 } 2036 2037 if (LoadSizeType != MaxLoadType) { 2038 LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); 2039 LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); 2040 } 2041 2042 // Add the loaded values to the phi nodes for calculating memcmp result only 2043 // if result is not used in a zero equality. 2044 if (!IsUsedForZeroCmp) { 2045 ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); 2046 ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); 2047 } 2048 2049 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); 2050 BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) 2051 ? EndBlock 2052 : LoadCmpBlocks[Index + 1]; 2053 // Early exit branch if difference found to ResultBlock. Otherwise, continue 2054 // to next LoadCmpBlock or EndBlock. 2055 BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); 2056 Builder.Insert(CmpBr); 2057 2058 // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 2059 // since early exit to ResultBlock was not taken (no difference was found in 2060 // any of the bytes). 2061 if (Index == LoadCmpBlocks.size() - 1) { 2062 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); 2063 PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); 2064 } 2065 } 2066 2067 // This function populates the ResultBlock with a sequence to calculate the 2068 // memcmp result. It compares the two loaded source values and returns -1 if 2069 // src1 < src2 and 1 if src1 > src2. 2070 void MemCmpExpansion::emitMemCmpResultBlock() { 2071 // Special case: if memcmp result is used in a zero equality, result does not 2072 // need to be calculated and can simply return 1. 2073 if (IsUsedForZeroCmp) { 2074 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); 2075 Builder.SetInsertPoint(ResBlock.BB, InsertPt); 2076 Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); 2077 PhiRes->addIncoming(Res, ResBlock.BB); 2078 BranchInst *NewBr = BranchInst::Create(EndBlock); 2079 Builder.Insert(NewBr); 2080 return; 2081 } 2082 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); 2083 Builder.SetInsertPoint(ResBlock.BB, InsertPt); 2084 2085 Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, 2086 ResBlock.PhiSrc2); 2087 2088 Value *Res = 2089 Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), 2090 ConstantInt::get(Builder.getInt32Ty(), 1)); 2091 2092 BranchInst *NewBr = BranchInst::Create(EndBlock); 2093 Builder.Insert(NewBr); 2094 PhiRes->addIncoming(Res, ResBlock.BB); 2095 } 2096 2097 unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) { 2098 unsigned NumBlocks = 0; 2099 bool HaveOneByteLoad = false; 2100 unsigned RemainingSize = Size; 2101 unsigned LoadSize = MaxLoadSize; 2102 while (RemainingSize) { 2103 if (LoadSize == 1) 2104 HaveOneByteLoad = true; 2105 NumBlocks += RemainingSize / LoadSize; 2106 RemainingSize = RemainingSize % LoadSize; 2107 LoadSize = LoadSize / 2; 2108 } 2109 NumBlocksNonOneByte = HaveOneByteLoad ? (NumBlocks - 1) : NumBlocks; 2110 2111 if (IsUsedForZeroCmp) 2112 NumBlocks = NumBlocks / NumLoadsPerBlock + 2113 (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0); 2114 2115 return NumBlocks; 2116 } 2117 2118 void MemCmpExpansion::setupResultBlockPHINodes() { 2119 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); 2120 Builder.SetInsertPoint(ResBlock.BB); 2121 ResBlock.PhiSrc1 = 2122 Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); 2123 ResBlock.PhiSrc2 = 2124 Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); 2125 } 2126 2127 void MemCmpExpansion::setupEndBlockPHINodes() { 2128 Builder.SetInsertPoint(&EndBlock->front()); 2129 PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); 2130 } 2131 2132 Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) { 2133 unsigned NumBytesProcessed = 0; 2134 // This loop populates each of the LoadCmpBlocks with the IR sequence to 2135 // handle multiple loads per block. 2136 for (unsigned i = 0; i < NumBlocks; ++i) 2137 emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed); 2138 2139 emitMemCmpResultBlock(); 2140 return PhiRes; 2141 } 2142 2143 /// A memcmp expansion that compares equality with 0 and only has one block of 2144 /// load and compare can bypass the compare, branch, and phi IR that is required 2145 /// in the general case. 2146 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { 2147 unsigned NumBytesProcessed = 0; 2148 Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed); 2149 return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); 2150 } 2151 2152 /// A memcmp expansion that only has one block of load and compare can bypass 2153 /// the compare, branch, and phi IR that is required in the general case. 2154 Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) { 2155 assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); 2156 2157 Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); 2158 Value *Source1 = CI->getArgOperand(0); 2159 Value *Source2 = CI->getArgOperand(1); 2160 2161 // Cast source to LoadSizeType*. 2162 if (Source1->getType() != LoadSizeType) 2163 Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); 2164 if (Source2->getType() != LoadSizeType) 2165 Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); 2166 2167 // Load LoadSizeType from the base address. 2168 Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); 2169 Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); 2170 2171 if (DL.isLittleEndian() && Size != 1) { 2172 Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), 2173 Intrinsic::bswap, LoadSizeType); 2174 LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); 2175 LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); 2176 } 2177 2178 if (Size < 4) { 2179 // The i8 and i16 cases don't need compares. We zext the loaded values and 2180 // subtract them to get the suitable negative, zero, or positive i32 result. 2181 LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); 2182 LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); 2183 return Builder.CreateSub(LoadSrc1, LoadSrc2); 2184 } 2185 2186 // The result of memcmp is negative, zero, or positive, so produce that by 2187 // subtracting 2 extended compare bits: sub (ugt, ult). 2188 // If a target prefers to use selects to get -1/0/1, they should be able 2189 // to transform this later. The inverse transform (going from selects to math) 2190 // may not be possible in the DAG because the selects got converted into 2191 // branches before we got there. 2192 Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); 2193 Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); 2194 Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); 2195 Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); 2196 return Builder.CreateSub(ZextUGT, ZextULT); 2197 } 2198 2199 // This function expands the memcmp call into an inline expansion and returns 2200 // the memcmp result. 2201 Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { 2202 if (IsUsedForZeroCmp) 2203 return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : 2204 getMemCmpExpansionZeroCase(Size); 2205 2206 // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). 2207 if (NumBlocks == 1 && NumLoadsPerBlock == 1) 2208 return getMemCmpOneBlock(Size); 2209 2210 // This loop calls emitLoadCompareBlock for comparing Size bytes of the two 2211 // memcmp sources. It starts with loading using the maximum load size set by 2212 // the target. It processes any remaining bytes using a load size which is the 2213 // next smallest power of 2. 2214 unsigned LoadSize = MaxLoadSize; 2215 unsigned NumBytesToBeProcessed = Size; 2216 unsigned Index = 0; 2217 while (NumBytesToBeProcessed) { 2218 // Calculate how many blocks we can create with the current load size. 2219 unsigned NumBlocks = NumBytesToBeProcessed / LoadSize; 2220 unsigned GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; 2221 NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; 2222 2223 // For each NumBlocks, populate the instruction sequence for loading and 2224 // comparing LoadSize bytes. 2225 while (NumBlocks--) { 2226 emitLoadCompareBlock(Index, LoadSize, GEPIndex); 2227 Index++; 2228 GEPIndex++; 2229 } 2230 // Get the next LoadSize to use. 2231 LoadSize = LoadSize / 2; 2232 } 2233 2234 emitMemCmpResultBlock(); 2235 return PhiRes; 2236 } 2237 2238 // This function checks to see if an expansion of memcmp can be generated. 2239 // It checks for constant compare size that is less than the max inline size. 2240 // If an expansion cannot occur, returns false to leave as a library call. 2241 // Otherwise, the library call is replaced with a new IR instruction sequence. 2242 /// We want to transform: 2243 /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) 2244 /// To: 2245 /// loadbb: 2246 /// %0 = bitcast i32* %buffer2 to i8* 2247 /// %1 = bitcast i32* %buffer1 to i8* 2248 /// %2 = bitcast i8* %1 to i64* 2249 /// %3 = bitcast i8* %0 to i64* 2250 /// %4 = load i64, i64* %2 2251 /// %5 = load i64, i64* %3 2252 /// %6 = call i64 @llvm.bswap.i64(i64 %4) 2253 /// %7 = call i64 @llvm.bswap.i64(i64 %5) 2254 /// %8 = sub i64 %6, %7 2255 /// %9 = icmp ne i64 %8, 0 2256 /// br i1 %9, label %res_block, label %loadbb1 2257 /// res_block: ; preds = %loadbb2, 2258 /// %loadbb1, %loadbb 2259 /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] 2260 /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] 2261 /// %10 = icmp ult i64 %phi.src1, %phi.src2 2262 /// %11 = select i1 %10, i32 -1, i32 1 2263 /// br label %endblock 2264 /// loadbb1: ; preds = %loadbb 2265 /// %12 = bitcast i32* %buffer2 to i8* 2266 /// %13 = bitcast i32* %buffer1 to i8* 2267 /// %14 = bitcast i8* %13 to i32* 2268 /// %15 = bitcast i8* %12 to i32* 2269 /// %16 = getelementptr i32, i32* %14, i32 2 2270 /// %17 = getelementptr i32, i32* %15, i32 2 2271 /// %18 = load i32, i32* %16 2272 /// %19 = load i32, i32* %17 2273 /// %20 = call i32 @llvm.bswap.i32(i32 %18) 2274 /// %21 = call i32 @llvm.bswap.i32(i32 %19) 2275 /// %22 = zext i32 %20 to i64 2276 /// %23 = zext i32 %21 to i64 2277 /// %24 = sub i64 %22, %23 2278 /// %25 = icmp ne i64 %24, 0 2279 /// br i1 %25, label %res_block, label %loadbb2 2280 /// loadbb2: ; preds = %loadbb1 2281 /// %26 = bitcast i32* %buffer2 to i8* 2282 /// %27 = bitcast i32* %buffer1 to i8* 2283 /// %28 = bitcast i8* %27 to i16* 2284 /// %29 = bitcast i8* %26 to i16* 2285 /// %30 = getelementptr i16, i16* %28, i16 6 2286 /// %31 = getelementptr i16, i16* %29, i16 6 2287 /// %32 = load i16, i16* %30 2288 /// %33 = load i16, i16* %31 2289 /// %34 = call i16 @llvm.bswap.i16(i16 %32) 2290 /// %35 = call i16 @llvm.bswap.i16(i16 %33) 2291 /// %36 = zext i16 %34 to i64 2292 /// %37 = zext i16 %35 to i64 2293 /// %38 = sub i64 %36, %37 2294 /// %39 = icmp ne i64 %38, 0 2295 /// br i1 %39, label %res_block, label %loadbb3 2296 /// loadbb3: ; preds = %loadbb2 2297 /// %40 = bitcast i32* %buffer2 to i8* 2298 /// %41 = bitcast i32* %buffer1 to i8* 2299 /// %42 = getelementptr i8, i8* %41, i8 14 2300 /// %43 = getelementptr i8, i8* %40, i8 14 2301 /// %44 = load i8, i8* %42 2302 /// %45 = load i8, i8* %43 2303 /// %46 = zext i8 %44 to i32 2304 /// %47 = zext i8 %45 to i32 2305 /// %48 = sub i32 %46, %47 2306 /// br label %endblock 2307 /// endblock: ; preds = %res_block, 2308 /// %loadbb3 2309 /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] 2310 /// ret i32 %phi.res 2311 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, 2312 const TargetLowering *TLI, const DataLayout *DL) { 2313 NumMemCmpCalls++; 2314 2315 // TTI call to check if target would like to expand memcmp. Also, get the 2316 // MaxLoadSize. 2317 unsigned MaxLoadSize; 2318 if (!TTI->expandMemCmp(CI, MaxLoadSize)) 2319 return false; 2320 2321 // Early exit from expansion if -Oz. 2322 if (CI->getFunction()->optForMinSize()) 2323 return false; 2324 2325 // Early exit from expansion if size is not a constant. 2326 ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); 2327 if (!SizeCast) { 2328 NumMemCmpNotConstant++; 2329 return false; 2330 } 2331 2332 // Scale the max size down if the target can load more bytes than we need. 2333 uint64_t SizeVal = SizeCast->getZExtValue(); 2334 if (MaxLoadSize > SizeVal) 2335 MaxLoadSize = 1 << SizeCast->getValue().logBase2(); 2336 2337 // Calculate how many load pairs are needed for the constant size. 2338 unsigned NumLoads = 0; 2339 unsigned RemainingSize = SizeVal; 2340 unsigned LoadSize = MaxLoadSize; 2341 while (RemainingSize) { 2342 NumLoads += RemainingSize / LoadSize; 2343 RemainingSize = RemainingSize % LoadSize; 2344 LoadSize = LoadSize / 2; 2345 } 2346 2347 // Don't expand if this will require more loads than desired by the target. 2348 if (NumLoads > TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize())) { 2349 NumMemCmpGreaterThanMax++; 2350 return false; 2351 } 2352 2353 NumMemCmpInlined++; 2354 2355 // MemCmpHelper object creates and sets up basic blocks required for 2356 // expanding memcmp with size SizeVal. 2357 unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock; 2358 MemCmpExpansion MemCmpHelper(CI, SizeVal, MaxLoadSize, NumLoadsPerBlock, *DL); 2359 2360 Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal); 2361 2362 // Replace call with result of expansion and erase call. 2363 CI->replaceAllUsesWith(Res); 2364 CI->eraseFromParent(); 2365 2366 return true; 2367 } 2368 2369 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { 2370 BasicBlock *BB = CI->getParent(); 2371 2372 // Lower inline assembly if we can. 2373 // If we found an inline asm expession, and if the target knows how to 2374 // lower it to normal LLVM code, do so now. 2375 if (TLI && isa<InlineAsm>(CI->getCalledValue())) { 2376 if (TLI->ExpandInlineAsm(CI)) { 2377 // Avoid invalidating the iterator. 2378 CurInstIterator = BB->begin(); 2379 // Avoid processing instructions out of order, which could cause 2380 // reuse before a value is defined. 2381 SunkAddrs.clear(); 2382 return true; 2383 } 2384 // Sink address computing for memory operands into the block. 2385 if (optimizeInlineAsmInst(CI)) 2386 return true; 2387 } 2388 2389 // Align the pointer arguments to this call if the target thinks it's a good 2390 // idea 2391 unsigned MinSize, PrefAlign; 2392 if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { 2393 for (auto &Arg : CI->arg_operands()) { 2394 // We want to align both objects whose address is used directly and 2395 // objects whose address is used in casts and GEPs, though it only makes 2396 // sense for GEPs if the offset is a multiple of the desired alignment and 2397 // if size - offset meets the size threshold. 2398 if (!Arg->getType()->isPointerTy()) 2399 continue; 2400 APInt Offset(DL->getPointerSizeInBits( 2401 cast<PointerType>(Arg->getType())->getAddressSpace()), 2402 0); 2403 Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); 2404 uint64_t Offset2 = Offset.getLimitedValue(); 2405 if ((Offset2 & (PrefAlign-1)) != 0) 2406 continue; 2407 AllocaInst *AI; 2408 if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign && 2409 DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2) 2410 AI->setAlignment(PrefAlign); 2411 // Global variables can only be aligned if they are defined in this 2412 // object (i.e. they are uniquely initialized in this object), and 2413 // over-aligning global variables that have an explicit section is 2414 // forbidden. 2415 GlobalVariable *GV; 2416 if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() && 2417 GV->getPointerAlignment(*DL) < PrefAlign && 2418 DL->getTypeAllocSize(GV->getValueType()) >= 2419 MinSize + Offset2) 2420 GV->setAlignment(PrefAlign); 2421 } 2422 // If this is a memcpy (or similar) then we may be able to improve the 2423 // alignment 2424 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) { 2425 unsigned Align = getKnownAlignment(MI->getDest(), *DL); 2426 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) 2427 Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL)); 2428 if (Align > MI->getAlignment()) 2429 MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align)); 2430 } 2431 } 2432 2433 // If we have a cold call site, try to sink addressing computation into the 2434 // cold block. This interacts with our handling for loads and stores to 2435 // ensure that we can fold all uses of a potential addressing computation 2436 // into their uses. TODO: generalize this to work over profiling data 2437 if (!OptSize && CI->hasFnAttr(Attribute::Cold)) 2438 for (auto &Arg : CI->arg_operands()) { 2439 if (!Arg->getType()->isPointerTy()) 2440 continue; 2441 unsigned AS = Arg->getType()->getPointerAddressSpace(); 2442 return optimizeMemoryInst(CI, Arg, Arg->getType(), AS); 2443 } 2444 2445 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); 2446 if (II) { 2447 switch (II->getIntrinsicID()) { 2448 default: break; 2449 case Intrinsic::objectsize: { 2450 // Lower all uses of llvm.objectsize.* 2451 ConstantInt *RetVal = 2452 lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true); 2453 // Substituting this can cause recursive simplifications, which can 2454 // invalidate our iterator. Use a WeakTrackingVH to hold onto it in case 2455 // this 2456 // happens. 2457 Value *CurValue = &*CurInstIterator; 2458 WeakTrackingVH IterHandle(CurValue); 2459 2460 replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); 2461 2462 // If the iterator instruction was recursively deleted, start over at the 2463 // start of the block. 2464 if (IterHandle != CurValue) { 2465 CurInstIterator = BB->begin(); 2466 SunkAddrs.clear(); 2467 } 2468 return true; 2469 } 2470 case Intrinsic::aarch64_stlxr: 2471 case Intrinsic::aarch64_stxr: { 2472 ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); 2473 if (!ExtVal || !ExtVal->hasOneUse() || 2474 ExtVal->getParent() == CI->getParent()) 2475 return false; 2476 // Sink a zext feeding stlxr/stxr before it, so it can be folded into it. 2477 ExtVal->moveBefore(CI); 2478 // Mark this instruction as "inserted by CGP", so that other 2479 // optimizations don't touch it. 2480 InsertedInsts.insert(ExtVal); 2481 return true; 2482 } 2483 case Intrinsic::invariant_group_barrier: 2484 II->replaceAllUsesWith(II->getArgOperand(0)); 2485 II->eraseFromParent(); 2486 return true; 2487 2488 case Intrinsic::cttz: 2489 case Intrinsic::ctlz: 2490 // If counting zeros is expensive, try to avoid it. 2491 return despeculateCountZeros(II, TLI, DL, ModifiedDT); 2492 } 2493 2494 if (TLI) { 2495 SmallVector<Value*, 2> PtrOps; 2496 Type *AccessTy; 2497 if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) 2498 while (!PtrOps.empty()) { 2499 Value *PtrVal = PtrOps.pop_back_val(); 2500 unsigned AS = PtrVal->getType()->getPointerAddressSpace(); 2501 if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) 2502 return true; 2503 } 2504 } 2505 } 2506 2507 // From here on out we're working with named functions. 2508 if (!CI->getCalledFunction()) return false; 2509 2510 // Lower all default uses of _chk calls. This is very similar 2511 // to what InstCombineCalls does, but here we are only lowering calls 2512 // to fortified library functions (e.g. __memcpy_chk) that have the default 2513 // "don't know" as the objectsize. Anything else should be left alone. 2514 FortifiedLibCallSimplifier Simplifier(TLInfo, true); 2515 if (Value *V = Simplifier.optimizeCall(CI)) { 2516 CI->replaceAllUsesWith(V); 2517 CI->eraseFromParent(); 2518 return true; 2519 } 2520 2521 LibFunc Func; 2522 if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && 2523 Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { 2524 ModifiedDT = true; 2525 return true; 2526 } 2527 return false; 2528 } 2529 2530 /// Look for opportunities to duplicate return instructions to the predecessor 2531 /// to enable tail call optimizations. The case it is currently looking for is: 2532 /// @code 2533 /// bb0: 2534 /// %tmp0 = tail call i32 @f0() 2535 /// br label %return 2536 /// bb1: 2537 /// %tmp1 = tail call i32 @f1() 2538 /// br label %return 2539 /// bb2: 2540 /// %tmp2 = tail call i32 @f2() 2541 /// br label %return 2542 /// return: 2543 /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] 2544 /// ret i32 %retval 2545 /// @endcode 2546 /// 2547 /// => 2548 /// 2549 /// @code 2550 /// bb0: 2551 /// %tmp0 = tail call i32 @f0() 2552 /// ret i32 %tmp0 2553 /// bb1: 2554 /// %tmp1 = tail call i32 @f1() 2555 /// ret i32 %tmp1 2556 /// bb2: 2557 /// %tmp2 = tail call i32 @f2() 2558 /// ret i32 %tmp2 2559 /// @endcode 2560 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { 2561 if (!TLI) 2562 return false; 2563 2564 ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator()); 2565 if (!RetI) 2566 return false; 2567 2568 PHINode *PN = nullptr; 2569 BitCastInst *BCI = nullptr; 2570 Value *V = RetI->getReturnValue(); 2571 if (V) { 2572 BCI = dyn_cast<BitCastInst>(V); 2573 if (BCI) 2574 V = BCI->getOperand(0); 2575 2576 PN = dyn_cast<PHINode>(V); 2577 if (!PN) 2578 return false; 2579 } 2580 2581 if (PN && PN->getParent() != BB) 2582 return false; 2583 2584 // Make sure there are no instructions between the PHI and return, or that the 2585 // return is the first instruction in the block. 2586 if (PN) { 2587 BasicBlock::iterator BI = BB->begin(); 2588 do { ++BI; } while (isa<DbgInfoIntrinsic>(BI)); 2589 if (&*BI == BCI) 2590 // Also skip over the bitcast. 2591 ++BI; 2592 if (&*BI != RetI) 2593 return false; 2594 } else { 2595 BasicBlock::iterator BI = BB->begin(); 2596 while (isa<DbgInfoIntrinsic>(BI)) ++BI; 2597 if (&*BI != RetI) 2598 return false; 2599 } 2600 2601 /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail 2602 /// call. 2603 const Function *F = BB->getParent(); 2604 SmallVector<CallInst*, 4> TailCalls; 2605 if (PN) { 2606 for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { 2607 CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I)); 2608 // Make sure the phi value is indeed produced by the tail call. 2609 if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) && 2610 TLI->mayBeEmittedAsTailCall(CI) && 2611 attributesPermitTailCall(F, CI, RetI, *TLI)) 2612 TailCalls.push_back(CI); 2613 } 2614 } else { 2615 SmallPtrSet<BasicBlock*, 4> VisitedBBs; 2616 for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { 2617 if (!VisitedBBs.insert(*PI).second) 2618 continue; 2619 2620 BasicBlock::InstListType &InstList = (*PI)->getInstList(); 2621 BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); 2622 BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); 2623 do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI)); 2624 if (RI == RE) 2625 continue; 2626 2627 CallInst *CI = dyn_cast<CallInst>(&*RI); 2628 if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) && 2629 attributesPermitTailCall(F, CI, RetI, *TLI)) 2630 TailCalls.push_back(CI); 2631 } 2632 } 2633 2634 bool Changed = false; 2635 for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) { 2636 CallInst *CI = TailCalls[i]; 2637 CallSite CS(CI); 2638 2639 // Conservatively require the attributes of the call to match those of the 2640 // return. Ignore noalias because it doesn't affect the call sequence. 2641 AttributeList CalleeAttrs = CS.getAttributes(); 2642 if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) 2643 .removeAttribute(Attribute::NoAlias) != 2644 AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) 2645 .removeAttribute(Attribute::NoAlias)) 2646 continue; 2647 2648 // Make sure the call instruction is followed by an unconditional branch to 2649 // the return block. 2650 BasicBlock *CallBB = CI->getParent(); 2651 BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator()); 2652 if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB) 2653 continue; 2654 2655 // Duplicate the return into CallBB. 2656 (void)FoldReturnIntoUncondBranch(RetI, BB, CallBB); 2657 ModifiedDT = Changed = true; 2658 ++NumRetsDup; 2659 } 2660 2661 // If we eliminated all predecessors of the block, delete the block now. 2662 if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) 2663 BB->eraseFromParent(); 2664 2665 return Changed; 2666 } 2667 2668 //===----------------------------------------------------------------------===// 2669 // Memory Optimization 2670 //===----------------------------------------------------------------------===// 2671 2672 namespace { 2673 2674 /// This is an extended version of TargetLowering::AddrMode 2675 /// which holds actual Value*'s for register values. 2676 struct ExtAddrMode : public TargetLowering::AddrMode { 2677 Value *BaseReg = nullptr; 2678 Value *ScaledReg = nullptr; 2679 2680 ExtAddrMode() = default; 2681 2682 void print(raw_ostream &OS) const; 2683 void dump() const; 2684 2685 bool operator==(const ExtAddrMode& O) const { 2686 return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && 2687 (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && 2688 (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale); 2689 } 2690 }; 2691 2692 } // end anonymous namespace 2693 2694 #ifndef NDEBUG 2695 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { 2696 AM.print(OS); 2697 return OS; 2698 } 2699 #endif 2700 2701 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2702 void ExtAddrMode::print(raw_ostream &OS) const { 2703 bool NeedPlus = false; 2704 OS << "["; 2705 if (BaseGV) { 2706 OS << (NeedPlus ? " + " : "") 2707 << "GV:"; 2708 BaseGV->printAsOperand(OS, /*PrintType=*/false); 2709 NeedPlus = true; 2710 } 2711 2712 if (BaseOffs) { 2713 OS << (NeedPlus ? " + " : "") 2714 << BaseOffs; 2715 NeedPlus = true; 2716 } 2717 2718 if (BaseReg) { 2719 OS << (NeedPlus ? " + " : "") 2720 << "Base:"; 2721 BaseReg->printAsOperand(OS, /*PrintType=*/false); 2722 NeedPlus = true; 2723 } 2724 if (Scale) { 2725 OS << (NeedPlus ? " + " : "") 2726 << Scale << "*"; 2727 ScaledReg->printAsOperand(OS, /*PrintType=*/false); 2728 } 2729 2730 OS << ']'; 2731 } 2732 2733 LLVM_DUMP_METHOD void ExtAddrMode::dump() const { 2734 print(dbgs()); 2735 dbgs() << '\n'; 2736 } 2737 #endif 2738 2739 namespace { 2740 2741 /// \brief This class provides transaction based operation on the IR. 2742 /// Every change made through this class is recorded in the internal state and 2743 /// can be undone (rollback) until commit is called. 2744 class TypePromotionTransaction { 2745 /// \brief This represents the common interface of the individual transaction. 2746 /// Each class implements the logic for doing one specific modification on 2747 /// the IR via the TypePromotionTransaction. 2748 class TypePromotionAction { 2749 protected: 2750 /// The Instruction modified. 2751 Instruction *Inst; 2752 2753 public: 2754 /// \brief Constructor of the action. 2755 /// The constructor performs the related action on the IR. 2756 TypePromotionAction(Instruction *Inst) : Inst(Inst) {} 2757 2758 virtual ~TypePromotionAction() = default; 2759 2760 /// \brief Undo the modification done by this action. 2761 /// When this method is called, the IR must be in the same state as it was 2762 /// before this action was applied. 2763 /// \pre Undoing the action works if and only if the IR is in the exact same 2764 /// state as it was directly after this action was applied. 2765 virtual void undo() = 0; 2766 2767 /// \brief Advocate every change made by this action. 2768 /// When the results on the IR of the action are to be kept, it is important 2769 /// to call this function, otherwise hidden information may be kept forever. 2770 virtual void commit() { 2771 // Nothing to be done, this action is not doing anything. 2772 } 2773 }; 2774 2775 /// \brief Utility to remember the position of an instruction. 2776 class InsertionHandler { 2777 /// Position of an instruction. 2778 /// Either an instruction: 2779 /// - Is the first in a basic block: BB is used. 2780 /// - Has a previous instructon: PrevInst is used. 2781 union { 2782 Instruction *PrevInst; 2783 BasicBlock *BB; 2784 } Point; 2785 2786 /// Remember whether or not the instruction had a previous instruction. 2787 bool HasPrevInstruction; 2788 2789 public: 2790 /// \brief Record the position of \p Inst. 2791 InsertionHandler(Instruction *Inst) { 2792 BasicBlock::iterator It = Inst->getIterator(); 2793 HasPrevInstruction = (It != (Inst->getParent()->begin())); 2794 if (HasPrevInstruction) 2795 Point.PrevInst = &*--It; 2796 else 2797 Point.BB = Inst->getParent(); 2798 } 2799 2800 /// \brief Insert \p Inst at the recorded position. 2801 void insert(Instruction *Inst) { 2802 if (HasPrevInstruction) { 2803 if (Inst->getParent()) 2804 Inst->removeFromParent(); 2805 Inst->insertAfter(Point.PrevInst); 2806 } else { 2807 Instruction *Position = &*Point.BB->getFirstInsertionPt(); 2808 if (Inst->getParent()) 2809 Inst->moveBefore(Position); 2810 else 2811 Inst->insertBefore(Position); 2812 } 2813 } 2814 }; 2815 2816 /// \brief Move an instruction before another. 2817 class InstructionMoveBefore : public TypePromotionAction { 2818 /// Original position of the instruction. 2819 InsertionHandler Position; 2820 2821 public: 2822 /// \brief Move \p Inst before \p Before. 2823 InstructionMoveBefore(Instruction *Inst, Instruction *Before) 2824 : TypePromotionAction(Inst), Position(Inst) { 2825 DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before << "\n"); 2826 Inst->moveBefore(Before); 2827 } 2828 2829 /// \brief Move the instruction back to its original position. 2830 void undo() override { 2831 DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n"); 2832 Position.insert(Inst); 2833 } 2834 }; 2835 2836 /// \brief Set the operand of an instruction with a new value. 2837 class OperandSetter : public TypePromotionAction { 2838 /// Original operand of the instruction. 2839 Value *Origin; 2840 2841 /// Index of the modified instruction. 2842 unsigned Idx; 2843 2844 public: 2845 /// \brief Set \p Idx operand of \p Inst with \p NewVal. 2846 OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal) 2847 : TypePromotionAction(Inst), Idx(Idx) { 2848 DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n" 2849 << "for:" << *Inst << "\n" 2850 << "with:" << *NewVal << "\n"); 2851 Origin = Inst->getOperand(Idx); 2852 Inst->setOperand(Idx, NewVal); 2853 } 2854 2855 /// \brief Restore the original value of the instruction. 2856 void undo() override { 2857 DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n" 2858 << "for: " << *Inst << "\n" 2859 << "with: " << *Origin << "\n"); 2860 Inst->setOperand(Idx, Origin); 2861 } 2862 }; 2863 2864 /// \brief Hide the operands of an instruction. 2865 /// Do as if this instruction was not using any of its operands. 2866 class OperandsHider : public TypePromotionAction { 2867 /// The list of original operands. 2868 SmallVector<Value *, 4> OriginalValues; 2869 2870 public: 2871 /// \brief Remove \p Inst from the uses of the operands of \p Inst. 2872 OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) { 2873 DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n"); 2874 unsigned NumOpnds = Inst->getNumOperands(); 2875 OriginalValues.reserve(NumOpnds); 2876 for (unsigned It = 0; It < NumOpnds; ++It) { 2877 // Save the current operand. 2878 Value *Val = Inst->getOperand(It); 2879 OriginalValues.push_back(Val); 2880 // Set a dummy one. 2881 // We could use OperandSetter here, but that would imply an overhead 2882 // that we are not willing to pay. 2883 Inst->setOperand(It, UndefValue::get(Val->getType())); 2884 } 2885 } 2886 2887 /// \brief Restore the original list of uses. 2888 void undo() override { 2889 DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n"); 2890 for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It) 2891 Inst->setOperand(It, OriginalValues[It]); 2892 } 2893 }; 2894 2895 /// \brief Build a truncate instruction. 2896 class TruncBuilder : public TypePromotionAction { 2897 Value *Val; 2898 2899 public: 2900 /// \brief Build a truncate instruction of \p Opnd producing a \p Ty 2901 /// result. 2902 /// trunc Opnd to Ty. 2903 TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) { 2904 IRBuilder<> Builder(Opnd); 2905 Val = Builder.CreateTrunc(Opnd, Ty, "promoted"); 2906 DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n"); 2907 } 2908 2909 /// \brief Get the built value. 2910 Value *getBuiltValue() { return Val; } 2911 2912 /// \brief Remove the built instruction. 2913 void undo() override { 2914 DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n"); 2915 if (Instruction *IVal = dyn_cast<Instruction>(Val)) 2916 IVal->eraseFromParent(); 2917 } 2918 }; 2919 2920 /// \brief Build a sign extension instruction. 2921 class SExtBuilder : public TypePromotionAction { 2922 Value *Val; 2923 2924 public: 2925 /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty 2926 /// result. 2927 /// sext Opnd to Ty. 2928 SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) 2929 : TypePromotionAction(InsertPt) { 2930 IRBuilder<> Builder(InsertPt); 2931 Val = Builder.CreateSExt(Opnd, Ty, "promoted"); 2932 DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n"); 2933 } 2934 2935 /// \brief Get the built value. 2936 Value *getBuiltValue() { return Val; } 2937 2938 /// \brief Remove the built instruction. 2939 void undo() override { 2940 DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n"); 2941 if (Instruction *IVal = dyn_cast<Instruction>(Val)) 2942 IVal->eraseFromParent(); 2943 } 2944 }; 2945 2946 /// \brief Build a zero extension instruction. 2947 class ZExtBuilder : public TypePromotionAction { 2948 Value *Val; 2949 2950 public: 2951 /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty 2952 /// result. 2953 /// zext Opnd to Ty. 2954 ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) 2955 : TypePromotionAction(InsertPt) { 2956 IRBuilder<> Builder(InsertPt); 2957 Val = Builder.CreateZExt(Opnd, Ty, "promoted"); 2958 DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n"); 2959 } 2960 2961 /// \brief Get the built value. 2962 Value *getBuiltValue() { return Val; } 2963 2964 /// \brief Remove the built instruction. 2965 void undo() override { 2966 DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n"); 2967 if (Instruction *IVal = dyn_cast<Instruction>(Val)) 2968 IVal->eraseFromParent(); 2969 } 2970 }; 2971 2972 /// \brief Mutate an instruction to another type. 2973 class TypeMutator : public TypePromotionAction { 2974 /// Record the original type. 2975 Type *OrigTy; 2976 2977 public: 2978 /// \brief Mutate the type of \p Inst into \p NewTy. 2979 TypeMutator(Instruction *Inst, Type *NewTy) 2980 : TypePromotionAction(Inst), OrigTy(Inst->getType()) { 2981 DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy 2982 << "\n"); 2983 Inst->mutateType(NewTy); 2984 } 2985 2986 /// \brief Mutate the instruction back to its original type. 2987 void undo() override { 2988 DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy 2989 << "\n"); 2990 Inst->mutateType(OrigTy); 2991 } 2992 }; 2993 2994 /// \brief Replace the uses of an instruction by another instruction. 2995 class UsesReplacer : public TypePromotionAction { 2996 /// Helper structure to keep track of the replaced uses. 2997 struct InstructionAndIdx { 2998 /// The instruction using the instruction. 2999 Instruction *Inst; 3000 3001 /// The index where this instruction is used for Inst. 3002 unsigned Idx; 3003 3004 InstructionAndIdx(Instruction *Inst, unsigned Idx) 3005 : Inst(Inst), Idx(Idx) {} 3006 }; 3007 3008 /// Keep track of the original uses (pair Instruction, Index). 3009 SmallVector<InstructionAndIdx, 4> OriginalUses; 3010 3011 using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator; 3012 3013 public: 3014 /// \brief Replace all the use of \p Inst by \p New. 3015 UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) { 3016 DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New 3017 << "\n"); 3018 // Record the original uses. 3019 for (Use &U : Inst->uses()) { 3020 Instruction *UserI = cast<Instruction>(U.getUser()); 3021 OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo())); 3022 } 3023 // Now, we can replace the uses. 3024 Inst->replaceAllUsesWith(New); 3025 } 3026 3027 /// \brief Reassign the original uses of Inst to Inst. 3028 void undo() override { 3029 DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n"); 3030 for (use_iterator UseIt = OriginalUses.begin(), 3031 EndIt = OriginalUses.end(); 3032 UseIt != EndIt; ++UseIt) { 3033 UseIt->Inst->setOperand(UseIt->Idx, Inst); 3034 } 3035 } 3036 }; 3037 3038 /// \brief Remove an instruction from the IR. 3039 class InstructionRemover : public TypePromotionAction { 3040 /// Original position of the instruction. 3041 InsertionHandler Inserter; 3042 3043 /// Helper structure to hide all the link to the instruction. In other 3044 /// words, this helps to do as if the instruction was removed. 3045 OperandsHider Hider; 3046 3047 /// Keep track of the uses replaced, if any. 3048 UsesReplacer *Replacer = nullptr; 3049 3050 /// Keep track of instructions removed. 3051 SetOfInstrs &RemovedInsts; 3052 3053 public: 3054 /// \brief Remove all reference of \p Inst and optinally replace all its 3055 /// uses with New. 3056 /// \p RemovedInsts Keep track of the instructions removed by this Action. 3057 /// \pre If !Inst->use_empty(), then New != nullptr 3058 InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, 3059 Value *New = nullptr) 3060 : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), 3061 RemovedInsts(RemovedInsts) { 3062 if (New) 3063 Replacer = new UsesReplacer(Inst, New); 3064 DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); 3065 RemovedInsts.insert(Inst); 3066 /// The instructions removed here will be freed after completing 3067 /// optimizeBlock() for all blocks as we need to keep track of the 3068 /// removed instructions during promotion. 3069 Inst->removeFromParent(); 3070 } 3071 3072 ~InstructionRemover() override { delete Replacer; } 3073 3074 /// \brief Resurrect the instruction and reassign it to the proper uses if 3075 /// new value was provided when build this action. 3076 void undo() override { 3077 DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n"); 3078 Inserter.insert(Inst); 3079 if (Replacer) 3080 Replacer->undo(); 3081 Hider.undo(); 3082 RemovedInsts.erase(Inst); 3083 } 3084 }; 3085 3086 public: 3087 /// Restoration point. 3088 /// The restoration point is a pointer to an action instead of an iterator 3089 /// because the iterator may be invalidated but not the pointer. 3090 using ConstRestorationPt = const TypePromotionAction *; 3091 3092 TypePromotionTransaction(SetOfInstrs &RemovedInsts) 3093 : RemovedInsts(RemovedInsts) {} 3094 3095 /// Advocate every changes made in that transaction. 3096 void commit(); 3097 3098 /// Undo all the changes made after the given point. 3099 void rollback(ConstRestorationPt Point); 3100 3101 /// Get the current restoration point. 3102 ConstRestorationPt getRestorationPoint() const; 3103 3104 /// \name API for IR modification with state keeping to support rollback. 3105 /// @{ 3106 /// Same as Instruction::setOperand. 3107 void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal); 3108 3109 /// Same as Instruction::eraseFromParent. 3110 void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr); 3111 3112 /// Same as Value::replaceAllUsesWith. 3113 void replaceAllUsesWith(Instruction *Inst, Value *New); 3114 3115 /// Same as Value::mutateType. 3116 void mutateType(Instruction *Inst, Type *NewTy); 3117 3118 /// Same as IRBuilder::createTrunc. 3119 Value *createTrunc(Instruction *Opnd, Type *Ty); 3120 3121 /// Same as IRBuilder::createSExt. 3122 Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty); 3123 3124 /// Same as IRBuilder::createZExt. 3125 Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty); 3126 3127 /// Same as Instruction::moveBefore. 3128 void moveBefore(Instruction *Inst, Instruction *Before); 3129 /// @} 3130 3131 private: 3132 /// The ordered list of actions made so far. 3133 SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions; 3134 3135 using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator; 3136 3137 SetOfInstrs &RemovedInsts; 3138 }; 3139 3140 } // end anonymous namespace 3141 3142 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, 3143 Value *NewVal) { 3144 Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>( 3145 Inst, Idx, NewVal)); 3146 } 3147 3148 void TypePromotionTransaction::eraseInstruction(Instruction *Inst, 3149 Value *NewVal) { 3150 Actions.push_back( 3151 llvm::make_unique<TypePromotionTransaction::InstructionRemover>( 3152 Inst, RemovedInsts, NewVal)); 3153 } 3154 3155 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, 3156 Value *New) { 3157 Actions.push_back( 3158 llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New)); 3159 } 3160 3161 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) { 3162 Actions.push_back( 3163 llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy)); 3164 } 3165 3166 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, 3167 Type *Ty) { 3168 std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty)); 3169 Value *Val = Ptr->getBuiltValue(); 3170 Actions.push_back(std::move(Ptr)); 3171 return Val; 3172 } 3173 3174 Value *TypePromotionTransaction::createSExt(Instruction *Inst, 3175 Value *Opnd, Type *Ty) { 3176 std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty)); 3177 Value *Val = Ptr->getBuiltValue(); 3178 Actions.push_back(std::move(Ptr)); 3179 return Val; 3180 } 3181 3182 Value *TypePromotionTransaction::createZExt(Instruction *Inst, 3183 Value *Opnd, Type *Ty) { 3184 std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty)); 3185 Value *Val = Ptr->getBuiltValue(); 3186 Actions.push_back(std::move(Ptr)); 3187 return Val; 3188 } 3189 3190 void TypePromotionTransaction::moveBefore(Instruction *Inst, 3191 Instruction *Before) { 3192 Actions.push_back( 3193 llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>( 3194 Inst, Before)); 3195 } 3196 3197 TypePromotionTransaction::ConstRestorationPt 3198 TypePromotionTransaction::getRestorationPoint() const { 3199 return !Actions.empty() ? Actions.back().get() : nullptr; 3200 } 3201 3202 void TypePromotionTransaction::commit() { 3203 for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; 3204 ++It) 3205 (*It)->commit(); 3206 Actions.clear(); 3207 } 3208 3209 void TypePromotionTransaction::rollback( 3210 TypePromotionTransaction::ConstRestorationPt Point) { 3211 while (!Actions.empty() && Point != Actions.back().get()) { 3212 std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val(); 3213 Curr->undo(); 3214 } 3215 } 3216 3217 namespace { 3218 3219 /// \brief A helper class for matching addressing modes. 3220 /// 3221 /// This encapsulates the logic for matching the target-legal addressing modes. 3222 class AddressingModeMatcher { 3223 SmallVectorImpl<Instruction*> &AddrModeInsts; 3224 const TargetLowering &TLI; 3225 const TargetRegisterInfo &TRI; 3226 const DataLayout &DL; 3227 3228 /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and 3229 /// the memory instruction that we're computing this address for. 3230 Type *AccessTy; 3231 unsigned AddrSpace; 3232 Instruction *MemoryInst; 3233 3234 /// This is the addressing mode that we're building up. This is 3235 /// part of the return value of this addressing mode matching stuff. 3236 ExtAddrMode &AddrMode; 3237 3238 /// The instructions inserted by other CodeGenPrepare optimizations. 3239 const SetOfInstrs &InsertedInsts; 3240 3241 /// A map from the instructions to their type before promotion. 3242 InstrToOrigTy &PromotedInsts; 3243 3244 /// The ongoing transaction where every action should be registered. 3245 TypePromotionTransaction &TPT; 3246 3247 /// This is set to true when we should not do profitability checks. 3248 /// When true, IsProfitableToFoldIntoAddressingMode always returns true. 3249 bool IgnoreProfitability; 3250 3251 AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, 3252 const TargetLowering &TLI, 3253 const TargetRegisterInfo &TRI, 3254 Type *AT, unsigned AS, 3255 Instruction *MI, ExtAddrMode &AM, 3256 const SetOfInstrs &InsertedInsts, 3257 InstrToOrigTy &PromotedInsts, 3258 TypePromotionTransaction &TPT) 3259 : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), 3260 DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), 3261 MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), 3262 PromotedInsts(PromotedInsts), TPT(TPT) { 3263 IgnoreProfitability = false; 3264 } 3265 3266 public: 3267 /// Find the maximal addressing mode that a load/store of V can fold, 3268 /// give an access type of AccessTy. This returns a list of involved 3269 /// instructions in AddrModeInsts. 3270 /// \p InsertedInsts The instructions inserted by other CodeGenPrepare 3271 /// optimizations. 3272 /// \p PromotedInsts maps the instructions to their type before promotion. 3273 /// \p The ongoing transaction where every action should be registered. 3274 static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, 3275 Instruction *MemoryInst, 3276 SmallVectorImpl<Instruction*> &AddrModeInsts, 3277 const TargetLowering &TLI, 3278 const TargetRegisterInfo &TRI, 3279 const SetOfInstrs &InsertedInsts, 3280 InstrToOrigTy &PromotedInsts, 3281 TypePromotionTransaction &TPT) { 3282 ExtAddrMode Result; 3283 3284 bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, 3285 AccessTy, AS, 3286 MemoryInst, Result, InsertedInsts, 3287 PromotedInsts, TPT).matchAddr(V, 0); 3288 (void)Success; assert(Success && "Couldn't select *anything*?"); 3289 return Result; 3290 } 3291 3292 private: 3293 bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); 3294 bool matchAddr(Value *V, unsigned Depth); 3295 bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth, 3296 bool *MovedAway = nullptr); 3297 bool isProfitableToFoldIntoAddressingMode(Instruction *I, 3298 ExtAddrMode &AMBefore, 3299 ExtAddrMode &AMAfter); 3300 bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); 3301 bool isPromotionProfitable(unsigned NewCost, unsigned OldCost, 3302 Value *PromotedOperand) const; 3303 }; 3304 3305 } // end anonymous namespace 3306 3307 /// Try adding ScaleReg*Scale to the current addressing mode. 3308 /// Return true and update AddrMode if this addr mode is legal for the target, 3309 /// false if not. 3310 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, 3311 unsigned Depth) { 3312 // If Scale is 1, then this is the same as adding ScaleReg to the addressing 3313 // mode. Just process that directly. 3314 if (Scale == 1) 3315 return matchAddr(ScaleReg, Depth); 3316 3317 // If the scale is 0, it takes nothing to add this. 3318 if (Scale == 0) 3319 return true; 3320 3321 // If we already have a scale of this value, we can add to it, otherwise, we 3322 // need an available scale field. 3323 if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) 3324 return false; 3325 3326 ExtAddrMode TestAddrMode = AddrMode; 3327 3328 // Add scale to turn X*4+X*3 -> X*7. This could also do things like 3329 // [A+B + A*7] -> [B+A*8]. 3330 TestAddrMode.Scale += Scale; 3331 TestAddrMode.ScaledReg = ScaleReg; 3332 3333 // If the new address isn't legal, bail out. 3334 if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) 3335 return false; 3336 3337 // It was legal, so commit it. 3338 AddrMode = TestAddrMode; 3339 3340 // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now 3341 // to see if ScaleReg is actually X+C. If so, we can turn this into adding 3342 // X*Scale + C*Scale to addr mode. 3343 ConstantInt *CI = nullptr; Value *AddLHS = nullptr; 3344 if (isa<Instruction>(ScaleReg) && // not a constant expr. 3345 match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { 3346 TestAddrMode.ScaledReg = AddLHS; 3347 TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; 3348 3349 // If this addressing mode is legal, commit it and remember that we folded 3350 // this instruction. 3351 if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) { 3352 AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); 3353 AddrMode = TestAddrMode; 3354 return true; 3355 } 3356 } 3357 3358 // Otherwise, not (x+c)*scale, just return what we have. 3359 return true; 3360 } 3361 3362 /// This is a little filter, which returns true if an addressing computation 3363 /// involving I might be folded into a load/store accessing it. 3364 /// This doesn't need to be perfect, but needs to accept at least 3365 /// the set of instructions that MatchOperationAddr can. 3366 static bool MightBeFoldableInst(Instruction *I) { 3367 switch (I->getOpcode()) { 3368 case Instruction::BitCast: 3369 case Instruction::AddrSpaceCast: 3370 // Don't touch identity bitcasts. 3371 if (I->getType() == I->getOperand(0)->getType()) 3372 return false; 3373 return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); 3374 case Instruction::PtrToInt: 3375 // PtrToInt is always a noop, as we know that the int type is pointer sized. 3376 return true; 3377 case Instruction::IntToPtr: 3378 // We know the input is intptr_t, so this is foldable. 3379 return true; 3380 case Instruction::Add: 3381 return true; 3382 case Instruction::Mul: 3383 case Instruction::Shl: 3384 // Can only handle X*C and X << C. 3385 return isa<ConstantInt>(I->getOperand(1)); 3386 case Instruction::GetElementPtr: 3387 return true; 3388 default: 3389 return false; 3390 } 3391 } 3392 3393 /// \brief Check whether or not \p Val is a legal instruction for \p TLI. 3394 /// \note \p Val is assumed to be the product of some type promotion. 3395 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed 3396 /// to be legal, as the non-promoted value would have had the same state. 3397 static bool isPromotedInstructionLegal(const TargetLowering &TLI, 3398 const DataLayout &DL, Value *Val) { 3399 Instruction *PromotedInst = dyn_cast<Instruction>(Val); 3400 if (!PromotedInst) 3401 return false; 3402 int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); 3403 // If the ISDOpcode is undefined, it was undefined before the promotion. 3404 if (!ISDOpcode) 3405 return true; 3406 // Otherwise, check if the promoted instruction is legal or not. 3407 return TLI.isOperationLegalOrCustom( 3408 ISDOpcode, TLI.getValueType(DL, PromotedInst->getType())); 3409 } 3410 3411 namespace { 3412 3413 /// \brief Hepler class to perform type promotion. 3414 class TypePromotionHelper { 3415 /// \brief Utility function to check whether or not a sign or zero extension 3416 /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by 3417 /// either using the operands of \p Inst or promoting \p Inst. 3418 /// The type of the extension is defined by \p IsSExt. 3419 /// In other words, check if: 3420 /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType. 3421 /// #1 Promotion applies: 3422 /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...). 3423 /// #2 Operand reuses: 3424 /// ext opnd1 to ConsideredExtType. 3425 /// \p PromotedInsts maps the instructions to their type before promotion. 3426 static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType, 3427 const InstrToOrigTy &PromotedInsts, bool IsSExt); 3428 3429 /// \brief Utility function to determine if \p OpIdx should be promoted when 3430 /// promoting \p Inst. 3431 static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { 3432 return !(isa<SelectInst>(Inst) && OpIdx == 0); 3433 } 3434 3435 /// \brief Utility function to promote the operand of \p Ext when this 3436 /// operand is a promotable trunc or sext or zext. 3437 /// \p PromotedInsts maps the instructions to their type before promotion. 3438 /// \p CreatedInstsCost[out] contains the cost of all instructions 3439 /// created to promote the operand of Ext. 3440 /// Newly added extensions are inserted in \p Exts. 3441 /// Newly added truncates are inserted in \p Truncs. 3442 /// Should never be called directly. 3443 /// \return The promoted value which is used instead of Ext. 3444 static Value *promoteOperandForTruncAndAnyExt( 3445 Instruction *Ext, TypePromotionTransaction &TPT, 3446 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3447 SmallVectorImpl<Instruction *> *Exts, 3448 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI); 3449 3450 /// \brief Utility function to promote the operand of \p Ext when this 3451 /// operand is promotable and is not a supported trunc or sext. 3452 /// \p PromotedInsts maps the instructions to their type before promotion. 3453 /// \p CreatedInstsCost[out] contains the cost of all the instructions 3454 /// created to promote the operand of Ext. 3455 /// Newly added extensions are inserted in \p Exts. 3456 /// Newly added truncates are inserted in \p Truncs. 3457 /// Should never be called directly. 3458 /// \return The promoted value which is used instead of Ext. 3459 static Value *promoteOperandForOther(Instruction *Ext, 3460 TypePromotionTransaction &TPT, 3461 InstrToOrigTy &PromotedInsts, 3462 unsigned &CreatedInstsCost, 3463 SmallVectorImpl<Instruction *> *Exts, 3464 SmallVectorImpl<Instruction *> *Truncs, 3465 const TargetLowering &TLI, bool IsSExt); 3466 3467 /// \see promoteOperandForOther. 3468 static Value *signExtendOperandForOther( 3469 Instruction *Ext, TypePromotionTransaction &TPT, 3470 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3471 SmallVectorImpl<Instruction *> *Exts, 3472 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { 3473 return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, 3474 Exts, Truncs, TLI, true); 3475 } 3476 3477 /// \see promoteOperandForOther. 3478 static Value *zeroExtendOperandForOther( 3479 Instruction *Ext, TypePromotionTransaction &TPT, 3480 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3481 SmallVectorImpl<Instruction *> *Exts, 3482 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { 3483 return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, 3484 Exts, Truncs, TLI, false); 3485 } 3486 3487 public: 3488 /// Type for the utility function that promotes the operand of Ext. 3489 using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT, 3490 InstrToOrigTy &PromotedInsts, 3491 unsigned &CreatedInstsCost, 3492 SmallVectorImpl<Instruction *> *Exts, 3493 SmallVectorImpl<Instruction *> *Truncs, 3494 const TargetLowering &TLI); 3495 3496 /// \brief Given a sign/zero extend instruction \p Ext, return the approriate 3497 /// action to promote the operand of \p Ext instead of using Ext. 3498 /// \return NULL if no promotable action is possible with the current 3499 /// sign extension. 3500 /// \p InsertedInsts keeps track of all the instructions inserted by the 3501 /// other CodeGenPrepare optimizations. This information is important 3502 /// because we do not want to promote these instructions as CodeGenPrepare 3503 /// will reinsert them later. Thus creating an infinite loop: create/remove. 3504 /// \p PromotedInsts maps the instructions to their type before promotion. 3505 static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts, 3506 const TargetLowering &TLI, 3507 const InstrToOrigTy &PromotedInsts); 3508 }; 3509 3510 } // end anonymous namespace 3511 3512 bool TypePromotionHelper::canGetThrough(const Instruction *Inst, 3513 Type *ConsideredExtType, 3514 const InstrToOrigTy &PromotedInsts, 3515 bool IsSExt) { 3516 // The promotion helper does not know how to deal with vector types yet. 3517 // To be able to fix that, we would need to fix the places where we 3518 // statically extend, e.g., constants and such. 3519 if (Inst->getType()->isVectorTy()) 3520 return false; 3521 3522 // We can always get through zext. 3523 if (isa<ZExtInst>(Inst)) 3524 return true; 3525 3526 // sext(sext) is ok too. 3527 if (IsSExt && isa<SExtInst>(Inst)) 3528 return true; 3529 3530 // We can get through binary operator, if it is legal. In other words, the 3531 // binary operator must have a nuw or nsw flag. 3532 const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 3533 if (BinOp && isa<OverflowingBinaryOperator>(BinOp) && 3534 ((!IsSExt && BinOp->hasNoUnsignedWrap()) || 3535 (IsSExt && BinOp->hasNoSignedWrap()))) 3536 return true; 3537 3538 // Check if we can do the following simplification. 3539 // ext(trunc(opnd)) --> ext(opnd) 3540 if (!isa<TruncInst>(Inst)) 3541 return false; 3542 3543 Value *OpndVal = Inst->getOperand(0); 3544 // Check if we can use this operand in the extension. 3545 // If the type is larger than the result type of the extension, we cannot. 3546 if (!OpndVal->getType()->isIntegerTy() || 3547 OpndVal->getType()->getIntegerBitWidth() > 3548 ConsideredExtType->getIntegerBitWidth()) 3549 return false; 3550 3551 // If the operand of the truncate is not an instruction, we will not have 3552 // any information on the dropped bits. 3553 // (Actually we could for constant but it is not worth the extra logic). 3554 Instruction *Opnd = dyn_cast<Instruction>(OpndVal); 3555 if (!Opnd) 3556 return false; 3557 3558 // Check if the source of the type is narrow enough. 3559 // I.e., check that trunc just drops extended bits of the same kind of 3560 // the extension. 3561 // #1 get the type of the operand and check the kind of the extended bits. 3562 const Type *OpndType; 3563 InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd); 3564 if (It != PromotedInsts.end() && It->second.getInt() == IsSExt) 3565 OpndType = It->second.getPointer(); 3566 else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd))) 3567 OpndType = Opnd->getOperand(0)->getType(); 3568 else 3569 return false; 3570 3571 // #2 check that the truncate just drops extended bits. 3572 return Inst->getType()->getIntegerBitWidth() >= 3573 OpndType->getIntegerBitWidth(); 3574 } 3575 3576 TypePromotionHelper::Action TypePromotionHelper::getAction( 3577 Instruction *Ext, const SetOfInstrs &InsertedInsts, 3578 const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) { 3579 assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && 3580 "Unexpected instruction type"); 3581 Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0)); 3582 Type *ExtTy = Ext->getType(); 3583 bool IsSExt = isa<SExtInst>(Ext); 3584 // If the operand of the extension is not an instruction, we cannot 3585 // get through. 3586 // If it, check we can get through. 3587 if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt)) 3588 return nullptr; 3589 3590 // Do not promote if the operand has been added by codegenprepare. 3591 // Otherwise, it means we are undoing an optimization that is likely to be 3592 // redone, thus causing potential infinite loop. 3593 if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd)) 3594 return nullptr; 3595 3596 // SExt or Trunc instructions. 3597 // Return the related handler. 3598 if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) || 3599 isa<ZExtInst>(ExtOpnd)) 3600 return promoteOperandForTruncAndAnyExt; 3601 3602 // Regular instruction. 3603 // Abort early if we will have to insert non-free instructions. 3604 if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType())) 3605 return nullptr; 3606 return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther; 3607 } 3608 3609 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( 3610 Instruction *SExt, TypePromotionTransaction &TPT, 3611 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3612 SmallVectorImpl<Instruction *> *Exts, 3613 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { 3614 // By construction, the operand of SExt is an instruction. Otherwise we cannot 3615 // get through it and this method should not be called. 3616 Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0)); 3617 Value *ExtVal = SExt; 3618 bool HasMergedNonFreeExt = false; 3619 if (isa<ZExtInst>(SExtOpnd)) { 3620 // Replace s|zext(zext(opnd)) 3621 // => zext(opnd). 3622 HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd); 3623 Value *ZExt = 3624 TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType()); 3625 TPT.replaceAllUsesWith(SExt, ZExt); 3626 TPT.eraseInstruction(SExt); 3627 ExtVal = ZExt; 3628 } else { 3629 // Replace z|sext(trunc(opnd)) or sext(sext(opnd)) 3630 // => z|sext(opnd). 3631 TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0)); 3632 } 3633 CreatedInstsCost = 0; 3634 3635 // Remove dead code. 3636 if (SExtOpnd->use_empty()) 3637 TPT.eraseInstruction(SExtOpnd); 3638 3639 // Check if the extension is still needed. 3640 Instruction *ExtInst = dyn_cast<Instruction>(ExtVal); 3641 if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) { 3642 if (ExtInst) { 3643 if (Exts) 3644 Exts->push_back(ExtInst); 3645 CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt; 3646 } 3647 return ExtVal; 3648 } 3649 3650 // At this point we have: ext ty opnd to ty. 3651 // Reassign the uses of ExtInst to the opnd and remove ExtInst. 3652 Value *NextVal = ExtInst->getOperand(0); 3653 TPT.eraseInstruction(ExtInst, NextVal); 3654 return NextVal; 3655 } 3656 3657 Value *TypePromotionHelper::promoteOperandForOther( 3658 Instruction *Ext, TypePromotionTransaction &TPT, 3659 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, 3660 SmallVectorImpl<Instruction *> *Exts, 3661 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI, 3662 bool IsSExt) { 3663 // By construction, the operand of Ext is an instruction. Otherwise we cannot 3664 // get through it and this method should not be called. 3665 Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0)); 3666 CreatedInstsCost = 0; 3667 if (!ExtOpnd->hasOneUse()) { 3668 // ExtOpnd will be promoted. 3669 // All its uses, but Ext, will need to use a truncated value of the 3670 // promoted version. 3671 // Create the truncate now. 3672 Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType()); 3673 if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) { 3674 // Insert it just after the definition. 3675 ITrunc->moveAfter(ExtOpnd); 3676 if (Truncs) 3677 Truncs->push_back(ITrunc); 3678 } 3679 3680 TPT.replaceAllUsesWith(ExtOpnd, Trunc); 3681 // Restore the operand of Ext (which has been replaced by the previous call 3682 // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext. 3683 TPT.setOperand(Ext, 0, ExtOpnd); 3684 } 3685 3686 // Get through the Instruction: 3687 // 1. Update its type. 3688 // 2. Replace the uses of Ext by Inst. 3689 // 3. Extend each operand that needs to be extended. 3690 3691 // Remember the original type of the instruction before promotion. 3692 // This is useful to know that the high bits are sign extended bits. 3693 PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>( 3694 ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt))); 3695 // Step #1. 3696 TPT.mutateType(ExtOpnd, Ext->getType()); 3697 // Step #2. 3698 TPT.replaceAllUsesWith(Ext, ExtOpnd); 3699 // Step #3. 3700 Instruction *ExtForOpnd = Ext; 3701 3702 DEBUG(dbgs() << "Propagate Ext to operands\n"); 3703 for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx; 3704 ++OpIdx) { 3705 DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n'); 3706 if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() || 3707 !shouldExtOperand(ExtOpnd, OpIdx)) { 3708 DEBUG(dbgs() << "No need to propagate\n"); 3709 continue; 3710 } 3711 // Check if we can statically extend the operand. 3712 Value *Opnd = ExtOpnd->getOperand(OpIdx); 3713 if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) { 3714 DEBUG(dbgs() << "Statically extend\n"); 3715 unsigned BitWidth = Ext->getType()->getIntegerBitWidth(); 3716 APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth) 3717 : Cst->getValue().zext(BitWidth); 3718 TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal)); 3719 continue; 3720 } 3721 // UndefValue are typed, so we have to statically sign extend them. 3722 if (isa<UndefValue>(Opnd)) { 3723 DEBUG(dbgs() << "Statically extend\n"); 3724 TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType())); 3725 continue; 3726 } 3727 3728 // Otherwise we have to explicity sign extend the operand. 3729 // Check if Ext was reused to extend an operand. 3730 if (!ExtForOpnd) { 3731 // If yes, create a new one. 3732 DEBUG(dbgs() << "More operands to ext\n"); 3733 Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) 3734 : TPT.createZExt(Ext, Opnd, Ext->getType()); 3735 if (!isa<Instruction>(ValForExtOpnd)) { 3736 TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd); 3737 continue; 3738 } 3739 ExtForOpnd = cast<Instruction>(ValForExtOpnd); 3740 } 3741 if (Exts) 3742 Exts->push_back(ExtForOpnd); 3743 TPT.setOperand(ExtForOpnd, 0, Opnd); 3744 3745 // Move the sign extension before the insertion point. 3746 TPT.moveBefore(ExtForOpnd, ExtOpnd); 3747 TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd); 3748 CreatedInstsCost += !TLI.isExtFree(ExtForOpnd); 3749 // If more sext are required, new instructions will have to be created. 3750 ExtForOpnd = nullptr; 3751 } 3752 if (ExtForOpnd == Ext) { 3753 DEBUG(dbgs() << "Extension is useless now\n"); 3754 TPT.eraseInstruction(Ext); 3755 } 3756 return ExtOpnd; 3757 } 3758 3759 /// Check whether or not promoting an instruction to a wider type is profitable. 3760 /// \p NewCost gives the cost of extension instructions created by the 3761 /// promotion. 3762 /// \p OldCost gives the cost of extension instructions before the promotion 3763 /// plus the number of instructions that have been 3764 /// matched in the addressing mode the promotion. 3765 /// \p PromotedOperand is the value that has been promoted. 3766 /// \return True if the promotion is profitable, false otherwise. 3767 bool AddressingModeMatcher::isPromotionProfitable( 3768 unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const { 3769 DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n'); 3770 // The cost of the new extensions is greater than the cost of the 3771 // old extension plus what we folded. 3772 // This is not profitable. 3773 if (NewCost > OldCost) 3774 return false; 3775 if (NewCost < OldCost) 3776 return true; 3777 // The promotion is neutral but it may help folding the sign extension in 3778 // loads for instance. 3779 // Check that we did not create an illegal instruction. 3780 return isPromotedInstructionLegal(TLI, DL, PromotedOperand); 3781 } 3782 3783 /// Given an instruction or constant expr, see if we can fold the operation 3784 /// into the addressing mode. If so, update the addressing mode and return 3785 /// true, otherwise return false without modifying AddrMode. 3786 /// If \p MovedAway is not NULL, it contains the information of whether or 3787 /// not AddrInst has to be folded into the addressing mode on success. 3788 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing 3789 /// because it has been moved away. 3790 /// Thus AddrInst must not be added in the matched instructions. 3791 /// This state can happen when AddrInst is a sext, since it may be moved away. 3792 /// Therefore, AddrInst may not be valid when MovedAway is true and it must 3793 /// not be referenced anymore. 3794 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, 3795 unsigned Depth, 3796 bool *MovedAway) { 3797 // Avoid exponential behavior on extremely deep expression trees. 3798 if (Depth >= 5) return false; 3799 3800 // By default, all matched instructions stay in place. 3801 if (MovedAway) 3802 *MovedAway = false; 3803 3804 switch (Opcode) { 3805 case Instruction::PtrToInt: 3806 // PtrToInt is always a noop, as we know that the int type is pointer sized. 3807 return matchAddr(AddrInst->getOperand(0), Depth); 3808 case Instruction::IntToPtr: { 3809 auto AS = AddrInst->getType()->getPointerAddressSpace(); 3810 auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); 3811 // This inttoptr is a no-op if the integer type is pointer sized. 3812 if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy) 3813 return matchAddr(AddrInst->getOperand(0), Depth); 3814 return false; 3815 } 3816 case Instruction::BitCast: 3817 // BitCast is always a noop, and we can handle it as long as it is 3818 // int->int or pointer->pointer (we don't want int<->fp or something). 3819 if ((AddrInst->getOperand(0)->getType()->isPointerTy() || 3820 AddrInst->getOperand(0)->getType()->isIntegerTy()) && 3821 // Don't touch identity bitcasts. These were probably put here by LSR, 3822 // and we don't want to mess around with them. Assume it knows what it 3823 // is doing. 3824 AddrInst->getOperand(0)->getType() != AddrInst->getType()) 3825 return matchAddr(AddrInst->getOperand(0), Depth); 3826 return false; 3827 case Instruction::AddrSpaceCast: { 3828 unsigned SrcAS 3829 = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); 3830 unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); 3831 if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) 3832 return matchAddr(AddrInst->getOperand(0), Depth); 3833 return false; 3834 } 3835 case Instruction::Add: { 3836 // Check to see if we can merge in the RHS then the LHS. If so, we win. 3837 ExtAddrMode BackupAddrMode = AddrMode; 3838 unsigned OldSize = AddrModeInsts.size(); 3839 // Start a transaction at this point. 3840 // The LHS may match but not the RHS. 3841 // Therefore, we need a higher level restoration point to undo partially 3842 // matched operation. 3843 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 3844 TPT.getRestorationPoint(); 3845 3846 if (matchAddr(AddrInst->getOperand(1), Depth+1) && 3847 matchAddr(AddrInst->getOperand(0), Depth+1)) 3848 return true; 3849 3850 // Restore the old addr mode info. 3851 AddrMode = BackupAddrMode; 3852 AddrModeInsts.resize(OldSize); 3853 TPT.rollback(LastKnownGood); 3854 3855 // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. 3856 if (matchAddr(AddrInst->getOperand(0), Depth+1) && 3857 matchAddr(AddrInst->getOperand(1), Depth+1)) 3858 return true; 3859 3860 // Otherwise we definitely can't merge the ADD in. 3861 AddrMode = BackupAddrMode; 3862 AddrModeInsts.resize(OldSize); 3863 TPT.rollback(LastKnownGood); 3864 break; 3865 } 3866 //case Instruction::Or: 3867 // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. 3868 //break; 3869 case Instruction::Mul: 3870 case Instruction::Shl: { 3871 // Can only handle X*C and X << C. 3872 ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); 3873 if (!RHS) 3874 return false; 3875 int64_t Scale = RHS->getSExtValue(); 3876 if (Opcode == Instruction::Shl) 3877 Scale = 1LL << Scale; 3878 3879 return matchScaledValue(AddrInst->getOperand(0), Scale, Depth); 3880 } 3881 case Instruction::GetElementPtr: { 3882 // Scan the GEP. We check it if it contains constant offsets and at most 3883 // one variable offset. 3884 int VariableOperand = -1; 3885 unsigned VariableScale = 0; 3886 3887 int64_t ConstantOffset = 0; 3888 gep_type_iterator GTI = gep_type_begin(AddrInst); 3889 for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { 3890 if (StructType *STy = GTI.getStructTypeOrNull()) { 3891 const StructLayout *SL = DL.getStructLayout(STy); 3892 unsigned Idx = 3893 cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); 3894 ConstantOffset += SL->getElementOffset(Idx); 3895 } else { 3896 uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType()); 3897 if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { 3898 ConstantOffset += CI->getSExtValue()*TypeSize; 3899 } else if (TypeSize) { // Scales of zero don't do anything. 3900 // We only allow one variable index at the moment. 3901 if (VariableOperand != -1) 3902 return false; 3903 3904 // Remember the variable index. 3905 VariableOperand = i; 3906 VariableScale = TypeSize; 3907 } 3908 } 3909 } 3910 3911 // A common case is for the GEP to only do a constant offset. In this case, 3912 // just add it to the disp field and check validity. 3913 if (VariableOperand == -1) { 3914 AddrMode.BaseOffs += ConstantOffset; 3915 if (ConstantOffset == 0 || 3916 TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) { 3917 // Check to see if we can fold the base pointer in too. 3918 if (matchAddr(AddrInst->getOperand(0), Depth+1)) 3919 return true; 3920 } 3921 AddrMode.BaseOffs -= ConstantOffset; 3922 return false; 3923 } 3924 3925 // Save the valid addressing mode in case we can't match. 3926 ExtAddrMode BackupAddrMode = AddrMode; 3927 unsigned OldSize = AddrModeInsts.size(); 3928 3929 // See if the scale and offset amount is valid for this target. 3930 AddrMode.BaseOffs += ConstantOffset; 3931 3932 // Match the base operand of the GEP. 3933 if (!matchAddr(AddrInst->getOperand(0), Depth+1)) { 3934 // If it couldn't be matched, just stuff the value in a register. 3935 if (AddrMode.HasBaseReg) { 3936 AddrMode = BackupAddrMode; 3937 AddrModeInsts.resize(OldSize); 3938 return false; 3939 } 3940 AddrMode.HasBaseReg = true; 3941 AddrMode.BaseReg = AddrInst->getOperand(0); 3942 } 3943 3944 // Match the remaining variable portion of the GEP. 3945 if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, 3946 Depth)) { 3947 // If it couldn't be matched, try stuffing the base into a register 3948 // instead of matching it, and retrying the match of the scale. 3949 AddrMode = BackupAddrMode; 3950 AddrModeInsts.resize(OldSize); 3951 if (AddrMode.HasBaseReg) 3952 return false; 3953 AddrMode.HasBaseReg = true; 3954 AddrMode.BaseReg = AddrInst->getOperand(0); 3955 AddrMode.BaseOffs += ConstantOffset; 3956 if (!matchScaledValue(AddrInst->getOperand(VariableOperand), 3957 VariableScale, Depth)) { 3958 // If even that didn't work, bail. 3959 AddrMode = BackupAddrMode; 3960 AddrModeInsts.resize(OldSize); 3961 return false; 3962 } 3963 } 3964 3965 return true; 3966 } 3967 case Instruction::SExt: 3968 case Instruction::ZExt: { 3969 Instruction *Ext = dyn_cast<Instruction>(AddrInst); 3970 if (!Ext) 3971 return false; 3972 3973 // Try to move this ext out of the way of the addressing mode. 3974 // Ask for a method for doing so. 3975 TypePromotionHelper::Action TPH = 3976 TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts); 3977 if (!TPH) 3978 return false; 3979 3980 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 3981 TPT.getRestorationPoint(); 3982 unsigned CreatedInstsCost = 0; 3983 unsigned ExtCost = !TLI.isExtFree(Ext); 3984 Value *PromotedOperand = 3985 TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI); 3986 // SExt has been moved away. 3987 // Thus either it will be rematched later in the recursive calls or it is 3988 // gone. Anyway, we must not fold it into the addressing mode at this point. 3989 // E.g., 3990 // op = add opnd, 1 3991 // idx = ext op 3992 // addr = gep base, idx 3993 // is now: 3994 // promotedOpnd = ext opnd <- no match here 3995 // op = promoted_add promotedOpnd, 1 <- match (later in recursive calls) 3996 // addr = gep base, op <- match 3997 if (MovedAway) 3998 *MovedAway = true; 3999 4000 assert(PromotedOperand && 4001 "TypePromotionHelper should have filtered out those cases"); 4002 4003 ExtAddrMode BackupAddrMode = AddrMode; 4004 unsigned OldSize = AddrModeInsts.size(); 4005 4006 if (!matchAddr(PromotedOperand, Depth) || 4007 // The total of the new cost is equal to the cost of the created 4008 // instructions. 4009 // The total of the old cost is equal to the cost of the extension plus 4010 // what we have saved in the addressing mode. 4011 !isPromotionProfitable(CreatedInstsCost, 4012 ExtCost + (AddrModeInsts.size() - OldSize), 4013 PromotedOperand)) { 4014 AddrMode = BackupAddrMode; 4015 AddrModeInsts.resize(OldSize); 4016 DEBUG(dbgs() << "Sign extension does not pay off: rollback\n"); 4017 TPT.rollback(LastKnownGood); 4018 return false; 4019 } 4020 return true; 4021 } 4022 } 4023 return false; 4024 } 4025 4026 /// If we can, try to add the value of 'Addr' into the current addressing mode. 4027 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode 4028 /// unmodified. This assumes that Addr is either a pointer type or intptr_t 4029 /// for the target. 4030 /// 4031 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { 4032 // Start a transaction at this point that we will rollback if the matching 4033 // fails. 4034 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4035 TPT.getRestorationPoint(); 4036 if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { 4037 // Fold in immediates if legal for the target. 4038 AddrMode.BaseOffs += CI->getSExtValue(); 4039 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4040 return true; 4041 AddrMode.BaseOffs -= CI->getSExtValue(); 4042 } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { 4043 // If this is a global variable, try to fold it into the addressing mode. 4044 if (!AddrMode.BaseGV) { 4045 AddrMode.BaseGV = GV; 4046 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4047 return true; 4048 AddrMode.BaseGV = nullptr; 4049 } 4050 } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { 4051 ExtAddrMode BackupAddrMode = AddrMode; 4052 unsigned OldSize = AddrModeInsts.size(); 4053 4054 // Check to see if it is possible to fold this operation. 4055 bool MovedAway = false; 4056 if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { 4057 // This instruction may have been moved away. If so, there is nothing 4058 // to check here. 4059 if (MovedAway) 4060 return true; 4061 // Okay, it's possible to fold this. Check to see if it is actually 4062 // *profitable* to do so. We use a simple cost model to avoid increasing 4063 // register pressure too much. 4064 if (I->hasOneUse() || 4065 isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { 4066 AddrModeInsts.push_back(I); 4067 return true; 4068 } 4069 4070 // It isn't profitable to do this, roll back. 4071 //cerr << "NOT FOLDING: " << *I; 4072 AddrMode = BackupAddrMode; 4073 AddrModeInsts.resize(OldSize); 4074 TPT.rollback(LastKnownGood); 4075 } 4076 } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { 4077 if (matchOperationAddr(CE, CE->getOpcode(), Depth)) 4078 return true; 4079 TPT.rollback(LastKnownGood); 4080 } else if (isa<ConstantPointerNull>(Addr)) { 4081 // Null pointer gets folded without affecting the addressing mode. 4082 return true; 4083 } 4084 4085 // Worse case, the target should support [reg] addressing modes. :) 4086 if (!AddrMode.HasBaseReg) { 4087 AddrMode.HasBaseReg = true; 4088 AddrMode.BaseReg = Addr; 4089 // Still check for legality in case the target supports [imm] but not [i+r]. 4090 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4091 return true; 4092 AddrMode.HasBaseReg = false; 4093 AddrMode.BaseReg = nullptr; 4094 } 4095 4096 // If the base register is already taken, see if we can do [r+r]. 4097 if (AddrMode.Scale == 0) { 4098 AddrMode.Scale = 1; 4099 AddrMode.ScaledReg = Addr; 4100 if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) 4101 return true; 4102 AddrMode.Scale = 0; 4103 AddrMode.ScaledReg = nullptr; 4104 } 4105 // Couldn't match. 4106 TPT.rollback(LastKnownGood); 4107 return false; 4108 } 4109 4110 /// Check to see if all uses of OpVal by the specified inline asm call are due 4111 /// to memory operands. If so, return true, otherwise return false. 4112 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, 4113 const TargetLowering &TLI, 4114 const TargetRegisterInfo &TRI) { 4115 const Function *F = CI->getFunction(); 4116 TargetLowering::AsmOperandInfoVector TargetConstraints = 4117 TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, 4118 ImmutableCallSite(CI)); 4119 4120 for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { 4121 TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; 4122 4123 // Compute the constraint code and ConstraintType to use. 4124 TLI.ComputeConstraintToUse(OpInfo, SDValue()); 4125 4126 // If this asm operand is our Value*, and if it isn't an indirect memory 4127 // operand, we can't fold it! 4128 if (OpInfo.CallOperandVal == OpVal && 4129 (OpInfo.ConstraintType != TargetLowering::C_Memory || 4130 !OpInfo.isIndirect)) 4131 return false; 4132 } 4133 4134 return true; 4135 } 4136 4137 // Max number of memory uses to look at before aborting the search to conserve 4138 // compile time. 4139 static constexpr int MaxMemoryUsesToScan = 20; 4140 4141 /// Recursively walk all the uses of I until we find a memory use. 4142 /// If we find an obviously non-foldable instruction, return true. 4143 /// Add the ultimately found memory instructions to MemoryUses. 4144 static bool FindAllMemoryUses( 4145 Instruction *I, 4146 SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, 4147 SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI, 4148 const TargetRegisterInfo &TRI, int SeenInsts = 0) { 4149 // If we already considered this instruction, we're done. 4150 if (!ConsideredInsts.insert(I).second) 4151 return false; 4152 4153 // If this is an obviously unfoldable instruction, bail out. 4154 if (!MightBeFoldableInst(I)) 4155 return true; 4156 4157 const bool OptSize = I->getFunction()->optForSize(); 4158 4159 // Loop over all the uses, recursively processing them. 4160 for (Use &U : I->uses()) { 4161 // Conservatively return true if we're seeing a large number or a deep chain 4162 // of users. This avoids excessive compilation times in pathological cases. 4163 if (SeenInsts++ >= MaxMemoryUsesToScan) 4164 return true; 4165 4166 Instruction *UserI = cast<Instruction>(U.getUser()); 4167 if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) { 4168 MemoryUses.push_back(std::make_pair(LI, U.getOperandNo())); 4169 continue; 4170 } 4171 4172 if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { 4173 unsigned opNo = U.getOperandNo(); 4174 if (opNo != StoreInst::getPointerOperandIndex()) 4175 return true; // Storing addr, not into addr. 4176 MemoryUses.push_back(std::make_pair(SI, opNo)); 4177 continue; 4178 } 4179 4180 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { 4181 unsigned opNo = U.getOperandNo(); 4182 if (opNo != AtomicRMWInst::getPointerOperandIndex()) 4183 return true; // Storing addr, not into addr. 4184 MemoryUses.push_back(std::make_pair(RMW, opNo)); 4185 continue; 4186 } 4187 4188 if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { 4189 unsigned opNo = U.getOperandNo(); 4190 if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) 4191 return true; // Storing addr, not into addr. 4192 MemoryUses.push_back(std::make_pair(CmpX, opNo)); 4193 continue; 4194 } 4195 4196 if (CallInst *CI = dyn_cast<CallInst>(UserI)) { 4197 // If this is a cold call, we can sink the addressing calculation into 4198 // the cold path. See optimizeCallInst 4199 if (!OptSize && CI->hasFnAttr(Attribute::Cold)) 4200 continue; 4201 4202 InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); 4203 if (!IA) return true; 4204 4205 // If this is a memory operand, we're cool, otherwise bail out. 4206 if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI)) 4207 return true; 4208 continue; 4209 } 4210 4211 if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, 4212 SeenInsts)) 4213 return true; 4214 } 4215 4216 return false; 4217 } 4218 4219 /// Return true if Val is already known to be live at the use site that we're 4220 /// folding it into. If so, there is no cost to include it in the addressing 4221 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the 4222 /// instruction already. 4223 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, 4224 Value *KnownLive2) { 4225 // If Val is either of the known-live values, we know it is live! 4226 if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2) 4227 return true; 4228 4229 // All values other than instructions and arguments (e.g. constants) are live. 4230 if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; 4231 4232 // If Val is a constant sized alloca in the entry block, it is live, this is 4233 // true because it is just a reference to the stack/frame pointer, which is 4234 // live for the whole function. 4235 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) 4236 if (AI->isStaticAlloca()) 4237 return true; 4238 4239 // Check to see if this value is already used in the memory instruction's 4240 // block. If so, it's already live into the block at the very least, so we 4241 // can reasonably fold it. 4242 return Val->isUsedInBasicBlock(MemoryInst->getParent()); 4243 } 4244 4245 /// It is possible for the addressing mode of the machine to fold the specified 4246 /// instruction into a load or store that ultimately uses it. 4247 /// However, the specified instruction has multiple uses. 4248 /// Given this, it may actually increase register pressure to fold it 4249 /// into the load. For example, consider this code: 4250 /// 4251 /// X = ... 4252 /// Y = X+1 4253 /// use(Y) -> nonload/store 4254 /// Z = Y+1 4255 /// load Z 4256 /// 4257 /// In this case, Y has multiple uses, and can be folded into the load of Z 4258 /// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to 4259 /// be live at the use(Y) line. If we don't fold Y into load Z, we use one 4260 /// fewer register. Since Y can't be folded into "use(Y)" we don't increase the 4261 /// number of computations either. 4262 /// 4263 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If 4264 /// X was live across 'load Z' for other reasons, we actually *would* want to 4265 /// fold the addressing mode in the Z case. This would make Y die earlier. 4266 bool AddressingModeMatcher:: 4267 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, 4268 ExtAddrMode &AMAfter) { 4269 if (IgnoreProfitability) return true; 4270 4271 // AMBefore is the addressing mode before this instruction was folded into it, 4272 // and AMAfter is the addressing mode after the instruction was folded. Get 4273 // the set of registers referenced by AMAfter and subtract out those 4274 // referenced by AMBefore: this is the set of values which folding in this 4275 // address extends the lifetime of. 4276 // 4277 // Note that there are only two potential values being referenced here, 4278 // BaseReg and ScaleReg (global addresses are always available, as are any 4279 // folded immediates). 4280 Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; 4281 4282 // If the BaseReg or ScaledReg was referenced by the previous addrmode, their 4283 // lifetime wasn't extended by adding this instruction. 4284 if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) 4285 BaseReg = nullptr; 4286 if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) 4287 ScaledReg = nullptr; 4288 4289 // If folding this instruction (and it's subexprs) didn't extend any live 4290 // ranges, we're ok with it. 4291 if (!BaseReg && !ScaledReg) 4292 return true; 4293 4294 // If all uses of this instruction can have the address mode sunk into them, 4295 // we can remove the addressing mode and effectively trade one live register 4296 // for another (at worst.) In this context, folding an addressing mode into 4297 // the use is just a particularly nice way of sinking it. 4298 SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; 4299 SmallPtrSet<Instruction*, 16> ConsideredInsts; 4300 if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) 4301 return false; // Has a non-memory, non-foldable use! 4302 4303 // Now that we know that all uses of this instruction are part of a chain of 4304 // computation involving only operations that could theoretically be folded 4305 // into a memory use, loop over each of these memory operation uses and see 4306 // if they could *actually* fold the instruction. The assumption is that 4307 // addressing modes are cheap and that duplicating the computation involved 4308 // many times is worthwhile, even on a fastpath. For sinking candidates 4309 // (i.e. cold call sites), this serves as a way to prevent excessive code 4310 // growth since most architectures have some reasonable small and fast way to 4311 // compute an effective address. (i.e LEA on x86) 4312 SmallVector<Instruction*, 32> MatchedAddrModeInsts; 4313 for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { 4314 Instruction *User = MemoryUses[i].first; 4315 unsigned OpNo = MemoryUses[i].second; 4316 4317 // Get the access type of this use. If the use isn't a pointer, we don't 4318 // know what it accesses. 4319 Value *Address = User->getOperand(OpNo); 4320 PointerType *AddrTy = dyn_cast<PointerType>(Address->getType()); 4321 if (!AddrTy) 4322 return false; 4323 Type *AddressAccessTy = AddrTy->getElementType(); 4324 unsigned AS = AddrTy->getAddressSpace(); 4325 4326 // Do a match against the root of this address, ignoring profitability. This 4327 // will tell us if the addressing mode for the memory operation will 4328 // *actually* cover the shared instruction. 4329 ExtAddrMode Result; 4330 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4331 TPT.getRestorationPoint(); 4332 AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, 4333 AddressAccessTy, AS, 4334 MemoryInst, Result, InsertedInsts, 4335 PromotedInsts, TPT); 4336 Matcher.IgnoreProfitability = true; 4337 bool Success = Matcher.matchAddr(Address, 0); 4338 (void)Success; assert(Success && "Couldn't select *anything*?"); 4339 4340 // The match was to check the profitability, the changes made are not 4341 // part of the original matcher. Therefore, they should be dropped 4342 // otherwise the original matcher will not present the right state. 4343 TPT.rollback(LastKnownGood); 4344 4345 // If the match didn't cover I, then it won't be shared by it. 4346 if (!is_contained(MatchedAddrModeInsts, I)) 4347 return false; 4348 4349 MatchedAddrModeInsts.clear(); 4350 } 4351 4352 return true; 4353 } 4354 4355 /// Return true if the specified values are defined in a 4356 /// different basic block than BB. 4357 static bool IsNonLocalValue(Value *V, BasicBlock *BB) { 4358 if (Instruction *I = dyn_cast<Instruction>(V)) 4359 return I->getParent() != BB; 4360 return false; 4361 } 4362 4363 /// Sink addressing mode computation immediate before MemoryInst if doing so 4364 /// can be done without increasing register pressure. The need for the 4365 /// register pressure constraint means this can end up being an all or nothing 4366 /// decision for all uses of the same addressing computation. 4367 /// 4368 /// Load and Store Instructions often have addressing modes that can do 4369 /// significant amounts of computation. As such, instruction selection will try 4370 /// to get the load or store to do as much computation as possible for the 4371 /// program. The problem is that isel can only see within a single block. As 4372 /// such, we sink as much legal addressing mode work into the block as possible. 4373 /// 4374 /// This method is used to optimize both load/store and inline asms with memory 4375 /// operands. It's also used to sink addressing computations feeding into cold 4376 /// call sites into their (cold) basic block. 4377 /// 4378 /// The motivation for handling sinking into cold blocks is that doing so can 4379 /// both enable other address mode sinking (by satisfying the register pressure 4380 /// constraint above), and reduce register pressure globally (by removing the 4381 /// addressing mode computation from the fast path entirely.). 4382 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, 4383 Type *AccessTy, unsigned AddrSpace) { 4384 Value *Repl = Addr; 4385 4386 // Try to collapse single-value PHI nodes. This is necessary to undo 4387 // unprofitable PRE transformations. 4388 SmallVector<Value*, 8> worklist; 4389 SmallPtrSet<Value*, 16> Visited; 4390 worklist.push_back(Addr); 4391 4392 // Use a worklist to iteratively look through PHI nodes, and ensure that 4393 // the addressing mode obtained from the non-PHI roots of the graph 4394 // are equivalent. 4395 bool AddrModeFound = false; 4396 bool PhiSeen = false; 4397 SmallVector<Instruction*, 16> AddrModeInsts; 4398 ExtAddrMode AddrMode; 4399 TypePromotionTransaction TPT(RemovedInsts); 4400 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4401 TPT.getRestorationPoint(); 4402 while (!worklist.empty()) { 4403 Value *V = worklist.back(); 4404 worklist.pop_back(); 4405 4406 // We allow traversing cyclic Phi nodes. 4407 // In case of success after this loop we ensure that traversing through 4408 // Phi nodes ends up with all cases to compute address of the form 4409 // BaseGV + Base + Scale * Index + Offset 4410 // where Scale and Offset are constans and BaseGV, Base and Index 4411 // are exactly the same Values in all cases. 4412 // It means that BaseGV, Scale and Offset dominate our memory instruction 4413 // and have the same value as they had in address computation represented 4414 // as Phi. So we can safely sink address computation to memory instruction. 4415 if (!Visited.insert(V).second) 4416 continue; 4417 4418 // For a PHI node, push all of its incoming values. 4419 if (PHINode *P = dyn_cast<PHINode>(V)) { 4420 for (Value *IncValue : P->incoming_values()) 4421 worklist.push_back(IncValue); 4422 PhiSeen = true; 4423 continue; 4424 } 4425 4426 // For non-PHIs, determine the addressing mode being computed. Note that 4427 // the result may differ depending on what other uses our candidate 4428 // addressing instructions might have. 4429 AddrModeInsts.clear(); 4430 ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( 4431 V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, 4432 InsertedInsts, PromotedInsts, TPT); 4433 4434 if (!AddrModeFound) { 4435 AddrModeFound = true; 4436 AddrMode = NewAddrMode; 4437 continue; 4438 } 4439 if (NewAddrMode == AddrMode) 4440 continue; 4441 4442 AddrModeFound = false; 4443 break; 4444 } 4445 4446 // If the addressing mode couldn't be determined, or if multiple different 4447 // ones were determined, bail out now. 4448 if (!AddrModeFound) { 4449 TPT.rollback(LastKnownGood); 4450 return false; 4451 } 4452 TPT.commit(); 4453 4454 // If all the instructions matched are already in this BB, don't do anything. 4455 // If we saw Phi node then it is not local definitely. 4456 if (!PhiSeen && none_of(AddrModeInsts, [&](Value *V) { 4457 return IsNonLocalValue(V, MemoryInst->getParent()); 4458 })) { 4459 DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); 4460 return false; 4461 } 4462 4463 // Insert this computation right after this user. Since our caller is 4464 // scanning from the top of the BB to the bottom, reuse of the expr are 4465 // guaranteed to happen later. 4466 IRBuilder<> Builder(MemoryInst); 4467 4468 // Now that we determined the addressing expression we want to use and know 4469 // that we have to sink it into this block. Check to see if we have already 4470 // done this for some other load/store instr in this block. If so, reuse the 4471 // computation. 4472 Value *&SunkAddr = SunkAddrs[Addr]; 4473 if (SunkAddr) { 4474 DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " 4475 << *MemoryInst << "\n"); 4476 if (SunkAddr->getType() != Addr->getType()) 4477 SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); 4478 } else if (AddrSinkUsingGEPs || 4479 (!AddrSinkUsingGEPs.getNumOccurrences() && TM && 4480 SubtargetInfo->useAA())) { 4481 // By default, we use the GEP-based method when AA is used later. This 4482 // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. 4483 DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " 4484 << *MemoryInst << "\n"); 4485 Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); 4486 Value *ResultPtr = nullptr, *ResultIndex = nullptr; 4487 4488 // First, find the pointer. 4489 if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) { 4490 ResultPtr = AddrMode.BaseReg; 4491 AddrMode.BaseReg = nullptr; 4492 } 4493 4494 if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) { 4495 // We can't add more than one pointer together, nor can we scale a 4496 // pointer (both of which seem meaningless). 4497 if (ResultPtr || AddrMode.Scale != 1) 4498 return false; 4499 4500 ResultPtr = AddrMode.ScaledReg; 4501 AddrMode.Scale = 0; 4502 } 4503 4504 // It is only safe to sign extend the BaseReg if we know that the math 4505 // required to create it did not overflow before we extend it. Since 4506 // the original IR value was tossed in favor of a constant back when 4507 // the AddrMode was created we need to bail out gracefully if widths 4508 // do not match instead of extending it. 4509 // 4510 // (See below for code to add the scale.) 4511 if (AddrMode.Scale) { 4512 Type *ScaledRegTy = AddrMode.ScaledReg->getType(); 4513 if (cast<IntegerType>(IntPtrTy)->getBitWidth() > 4514 cast<IntegerType>(ScaledRegTy)->getBitWidth()) 4515 return false; 4516 } 4517 4518 if (AddrMode.BaseGV) { 4519 if (ResultPtr) 4520 return false; 4521 4522 ResultPtr = AddrMode.BaseGV; 4523 } 4524 4525 // If the real base value actually came from an inttoptr, then the matcher 4526 // will look through it and provide only the integer value. In that case, 4527 // use it here. 4528 if (!DL->isNonIntegralPointerType(Addr->getType())) { 4529 if (!ResultPtr && AddrMode.BaseReg) { 4530 ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), 4531 "sunkaddr"); 4532 AddrMode.BaseReg = nullptr; 4533 } else if (!ResultPtr && AddrMode.Scale == 1) { 4534 ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), 4535 "sunkaddr"); 4536 AddrMode.Scale = 0; 4537 } 4538 } 4539 4540 if (!ResultPtr && 4541 !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) { 4542 SunkAddr = Constant::getNullValue(Addr->getType()); 4543 } else if (!ResultPtr) { 4544 return false; 4545 } else { 4546 Type *I8PtrTy = 4547 Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); 4548 Type *I8Ty = Builder.getInt8Ty(); 4549 4550 // Start with the base register. Do this first so that subsequent address 4551 // matching finds it last, which will prevent it from trying to match it 4552 // as the scaled value in case it happens to be a mul. That would be 4553 // problematic if we've sunk a different mul for the scale, because then 4554 // we'd end up sinking both muls. 4555 if (AddrMode.BaseReg) { 4556 Value *V = AddrMode.BaseReg; 4557 if (V->getType() != IntPtrTy) 4558 V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); 4559 4560 ResultIndex = V; 4561 } 4562 4563 // Add the scale value. 4564 if (AddrMode.Scale) { 4565 Value *V = AddrMode.ScaledReg; 4566 if (V->getType() == IntPtrTy) { 4567 // done. 4568 } else { 4569 assert(cast<IntegerType>(IntPtrTy)->getBitWidth() < 4570 cast<IntegerType>(V->getType())->getBitWidth() && 4571 "We can't transform if ScaledReg is too narrow"); 4572 V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); 4573 } 4574 4575 if (AddrMode.Scale != 1) 4576 V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), 4577 "sunkaddr"); 4578 if (ResultIndex) 4579 ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr"); 4580 else 4581 ResultIndex = V; 4582 } 4583 4584 // Add in the Base Offset if present. 4585 if (AddrMode.BaseOffs) { 4586 Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); 4587 if (ResultIndex) { 4588 // We need to add this separately from the scale above to help with 4589 // SDAG consecutive load/store merging. 4590 if (ResultPtr->getType() != I8PtrTy) 4591 ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); 4592 ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); 4593 } 4594 4595 ResultIndex = V; 4596 } 4597 4598 if (!ResultIndex) { 4599 SunkAddr = ResultPtr; 4600 } else { 4601 if (ResultPtr->getType() != I8PtrTy) 4602 ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); 4603 SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); 4604 } 4605 4606 if (SunkAddr->getType() != Addr->getType()) 4607 SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); 4608 } 4609 } else { 4610 // We'd require a ptrtoint/inttoptr down the line, which we can't do for 4611 // non-integral pointers, so in that case bail out now. 4612 Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr; 4613 Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr; 4614 PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy); 4615 PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy); 4616 if (DL->isNonIntegralPointerType(Addr->getType()) || 4617 (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) || 4618 (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) || 4619 (AddrMode.BaseGV && 4620 DL->isNonIntegralPointerType(AddrMode.BaseGV->getType()))) 4621 return false; 4622 4623 DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " 4624 << *MemoryInst << "\n"); 4625 Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); 4626 Value *Result = nullptr; 4627 4628 // Start with the base register. Do this first so that subsequent address 4629 // matching finds it last, which will prevent it from trying to match it 4630 // as the scaled value in case it happens to be a mul. That would be 4631 // problematic if we've sunk a different mul for the scale, because then 4632 // we'd end up sinking both muls. 4633 if (AddrMode.BaseReg) { 4634 Value *V = AddrMode.BaseReg; 4635 if (V->getType()->isPointerTy()) 4636 V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); 4637 if (V->getType() != IntPtrTy) 4638 V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); 4639 Result = V; 4640 } 4641 4642 // Add the scale value. 4643 if (AddrMode.Scale) { 4644 Value *V = AddrMode.ScaledReg; 4645 if (V->getType() == IntPtrTy) { 4646 // done. 4647 } else if (V->getType()->isPointerTy()) { 4648 V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); 4649 } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < 4650 cast<IntegerType>(V->getType())->getBitWidth()) { 4651 V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); 4652 } else { 4653 // It is only safe to sign extend the BaseReg if we know that the math 4654 // required to create it did not overflow before we extend it. Since 4655 // the original IR value was tossed in favor of a constant back when 4656 // the AddrMode was created we need to bail out gracefully if widths 4657 // do not match instead of extending it. 4658 Instruction *I = dyn_cast_or_null<Instruction>(Result); 4659 if (I && (Result != AddrMode.BaseReg)) 4660 I->eraseFromParent(); 4661 return false; 4662 } 4663 if (AddrMode.Scale != 1) 4664 V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), 4665 "sunkaddr"); 4666 if (Result) 4667 Result = Builder.CreateAdd(Result, V, "sunkaddr"); 4668 else 4669 Result = V; 4670 } 4671 4672 // Add in the BaseGV if present. 4673 if (AddrMode.BaseGV) { 4674 Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); 4675 if (Result) 4676 Result = Builder.CreateAdd(Result, V, "sunkaddr"); 4677 else 4678 Result = V; 4679 } 4680 4681 // Add in the Base Offset if present. 4682 if (AddrMode.BaseOffs) { 4683 Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); 4684 if (Result) 4685 Result = Builder.CreateAdd(Result, V, "sunkaddr"); 4686 else 4687 Result = V; 4688 } 4689 4690 if (!Result) 4691 SunkAddr = Constant::getNullValue(Addr->getType()); 4692 else 4693 SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); 4694 } 4695 4696 MemoryInst->replaceUsesOfWith(Repl, SunkAddr); 4697 4698 // If we have no uses, recursively delete the value and all dead instructions 4699 // using it. 4700 if (Repl->use_empty()) { 4701 // This can cause recursive deletion, which can invalidate our iterator. 4702 // Use a WeakTrackingVH to hold onto it in case this happens. 4703 Value *CurValue = &*CurInstIterator; 4704 WeakTrackingVH IterHandle(CurValue); 4705 BasicBlock *BB = CurInstIterator->getParent(); 4706 4707 RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); 4708 4709 if (IterHandle != CurValue) { 4710 // If the iterator instruction was recursively deleted, start over at the 4711 // start of the block. 4712 CurInstIterator = BB->begin(); 4713 SunkAddrs.clear(); 4714 } 4715 } 4716 ++NumMemoryInsts; 4717 return true; 4718 } 4719 4720 /// If there are any memory operands, use OptimizeMemoryInst to sink their 4721 /// address computing into the block when possible / profitable. 4722 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { 4723 bool MadeChange = false; 4724 4725 const TargetRegisterInfo *TRI = 4726 TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo(); 4727 TargetLowering::AsmOperandInfoVector TargetConstraints = 4728 TLI->ParseConstraints(*DL, TRI, CS); 4729 unsigned ArgNo = 0; 4730 for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { 4731 TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; 4732 4733 // Compute the constraint code and ConstraintType to use. 4734 TLI->ComputeConstraintToUse(OpInfo, SDValue()); 4735 4736 if (OpInfo.ConstraintType == TargetLowering::C_Memory && 4737 OpInfo.isIndirect) { 4738 Value *OpVal = CS->getArgOperand(ArgNo++); 4739 MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); 4740 } else if (OpInfo.Type == InlineAsm::isInput) 4741 ArgNo++; 4742 } 4743 4744 return MadeChange; 4745 } 4746 4747 /// \brief Check if all the uses of \p Val are equivalent (or free) zero or 4748 /// sign extensions. 4749 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { 4750 assert(!Val->use_empty() && "Input must have at least one use"); 4751 const Instruction *FirstUser = cast<Instruction>(*Val->user_begin()); 4752 bool IsSExt = isa<SExtInst>(FirstUser); 4753 Type *ExtTy = FirstUser->getType(); 4754 for (const User *U : Val->users()) { 4755 const Instruction *UI = cast<Instruction>(U); 4756 if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI))) 4757 return false; 4758 Type *CurTy = UI->getType(); 4759 // Same input and output types: Same instruction after CSE. 4760 if (CurTy == ExtTy) 4761 continue; 4762 4763 // If IsSExt is true, we are in this situation: 4764 // a = Val 4765 // b = sext ty1 a to ty2 4766 // c = sext ty1 a to ty3 4767 // Assuming ty2 is shorter than ty3, this could be turned into: 4768 // a = Val 4769 // b = sext ty1 a to ty2 4770 // c = sext ty2 b to ty3 4771 // However, the last sext is not free. 4772 if (IsSExt) 4773 return false; 4774 4775 // This is a ZExt, maybe this is free to extend from one type to another. 4776 // In that case, we would not account for a different use. 4777 Type *NarrowTy; 4778 Type *LargeTy; 4779 if (ExtTy->getScalarType()->getIntegerBitWidth() > 4780 CurTy->getScalarType()->getIntegerBitWidth()) { 4781 NarrowTy = CurTy; 4782 LargeTy = ExtTy; 4783 } else { 4784 NarrowTy = ExtTy; 4785 LargeTy = CurTy; 4786 } 4787 4788 if (!TLI.isZExtFree(NarrowTy, LargeTy)) 4789 return false; 4790 } 4791 // All uses are the same or can be derived from one another for free. 4792 return true; 4793 } 4794 4795 /// \brief Try to speculatively promote extensions in \p Exts and continue 4796 /// promoting through newly promoted operands recursively as far as doing so is 4797 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. 4798 /// When some promotion happened, \p TPT contains the proper state to revert 4799 /// them. 4800 /// 4801 /// \return true if some promotion happened, false otherwise. 4802 bool CodeGenPrepare::tryToPromoteExts( 4803 TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, 4804 SmallVectorImpl<Instruction *> &ProfitablyMovedExts, 4805 unsigned CreatedInstsCost) { 4806 bool Promoted = false; 4807 4808 // Iterate over all the extensions to try to promote them. 4809 for (auto I : Exts) { 4810 // Early check if we directly have ext(load). 4811 if (isa<LoadInst>(I->getOperand(0))) { 4812 ProfitablyMovedExts.push_back(I); 4813 continue; 4814 } 4815 4816 // Check whether or not we want to do any promotion. The reason we have 4817 // this check inside the for loop is to catch the case where an extension 4818 // is directly fed by a load because in such case the extension can be moved 4819 // up without any promotion on its operands. 4820 if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) 4821 return false; 4822 4823 // Get the action to perform the promotion. 4824 TypePromotionHelper::Action TPH = 4825 TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts); 4826 // Check if we can promote. 4827 if (!TPH) { 4828 // Save the current extension as we cannot move up through its operand. 4829 ProfitablyMovedExts.push_back(I); 4830 continue; 4831 } 4832 4833 // Save the current state. 4834 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 4835 TPT.getRestorationPoint(); 4836 SmallVector<Instruction *, 4> NewExts; 4837 unsigned NewCreatedInstsCost = 0; 4838 unsigned ExtCost = !TLI->isExtFree(I); 4839 // Promote. 4840 Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost, 4841 &NewExts, nullptr, *TLI); 4842 assert(PromotedVal && 4843 "TypePromotionHelper should have filtered out those cases"); 4844 4845 // We would be able to merge only one extension in a load. 4846 // Therefore, if we have more than 1 new extension we heuristically 4847 // cut this search path, because it means we degrade the code quality. 4848 // With exactly 2, the transformation is neutral, because we will merge 4849 // one extension but leave one. However, we optimistically keep going, 4850 // because the new extension may be removed too. 4851 long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; 4852 // FIXME: It would be possible to propagate a negative value instead of 4853 // conservatively ceiling it to 0. 4854 TotalCreatedInstsCost = 4855 std::max((long long)0, (TotalCreatedInstsCost - ExtCost)); 4856 if (!StressExtLdPromotion && 4857 (TotalCreatedInstsCost > 1 || 4858 !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { 4859 // This promotion is not profitable, rollback to the previous state, and 4860 // save the current extension in ProfitablyMovedExts as the latest 4861 // speculative promotion turned out to be unprofitable. 4862 TPT.rollback(LastKnownGood); 4863 ProfitablyMovedExts.push_back(I); 4864 continue; 4865 } 4866 // Continue promoting NewExts as far as doing so is profitable. 4867 SmallVector<Instruction *, 2> NewlyMovedExts; 4868 (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); 4869 bool NewPromoted = false; 4870 for (auto ExtInst : NewlyMovedExts) { 4871 Instruction *MovedExt = cast<Instruction>(ExtInst); 4872 Value *ExtOperand = MovedExt->getOperand(0); 4873 // If we have reached to a load, we need this extra profitability check 4874 // as it could potentially be merged into an ext(load). 4875 if (isa<LoadInst>(ExtOperand) && 4876 !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || 4877 (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) 4878 continue; 4879 4880 ProfitablyMovedExts.push_back(MovedExt); 4881 NewPromoted = true; 4882 } 4883 4884 // If none of speculative promotions for NewExts is profitable, rollback 4885 // and save the current extension (I) as the last profitable extension. 4886 if (!NewPromoted) { 4887 TPT.rollback(LastKnownGood); 4888 ProfitablyMovedExts.push_back(I); 4889 continue; 4890 } 4891 // The promotion is profitable. 4892 Promoted = true; 4893 } 4894 return Promoted; 4895 } 4896 4897 /// Merging redundant sexts when one is dominating the other. 4898 bool CodeGenPrepare::mergeSExts(Function &F) { 4899 DominatorTree DT(F); 4900 bool Changed = false; 4901 for (auto &Entry : ValToSExtendedUses) { 4902 SExts &Insts = Entry.second; 4903 SExts CurPts; 4904 for (Instruction *Inst : Insts) { 4905 if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || 4906 Inst->getOperand(0) != Entry.first) 4907 continue; 4908 bool inserted = false; 4909 for (auto &Pt : CurPts) { 4910 if (DT.dominates(Inst, Pt)) { 4911 Pt->replaceAllUsesWith(Inst); 4912 RemovedInsts.insert(Pt); 4913 Pt->removeFromParent(); 4914 Pt = Inst; 4915 inserted = true; 4916 Changed = true; 4917 break; 4918 } 4919 if (!DT.dominates(Pt, Inst)) 4920 // Give up if we need to merge in a common dominator as the 4921 // expermients show it is not profitable. 4922 continue; 4923 Inst->replaceAllUsesWith(Pt); 4924 RemovedInsts.insert(Inst); 4925 Inst->removeFromParent(); 4926 inserted = true; 4927 Changed = true; 4928 break; 4929 } 4930 if (!inserted) 4931 CurPts.push_back(Inst); 4932 } 4933 } 4934 return Changed; 4935 } 4936 4937 /// Return true, if an ext(load) can be formed from an extension in 4938 /// \p MovedExts. 4939 bool CodeGenPrepare::canFormExtLd( 4940 const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, 4941 Instruction *&Inst, bool HasPromoted) { 4942 for (auto *MovedExtInst : MovedExts) { 4943 if (isa<LoadInst>(MovedExtInst->getOperand(0))) { 4944 LI = cast<LoadInst>(MovedExtInst->getOperand(0)); 4945 Inst = MovedExtInst; 4946 break; 4947 } 4948 } 4949 if (!LI) 4950 return false; 4951 4952 // If they're already in the same block, there's nothing to do. 4953 // Make the cheap checks first if we did not promote. 4954 // If we promoted, we need to check if it is indeed profitable. 4955 if (!HasPromoted && LI->getParent() == Inst->getParent()) 4956 return false; 4957 4958 return TLI->isExtLoad(LI, Inst, *DL); 4959 } 4960 4961 /// Move a zext or sext fed by a load into the same basic block as the load, 4962 /// unless conditions are unfavorable. This allows SelectionDAG to fold the 4963 /// extend into the load. 4964 /// 4965 /// E.g., 4966 /// \code 4967 /// %ld = load i32* %addr 4968 /// %add = add nuw i32 %ld, 4 4969 /// %zext = zext i32 %add to i64 4970 // \endcode 4971 /// => 4972 /// \code 4973 /// %ld = load i32* %addr 4974 /// %zext = zext i32 %ld to i64 4975 /// %add = add nuw i64 %zext, 4 4976 /// \encode 4977 /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which 4978 /// allow us to match zext(load i32*) to i64. 4979 /// 4980 /// Also, try to promote the computations used to obtain a sign extended 4981 /// value used into memory accesses. 4982 /// E.g., 4983 /// \code 4984 /// a = add nsw i32 b, 3 4985 /// d = sext i32 a to i64 4986 /// e = getelementptr ..., i64 d 4987 /// \endcode 4988 /// => 4989 /// \code 4990 /// f = sext i32 b to i64 4991 /// a = add nsw i64 f, 3 4992 /// e = getelementptr ..., i64 a 4993 /// \endcode 4994 /// 4995 /// \p Inst[in/out] the extension may be modified during the process if some 4996 /// promotions apply. 4997 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { 4998 // ExtLoad formation and address type promotion infrastructure requires TLI to 4999 // be effective. 5000 if (!TLI) 5001 return false; 5002 5003 bool AllowPromotionWithoutCommonHeader = false; 5004 /// See if it is an interesting sext operations for the address type 5005 /// promotion before trying to promote it, e.g., the ones with the right 5006 /// type and used in memory accesses. 5007 bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( 5008 *Inst, AllowPromotionWithoutCommonHeader); 5009 TypePromotionTransaction TPT(RemovedInsts); 5010 TypePromotionTransaction::ConstRestorationPt LastKnownGood = 5011 TPT.getRestorationPoint(); 5012 SmallVector<Instruction *, 1> Exts; 5013 SmallVector<Instruction *, 2> SpeculativelyMovedExts; 5014 Exts.push_back(Inst); 5015 5016 bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); 5017 5018 // Look for a load being extended. 5019 LoadInst *LI = nullptr; 5020 Instruction *ExtFedByLoad; 5021 5022 // Try to promote a chain of computation if it allows to form an extended 5023 // load. 5024 if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { 5025 assert(LI && ExtFedByLoad && "Expect a valid load and extension"); 5026 TPT.commit(); 5027 // Move the extend into the same block as the load 5028 ExtFedByLoad->moveAfter(LI); 5029 // CGP does not check if the zext would be speculatively executed when moved 5030 // to the same basic block as the load. Preserving its original location 5031 // would pessimize the debugging experience, as well as negatively impact 5032 // the quality of sample pgo. We don't want to use "line 0" as that has a 5033 // size cost in the line-table section and logically the zext can be seen as 5034 // part of the load. Therefore we conservatively reuse the same debug 5035 // location for the load and the zext. 5036 ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); 5037 ++NumExtsMoved; 5038 Inst = ExtFedByLoad; 5039 return true; 5040 } 5041 5042 // Continue promoting SExts if known as considerable depending on targets. 5043 if (ATPConsiderable && 5044 performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, 5045 HasPromoted, TPT, SpeculativelyMovedExts)) 5046 return true; 5047 5048 TPT.rollback(LastKnownGood); 5049 return false; 5050 } 5051 5052 // Perform address type promotion if doing so is profitable. 5053 // If AllowPromotionWithoutCommonHeader == false, we should find other sext 5054 // instructions that sign extended the same initial value. However, if 5055 // AllowPromotionWithoutCommonHeader == true, we expect promoting the 5056 // extension is just profitable. 5057 bool CodeGenPrepare::performAddressTypePromotion( 5058 Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, 5059 bool HasPromoted, TypePromotionTransaction &TPT, 5060 SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { 5061 bool Promoted = false; 5062 SmallPtrSet<Instruction *, 1> UnhandledExts; 5063 bool AllSeenFirst = true; 5064 for (auto I : SpeculativelyMovedExts) { 5065 Value *HeadOfChain = I->getOperand(0); 5066 DenseMap<Value *, Instruction *>::iterator AlreadySeen = 5067 SeenChainsForSExt.find(HeadOfChain); 5068 // If there is an unhandled SExt which has the same header, try to promote 5069 // it as well. 5070 if (AlreadySeen != SeenChainsForSExt.end()) { 5071 if (AlreadySeen->second != nullptr) 5072 UnhandledExts.insert(AlreadySeen->second); 5073 AllSeenFirst = false; 5074 } 5075 } 5076 5077 if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && 5078 SpeculativelyMovedExts.size() == 1)) { 5079 TPT.commit(); 5080 if (HasPromoted) 5081 Promoted = true; 5082 for (auto I : SpeculativelyMovedExts) { 5083 Value *HeadOfChain = I->getOperand(0); 5084 SeenChainsForSExt[HeadOfChain] = nullptr; 5085 ValToSExtendedUses[HeadOfChain].push_back(I); 5086 } 5087 // Update Inst as promotion happen. 5088 Inst = SpeculativelyMovedExts.pop_back_val(); 5089 } else { 5090 // This is the first chain visited from the header, keep the current chain 5091 // as unhandled. Defer to promote this until we encounter another SExt 5092 // chain derived from the same header. 5093 for (auto I : SpeculativelyMovedExts) { 5094 Value *HeadOfChain = I->getOperand(0); 5095 SeenChainsForSExt[HeadOfChain] = Inst; 5096 } 5097 return false; 5098 } 5099 5100 if (!AllSeenFirst && !UnhandledExts.empty()) 5101 for (auto VisitedSExt : UnhandledExts) { 5102 if (RemovedInsts.count(VisitedSExt)) 5103 continue; 5104 TypePromotionTransaction TPT(RemovedInsts); 5105 SmallVector<Instruction *, 1> Exts; 5106 SmallVector<Instruction *, 2> Chains; 5107 Exts.push_back(VisitedSExt); 5108 bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); 5109 TPT.commit(); 5110 if (HasPromoted) 5111 Promoted = true; 5112 for (auto I : Chains) { 5113 Value *HeadOfChain = I->getOperand(0); 5114 // Mark this as handled. 5115 SeenChainsForSExt[HeadOfChain] = nullptr; 5116 ValToSExtendedUses[HeadOfChain].push_back(I); 5117 } 5118 } 5119 return Promoted; 5120 } 5121 5122 bool CodeGenPrepare::optimizeExtUses(Instruction *I) { 5123 BasicBlock *DefBB = I->getParent(); 5124 5125 // If the result of a {s|z}ext and its source are both live out, rewrite all 5126 // other uses of the source with result of extension. 5127 Value *Src = I->getOperand(0); 5128 if (Src->hasOneUse()) 5129 return false; 5130 5131 // Only do this xform if truncating is free. 5132 if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) 5133 return false; 5134 5135 // Only safe to perform the optimization if the source is also defined in 5136 // this block. 5137 if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()) 5138 return false; 5139 5140 bool DefIsLiveOut = false; 5141 for (User *U : I->users()) { 5142 Instruction *UI = cast<Instruction>(U); 5143 5144 // Figure out which BB this ext is used in. 5145 BasicBlock *UserBB = UI->getParent(); 5146 if (UserBB == DefBB) continue; 5147 DefIsLiveOut = true; 5148 break; 5149 } 5150 if (!DefIsLiveOut) 5151 return false; 5152 5153 // Make sure none of the uses are PHI nodes. 5154 for (User *U : Src->users()) { 5155 Instruction *UI = cast<Instruction>(U); 5156 BasicBlock *UserBB = UI->getParent(); 5157 if (UserBB == DefBB) continue; 5158 // Be conservative. We don't want this xform to end up introducing 5159 // reloads just before load / store instructions. 5160 if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI)) 5161 return false; 5162 } 5163 5164 // InsertedTruncs - Only insert one trunc in each block once. 5165 DenseMap<BasicBlock*, Instruction*> InsertedTruncs; 5166 5167 bool MadeChange = false; 5168 for (Use &U : Src->uses()) { 5169 Instruction *User = cast<Instruction>(U.getUser()); 5170 5171 // Figure out which BB this ext is used in. 5172 BasicBlock *UserBB = User->getParent(); 5173 if (UserBB == DefBB) continue; 5174 5175 // Both src and def are live in this block. Rewrite the use. 5176 Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; 5177 5178 if (!InsertedTrunc) { 5179 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 5180 assert(InsertPt != UserBB->end()); 5181 InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt); 5182 InsertedInsts.insert(InsertedTrunc); 5183 } 5184 5185 // Replace a use of the {s|z}ext source with a use of the result. 5186 U = InsertedTrunc; 5187 ++NumExtUses; 5188 MadeChange = true; 5189 } 5190 5191 return MadeChange; 5192 } 5193 5194 // Find loads whose uses only use some of the loaded value's bits. Add an "and" 5195 // just after the load if the target can fold this into one extload instruction, 5196 // with the hope of eliminating some of the other later "and" instructions using 5197 // the loaded value. "and"s that are made trivially redundant by the insertion 5198 // of the new "and" are removed by this function, while others (e.g. those whose 5199 // path from the load goes through a phi) are left for isel to potentially 5200 // remove. 5201 // 5202 // For example: 5203 // 5204 // b0: 5205 // x = load i32 5206 // ... 5207 // b1: 5208 // y = and x, 0xff 5209 // z = use y 5210 // 5211 // becomes: 5212 // 5213 // b0: 5214 // x = load i32 5215 // x' = and x, 0xff 5216 // ... 5217 // b1: 5218 // z = use x' 5219 // 5220 // whereas: 5221 // 5222 // b0: 5223 // x1 = load i32 5224 // ... 5225 // b1: 5226 // x2 = load i32 5227 // ... 5228 // b2: 5229 // x = phi x1, x2 5230 // y = and x, 0xff 5231 // 5232 // becomes (after a call to optimizeLoadExt for each load): 5233 // 5234 // b0: 5235 // x1 = load i32 5236 // x1' = and x1, 0xff 5237 // ... 5238 // b1: 5239 // x2 = load i32 5240 // x2' = and x2, 0xff 5241 // ... 5242 // b2: 5243 // x = phi x1', x2' 5244 // y = and x, 0xff 5245 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { 5246 if (!Load->isSimple() || 5247 !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) 5248 return false; 5249 5250 // Skip loads we've already transformed. 5251 if (Load->hasOneUse() && 5252 InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) 5253 return false; 5254 5255 // Look at all uses of Load, looking through phis, to determine how many bits 5256 // of the loaded value are needed. 5257 SmallVector<Instruction *, 8> WorkList; 5258 SmallPtrSet<Instruction *, 16> Visited; 5259 SmallVector<Instruction *, 8> AndsToMaybeRemove; 5260 for (auto *U : Load->users()) 5261 WorkList.push_back(cast<Instruction>(U)); 5262 5263 EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); 5264 unsigned BitWidth = LoadResultVT.getSizeInBits(); 5265 APInt DemandBits(BitWidth, 0); 5266 APInt WidestAndBits(BitWidth, 0); 5267 5268 while (!WorkList.empty()) { 5269 Instruction *I = WorkList.back(); 5270 WorkList.pop_back(); 5271 5272 // Break use-def graph loops. 5273 if (!Visited.insert(I).second) 5274 continue; 5275 5276 // For a PHI node, push all of its users. 5277 if (auto *Phi = dyn_cast<PHINode>(I)) { 5278 for (auto *U : Phi->users()) 5279 WorkList.push_back(cast<Instruction>(U)); 5280 continue; 5281 } 5282 5283 switch (I->getOpcode()) { 5284 case Instruction::And: { 5285 auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1)); 5286 if (!AndC) 5287 return false; 5288 APInt AndBits = AndC->getValue(); 5289 DemandBits |= AndBits; 5290 // Keep track of the widest and mask we see. 5291 if (AndBits.ugt(WidestAndBits)) 5292 WidestAndBits = AndBits; 5293 if (AndBits == WidestAndBits && I->getOperand(0) == Load) 5294 AndsToMaybeRemove.push_back(I); 5295 break; 5296 } 5297 5298 case Instruction::Shl: { 5299 auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1)); 5300 if (!ShlC) 5301 return false; 5302 uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); 5303 DemandBits.setLowBits(BitWidth - ShiftAmt); 5304 break; 5305 } 5306 5307 case Instruction::Trunc: { 5308 EVT TruncVT = TLI->getValueType(*DL, I->getType()); 5309 unsigned TruncBitWidth = TruncVT.getSizeInBits(); 5310 DemandBits.setLowBits(TruncBitWidth); 5311 break; 5312 } 5313 5314 default: 5315 return false; 5316 } 5317 } 5318 5319 uint32_t ActiveBits = DemandBits.getActiveBits(); 5320 // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the 5321 // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, 5322 // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but 5323 // (and (load x) 1) is not matched as a single instruction, rather as a LDR 5324 // followed by an AND. 5325 // TODO: Look into removing this restriction by fixing backends to either 5326 // return false for isLoadExtLegal for i1 or have them select this pattern to 5327 // a single instruction. 5328 // 5329 // Also avoid hoisting if we didn't see any ands with the exact DemandBits 5330 // mask, since these are the only ands that will be removed by isel. 5331 if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) || 5332 WidestAndBits != DemandBits) 5333 return false; 5334 5335 LLVMContext &Ctx = Load->getType()->getContext(); 5336 Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); 5337 EVT TruncVT = TLI->getValueType(*DL, TruncTy); 5338 5339 // Reject cases that won't be matched as extloads. 5340 if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || 5341 !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) 5342 return false; 5343 5344 IRBuilder<> Builder(Load->getNextNode()); 5345 auto *NewAnd = dyn_cast<Instruction>( 5346 Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); 5347 // Mark this instruction as "inserted by CGP", so that other 5348 // optimizations don't touch it. 5349 InsertedInsts.insert(NewAnd); 5350 5351 // Replace all uses of load with new and (except for the use of load in the 5352 // new and itself). 5353 Load->replaceAllUsesWith(NewAnd); 5354 NewAnd->setOperand(0, Load); 5355 5356 // Remove any and instructions that are now redundant. 5357 for (auto *And : AndsToMaybeRemove) 5358 // Check that the and mask is the same as the one we decided to put on the 5359 // new and. 5360 if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) { 5361 And->replaceAllUsesWith(NewAnd); 5362 if (&*CurInstIterator == And) 5363 CurInstIterator = std::next(And->getIterator()); 5364 And->eraseFromParent(); 5365 ++NumAndUses; 5366 } 5367 5368 ++NumAndsAdded; 5369 return true; 5370 } 5371 5372 /// Check if V (an operand of a select instruction) is an expensive instruction 5373 /// that is only used once. 5374 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { 5375 auto *I = dyn_cast<Instruction>(V); 5376 // If it's safe to speculatively execute, then it should not have side 5377 // effects; therefore, it's safe to sink and possibly *not* execute. 5378 return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && 5379 TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive; 5380 } 5381 5382 /// Returns true if a SelectInst should be turned into an explicit branch. 5383 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, 5384 const TargetLowering *TLI, 5385 SelectInst *SI) { 5386 // If even a predictable select is cheap, then a branch can't be cheaper. 5387 if (!TLI->isPredictableSelectExpensive()) 5388 return false; 5389 5390 // FIXME: This should use the same heuristics as IfConversion to determine 5391 // whether a select is better represented as a branch. 5392 5393 // If metadata tells us that the select condition is obviously predictable, 5394 // then we want to replace the select with a branch. 5395 uint64_t TrueWeight, FalseWeight; 5396 if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { 5397 uint64_t Max = std::max(TrueWeight, FalseWeight); 5398 uint64_t Sum = TrueWeight + FalseWeight; 5399 if (Sum != 0) { 5400 auto Probability = BranchProbability::getBranchProbability(Max, Sum); 5401 if (Probability > TLI->getPredictableBranchThreshold()) 5402 return true; 5403 } 5404 } 5405 5406 CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); 5407 5408 // If a branch is predictable, an out-of-order CPU can avoid blocking on its 5409 // comparison condition. If the compare has more than one use, there's 5410 // probably another cmov or setcc around, so it's not worth emitting a branch. 5411 if (!Cmp || !Cmp->hasOneUse()) 5412 return false; 5413 5414 // If either operand of the select is expensive and only needed on one side 5415 // of the select, we should form a branch. 5416 if (sinkSelectOperand(TTI, SI->getTrueValue()) || 5417 sinkSelectOperand(TTI, SI->getFalseValue())) 5418 return true; 5419 5420 return false; 5421 } 5422 5423 /// If \p isTrue is true, return the true value of \p SI, otherwise return 5424 /// false value of \p SI. If the true/false value of \p SI is defined by any 5425 /// select instructions in \p Selects, look through the defining select 5426 /// instruction until the true/false value is not defined in \p Selects. 5427 static Value *getTrueOrFalseValue( 5428 SelectInst *SI, bool isTrue, 5429 const SmallPtrSet<const Instruction *, 2> &Selects) { 5430 Value *V; 5431 5432 for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI); 5433 DefSI = dyn_cast<SelectInst>(V)) { 5434 assert(DefSI->getCondition() == SI->getCondition() && 5435 "The condition of DefSI does not match with SI"); 5436 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); 5437 } 5438 return V; 5439 } 5440 5441 /// If we have a SelectInst that will likely profit from branch prediction, 5442 /// turn it into a branch. 5443 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { 5444 // Find all consecutive select instructions that share the same condition. 5445 SmallVector<SelectInst *, 2> ASI; 5446 ASI.push_back(SI); 5447 for (BasicBlock::iterator It = ++BasicBlock::iterator(SI); 5448 It != SI->getParent()->end(); ++It) { 5449 SelectInst *I = dyn_cast<SelectInst>(&*It); 5450 if (I && SI->getCondition() == I->getCondition()) { 5451 ASI.push_back(I); 5452 } else { 5453 break; 5454 } 5455 } 5456 5457 SelectInst *LastSI = ASI.back(); 5458 // Increment the current iterator to skip all the rest of select instructions 5459 // because they will be either "not lowered" or "all lowered" to branch. 5460 CurInstIterator = std::next(LastSI->getIterator()); 5461 5462 bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); 5463 5464 // Can we convert the 'select' to CF ? 5465 if (DisableSelectToBranch || OptSize || !TLI || VectorCond || 5466 SI->getMetadata(LLVMContext::MD_unpredictable)) 5467 return false; 5468 5469 TargetLowering::SelectSupportKind SelectKind; 5470 if (VectorCond) 5471 SelectKind = TargetLowering::VectorMaskSelect; 5472 else if (SI->getType()->isVectorTy()) 5473 SelectKind = TargetLowering::ScalarCondVectorVal; 5474 else 5475 SelectKind = TargetLowering::ScalarValSelect; 5476 5477 if (TLI->isSelectSupported(SelectKind) && 5478 !isFormingBranchFromSelectProfitable(TTI, TLI, SI)) 5479 return false; 5480 5481 ModifiedDT = true; 5482 5483 // Transform a sequence like this: 5484 // start: 5485 // %cmp = cmp uge i32 %a, %b 5486 // %sel = select i1 %cmp, i32 %c, i32 %d 5487 // 5488 // Into: 5489 // start: 5490 // %cmp = cmp uge i32 %a, %b 5491 // br i1 %cmp, label %select.true, label %select.false 5492 // select.true: 5493 // br label %select.end 5494 // select.false: 5495 // br label %select.end 5496 // select.end: 5497 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] 5498 // 5499 // In addition, we may sink instructions that produce %c or %d from 5500 // the entry block into the destination(s) of the new branch. 5501 // If the true or false blocks do not contain a sunken instruction, that 5502 // block and its branch may be optimized away. In that case, one side of the 5503 // first branch will point directly to select.end, and the corresponding PHI 5504 // predecessor block will be the start block. 5505 5506 // First, we split the block containing the select into 2 blocks. 5507 BasicBlock *StartBlock = SI->getParent(); 5508 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); 5509 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); 5510 5511 // Delete the unconditional branch that was just created by the split. 5512 StartBlock->getTerminator()->eraseFromParent(); 5513 5514 // These are the new basic blocks for the conditional branch. 5515 // At least one will become an actual new basic block. 5516 BasicBlock *TrueBlock = nullptr; 5517 BasicBlock *FalseBlock = nullptr; 5518 BranchInst *TrueBranch = nullptr; 5519 BranchInst *FalseBranch = nullptr; 5520 5521 // Sink expensive instructions into the conditional blocks to avoid executing 5522 // them speculatively. 5523 for (SelectInst *SI : ASI) { 5524 if (sinkSelectOperand(TTI, SI->getTrueValue())) { 5525 if (TrueBlock == nullptr) { 5526 TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", 5527 EndBlock->getParent(), EndBlock); 5528 TrueBranch = BranchInst::Create(EndBlock, TrueBlock); 5529 } 5530 auto *TrueInst = cast<Instruction>(SI->getTrueValue()); 5531 TrueInst->moveBefore(TrueBranch); 5532 } 5533 if (sinkSelectOperand(TTI, SI->getFalseValue())) { 5534 if (FalseBlock == nullptr) { 5535 FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", 5536 EndBlock->getParent(), EndBlock); 5537 FalseBranch = BranchInst::Create(EndBlock, FalseBlock); 5538 } 5539 auto *FalseInst = cast<Instruction>(SI->getFalseValue()); 5540 FalseInst->moveBefore(FalseBranch); 5541 } 5542 } 5543 5544 // If there was nothing to sink, then arbitrarily choose the 'false' side 5545 // for a new input value to the PHI. 5546 if (TrueBlock == FalseBlock) { 5547 assert(TrueBlock == nullptr && 5548 "Unexpected basic block transform while optimizing select"); 5549 5550 FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", 5551 EndBlock->getParent(), EndBlock); 5552 BranchInst::Create(EndBlock, FalseBlock); 5553 } 5554 5555 // Insert the real conditional branch based on the original condition. 5556 // If we did not create a new block for one of the 'true' or 'false' paths 5557 // of the condition, it means that side of the branch goes to the end block 5558 // directly and the path originates from the start block from the point of 5559 // view of the new PHI. 5560 BasicBlock *TT, *FT; 5561 if (TrueBlock == nullptr) { 5562 TT = EndBlock; 5563 FT = FalseBlock; 5564 TrueBlock = StartBlock; 5565 } else if (FalseBlock == nullptr) { 5566 TT = TrueBlock; 5567 FT = EndBlock; 5568 FalseBlock = StartBlock; 5569 } else { 5570 TT = TrueBlock; 5571 FT = FalseBlock; 5572 } 5573 IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI); 5574 5575 SmallPtrSet<const Instruction *, 2> INS; 5576 INS.insert(ASI.begin(), ASI.end()); 5577 // Use reverse iterator because later select may use the value of the 5578 // earlier select, and we need to propagate value through earlier select 5579 // to get the PHI operand. 5580 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { 5581 SelectInst *SI = *It; 5582 // The select itself is replaced with a PHI Node. 5583 PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); 5584 PN->takeName(SI); 5585 PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock); 5586 PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); 5587 5588 SI->replaceAllUsesWith(PN); 5589 SI->eraseFromParent(); 5590 INS.erase(SI); 5591 ++NumSelectsExpanded; 5592 } 5593 5594 // Instruct OptimizeBlock to skip to the next block. 5595 CurInstIterator = StartBlock->end(); 5596 return true; 5597 } 5598 5599 static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { 5600 SmallVector<int, 16> Mask(SVI->getShuffleMask()); 5601 int SplatElem = -1; 5602 for (unsigned i = 0; i < Mask.size(); ++i) { 5603 if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem) 5604 return false; 5605 SplatElem = Mask[i]; 5606 } 5607 5608 return true; 5609 } 5610 5611 /// Some targets have expensive vector shifts if the lanes aren't all the same 5612 /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases 5613 /// it's often worth sinking a shufflevector splat down to its use so that 5614 /// codegen can spot all lanes are identical. 5615 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { 5616 BasicBlock *DefBB = SVI->getParent(); 5617 5618 // Only do this xform if variable vector shifts are particularly expensive. 5619 if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType())) 5620 return false; 5621 5622 // We only expect better codegen by sinking a shuffle if we can recognise a 5623 // constant splat. 5624 if (!isBroadcastShuffle(SVI)) 5625 return false; 5626 5627 // InsertedShuffles - Only insert a shuffle in each block once. 5628 DenseMap<BasicBlock*, Instruction*> InsertedShuffles; 5629 5630 bool MadeChange = false; 5631 for (User *U : SVI->users()) { 5632 Instruction *UI = cast<Instruction>(U); 5633 5634 // Figure out which BB this ext is used in. 5635 BasicBlock *UserBB = UI->getParent(); 5636 if (UserBB == DefBB) continue; 5637 5638 // For now only apply this when the splat is used by a shift instruction. 5639 if (!UI->isShift()) continue; 5640 5641 // Everything checks out, sink the shuffle if the user's block doesn't 5642 // already have a copy. 5643 Instruction *&InsertedShuffle = InsertedShuffles[UserBB]; 5644 5645 if (!InsertedShuffle) { 5646 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); 5647 assert(InsertPt != UserBB->end()); 5648 InsertedShuffle = 5649 new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), 5650 SVI->getOperand(2), "", &*InsertPt); 5651 } 5652 5653 UI->replaceUsesOfWith(SVI, InsertedShuffle); 5654 MadeChange = true; 5655 } 5656 5657 // If we removed all uses, nuke the shuffle. 5658 if (SVI->use_empty()) { 5659 SVI->eraseFromParent(); 5660 MadeChange = true; 5661 } 5662 5663 return MadeChange; 5664 } 5665 5666 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { 5667 if (!TLI || !DL) 5668 return false; 5669 5670 Value *Cond = SI->getCondition(); 5671 Type *OldType = Cond->getType(); 5672 LLVMContext &Context = Cond->getContext(); 5673 MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); 5674 unsigned RegWidth = RegType.getSizeInBits(); 5675 5676 if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth()) 5677 return false; 5678 5679 // If the register width is greater than the type width, expand the condition 5680 // of the switch instruction and each case constant to the width of the 5681 // register. By widening the type of the switch condition, subsequent 5682 // comparisons (for case comparisons) will not need to be extended to the 5683 // preferred register width, so we will potentially eliminate N-1 extends, 5684 // where N is the number of cases in the switch. 5685 auto *NewType = Type::getIntNTy(Context, RegWidth); 5686 5687 // Zero-extend the switch condition and case constants unless the switch 5688 // condition is a function argument that is already being sign-extended. 5689 // In that case, we can avoid an unnecessary mask/extension by sign-extending 5690 // everything instead. 5691 Instruction::CastOps ExtType = Instruction::ZExt; 5692 if (auto *Arg = dyn_cast<Argument>(Cond)) 5693 if (Arg->hasSExtAttr()) 5694 ExtType = Instruction::SExt; 5695 5696 auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); 5697 ExtInst->insertBefore(SI); 5698 SI->setCondition(ExtInst); 5699 for (auto Case : SI->cases()) { 5700 APInt NarrowConst = Case.getCaseValue()->getValue(); 5701 APInt WideConst = (ExtType == Instruction::ZExt) ? 5702 NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); 5703 Case.setValue(ConstantInt::get(Context, WideConst)); 5704 } 5705 5706 return true; 5707 } 5708 5709 5710 namespace { 5711 5712 /// \brief Helper class to promote a scalar operation to a vector one. 5713 /// This class is used to move downward extractelement transition. 5714 /// E.g., 5715 /// a = vector_op <2 x i32> 5716 /// b = extractelement <2 x i32> a, i32 0 5717 /// c = scalar_op b 5718 /// store c 5719 /// 5720 /// => 5721 /// a = vector_op <2 x i32> 5722 /// c = vector_op a (equivalent to scalar_op on the related lane) 5723 /// * d = extractelement <2 x i32> c, i32 0 5724 /// * store d 5725 /// Assuming both extractelement and store can be combine, we get rid of the 5726 /// transition. 5727 class VectorPromoteHelper { 5728 /// DataLayout associated with the current module. 5729 const DataLayout &DL; 5730 5731 /// Used to perform some checks on the legality of vector operations. 5732 const TargetLowering &TLI; 5733 5734 /// Used to estimated the cost of the promoted chain. 5735 const TargetTransformInfo &TTI; 5736 5737 /// The transition being moved downwards. 5738 Instruction *Transition; 5739 5740 /// The sequence of instructions to be promoted. 5741 SmallVector<Instruction *, 4> InstsToBePromoted; 5742 5743 /// Cost of combining a store and an extract. 5744 unsigned StoreExtractCombineCost; 5745 5746 /// Instruction that will be combined with the transition. 5747 Instruction *CombineInst = nullptr; 5748 5749 /// \brief The instruction that represents the current end of the transition. 5750 /// Since we are faking the promotion until we reach the end of the chain 5751 /// of computation, we need a way to get the current end of the transition. 5752 Instruction *getEndOfTransition() const { 5753 if (InstsToBePromoted.empty()) 5754 return Transition; 5755 return InstsToBePromoted.back(); 5756 } 5757 5758 /// \brief Return the index of the original value in the transition. 5759 /// E.g., for "extractelement <2 x i32> c, i32 1" the original value, 5760 /// c, is at index 0. 5761 unsigned getTransitionOriginalValueIdx() const { 5762 assert(isa<ExtractElementInst>(Transition) && 5763 "Other kind of transitions are not supported yet"); 5764 return 0; 5765 } 5766 5767 /// \brief Return the index of the index in the transition. 5768 /// E.g., for "extractelement <2 x i32> c, i32 0" the index 5769 /// is at index 1. 5770 unsigned getTransitionIdx() const { 5771 assert(isa<ExtractElementInst>(Transition) && 5772 "Other kind of transitions are not supported yet"); 5773 return 1; 5774 } 5775 5776 /// \brief Get the type of the transition. 5777 /// This is the type of the original value. 5778 /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the 5779 /// transition is <2 x i32>. 5780 Type *getTransitionType() const { 5781 return Transition->getOperand(getTransitionOriginalValueIdx())->getType(); 5782 } 5783 5784 /// \brief Promote \p ToBePromoted by moving \p Def downward through. 5785 /// I.e., we have the following sequence: 5786 /// Def = Transition <ty1> a to <ty2> 5787 /// b = ToBePromoted <ty2> Def, ... 5788 /// => 5789 /// b = ToBePromoted <ty1> a, ... 5790 /// Def = Transition <ty1> ToBePromoted to <ty2> 5791 void promoteImpl(Instruction *ToBePromoted); 5792 5793 /// \brief Check whether or not it is profitable to promote all the 5794 /// instructions enqueued to be promoted. 5795 bool isProfitableToPromote() { 5796 Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx()); 5797 unsigned Index = isa<ConstantInt>(ValIdx) 5798 ? cast<ConstantInt>(ValIdx)->getZExtValue() 5799 : -1; 5800 Type *PromotedType = getTransitionType(); 5801 5802 StoreInst *ST = cast<StoreInst>(CombineInst); 5803 unsigned AS = ST->getPointerAddressSpace(); 5804 unsigned Align = ST->getAlignment(); 5805 // Check if this store is supported. 5806 if (!TLI.allowsMisalignedMemoryAccesses( 5807 TLI.getValueType(DL, ST->getValueOperand()->getType()), AS, 5808 Align)) { 5809 // If this is not supported, there is no way we can combine 5810 // the extract with the store. 5811 return false; 5812 } 5813 5814 // The scalar chain of computation has to pay for the transition 5815 // scalar to vector. 5816 // The vector chain has to account for the combining cost. 5817 uint64_t ScalarCost = 5818 TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); 5819 uint64_t VectorCost = StoreExtractCombineCost; 5820 for (const auto &Inst : InstsToBePromoted) { 5821 // Compute the cost. 5822 // By construction, all instructions being promoted are arithmetic ones. 5823 // Moreover, one argument is a constant that can be viewed as a splat 5824 // constant. 5825 Value *Arg0 = Inst->getOperand(0); 5826 bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) || 5827 isa<ConstantFP>(Arg0); 5828 TargetTransformInfo::OperandValueKind Arg0OVK = 5829 IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue 5830 : TargetTransformInfo::OK_AnyValue; 5831 TargetTransformInfo::OperandValueKind Arg1OVK = 5832 !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue 5833 : TargetTransformInfo::OK_AnyValue; 5834 ScalarCost += TTI.getArithmeticInstrCost( 5835 Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK); 5836 VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, 5837 Arg0OVK, Arg1OVK); 5838 } 5839 DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: " 5840 << ScalarCost << "\nVector: " << VectorCost << '\n'); 5841 return ScalarCost > VectorCost; 5842 } 5843 5844 /// \brief Generate a constant vector with \p Val with the same 5845 /// number of elements as the transition. 5846 /// \p UseSplat defines whether or not \p Val should be replicated 5847 /// across the whole vector. 5848 /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>, 5849 /// otherwise we generate a vector with as many undef as possible: 5850 /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only 5851 /// used at the index of the extract. 5852 Value *getConstantVector(Constant *Val, bool UseSplat) const { 5853 unsigned ExtractIdx = std::numeric_limits<unsigned>::max(); 5854 if (!UseSplat) { 5855 // If we cannot determine where the constant must be, we have to 5856 // use a splat constant. 5857 Value *ValExtractIdx = Transition->getOperand(getTransitionIdx()); 5858 if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx)) 5859 ExtractIdx = CstVal->getSExtValue(); 5860 else 5861 UseSplat = true; 5862 } 5863 5864 unsigned End = getTransitionType()->getVectorNumElements(); 5865 if (UseSplat) 5866 return ConstantVector::getSplat(End, Val); 5867 5868 SmallVector<Constant *, 4> ConstVec; 5869 UndefValue *UndefVal = UndefValue::get(Val->getType()); 5870 for (unsigned Idx = 0; Idx != End; ++Idx) { 5871 if (Idx == ExtractIdx) 5872 ConstVec.push_back(Val); 5873 else 5874 ConstVec.push_back(UndefVal); 5875 } 5876 return ConstantVector::get(ConstVec); 5877 } 5878 5879 /// \brief Check if promoting to a vector type an operand at \p OperandIdx 5880 /// in \p Use can trigger undefined behavior. 5881 static bool canCauseUndefinedBehavior(const Instruction *Use, 5882 unsigned OperandIdx) { 5883 // This is not safe to introduce undef when the operand is on 5884 // the right hand side of a division-like instruction. 5885 if (OperandIdx != 1) 5886 return false; 5887 switch (Use->getOpcode()) { 5888 default: 5889 return false; 5890 case Instruction::SDiv: 5891 case Instruction::UDiv: 5892 case Instruction::SRem: 5893 case Instruction::URem: 5894 return true; 5895 case Instruction::FDiv: 5896 case Instruction::FRem: 5897 return !Use->hasNoNaNs(); 5898 } 5899 llvm_unreachable(nullptr); 5900 } 5901 5902 public: 5903 VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI, 5904 const TargetTransformInfo &TTI, Instruction *Transition, 5905 unsigned CombineCost) 5906 : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition), 5907 StoreExtractCombineCost(CombineCost) { 5908 assert(Transition && "Do not know how to promote null"); 5909 } 5910 5911 /// \brief Check if we can promote \p ToBePromoted to \p Type. 5912 bool canPromote(const Instruction *ToBePromoted) const { 5913 // We could support CastInst too. 5914 return isa<BinaryOperator>(ToBePromoted); 5915 } 5916 5917 /// \brief Check if it is profitable to promote \p ToBePromoted 5918 /// by moving downward the transition through. 5919 bool shouldPromote(const Instruction *ToBePromoted) const { 5920 // Promote only if all the operands can be statically expanded. 5921 // Indeed, we do not want to introduce any new kind of transitions. 5922 for (const Use &U : ToBePromoted->operands()) { 5923 const Value *Val = U.get(); 5924 if (Val == getEndOfTransition()) { 5925 // If the use is a division and the transition is on the rhs, 5926 // we cannot promote the operation, otherwise we may create a 5927 // division by zero. 5928 if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())) 5929 return false; 5930 continue; 5931 } 5932 if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) && 5933 !isa<ConstantFP>(Val)) 5934 return false; 5935 } 5936 // Check that the resulting operation is legal. 5937 int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode()); 5938 if (!ISDOpcode) 5939 return false; 5940 return StressStoreExtract || 5941 TLI.isOperationLegalOrCustom( 5942 ISDOpcode, TLI.getValueType(DL, getTransitionType(), true)); 5943 } 5944 5945 /// \brief Check whether or not \p Use can be combined 5946 /// with the transition. 5947 /// I.e., is it possible to do Use(Transition) => AnotherUse? 5948 bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); } 5949 5950 /// \brief Record \p ToBePromoted as part of the chain to be promoted. 5951 void enqueueForPromotion(Instruction *ToBePromoted) { 5952 InstsToBePromoted.push_back(ToBePromoted); 5953 } 5954 5955 /// \brief Set the instruction that will be combined with the transition. 5956 void recordCombineInstruction(Instruction *ToBeCombined) { 5957 assert(canCombine(ToBeCombined) && "Unsupported instruction to combine"); 5958 CombineInst = ToBeCombined; 5959 } 5960 5961 /// \brief Promote all the instructions enqueued for promotion if it is 5962 /// is profitable. 5963 /// \return True if the promotion happened, false otherwise. 5964 bool promote() { 5965 // Check if there is something to promote. 5966 // Right now, if we do not have anything to combine with, 5967 // we assume the promotion is not profitable. 5968 if (InstsToBePromoted.empty() || !CombineInst) 5969 return false; 5970 5971 // Check cost. 5972 if (!StressStoreExtract && !isProfitableToPromote()) 5973 return false; 5974 5975 // Promote. 5976 for (auto &ToBePromoted : InstsToBePromoted) 5977 promoteImpl(ToBePromoted); 5978 InstsToBePromoted.clear(); 5979 return true; 5980 } 5981 }; 5982 5983 } // end anonymous namespace 5984 5985 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { 5986 // At this point, we know that all the operands of ToBePromoted but Def 5987 // can be statically promoted. 5988 // For Def, we need to use its parameter in ToBePromoted: 5989 // b = ToBePromoted ty1 a 5990 // Def = Transition ty1 b to ty2 5991 // Move the transition down. 5992 // 1. Replace all uses of the promoted operation by the transition. 5993 // = ... b => = ... Def. 5994 assert(ToBePromoted->getType() == Transition->getType() && 5995 "The type of the result of the transition does not match " 5996 "the final type"); 5997 ToBePromoted->replaceAllUsesWith(Transition); 5998 // 2. Update the type of the uses. 5999 // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def. 6000 Type *TransitionTy = getTransitionType(); 6001 ToBePromoted->mutateType(TransitionTy); 6002 // 3. Update all the operands of the promoted operation with promoted 6003 // operands. 6004 // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a. 6005 for (Use &U : ToBePromoted->operands()) { 6006 Value *Val = U.get(); 6007 Value *NewVal = nullptr; 6008 if (Val == Transition) 6009 NewVal = Transition->getOperand(getTransitionOriginalValueIdx()); 6010 else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) || 6011 isa<ConstantFP>(Val)) { 6012 // Use a splat constant if it is not safe to use undef. 6013 NewVal = getConstantVector( 6014 cast<Constant>(Val), 6015 isa<UndefValue>(Val) || 6016 canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); 6017 } else 6018 llvm_unreachable("Did you modified shouldPromote and forgot to update " 6019 "this?"); 6020 ToBePromoted->setOperand(U.getOperandNo(), NewVal); 6021 } 6022 Transition->moveAfter(ToBePromoted); 6023 Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); 6024 } 6025 6026 /// Some targets can do store(extractelement) with one instruction. 6027 /// Try to push the extractelement towards the stores when the target 6028 /// has this feature and this is profitable. 6029 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { 6030 unsigned CombineCost = std::numeric_limits<unsigned>::max(); 6031 if (DisableStoreExtract || !TLI || 6032 (!StressStoreExtract && 6033 !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(), 6034 Inst->getOperand(1), CombineCost))) 6035 return false; 6036 6037 // At this point we know that Inst is a vector to scalar transition. 6038 // Try to move it down the def-use chain, until: 6039 // - We can combine the transition with its single use 6040 // => we got rid of the transition. 6041 // - We escape the current basic block 6042 // => we would need to check that we are moving it at a cheaper place and 6043 // we do not do that for now. 6044 BasicBlock *Parent = Inst->getParent(); 6045 DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); 6046 VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost); 6047 // If the transition has more than one use, assume this is not going to be 6048 // beneficial. 6049 while (Inst->hasOneUse()) { 6050 Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin()); 6051 DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n'); 6052 6053 if (ToBePromoted->getParent() != Parent) { 6054 DEBUG(dbgs() << "Instruction to promote is in a different block (" 6055 << ToBePromoted->getParent()->getName() 6056 << ") than the transition (" << Parent->getName() << ").\n"); 6057 return false; 6058 } 6059 6060 if (VPH.canCombine(ToBePromoted)) { 6061 DEBUG(dbgs() << "Assume " << *Inst << '\n' 6062 << "will be combined with: " << *ToBePromoted << '\n'); 6063 VPH.recordCombineInstruction(ToBePromoted); 6064 bool Changed = VPH.promote(); 6065 NumStoreExtractExposed += Changed; 6066 return Changed; 6067 } 6068 6069 DEBUG(dbgs() << "Try promoting.\n"); 6070 if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted)) 6071 return false; 6072 6073 DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n"); 6074 6075 VPH.enqueueForPromotion(ToBePromoted); 6076 Inst = ToBePromoted; 6077 } 6078 return false; 6079 } 6080 6081 /// For the instruction sequence of store below, F and I values 6082 /// are bundled together as an i64 value before being stored into memory. 6083 /// Sometimes it is more efficent to generate separate stores for F and I, 6084 /// which can remove the bitwise instructions or sink them to colder places. 6085 /// 6086 /// (store (or (zext (bitcast F to i32) to i64), 6087 /// (shl (zext I to i64), 32)), addr) --> 6088 /// (store F, addr) and (store I, addr+4) 6089 /// 6090 /// Similarly, splitting for other merged store can also be beneficial, like: 6091 /// For pair of {i32, i32}, i64 store --> two i32 stores. 6092 /// For pair of {i32, i16}, i64 store --> two i32 stores. 6093 /// For pair of {i16, i16}, i32 store --> two i16 stores. 6094 /// For pair of {i16, i8}, i32 store --> two i16 stores. 6095 /// For pair of {i8, i8}, i16 store --> two i8 stores. 6096 /// 6097 /// We allow each target to determine specifically which kind of splitting is 6098 /// supported. 6099 /// 6100 /// The store patterns are commonly seen from the simple code snippet below 6101 /// if only std::make_pair(...) is sroa transformed before inlined into hoo. 6102 /// void goo(const std::pair<int, float> &); 6103 /// hoo() { 6104 /// ... 6105 /// goo(std::make_pair(tmp, ftmp)); 6106 /// ... 6107 /// } 6108 /// 6109 /// Although we already have similar splitting in DAG Combine, we duplicate 6110 /// it in CodeGenPrepare to catch the case in which pattern is across 6111 /// multiple BBs. The logic in DAG Combine is kept to catch case generated 6112 /// during code expansion. 6113 static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, 6114 const TargetLowering &TLI) { 6115 // Handle simple but common cases only. 6116 Type *StoreType = SI.getValueOperand()->getType(); 6117 if (DL.getTypeStoreSizeInBits(StoreType) != DL.getTypeSizeInBits(StoreType) || 6118 DL.getTypeSizeInBits(StoreType) == 0) 6119 return false; 6120 6121 unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2; 6122 Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize); 6123 if (DL.getTypeStoreSizeInBits(SplitStoreType) != 6124 DL.getTypeSizeInBits(SplitStoreType)) 6125 return false; 6126 6127 // Match the following patterns: 6128 // (store (or (zext LValue to i64), 6129 // (shl (zext HValue to i64), 32)), HalfValBitSize) 6130 // or 6131 // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize) 6132 // (zext LValue to i64), 6133 // Expect both operands of OR and the first operand of SHL have only 6134 // one use. 6135 Value *LValue, *HValue; 6136 if (!match(SI.getValueOperand(), 6137 m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))), 6138 m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))), 6139 m_SpecificInt(HalfValBitSize)))))) 6140 return false; 6141 6142 // Check LValue and HValue are int with size less or equal than 32. 6143 if (!LValue->getType()->isIntegerTy() || 6144 DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize || 6145 !HValue->getType()->isIntegerTy() || 6146 DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize) 6147 return false; 6148 6149 // If LValue/HValue is a bitcast instruction, use the EVT before bitcast 6150 // as the input of target query. 6151 auto *LBC = dyn_cast<BitCastInst>(LValue); 6152 auto *HBC = dyn_cast<BitCastInst>(HValue); 6153 EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType()) 6154 : EVT::getEVT(LValue->getType()); 6155 EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType()) 6156 : EVT::getEVT(HValue->getType()); 6157 if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) 6158 return false; 6159 6160 // Start to split store. 6161 IRBuilder<> Builder(SI.getContext()); 6162 Builder.SetInsertPoint(&SI); 6163 6164 // If LValue/HValue is a bitcast in another BB, create a new one in current 6165 // BB so it may be merged with the splitted stores by dag combiner. 6166 if (LBC && LBC->getParent() != SI.getParent()) 6167 LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType()); 6168 if (HBC && HBC->getParent() != SI.getParent()) 6169 HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType()); 6170 6171 auto CreateSplitStore = [&](Value *V, bool Upper) { 6172 V = Builder.CreateZExtOrBitCast(V, SplitStoreType); 6173 Value *Addr = Builder.CreateBitCast( 6174 SI.getOperand(1), 6175 SplitStoreType->getPointerTo(SI.getPointerAddressSpace())); 6176 if (Upper) 6177 Addr = Builder.CreateGEP( 6178 SplitStoreType, Addr, 6179 ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1)); 6180 Builder.CreateAlignedStore( 6181 V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment()); 6182 }; 6183 6184 CreateSplitStore(LValue, false); 6185 CreateSplitStore(HValue, true); 6186 6187 // Delete the old store. 6188 SI.eraseFromParent(); 6189 return true; 6190 } 6191 6192 bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { 6193 // Bail out if we inserted the instruction to prevent optimizations from 6194 // stepping on each other's toes. 6195 if (InsertedInsts.count(I)) 6196 return false; 6197 6198 if (PHINode *P = dyn_cast<PHINode>(I)) { 6199 // It is possible for very late stage optimizations (such as SimplifyCFG) 6200 // to introduce PHI nodes too late to be cleaned up. If we detect such a 6201 // trivial PHI, go ahead and zap it here. 6202 if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) { 6203 P->replaceAllUsesWith(V); 6204 P->eraseFromParent(); 6205 ++NumPHIsElim; 6206 return true; 6207 } 6208 return false; 6209 } 6210 6211 if (CastInst *CI = dyn_cast<CastInst>(I)) { 6212 // If the source of the cast is a constant, then this should have 6213 // already been constant folded. The only reason NOT to constant fold 6214 // it is if something (e.g. LSR) was careful to place the constant 6215 // evaluation in a block other than then one that uses it (e.g. to hoist 6216 // the address of globals out of a loop). If this is the case, we don't 6217 // want to forward-subst the cast. 6218 if (isa<Constant>(CI->getOperand(0))) 6219 return false; 6220 6221 if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL)) 6222 return true; 6223 6224 if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { 6225 /// Sink a zext or sext into its user blocks if the target type doesn't 6226 /// fit in one register 6227 if (TLI && 6228 TLI->getTypeAction(CI->getContext(), 6229 TLI->getValueType(*DL, CI->getType())) == 6230 TargetLowering::TypeExpandInteger) { 6231 return SinkCast(CI); 6232 } else { 6233 bool MadeChange = optimizeExt(I); 6234 return MadeChange | optimizeExtUses(I); 6235 } 6236 } 6237 return false; 6238 } 6239 6240 if (CmpInst *CI = dyn_cast<CmpInst>(I)) 6241 if (!TLI || !TLI->hasMultipleConditionRegisters()) 6242 return OptimizeCmpExpression(CI, TLI); 6243 6244 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 6245 LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); 6246 if (TLI) { 6247 bool Modified = optimizeLoadExt(LI); 6248 unsigned AS = LI->getPointerAddressSpace(); 6249 Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); 6250 return Modified; 6251 } 6252 return false; 6253 } 6254 6255 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 6256 if (TLI && splitMergedValStore(*SI, *DL, *TLI)) 6257 return true; 6258 SI->setMetadata(LLVMContext::MD_invariant_group, nullptr); 6259 if (TLI) { 6260 unsigned AS = SI->getPointerAddressSpace(); 6261 return optimizeMemoryInst(I, SI->getOperand(1), 6262 SI->getOperand(0)->getType(), AS); 6263 } 6264 return false; 6265 } 6266 6267 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { 6268 unsigned AS = RMW->getPointerAddressSpace(); 6269 return optimizeMemoryInst(I, RMW->getPointerOperand(), 6270 RMW->getType(), AS); 6271 } 6272 6273 if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { 6274 unsigned AS = CmpX->getPointerAddressSpace(); 6275 return optimizeMemoryInst(I, CmpX->getPointerOperand(), 6276 CmpX->getCompareOperand()->getType(), AS); 6277 } 6278 6279 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); 6280 6281 if (BinOp && (BinOp->getOpcode() == Instruction::And) && 6282 EnableAndCmpSinking && TLI) 6283 return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); 6284 6285 if (BinOp && (BinOp->getOpcode() == Instruction::AShr || 6286 BinOp->getOpcode() == Instruction::LShr)) { 6287 ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); 6288 if (TLI && CI && TLI->hasExtractBitsInsn()) 6289 return OptimizeExtractBits(BinOp, CI, *TLI, *DL); 6290 6291 return false; 6292 } 6293 6294 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { 6295 if (GEPI->hasAllZeroIndices()) { 6296 /// The GEP operand must be a pointer, so must its result -> BitCast 6297 Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), 6298 GEPI->getName(), GEPI); 6299 GEPI->replaceAllUsesWith(NC); 6300 GEPI->eraseFromParent(); 6301 ++NumGEPsElim; 6302 optimizeInst(NC, ModifiedDT); 6303 return true; 6304 } 6305 return false; 6306 } 6307 6308 if (CallInst *CI = dyn_cast<CallInst>(I)) 6309 return optimizeCallInst(CI, ModifiedDT); 6310 6311 if (SelectInst *SI = dyn_cast<SelectInst>(I)) 6312 return optimizeSelectInst(SI); 6313 6314 if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) 6315 return optimizeShuffleVectorInst(SVI); 6316 6317 if (auto *Switch = dyn_cast<SwitchInst>(I)) 6318 return optimizeSwitchInst(Switch); 6319 6320 if (isa<ExtractElementInst>(I)) 6321 return optimizeExtractElementInst(I); 6322 6323 return false; 6324 } 6325 6326 /// Given an OR instruction, check to see if this is a bitreverse 6327 /// idiom. If so, insert the new intrinsic and return true. 6328 static bool makeBitReverse(Instruction &I, const DataLayout &DL, 6329 const TargetLowering &TLI) { 6330 if (!I.getType()->isIntegerTy() || 6331 !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, 6332 TLI.getValueType(DL, I.getType(), true))) 6333 return false; 6334 6335 SmallVector<Instruction*, 4> Insts; 6336 if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts)) 6337 return false; 6338 Instruction *LastInst = Insts.back(); 6339 I.replaceAllUsesWith(LastInst); 6340 RecursivelyDeleteTriviallyDeadInstructions(&I); 6341 return true; 6342 } 6343 6344 // In this pass we look for GEP and cast instructions that are used 6345 // across basic blocks and rewrite them to improve basic-block-at-a-time 6346 // selection. 6347 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { 6348 SunkAddrs.clear(); 6349 bool MadeChange = false; 6350 6351 CurInstIterator = BB.begin(); 6352 while (CurInstIterator != BB.end()) { 6353 MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); 6354 if (ModifiedDT) 6355 return true; 6356 } 6357 6358 bool MadeBitReverse = true; 6359 while (TLI && MadeBitReverse) { 6360 MadeBitReverse = false; 6361 for (auto &I : reverse(BB)) { 6362 if (makeBitReverse(I, *DL, *TLI)) { 6363 MadeBitReverse = MadeChange = true; 6364 ModifiedDT = true; 6365 break; 6366 } 6367 } 6368 } 6369 MadeChange |= dupRetToEnableTailCallOpts(&BB); 6370 6371 return MadeChange; 6372 } 6373 6374 // llvm.dbg.value is far away from the value then iSel may not be able 6375 // handle it properly. iSel will drop llvm.dbg.value if it can not 6376 // find a node corresponding to the value. 6377 bool CodeGenPrepare::placeDbgValues(Function &F) { 6378 bool MadeChange = false; 6379 for (BasicBlock &BB : F) { 6380 Instruction *PrevNonDbgInst = nullptr; 6381 for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { 6382 Instruction *Insn = &*BI++; 6383 DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); 6384 // Leave dbg.values that refer to an alloca alone. These 6385 // instrinsics describe the address of a variable (= the alloca) 6386 // being taken. They should not be moved next to the alloca 6387 // (and to the beginning of the scope), but rather stay close to 6388 // where said address is used. 6389 if (!DVI || (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) { 6390 PrevNonDbgInst = Insn; 6391 continue; 6392 } 6393 6394 Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); 6395 if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { 6396 // If VI is a phi in a block with an EHPad terminator, we can't insert 6397 // after it. 6398 if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad()) 6399 continue; 6400 DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); 6401 DVI->removeFromParent(); 6402 if (isa<PHINode>(VI)) 6403 DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); 6404 else 6405 DVI->insertAfter(VI); 6406 MadeChange = true; 6407 ++NumDbgValueMoved; 6408 } 6409 } 6410 } 6411 return MadeChange; 6412 } 6413 6414 /// \brief Scale down both weights to fit into uint32_t. 6415 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { 6416 uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; 6417 uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1; 6418 NewTrue = NewTrue / Scale; 6419 NewFalse = NewFalse / Scale; 6420 } 6421 6422 /// \brief Some targets prefer to split a conditional branch like: 6423 /// \code 6424 /// %0 = icmp ne i32 %a, 0 6425 /// %1 = icmp ne i32 %b, 0 6426 /// %or.cond = or i1 %0, %1 6427 /// br i1 %or.cond, label %TrueBB, label %FalseBB 6428 /// \endcode 6429 /// into multiple branch instructions like: 6430 /// \code 6431 /// bb1: 6432 /// %0 = icmp ne i32 %a, 0 6433 /// br i1 %0, label %TrueBB, label %bb2 6434 /// bb2: 6435 /// %1 = icmp ne i32 %b, 0 6436 /// br i1 %1, label %TrueBB, label %FalseBB 6437 /// \endcode 6438 /// This usually allows instruction selection to do even further optimizations 6439 /// and combine the compare with the branch instruction. Currently this is 6440 /// applied for targets which have "cheap" jump instructions. 6441 /// 6442 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. 6443 /// 6444 bool CodeGenPrepare::splitBranchCondition(Function &F) { 6445 if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive()) 6446 return false; 6447 6448 bool MadeChange = false; 6449 for (auto &BB : F) { 6450 // Does this BB end with the following? 6451 // %cond1 = icmp|fcmp|binary instruction ... 6452 // %cond2 = icmp|fcmp|binary instruction ... 6453 // %cond.or = or|and i1 %cond1, cond2 6454 // br i1 %cond.or label %dest1, label %dest2" 6455 BinaryOperator *LogicOp; 6456 BasicBlock *TBB, *FBB; 6457 if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) 6458 continue; 6459 6460 auto *Br1 = cast<BranchInst>(BB.getTerminator()); 6461 if (Br1->getMetadata(LLVMContext::MD_unpredictable)) 6462 continue; 6463 6464 unsigned Opc; 6465 Value *Cond1, *Cond2; 6466 if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), 6467 m_OneUse(m_Value(Cond2))))) 6468 Opc = Instruction::And; 6469 else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)), 6470 m_OneUse(m_Value(Cond2))))) 6471 Opc = Instruction::Or; 6472 else 6473 continue; 6474 6475 if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) || 6476 !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) ) 6477 continue; 6478 6479 DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump()); 6480 6481 // Create a new BB. 6482 auto TmpBB = 6483 BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split", 6484 BB.getParent(), BB.getNextNode()); 6485 6486 // Update original basic block by using the first condition directly by the 6487 // branch instruction and removing the no longer needed and/or instruction. 6488 Br1->setCondition(Cond1); 6489 LogicOp->eraseFromParent(); 6490 6491 // Depending on the conditon we have to either replace the true or the false 6492 // successor of the original branch instruction. 6493 if (Opc == Instruction::And) 6494 Br1->setSuccessor(0, TmpBB); 6495 else 6496 Br1->setSuccessor(1, TmpBB); 6497 6498 // Fill in the new basic block. 6499 auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB); 6500 if (auto *I = dyn_cast<Instruction>(Cond2)) { 6501 I->removeFromParent(); 6502 I->insertBefore(Br2); 6503 } 6504 6505 // Update PHI nodes in both successors. The original BB needs to be 6506 // replaced in one successor's PHI nodes, because the branch comes now from 6507 // the newly generated BB (NewBB). In the other successor we need to add one 6508 // incoming edge to the PHI nodes, because both branch instructions target 6509 // now the same successor. Depending on the original branch condition 6510 // (and/or) we have to swap the successors (TrueDest, FalseDest), so that 6511 // we perform the correct update for the PHI nodes. 6512 // This doesn't change the successor order of the just created branch 6513 // instruction (or any other instruction). 6514 if (Opc == Instruction::Or) 6515 std::swap(TBB, FBB); 6516 6517 // Replace the old BB with the new BB. 6518 for (auto &I : *TBB) { 6519 PHINode *PN = dyn_cast<PHINode>(&I); 6520 if (!PN) 6521 break; 6522 int i; 6523 while ((i = PN->getBasicBlockIndex(&BB)) >= 0) 6524 PN->setIncomingBlock(i, TmpBB); 6525 } 6526 6527 // Add another incoming edge form the new BB. 6528 for (auto &I : *FBB) { 6529 PHINode *PN = dyn_cast<PHINode>(&I); 6530 if (!PN) 6531 break; 6532 auto *Val = PN->getIncomingValueForBlock(&BB); 6533 PN->addIncoming(Val, TmpBB); 6534 } 6535 6536 // Update the branch weights (from SelectionDAGBuilder:: 6537 // FindMergedConditions). 6538 if (Opc == Instruction::Or) { 6539 // Codegen X | Y as: 6540 // BB1: 6541 // jmp_if_X TBB 6542 // jmp TmpBB 6543 // TmpBB: 6544 // jmp_if_Y TBB 6545 // jmp FBB 6546 // 6547 6548 // We have flexibility in setting Prob for BB1 and Prob for NewBB. 6549 // The requirement is that 6550 // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) 6551 // = TrueProb for orignal BB. 6552 // Assuming the orignal weights are A and B, one choice is to set BB1's 6553 // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice 6554 // assumes that 6555 // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. 6556 // Another choice is to assume TrueProb for BB1 equals to TrueProb for 6557 // TmpBB, but the math is more complicated. 6558 uint64_t TrueWeight, FalseWeight; 6559 if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) { 6560 uint64_t NewTrueWeight = TrueWeight; 6561 uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight; 6562 scaleWeights(NewTrueWeight, NewFalseWeight); 6563 Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) 6564 .createBranchWeights(TrueWeight, FalseWeight)); 6565 6566 NewTrueWeight = TrueWeight; 6567 NewFalseWeight = 2 * FalseWeight; 6568 scaleWeights(NewTrueWeight, NewFalseWeight); 6569 Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) 6570 .createBranchWeights(TrueWeight, FalseWeight)); 6571 } 6572 } else { 6573 // Codegen X & Y as: 6574 // BB1: 6575 // jmp_if_X TmpBB 6576 // jmp FBB 6577 // TmpBB: 6578 // jmp_if_Y TBB 6579 // jmp FBB 6580 // 6581 // This requires creation of TmpBB after CurBB. 6582 6583 // We have flexibility in setting Prob for BB1 and Prob for TmpBB. 6584 // The requirement is that 6585 // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) 6586 // = FalseProb for orignal BB. 6587 // Assuming the orignal weights are A and B, one choice is to set BB1's 6588 // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice 6589 // assumes that 6590 // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. 6591 uint64_t TrueWeight, FalseWeight; 6592 if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) { 6593 uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight; 6594 uint64_t NewFalseWeight = FalseWeight; 6595 scaleWeights(NewTrueWeight, NewFalseWeight); 6596 Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) 6597 .createBranchWeights(TrueWeight, FalseWeight)); 6598 6599 NewTrueWeight = 2 * TrueWeight; 6600 NewFalseWeight = FalseWeight; 6601 scaleWeights(NewTrueWeight, NewFalseWeight); 6602 Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) 6603 .createBranchWeights(TrueWeight, FalseWeight)); 6604 } 6605 } 6606 6607 // Note: No point in getting fancy here, since the DT info is never 6608 // available to CodeGenPrepare. 6609 ModifiedDT = true; 6610 6611 MadeChange = true; 6612 6613 DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); 6614 TmpBB->dump()); 6615 } 6616 return MadeChange; 6617 } 6618