1 //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs various transformations related to eliminating memcpy 10 // calls, or transforming sets of stores into memset's. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 15 #include "llvm/ADT/DenseSet.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/Analysis/AliasAnalysis.h" 22 #include "llvm/Analysis/AssumptionCache.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/MemoryDependenceAnalysis.h" 25 #include "llvm/Analysis/MemoryLocation.h" 26 #include "llvm/Analysis/MemorySSA.h" 27 #include "llvm/Analysis/MemorySSAUpdater.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/ValueTracking.h" 30 #include "llvm/IR/Argument.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/Constants.h" 33 #include "llvm/IR/DataLayout.h" 34 #include "llvm/IR/DerivedTypes.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/GetElementPtrTypeIterator.h" 38 #include "llvm/IR/GlobalVariable.h" 39 #include "llvm/IR/IRBuilder.h" 40 #include "llvm/IR/InstrTypes.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicInst.h" 44 #include "llvm/IR/Intrinsics.h" 45 #include "llvm/IR/LLVMContext.h" 46 #include "llvm/IR/Module.h" 47 #include "llvm/IR/Operator.h" 48 #include "llvm/IR/PassManager.h" 49 #include "llvm/IR/Type.h" 50 #include "llvm/IR/User.h" 51 #include "llvm/IR/Value.h" 52 #include "llvm/InitializePasses.h" 53 #include "llvm/Pass.h" 54 #include "llvm/Support/Casting.h" 55 #include "llvm/Support/Debug.h" 56 #include "llvm/Support/MathExtras.h" 57 #include "llvm/Support/raw_ostream.h" 58 #include "llvm/Transforms/Scalar.h" 59 #include "llvm/Transforms/Utils/Local.h" 60 #include <algorithm> 61 #include <cassert> 62 #include <cstdint> 63 #include <utility> 64 65 using namespace llvm; 66 67 #define DEBUG_TYPE "memcpyopt" 68 69 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); 70 STATISTIC(NumMemSetInfer, "Number of memsets inferred"); 71 STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); 72 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); 73 74 namespace { 75 76 /// Represents a range of memset'd bytes with the ByteVal value. 77 /// This allows us to analyze stores like: 78 /// store 0 -> P+1 79 /// store 0 -> P+0 80 /// store 0 -> P+3 81 /// store 0 -> P+2 82 /// which sometimes happens with stores to arrays of structs etc. When we see 83 /// the first store, we make a range [1, 2). The second store extends the range 84 /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the 85 /// two ranges into [0, 3) which is memset'able. 86 struct MemsetRange { 87 // Start/End - A semi range that describes the span that this range covers. 88 // The range is closed at the start and open at the end: [Start, End). 89 int64_t Start, End; 90 91 /// StartPtr - The getelementptr instruction that points to the start of the 92 /// range. 93 Value *StartPtr; 94 95 /// Alignment - The known alignment of the first store. 96 unsigned Alignment; 97 98 /// TheStores - The actual stores that make up this range. 99 SmallVector<Instruction*, 16> TheStores; 100 101 bool isProfitableToUseMemset(const DataLayout &DL) const; 102 }; 103 104 } // end anonymous namespace 105 106 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { 107 // If we found more than 4 stores to merge or 16 bytes, use memset. 108 if (TheStores.size() >= 4 || End-Start >= 16) return true; 109 110 // If there is nothing to merge, don't do anything. 111 if (TheStores.size() < 2) return false; 112 113 // If any of the stores are a memset, then it is always good to extend the 114 // memset. 115 for (Instruction *SI : TheStores) 116 if (!isa<StoreInst>(SI)) 117 return true; 118 119 // Assume that the code generator is capable of merging pairs of stores 120 // together if it wants to. 121 if (TheStores.size() == 2) return false; 122 123 // If we have fewer than 8 stores, it can still be worthwhile to do this. 124 // For example, merging 4 i8 stores into an i32 store is useful almost always. 125 // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the 126 // memset will be split into 2 32-bit stores anyway) and doing so can 127 // pessimize the llvm optimizer. 128 // 129 // Since we don't have perfect knowledge here, make some assumptions: assume 130 // the maximum GPR width is the same size as the largest legal integer 131 // size. If so, check to see whether we will end up actually reducing the 132 // number of stores used. 133 unsigned Bytes = unsigned(End-Start); 134 unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; 135 if (MaxIntSize == 0) 136 MaxIntSize = 1; 137 unsigned NumPointerStores = Bytes / MaxIntSize; 138 139 // Assume the remaining bytes if any are done a byte at a time. 140 unsigned NumByteStores = Bytes % MaxIntSize; 141 142 // If we will reduce the # stores (according to this heuristic), do the 143 // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 144 // etc. 145 return TheStores.size() > NumPointerStores+NumByteStores; 146 } 147 148 namespace { 149 150 class MemsetRanges { 151 using range_iterator = SmallVectorImpl<MemsetRange>::iterator; 152 153 /// A sorted list of the memset ranges. 154 SmallVector<MemsetRange, 8> Ranges; 155 156 const DataLayout &DL; 157 158 public: 159 MemsetRanges(const DataLayout &DL) : DL(DL) {} 160 161 using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator; 162 163 const_iterator begin() const { return Ranges.begin(); } 164 const_iterator end() const { return Ranges.end(); } 165 bool empty() const { return Ranges.empty(); } 166 167 void addInst(int64_t OffsetFromFirst, Instruction *Inst) { 168 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 169 addStore(OffsetFromFirst, SI); 170 else 171 addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); 172 } 173 174 void addStore(int64_t OffsetFromFirst, StoreInst *SI) { 175 int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); 176 177 addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), 178 SI->getAlign().value(), SI); 179 } 180 181 void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { 182 int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 183 addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI); 184 } 185 186 void addRange(int64_t Start, int64_t Size, Value *Ptr, 187 unsigned Alignment, Instruction *Inst); 188 }; 189 190 } // end anonymous namespace 191 192 /// Add a new store to the MemsetRanges data structure. This adds a 193 /// new range for the specified store at the specified offset, merging into 194 /// existing ranges as appropriate. 195 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, 196 unsigned Alignment, Instruction *Inst) { 197 int64_t End = Start+Size; 198 199 range_iterator I = partition_point( 200 Ranges, [=](const MemsetRange &O) { return O.End < Start; }); 201 202 // We now know that I == E, in which case we didn't find anything to merge 203 // with, or that Start <= I->End. If End < I->Start or I == E, then we need 204 // to insert a new range. Handle this now. 205 if (I == Ranges.end() || End < I->Start) { 206 MemsetRange &R = *Ranges.insert(I, MemsetRange()); 207 R.Start = Start; 208 R.End = End; 209 R.StartPtr = Ptr; 210 R.Alignment = Alignment; 211 R.TheStores.push_back(Inst); 212 return; 213 } 214 215 // This store overlaps with I, add it. 216 I->TheStores.push_back(Inst); 217 218 // At this point, we may have an interval that completely contains our store. 219 // If so, just add it to the interval and return. 220 if (I->Start <= Start && I->End >= End) 221 return; 222 223 // Now we know that Start <= I->End and End >= I->Start so the range overlaps 224 // but is not entirely contained within the range. 225 226 // See if the range extends the start of the range. In this case, it couldn't 227 // possibly cause it to join the prior range, because otherwise we would have 228 // stopped on *it*. 229 if (Start < I->Start) { 230 I->Start = Start; 231 I->StartPtr = Ptr; 232 I->Alignment = Alignment; 233 } 234 235 // Now we know that Start <= I->End and Start >= I->Start (so the startpoint 236 // is in or right at the end of I), and that End >= I->Start. Extend I out to 237 // End. 238 if (End > I->End) { 239 I->End = End; 240 range_iterator NextI = I; 241 while (++NextI != Ranges.end() && End >= NextI->Start) { 242 // Merge the range in. 243 I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); 244 if (NextI->End > I->End) 245 I->End = NextI->End; 246 Ranges.erase(NextI); 247 NextI = I; 248 } 249 } 250 } 251 252 //===----------------------------------------------------------------------===// 253 // MemCpyOptLegacyPass Pass 254 //===----------------------------------------------------------------------===// 255 256 namespace { 257 258 class MemCpyOptLegacyPass : public FunctionPass { 259 MemCpyOptPass Impl; 260 261 public: 262 static char ID; // Pass identification, replacement for typeid 263 264 MemCpyOptLegacyPass() : FunctionPass(ID) { 265 initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); 266 } 267 268 bool runOnFunction(Function &F) override; 269 270 private: 271 // This transformation requires dominator postdominator info 272 void getAnalysisUsage(AnalysisUsage &AU) const override { 273 AU.setPreservesCFG(); 274 AU.addRequired<AssumptionCacheTracker>(); 275 AU.addRequired<DominatorTreeWrapperPass>(); 276 AU.addPreserved<DominatorTreeWrapperPass>(); 277 AU.addPreserved<GlobalsAAWrapperPass>(); 278 AU.addRequired<TargetLibraryInfoWrapperPass>(); 279 AU.addRequired<MemoryDependenceWrapperPass>(); 280 AU.addPreserved<MemoryDependenceWrapperPass>(); 281 AU.addRequired<AAResultsWrapperPass>(); 282 AU.addPreserved<AAResultsWrapperPass>(); 283 AU.addPreserved<MemorySSAWrapperPass>(); 284 } 285 }; 286 287 } // end anonymous namespace 288 289 char MemCpyOptLegacyPass::ID = 0; 290 291 /// The public interface to this file... 292 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } 293 294 INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", 295 false, false) 296 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 297 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 298 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) 299 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 300 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 301 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 302 INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", 303 false, false) 304 305 void MemCpyOptPass::eraseInstruction(Instruction *I) { 306 if (MSSAU) 307 MSSAU->removeMemoryAccess(I); 308 MD->removeInstruction(I); 309 I->eraseFromParent(); 310 } 311 312 /// When scanning forward over instructions, we look for some other patterns to 313 /// fold away. In particular, this looks for stores to neighboring locations of 314 /// memory. If it sees enough consecutive ones, it attempts to merge them 315 /// together into a memcpy/memset. 316 Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, 317 Value *StartPtr, 318 Value *ByteVal) { 319 const DataLayout &DL = StartInst->getModule()->getDataLayout(); 320 321 // Okay, so we now have a single store that can be splatable. Scan to find 322 // all subsequent stores of the same value to offset from the same pointer. 323 // Join these together into ranges, so we can decide whether contiguous blocks 324 // are stored. 325 MemsetRanges Ranges(DL); 326 327 BasicBlock::iterator BI(StartInst); 328 329 // Keeps track of the last memory use or def before the insertion point for 330 // the new memset. The new MemoryDef for the inserted memsets will be inserted 331 // after MemInsertPoint. It points to either LastMemDef or to the last user 332 // before the insertion point of the memset, if there are any such users. 333 MemoryUseOrDef *MemInsertPoint = nullptr; 334 // Keeps track of the last MemoryDef between StartInst and the insertion point 335 // for the new memset. This will become the defining access of the inserted 336 // memsets. 337 MemoryDef *LastMemDef = nullptr; 338 for (++BI; !BI->isTerminator(); ++BI) { 339 if (MSSAU) { 340 auto *CurrentAcc = cast_or_null<MemoryUseOrDef>( 341 MSSAU->getMemorySSA()->getMemoryAccess(&*BI)); 342 if (CurrentAcc) { 343 MemInsertPoint = CurrentAcc; 344 if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc)) 345 LastMemDef = CurrentDef; 346 } 347 } 348 349 if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { 350 // If the instruction is readnone, ignore it, otherwise bail out. We 351 // don't even allow readonly here because we don't want something like: 352 // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). 353 if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) 354 break; 355 continue; 356 } 357 358 if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { 359 // If this is a store, see if we can merge it in. 360 if (!NextStore->isSimple()) break; 361 362 Value *StoredVal = NextStore->getValueOperand(); 363 364 // Don't convert stores of non-integral pointer types to memsets (which 365 // stores integers). 366 if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType())) 367 break; 368 369 // Check to see if this stored value is of the same byte-splattable value. 370 Value *StoredByte = isBytewiseValue(StoredVal, DL); 371 if (isa<UndefValue>(ByteVal) && StoredByte) 372 ByteVal = StoredByte; 373 if (ByteVal != StoredByte) 374 break; 375 376 // Check to see if this store is to a constant offset from the start ptr. 377 Optional<int64_t> Offset = 378 isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL); 379 if (!Offset) 380 break; 381 382 Ranges.addStore(*Offset, NextStore); 383 } else { 384 MemSetInst *MSI = cast<MemSetInst>(BI); 385 386 if (MSI->isVolatile() || ByteVal != MSI->getValue() || 387 !isa<ConstantInt>(MSI->getLength())) 388 break; 389 390 // Check to see if this store is to a constant offset from the start ptr. 391 Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL); 392 if (!Offset) 393 break; 394 395 Ranges.addMemSet(*Offset, MSI); 396 } 397 } 398 399 // If we have no ranges, then we just had a single store with nothing that 400 // could be merged in. This is a very common case of course. 401 if (Ranges.empty()) 402 return nullptr; 403 404 // If we had at least one store that could be merged in, add the starting 405 // store as well. We try to avoid this unless there is at least something 406 // interesting as a small compile-time optimization. 407 Ranges.addInst(0, StartInst); 408 409 // If we create any memsets, we put it right before the first instruction that 410 // isn't part of the memset block. This ensure that the memset is dominated 411 // by any addressing instruction needed by the start of the block. 412 IRBuilder<> Builder(&*BI); 413 414 // Now that we have full information about ranges, loop over the ranges and 415 // emit memset's for anything big enough to be worthwhile. 416 Instruction *AMemSet = nullptr; 417 for (const MemsetRange &Range : Ranges) { 418 if (Range.TheStores.size() == 1) continue; 419 420 // If it is profitable to lower this range to memset, do so now. 421 if (!Range.isProfitableToUseMemset(DL)) 422 continue; 423 424 // Otherwise, we do want to transform this! Create a new memset. 425 // Get the starting pointer of the block. 426 StartPtr = Range.StartPtr; 427 428 AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, 429 MaybeAlign(Range.Alignment)); 430 LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI 431 : Range.TheStores) dbgs() 432 << *SI << '\n'; 433 dbgs() << "With: " << *AMemSet << '\n'); 434 if (!Range.TheStores.empty()) 435 AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); 436 437 if (MSSAU) { 438 assert(LastMemDef && MemInsertPoint && 439 "Both LastMemDef and MemInsertPoint need to be set"); 440 auto *NewDef = 441 cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI 442 ? MSSAU->createMemoryAccessBefore( 443 AMemSet, LastMemDef, MemInsertPoint) 444 : MSSAU->createMemoryAccessAfter( 445 AMemSet, LastMemDef, MemInsertPoint)); 446 MSSAU->insertDef(NewDef, /*RenameUses=*/true); 447 LastMemDef = NewDef; 448 MemInsertPoint = NewDef; 449 } 450 451 // Zap all the stores. 452 for (Instruction *SI : Range.TheStores) 453 eraseInstruction(SI); 454 455 ++NumMemSetInfer; 456 } 457 458 return AMemSet; 459 } 460 461 // This method try to lift a store instruction before position P. 462 // It will lift the store and its argument + that anything that 463 // may alias with these. 464 // The method returns true if it was successful. 465 static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, 466 const LoadInst *LI) { 467 // If the store alias this position, early bail out. 468 MemoryLocation StoreLoc = MemoryLocation::get(SI); 469 if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc))) 470 return false; 471 472 // Keep track of the arguments of all instruction we plan to lift 473 // so we can make sure to lift them as well if appropriate. 474 DenseSet<Instruction*> Args; 475 if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) 476 if (Ptr->getParent() == SI->getParent()) 477 Args.insert(Ptr); 478 479 // Instruction to lift before P. 480 SmallVector<Instruction*, 8> ToLift; 481 482 // Memory locations of lifted instructions. 483 SmallVector<MemoryLocation, 8> MemLocs{StoreLoc}; 484 485 // Lifted calls. 486 SmallVector<const CallBase *, 8> Calls; 487 488 const MemoryLocation LoadLoc = MemoryLocation::get(LI); 489 490 for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { 491 auto *C = &*I; 492 493 bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None)); 494 495 bool NeedLift = false; 496 if (Args.erase(C)) 497 NeedLift = true; 498 else if (MayAlias) { 499 NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) { 500 return isModOrRefSet(AA.getModRefInfo(C, ML)); 501 }); 502 503 if (!NeedLift) 504 NeedLift = llvm::any_of(Calls, [C, &AA](const CallBase *Call) { 505 return isModOrRefSet(AA.getModRefInfo(C, Call)); 506 }); 507 } 508 509 if (!NeedLift) 510 continue; 511 512 if (MayAlias) { 513 // Since LI is implicitly moved downwards past the lifted instructions, 514 // none of them may modify its source. 515 if (isModSet(AA.getModRefInfo(C, LoadLoc))) 516 return false; 517 else if (const auto *Call = dyn_cast<CallBase>(C)) { 518 // If we can't lift this before P, it's game over. 519 if (isModOrRefSet(AA.getModRefInfo(P, Call))) 520 return false; 521 522 Calls.push_back(Call); 523 } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) { 524 // If we can't lift this before P, it's game over. 525 auto ML = MemoryLocation::get(C); 526 if (isModOrRefSet(AA.getModRefInfo(P, ML))) 527 return false; 528 529 MemLocs.push_back(ML); 530 } else 531 // We don't know how to lift this instruction. 532 return false; 533 } 534 535 ToLift.push_back(C); 536 for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k) 537 if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) { 538 if (A->getParent() == SI->getParent()) { 539 // Cannot hoist user of P above P 540 if(A == P) return false; 541 Args.insert(A); 542 } 543 } 544 } 545 546 // We made it, we need to lift 547 for (auto *I : llvm::reverse(ToLift)) { 548 LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); 549 I->moveBefore(P); 550 } 551 552 return true; 553 } 554 555 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { 556 if (!SI->isSimple()) return false; 557 558 // Avoid merging nontemporal stores since the resulting 559 // memcpy/memset would not be able to preserve the nontemporal hint. 560 // In theory we could teach how to propagate the !nontemporal metadata to 561 // memset calls. However, that change would force the backend to 562 // conservatively expand !nontemporal memset calls back to sequences of 563 // store instructions (effectively undoing the merging). 564 if (SI->getMetadata(LLVMContext::MD_nontemporal)) 565 return false; 566 567 const DataLayout &DL = SI->getModule()->getDataLayout(); 568 569 Value *StoredVal = SI->getValueOperand(); 570 571 // Not all the transforms below are correct for non-integral pointers, bail 572 // until we've audited the individual pieces. 573 if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType())) 574 return false; 575 576 // Load to store forwarding can be interpreted as memcpy. 577 if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { 578 if (LI->isSimple() && LI->hasOneUse() && 579 LI->getParent() == SI->getParent()) { 580 581 auto *T = LI->getType(); 582 if (T->isAggregateType()) { 583 MemoryLocation LoadLoc = MemoryLocation::get(LI); 584 585 // We use alias analysis to check if an instruction may store to 586 // the memory we load from in between the load and the store. If 587 // such an instruction is found, we try to promote there instead 588 // of at the store position. 589 Instruction *P = SI; 590 for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { 591 if (isModSet(AA->getModRefInfo(&I, LoadLoc))) { 592 P = &I; 593 break; 594 } 595 } 596 597 // We found an instruction that may write to the loaded memory. 598 // We can try to promote at this position instead of the store 599 // position if nothing alias the store memory after this and the store 600 // destination is not in the range. 601 if (P && P != SI) { 602 if (!moveUp(*AA, SI, P, LI)) 603 P = nullptr; 604 } 605 606 // If a valid insertion position is found, then we can promote 607 // the load/store pair to a memcpy. 608 if (P) { 609 // If we load from memory that may alias the memory we store to, 610 // memmove must be used to preserve semantic. If not, memcpy can 611 // be used. 612 bool UseMemMove = false; 613 if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc)) 614 UseMemMove = true; 615 616 uint64_t Size = DL.getTypeStoreSize(T); 617 618 IRBuilder<> Builder(P); 619 Instruction *M; 620 if (UseMemMove) 621 M = Builder.CreateMemMove( 622 SI->getPointerOperand(), SI->getAlign(), 623 LI->getPointerOperand(), LI->getAlign(), Size); 624 else 625 M = Builder.CreateMemCpy( 626 SI->getPointerOperand(), SI->getAlign(), 627 LI->getPointerOperand(), LI->getAlign(), Size); 628 629 LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " 630 << *M << "\n"); 631 632 if (MSSAU) { 633 assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(P))); 634 auto *LastDef = 635 cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(P)); 636 auto *NewAccess = 637 MSSAU->createMemoryAccessAfter(M, LastDef, LastDef); 638 MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); 639 } 640 641 eraseInstruction(SI); 642 eraseInstruction(LI); 643 ++NumMemCpyInstr; 644 645 // Make sure we do not invalidate the iterator. 646 BBI = M->getIterator(); 647 return true; 648 } 649 } 650 651 // Detect cases where we're performing call slot forwarding, but 652 // happen to be using a load-store pair to implement it, rather than 653 // a memcpy. 654 MemDepResult ldep = MD->getDependency(LI); 655 CallInst *C = nullptr; 656 if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) 657 C = dyn_cast<CallInst>(ldep.getInst()); 658 659 if (C) { 660 // Check that nothing touches the dest of the "copy" between 661 // the call and the store. 662 Value *CpyDest = SI->getPointerOperand()->stripPointerCasts(); 663 bool CpyDestIsLocal = isa<AllocaInst>(CpyDest); 664 MemoryLocation StoreLoc = MemoryLocation::get(SI); 665 for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); 666 I != E; --I) { 667 if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) { 668 C = nullptr; 669 break; 670 } 671 // The store to dest may never happen if an exception can be thrown 672 // between the load and the store. 673 if (I->mayThrow() && !CpyDestIsLocal) { 674 C = nullptr; 675 break; 676 } 677 } 678 } 679 680 if (C) { 681 bool changed = performCallSlotOptzn( 682 LI, SI->getPointerOperand()->stripPointerCasts(), 683 LI->getPointerOperand()->stripPointerCasts(), 684 DL.getTypeStoreSize(SI->getOperand(0)->getType()), 685 commonAlignment(SI->getAlign(), LI->getAlign()), C); 686 if (changed) { 687 eraseInstruction(SI); 688 eraseInstruction(LI); 689 ++NumMemCpyInstr; 690 return true; 691 } 692 } 693 } 694 } 695 696 // There are two cases that are interesting for this code to handle: memcpy 697 // and memset. Right now we only handle memset. 698 699 // Ensure that the value being stored is something that can be memset'able a 700 // byte at a time like "0" or "-1" or any width, as well as things like 701 // 0xA0A0A0A0 and 0.0. 702 auto *V = SI->getOperand(0); 703 if (Value *ByteVal = isBytewiseValue(V, DL)) { 704 if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), 705 ByteVal)) { 706 BBI = I->getIterator(); // Don't invalidate iterator. 707 return true; 708 } 709 710 // If we have an aggregate, we try to promote it to memset regardless 711 // of opportunity for merging as it can expose optimization opportunities 712 // in subsequent passes. 713 auto *T = V->getType(); 714 if (T->isAggregateType()) { 715 uint64_t Size = DL.getTypeStoreSize(T); 716 IRBuilder<> Builder(SI); 717 auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, 718 SI->getAlign()); 719 720 LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); 721 722 if (MSSAU) { 723 assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI))); 724 auto *LastDef = 725 cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)); 726 auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef); 727 MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); 728 } 729 730 eraseInstruction(SI); 731 NumMemSetInfer++; 732 733 // Make sure we do not invalidate the iterator. 734 BBI = M->getIterator(); 735 return true; 736 } 737 } 738 739 return false; 740 } 741 742 bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { 743 // See if there is another memset or store neighboring this memset which 744 // allows us to widen out the memset to do a single larger store. 745 if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) 746 if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), 747 MSI->getValue())) { 748 BBI = I->getIterator(); // Don't invalidate iterator. 749 return true; 750 } 751 return false; 752 } 753 754 /// Takes a memcpy and a call that it depends on, 755 /// and checks for the possibility of a call slot optimization by having 756 /// the call write its result directly into the destination of the memcpy. 757 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, 758 Value *cpySrc, uint64_t cpyLen, 759 Align cpyAlign, CallInst *C) { 760 // The general transformation to keep in mind is 761 // 762 // call @func(..., src, ...) 763 // memcpy(dest, src, ...) 764 // 765 // -> 766 // 767 // memcpy(dest, src, ...) 768 // call @func(..., dest, ...) 769 // 770 // Since moving the memcpy is technically awkward, we additionally check that 771 // src only holds uninitialized values at the moment of the call, meaning that 772 // the memcpy can be discarded rather than moved. 773 774 // Lifetime marks shouldn't be operated on. 775 if (Function *F = C->getCalledFunction()) 776 if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) 777 return false; 778 779 // Require that src be an alloca. This simplifies the reasoning considerably. 780 AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); 781 if (!srcAlloca) 782 return false; 783 784 ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); 785 if (!srcArraySize) 786 return false; 787 788 const DataLayout &DL = cpy->getModule()->getDataLayout(); 789 uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) * 790 srcArraySize->getZExtValue(); 791 792 if (cpyLen < srcSize) 793 return false; 794 795 // Check that accessing the first srcSize bytes of dest will not cause a 796 // trap. Otherwise the transform is invalid since it might cause a trap 797 // to occur earlier than it otherwise would. 798 if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) { 799 // The destination is an alloca. Check it is larger than srcSize. 800 ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize()); 801 if (!destArraySize) 802 return false; 803 804 uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) * 805 destArraySize->getZExtValue(); 806 807 if (destSize < srcSize) 808 return false; 809 } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { 810 // The store to dest may never happen if the call can throw. 811 if (C->mayThrow()) 812 return false; 813 814 if (A->getDereferenceableBytes() < srcSize) { 815 // If the destination is an sret parameter then only accesses that are 816 // outside of the returned struct type can trap. 817 if (!A->hasStructRetAttr()) 818 return false; 819 820 Type *StructTy = A->getParamStructRetType(); 821 if (!StructTy->isSized()) { 822 // The call may never return and hence the copy-instruction may never 823 // be executed, and therefore it's not safe to say "the destination 824 // has at least <cpyLen> bytes, as implied by the copy-instruction", 825 return false; 826 } 827 828 uint64_t destSize = DL.getTypeAllocSize(StructTy); 829 if (destSize < srcSize) 830 return false; 831 } 832 } else { 833 return false; 834 } 835 836 // Check that dest points to memory that is at least as aligned as src. 837 Align srcAlign = srcAlloca->getAlign(); 838 bool isDestSufficientlyAligned = srcAlign <= cpyAlign; 839 // If dest is not aligned enough and we can't increase its alignment then 840 // bail out. 841 if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) 842 return false; 843 844 // Check that src is not accessed except via the call and the memcpy. This 845 // guarantees that it holds only undefined values when passed in (so the final 846 // memcpy can be dropped), that it is not read or written between the call and 847 // the memcpy, and that writing beyond the end of it is undefined. 848 SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(), 849 srcAlloca->user_end()); 850 while (!srcUseList.empty()) { 851 User *U = srcUseList.pop_back_val(); 852 853 if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { 854 for (User *UU : U->users()) 855 srcUseList.push_back(UU); 856 continue; 857 } 858 if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { 859 if (!G->hasAllZeroIndices()) 860 return false; 861 862 for (User *UU : U->users()) 863 srcUseList.push_back(UU); 864 continue; 865 } 866 if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U)) 867 if (IT->isLifetimeStartOrEnd()) 868 continue; 869 870 if (U != C && U != cpy) 871 return false; 872 } 873 874 // Check that src isn't captured by the called function since the 875 // transformation can cause aliasing issues in that case. 876 for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI) 877 if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI)) 878 return false; 879 880 // Since we're changing the parameter to the callsite, we need to make sure 881 // that what would be the new parameter dominates the callsite. 882 if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) 883 if (!DT->dominates(cpyDestInst, C)) 884 return false; 885 886 // In addition to knowing that the call does not access src in some 887 // unexpected manner, for example via a global, which we deduce from 888 // the use analysis, we also need to know that it does not sneakily 889 // access dest. We rely on AA to figure this out for us. 890 ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize)); 891 // If necessary, perform additional analysis. 892 if (isModOrRefSet(MR)) 893 MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT); 894 if (isModOrRefSet(MR)) 895 return false; 896 897 // We can't create address space casts here because we don't know if they're 898 // safe for the target. 899 if (cpySrc->getType()->getPointerAddressSpace() != 900 cpyDest->getType()->getPointerAddressSpace()) 901 return false; 902 for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) 903 if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc && 904 cpySrc->getType()->getPointerAddressSpace() != 905 C->getArgOperand(ArgI)->getType()->getPointerAddressSpace()) 906 return false; 907 908 // All the checks have passed, so do the transformation. 909 bool changedArgument = false; 910 for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) 911 if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) { 912 Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest 913 : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), 914 cpyDest->getName(), C); 915 changedArgument = true; 916 if (C->getArgOperand(ArgI)->getType() == Dest->getType()) 917 C->setArgOperand(ArgI, Dest); 918 else 919 C->setArgOperand(ArgI, CastInst::CreatePointerCast( 920 Dest, C->getArgOperand(ArgI)->getType(), 921 Dest->getName(), C)); 922 } 923 924 if (!changedArgument) 925 return false; 926 927 // If the destination wasn't sufficiently aligned then increase its alignment. 928 if (!isDestSufficientlyAligned) { 929 assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); 930 cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); 931 } 932 933 // Drop any cached information about the call, because we may have changed 934 // its dependence information by changing its parameter. 935 MD->removeInstruction(C); 936 937 // Update AA metadata 938 // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be 939 // handled here, but combineMetadata doesn't support them yet 940 unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, 941 LLVMContext::MD_noalias, 942 LLVMContext::MD_invariant_group, 943 LLVMContext::MD_access_group}; 944 combineMetadata(C, cpy, KnownIDs, true); 945 946 return true; 947 } 948 949 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is 950 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. 951 bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, 952 MemCpyInst *MDep) { 953 // We can only transforms memcpy's where the dest of one is the source of the 954 // other. 955 if (M->getSource() != MDep->getDest() || MDep->isVolatile()) 956 return false; 957 958 // If dep instruction is reading from our current input, then it is a noop 959 // transfer and substituting the input won't change this instruction. Just 960 // ignore the input and let someone else zap MDep. This handles cases like: 961 // memcpy(a <- a) 962 // memcpy(b <- a) 963 if (M->getSource() == MDep->getSource()) 964 return false; 965 966 // Second, the length of the memcpy's must be the same, or the preceding one 967 // must be larger than the following one. 968 ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); 969 ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); 970 if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) 971 return false; 972 973 // Verify that the copied-from memory doesn't change in between the two 974 // transfers. For example, in: 975 // memcpy(a <- b) 976 // *b = 42; 977 // memcpy(c <- a) 978 // It would be invalid to transform the second memcpy into memcpy(c <- b). 979 // 980 // TODO: If the code between M and MDep is transparent to the destination "c", 981 // then we could still perform the xform by moving M up to the first memcpy. 982 // 983 // NOTE: This is conservative, it will stop on any read from the source loc, 984 // not just the defining memcpy. 985 MemDepResult SourceDep = 986 MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, 987 M->getIterator(), M->getParent()); 988 if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) 989 return false; 990 991 // If the dest of the second might alias the source of the first, then the 992 // source and dest might overlap. We still want to eliminate the intermediate 993 // value, but we have to generate a memmove instead of memcpy. 994 bool UseMemMove = false; 995 if (!AA->isNoAlias(MemoryLocation::getForDest(M), 996 MemoryLocation::getForSource(MDep))) 997 UseMemMove = true; 998 999 // If all checks passed, then we can transform M. 1000 LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" 1001 << *MDep << '\n' << *M << '\n'); 1002 1003 // TODO: Is this worth it if we're creating a less aligned memcpy? For 1004 // example we could be moving from movaps -> movq on x86. 1005 IRBuilder<> Builder(M); 1006 Instruction *NewM; 1007 if (UseMemMove) 1008 NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), 1009 MDep->getRawSource(), MDep->getSourceAlign(), 1010 M->getLength(), M->isVolatile()); 1011 else 1012 NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), 1013 MDep->getRawSource(), MDep->getSourceAlign(), 1014 M->getLength(), M->isVolatile()); 1015 1016 if (MSSAU) { 1017 assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M))); 1018 auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)); 1019 auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); 1020 MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); 1021 } 1022 1023 // Remove the instruction we're replacing. 1024 eraseInstruction(M); 1025 ++NumMemCpyInstr; 1026 return true; 1027 } 1028 1029 /// We've found that the (upward scanning) memory dependence of \p MemCpy is 1030 /// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that 1031 /// weren't copied over by \p MemCpy. 1032 /// 1033 /// In other words, transform: 1034 /// \code 1035 /// memset(dst, c, dst_size); 1036 /// memcpy(dst, src, src_size); 1037 /// \endcode 1038 /// into: 1039 /// \code 1040 /// memcpy(dst, src, src_size); 1041 /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); 1042 /// \endcode 1043 bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, 1044 MemSetInst *MemSet) { 1045 // We can only transform memset/memcpy with the same destination. 1046 if (MemSet->getDest() != MemCpy->getDest()) 1047 return false; 1048 1049 // Check that there are no other dependencies on the memset destination. 1050 MemDepResult DstDepInfo = 1051 MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false, 1052 MemCpy->getIterator(), MemCpy->getParent()); 1053 if (DstDepInfo.getInst() != MemSet) 1054 return false; 1055 1056 // Use the same i8* dest as the memcpy, killing the memset dest if different. 1057 Value *Dest = MemCpy->getRawDest(); 1058 Value *DestSize = MemSet->getLength(); 1059 Value *SrcSize = MemCpy->getLength(); 1060 1061 // By default, create an unaligned memset. 1062 unsigned Align = 1; 1063 // If Dest is aligned, and SrcSize is constant, use the minimum alignment 1064 // of the sum. 1065 const unsigned DestAlign = 1066 std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment()); 1067 if (DestAlign > 1) 1068 if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) 1069 Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); 1070 1071 IRBuilder<> Builder(MemCpy); 1072 1073 // If the sizes have different types, zext the smaller one. 1074 if (DestSize->getType() != SrcSize->getType()) { 1075 if (DestSize->getType()->getIntegerBitWidth() > 1076 SrcSize->getType()->getIntegerBitWidth()) 1077 SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType()); 1078 else 1079 DestSize = Builder.CreateZExt(DestSize, SrcSize->getType()); 1080 } 1081 1082 Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize); 1083 Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize); 1084 Value *MemsetLen = Builder.CreateSelect( 1085 Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff); 1086 Instruction *NewMemSet = Builder.CreateMemSet( 1087 Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest, 1088 SrcSize), 1089 MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); 1090 1091 if (MSSAU) { 1092 assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && 1093 "MemCpy must be a MemoryDef"); 1094 // The new memset is inserted after the memcpy, but it is known that its 1095 // defining access is the memset about to be removed which immediately 1096 // precedes the memcpy. 1097 auto *LastDef = 1098 cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); 1099 auto *NewAccess = MSSAU->createMemoryAccessBefore( 1100 NewMemSet, LastDef->getDefiningAccess(), LastDef); 1101 MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); 1102 } 1103 1104 eraseInstruction(MemSet); 1105 return true; 1106 } 1107 1108 /// Determine whether the instruction has undefined content for the given Size, 1109 /// either because it was freshly alloca'd or started its lifetime. 1110 static bool hasUndefContents(Instruction *I, ConstantInt *Size) { 1111 if (isa<AllocaInst>(I)) 1112 return true; 1113 1114 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 1115 if (II->getIntrinsicID() == Intrinsic::lifetime_start) 1116 if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) 1117 if (LTSize->getZExtValue() >= Size->getZExtValue()) 1118 return true; 1119 1120 return false; 1121 } 1122 1123 /// Transform memcpy to memset when its source was just memset. 1124 /// In other words, turn: 1125 /// \code 1126 /// memset(dst1, c, dst1_size); 1127 /// memcpy(dst2, dst1, dst2_size); 1128 /// \endcode 1129 /// into: 1130 /// \code 1131 /// memset(dst1, c, dst1_size); 1132 /// memset(dst2, c, dst2_size); 1133 /// \endcode 1134 /// When dst2_size <= dst1_size. 1135 /// 1136 /// The \p MemCpy must have a Constant length. 1137 bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, 1138 MemSetInst *MemSet) { 1139 // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and 1140 // memcpying from the same address. Otherwise it is hard to reason about. 1141 if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) 1142 return false; 1143 1144 // A known memset size is required. 1145 ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength()); 1146 if (!MemSetSize) 1147 return false; 1148 1149 // Make sure the memcpy doesn't read any more than what the memset wrote. 1150 // Don't worry about sizes larger than i64. 1151 ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength()); 1152 if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) { 1153 // If the memcpy is larger than the memset, but the memory was undef prior 1154 // to the memset, we can just ignore the tail. Technically we're only 1155 // interested in the bytes from MemSetSize..CopySize here, but as we can't 1156 // easily represent this location, we use the full 0..CopySize range. 1157 MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); 1158 MemDepResult DepInfo = MD->getPointerDependencyFrom( 1159 MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent()); 1160 if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) 1161 CopySize = MemSetSize; 1162 else 1163 return false; 1164 } 1165 1166 IRBuilder<> Builder(MemCpy); 1167 Instruction *NewM = 1168 Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), 1169 CopySize, MaybeAlign(MemCpy->getDestAlignment())); 1170 if (MSSAU) { 1171 auto *LastDef = 1172 cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); 1173 auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); 1174 MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); 1175 } 1176 1177 return true; 1178 } 1179 1180 /// Perform simplification of memcpy's. If we have memcpy A 1181 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite 1182 /// B to be a memcpy from X to Z (or potentially a memmove, depending on 1183 /// circumstances). This allows later passes to remove the first memcpy 1184 /// altogether. 1185 bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { 1186 // We can only optimize non-volatile memcpy's. 1187 if (M->isVolatile()) return false; 1188 1189 // If the source and destination of the memcpy are the same, then zap it. 1190 if (M->getSource() == M->getDest()) { 1191 ++BBI; 1192 eraseInstruction(M); 1193 return true; 1194 } 1195 1196 // If copying from a constant, try to turn the memcpy into a memset. 1197 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) 1198 if (GV->isConstant() && GV->hasDefinitiveInitializer()) 1199 if (Value *ByteVal = isBytewiseValue(GV->getInitializer(), 1200 M->getModule()->getDataLayout())) { 1201 IRBuilder<> Builder(M); 1202 Instruction *NewM = 1203 Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), 1204 MaybeAlign(M->getDestAlignment()), false); 1205 if (MSSAU) { 1206 auto *LastDef = 1207 cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)); 1208 auto *NewAccess = 1209 MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef); 1210 MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true); 1211 } 1212 1213 eraseInstruction(M); 1214 ++NumCpyToSet; 1215 return true; 1216 } 1217 1218 MemDepResult DepInfo = MD->getDependency(M); 1219 1220 // Try to turn a partially redundant memset + memcpy into 1221 // memcpy + smaller memset. We don't need the memcpy size for this. 1222 if (DepInfo.isClobber()) 1223 if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst())) 1224 if (processMemSetMemCpyDependence(M, MDep)) 1225 return true; 1226 1227 // The optimizations after this point require the memcpy size. 1228 ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); 1229 if (!CopySize) return false; 1230 1231 // There are four possible optimizations we can do for memcpy: 1232 // a) memcpy-memcpy xform which exposes redundance for DSE. 1233 // b) call-memcpy xform for return slot optimization. 1234 // c) memcpy from freshly alloca'd space or space that has just started its 1235 // lifetime copies undefined data, and we can therefore eliminate the 1236 // memcpy in favor of the data that was already at the destination. 1237 // d) memcpy from a just-memset'd source can be turned into memset. 1238 if (DepInfo.isClobber()) { 1239 if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { 1240 // FIXME: Can we pass in either of dest/src alignment here instead 1241 // of conservatively taking the minimum? 1242 Align Alignment = std::min(M->getDestAlign().valueOrOne(), 1243 M->getSourceAlign().valueOrOne()); 1244 if (performCallSlotOptzn(M, M->getDest(), M->getSource(), 1245 CopySize->getZExtValue(), Alignment, C)) { 1246 eraseInstruction(M); 1247 ++NumMemCpyInstr; 1248 return true; 1249 } 1250 } 1251 } 1252 1253 MemoryLocation SrcLoc = MemoryLocation::getForSource(M); 1254 MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( 1255 SrcLoc, true, M->getIterator(), M->getParent()); 1256 1257 if (SrcDepInfo.isClobber()) { 1258 if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) 1259 return processMemCpyMemCpyDependence(M, MDep); 1260 } else if (SrcDepInfo.isDef()) { 1261 if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) { 1262 eraseInstruction(M); 1263 ++NumMemCpyInstr; 1264 return true; 1265 } 1266 } 1267 1268 if (SrcDepInfo.isClobber()) 1269 if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst())) 1270 if (performMemCpyToMemSetOptzn(M, MDep)) { 1271 eraseInstruction(M); 1272 ++NumCpyToSet; 1273 return true; 1274 } 1275 1276 return false; 1277 } 1278 1279 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed 1280 /// not to alias. 1281 bool MemCpyOptPass::processMemMove(MemMoveInst *M) { 1282 if (!TLI->has(LibFunc_memmove)) 1283 return false; 1284 1285 // See if the pointers alias. 1286 if (!AA->isNoAlias(MemoryLocation::getForDest(M), 1287 MemoryLocation::getForSource(M))) 1288 return false; 1289 1290 LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M 1291 << "\n"); 1292 1293 // If not, then we know we can transform this. 1294 Type *ArgTys[3] = { M->getRawDest()->getType(), 1295 M->getRawSource()->getType(), 1296 M->getLength()->getType() }; 1297 M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), 1298 Intrinsic::memcpy, ArgTys)); 1299 1300 // For MemorySSA nothing really changes (except that memcpy may imply stricter 1301 // aliasing guarantees). 1302 1303 // MemDep may have over conservative information about this instruction, just 1304 // conservatively flush it from the cache. 1305 MD->removeInstruction(M); 1306 1307 ++NumMoveToCpy; 1308 return true; 1309 } 1310 1311 /// This is called on every byval argument in call sites. 1312 bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { 1313 const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); 1314 // Find out what feeds this byval argument. 1315 Value *ByValArg = CB.getArgOperand(ArgNo); 1316 Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); 1317 uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); 1318 MemDepResult DepInfo = MD->getPointerDependencyFrom( 1319 MemoryLocation(ByValArg, LocationSize::precise(ByValSize)), true, 1320 CB.getIterator(), CB.getParent()); 1321 if (!DepInfo.isClobber()) 1322 return false; 1323 1324 // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by 1325 // a memcpy, see if we can byval from the source of the memcpy instead of the 1326 // result. 1327 MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); 1328 if (!MDep || MDep->isVolatile() || 1329 ByValArg->stripPointerCasts() != MDep->getDest()) 1330 return false; 1331 1332 // The length of the memcpy must be larger or equal to the size of the byval. 1333 ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); 1334 if (!C1 || C1->getValue().getZExtValue() < ByValSize) 1335 return false; 1336 1337 // Get the alignment of the byval. If the call doesn't specify the alignment, 1338 // then it is some target specific value that we can't know. 1339 MaybeAlign ByValAlign = CB.getParamAlign(ArgNo); 1340 if (!ByValAlign) return false; 1341 1342 // If it is greater than the memcpy, then we check to see if we can force the 1343 // source of the memcpy to the alignment we need. If we fail, we bail out. 1344 MaybeAlign MemDepAlign = MDep->getSourceAlign(); 1345 if ((!MemDepAlign || *MemDepAlign < *ByValAlign) && 1346 getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC, 1347 DT) < *ByValAlign) 1348 return false; 1349 1350 // The address space of the memcpy source must match the byval argument 1351 if (MDep->getSource()->getType()->getPointerAddressSpace() != 1352 ByValArg->getType()->getPointerAddressSpace()) 1353 return false; 1354 1355 // Verify that the copied-from memory doesn't change in between the memcpy and 1356 // the byval call. 1357 // memcpy(a <- b) 1358 // *b = 42; 1359 // foo(*a) 1360 // It would be invalid to transform the second memcpy into foo(*b). 1361 // 1362 // NOTE: This is conservative, it will stop on any read from the source loc, 1363 // not just the defining memcpy. 1364 MemDepResult SourceDep = MD->getPointerDependencyFrom( 1365 MemoryLocation::getForSource(MDep), false, 1366 CB.getIterator(), MDep->getParent()); 1367 if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) 1368 return false; 1369 1370 Value *TmpCast = MDep->getSource(); 1371 if (MDep->getSource()->getType() != ByValArg->getType()) { 1372 BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), 1373 "tmpcast", &CB); 1374 // Set the tmpcast's DebugLoc to MDep's 1375 TmpBitCast->setDebugLoc(MDep->getDebugLoc()); 1376 TmpCast = TmpBitCast; 1377 } 1378 1379 LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" 1380 << " " << *MDep << "\n" 1381 << " " << CB << "\n"); 1382 1383 // Otherwise we're good! Update the byval argument. 1384 CB.setArgOperand(ArgNo, TmpCast); 1385 ++NumMemCpyInstr; 1386 return true; 1387 } 1388 1389 /// Executes one iteration of MemCpyOptPass. 1390 bool MemCpyOptPass::iterateOnFunction(Function &F) { 1391 bool MadeChange = false; 1392 1393 // Walk all instruction in the function. 1394 for (BasicBlock &BB : F) { 1395 // Skip unreachable blocks. For example processStore assumes that an 1396 // instruction in a BB can't be dominated by a later instruction in the 1397 // same BB (which is a scenario that can happen for an unreachable BB that 1398 // has itself as a predecessor). 1399 if (!DT->isReachableFromEntry(&BB)) 1400 continue; 1401 1402 for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { 1403 // Avoid invalidating the iterator. 1404 Instruction *I = &*BI++; 1405 1406 bool RepeatInstruction = false; 1407 1408 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1409 MadeChange |= processStore(SI, BI); 1410 else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) 1411 RepeatInstruction = processMemSet(M, BI); 1412 else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) 1413 RepeatInstruction = processMemCpy(M, BI); 1414 else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) 1415 RepeatInstruction = processMemMove(M); 1416 else if (auto *CB = dyn_cast<CallBase>(I)) { 1417 for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) 1418 if (CB->isByValArgument(i)) 1419 MadeChange |= processByValArgument(*CB, i); 1420 } 1421 1422 // Reprocess the instruction if desired. 1423 if (RepeatInstruction) { 1424 if (BI != BB.begin()) 1425 --BI; 1426 MadeChange = true; 1427 } 1428 } 1429 } 1430 1431 return MadeChange; 1432 } 1433 1434 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { 1435 auto &MD = AM.getResult<MemoryDependenceAnalysis>(F); 1436 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 1437 auto *AA = &AM.getResult<AAManager>(F); 1438 auto *AC = &AM.getResult<AssumptionAnalysis>(F); 1439 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 1440 auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F); 1441 1442 bool MadeChange = 1443 runImpl(F, &MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr); 1444 if (!MadeChange) 1445 return PreservedAnalyses::all(); 1446 1447 PreservedAnalyses PA; 1448 PA.preserveSet<CFGAnalyses>(); 1449 PA.preserve<GlobalsAA>(); 1450 PA.preserve<MemoryDependenceAnalysis>(); 1451 if (MSSA) 1452 PA.preserve<MemorySSAAnalysis>(); 1453 return PA; 1454 } 1455 1456 bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_, 1457 TargetLibraryInfo *TLI_, AliasAnalysis *AA_, 1458 AssumptionCache *AC_, DominatorTree *DT_, 1459 MemorySSA *MSSA_) { 1460 bool MadeChange = false; 1461 MD = MD_; 1462 TLI = TLI_; 1463 AA = AA_; 1464 AC = AC_; 1465 DT = DT_; 1466 MemorySSAUpdater MSSAU_(MSSA_); 1467 MSSAU = MSSA_ ? &MSSAU_ : nullptr; 1468 // If we don't have at least memset and memcpy, there is little point of doing 1469 // anything here. These are required by a freestanding implementation, so if 1470 // even they are disabled, there is no point in trying hard. 1471 if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy)) 1472 return false; 1473 1474 while (true) { 1475 if (!iterateOnFunction(F)) 1476 break; 1477 MadeChange = true; 1478 } 1479 1480 if (MSSA_ && VerifyMemorySSA) 1481 MSSA_->verifyMemorySSA(); 1482 1483 MD = nullptr; 1484 return MadeChange; 1485 } 1486 1487 /// This is the main transformation entry point for a function. 1488 bool MemCpyOptLegacyPass::runOnFunction(Function &F) { 1489 if (skipFunction(F)) 1490 return false; 1491 1492 auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); 1493 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 1494 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1495 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1496 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1497 auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>(); 1498 1499 return Impl.runImpl(F, MD, TLI, AA, AC, DT, 1500 MSSAWP ? &MSSAWP->getMSSA() : nullptr); 1501 } 1502