1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This transformation implements the well known scalar replacement of 11 /// aggregates transformation. It tries to identify promotable elements of an 12 /// aggregate alloca, and promote them to registers. It will also try to 13 /// convert uses of an element (or set of elements) of an alloca into a vector 14 /// or bitfield-style integer scalar if appropriate. 15 /// 16 /// It works to do this with minimal slicing of the alloca so that regions 17 /// which are merely transferred in and out of external memory remain unchanged 18 /// and are not decomposed to scalar code. 19 /// 20 /// Because this also performs alloca promotion, it can be thought of as also 21 /// serving the purpose of SSA formation. The algorithm iterates on the 22 /// function until all opportunities for promotion have been realized. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #define DEBUG_TYPE "sroa" 27 #include "llvm/Transforms/Scalar.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/SetVector.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/Dominators.h" 33 #include "llvm/Analysis/Loads.h" 34 #include "llvm/Analysis/PtrUseVisitor.h" 35 #include "llvm/Analysis/ValueTracking.h" 36 #include "llvm/DIBuilder.h" 37 #include "llvm/DebugInfo.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/DataLayout.h" 40 #include "llvm/IR/DerivedTypes.h" 41 #include "llvm/IR/Function.h" 42 #include "llvm/IR/IRBuilder.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicInst.h" 45 #include "llvm/IR/LLVMContext.h" 46 #include "llvm/IR/Operator.h" 47 #include "llvm/InstVisitor.h" 48 #include "llvm/Pass.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/Debug.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Support/MathExtras.h" 53 #include "llvm/Support/raw_ostream.h" 54 #include "llvm/Transforms/Utils/Local.h" 55 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 56 #include "llvm/Transforms/Utils/SSAUpdater.h" 57 using namespace llvm; 58 59 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 60 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); 61 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions"); 62 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found"); 63 STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses"); 64 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 65 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 66 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 67 STATISTIC(NumDeleted, "Number of instructions deleted"); 68 STATISTIC(NumVectorized, "Number of vectorized aggregates"); 69 70 /// Hidden option to force the pass to not use DomTree and mem2reg, instead 71 /// forming SSA values through the SSAUpdater infrastructure. 72 static cl::opt<bool> 73 ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 74 75 namespace { 76 /// \brief Provide a typedef for IRBuilder that drops names in release builds. 77 #ifndef NDEBUG 78 typedef llvm::IRBuilder<> IRBuilderTy; 79 #else 80 typedef llvm::IRBuilder<false> IRBuilderTy; 81 #endif 82 } 83 84 namespace { 85 /// \brief A common base class for representing a half-open byte range. 86 struct ByteRange { 87 /// \brief The beginning offset of the range. 88 uint64_t BeginOffset; 89 90 /// \brief The ending offset, not included in the range. 91 uint64_t EndOffset; 92 93 ByteRange() : BeginOffset(), EndOffset() {} 94 ByteRange(uint64_t BeginOffset, uint64_t EndOffset) 95 : BeginOffset(BeginOffset), EndOffset(EndOffset) {} 96 97 /// \brief Support for ordering ranges. 98 /// 99 /// This provides an ordering over ranges such that start offsets are 100 /// always increasing, and within equal start offsets, the end offsets are 101 /// decreasing. Thus the spanning range comes first in a cluster with the 102 /// same start position. 103 bool operator<(const ByteRange &RHS) const { 104 if (BeginOffset < RHS.BeginOffset) return true; 105 if (BeginOffset > RHS.BeginOffset) return false; 106 if (EndOffset > RHS.EndOffset) return true; 107 return false; 108 } 109 110 /// \brief Support comparison with a single offset to allow binary searches. 111 friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { 112 return LHS.BeginOffset < RHSOffset; 113 } 114 115 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 116 const ByteRange &RHS) { 117 return LHSOffset < RHS.BeginOffset; 118 } 119 120 bool operator==(const ByteRange &RHS) const { 121 return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; 122 } 123 bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } 124 }; 125 126 /// \brief A partition of an alloca. 127 /// 128 /// This structure represents a contiguous partition of the alloca. These are 129 /// formed by examining the uses of the alloca. During formation, they may 130 /// overlap but once an AllocaPartitioning is built, the Partitions within it 131 /// are all disjoint. 132 struct Partition : public ByteRange { 133 /// \brief Whether this partition is splittable into smaller partitions. 134 /// 135 /// We flag partitions as splittable when they are formed entirely due to 136 /// accesses by trivially splittable operations such as memset and memcpy. 137 bool IsSplittable; 138 139 /// \brief Test whether a partition has been marked as dead. 140 bool isDead() const { 141 if (BeginOffset == UINT64_MAX) { 142 assert(EndOffset == UINT64_MAX); 143 return true; 144 } 145 return false; 146 } 147 148 /// \brief Kill a partition. 149 /// This is accomplished by setting both its beginning and end offset to 150 /// the maximum possible value. 151 void kill() { 152 assert(!isDead() && "He's Dead, Jim!"); 153 BeginOffset = EndOffset = UINT64_MAX; 154 } 155 156 Partition() : ByteRange(), IsSplittable() {} 157 Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) 158 : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} 159 }; 160 161 /// \brief A particular use of a partition of the alloca. 162 /// 163 /// This structure is used to associate uses of a partition with it. They 164 /// mark the range of bytes which are referenced by a particular instruction, 165 /// and includes a handle to the user itself and the pointer value in use. 166 /// The bounds of these uses are determined by intersecting the bounds of the 167 /// memory use itself with a particular partition. As a consequence there is 168 /// intentionally overlap between various uses of the same partition. 169 class PartitionUse : public ByteRange { 170 /// \brief Combined storage for both the Use* and split state. 171 PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit; 172 173 public: 174 PartitionUse() : ByteRange(), UsePtrAndIsSplit() {} 175 PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U, 176 bool IsSplit) 177 : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {} 178 179 /// \brief The use in question. Provides access to both user and used value. 180 /// 181 /// Note that this may be null if the partition use is *dead*, that is, it 182 /// should be ignored. 183 Use *getUse() const { return UsePtrAndIsSplit.getPointer(); } 184 185 /// \brief Set the use for this partition use range. 186 void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); } 187 188 /// \brief Whether this use is split across multiple partitions. 189 bool isSplit() const { return UsePtrAndIsSplit.getInt(); } 190 }; 191 } 192 193 namespace llvm { 194 template <> struct isPodLike<Partition> : llvm::true_type {}; 195 template <> struct isPodLike<PartitionUse> : llvm::true_type {}; 196 } 197 198 namespace { 199 /// \brief Alloca partitioning representation. 200 /// 201 /// This class represents a partitioning of an alloca into slices, and 202 /// information about the nature of uses of each slice of the alloca. The goal 203 /// is that this information is sufficient to decide if and how to split the 204 /// alloca apart and replace slices with scalars. It is also intended that this 205 /// structure can capture the relevant information needed both to decide about 206 /// and to enact these transformations. 207 class AllocaPartitioning { 208 public: 209 /// \brief Construct a partitioning of a particular alloca. 210 /// 211 /// Construction does most of the work for partitioning the alloca. This 212 /// performs the necessary walks of users and builds a partitioning from it. 213 AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); 214 215 /// \brief Test whether a pointer to the allocation escapes our analysis. 216 /// 217 /// If this is true, the partitioning is never fully built and should be 218 /// ignored. 219 bool isEscaped() const { return PointerEscapingInstr; } 220 221 /// \brief Support for iterating over the partitions. 222 /// @{ 223 typedef SmallVectorImpl<Partition>::iterator iterator; 224 iterator begin() { return Partitions.begin(); } 225 iterator end() { return Partitions.end(); } 226 227 typedef SmallVectorImpl<Partition>::const_iterator const_iterator; 228 const_iterator begin() const { return Partitions.begin(); } 229 const_iterator end() const { return Partitions.end(); } 230 /// @} 231 232 /// \brief Support for iterating over and manipulating a particular 233 /// partition's uses. 234 /// 235 /// The iteration support provided for uses is more limited, but also 236 /// includes some manipulation routines to support rewriting the uses of 237 /// partitions during SROA. 238 /// @{ 239 typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; 240 use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } 241 use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } 242 use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } 243 use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } 244 245 typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; 246 const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } 247 const_use_iterator use_begin(const_iterator I) const { 248 return Uses[I - begin()].begin(); 249 } 250 const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } 251 const_use_iterator use_end(const_iterator I) const { 252 return Uses[I - begin()].end(); 253 } 254 255 unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } 256 unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } 257 const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { 258 return Uses[PIdx][UIdx]; 259 } 260 const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { 261 return Uses[I - begin()][UIdx]; 262 } 263 264 void use_push_back(unsigned Idx, const PartitionUse &PU) { 265 Uses[Idx].push_back(PU); 266 } 267 void use_push_back(const_iterator I, const PartitionUse &PU) { 268 Uses[I - begin()].push_back(PU); 269 } 270 /// @} 271 272 /// \brief Allow iterating the dead users for this alloca. 273 /// 274 /// These are instructions which will never actually use the alloca as they 275 /// are outside the allocated range. They are safe to replace with undef and 276 /// delete. 277 /// @{ 278 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 279 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 280 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 281 /// @} 282 283 /// \brief Allow iterating the dead expressions referring to this alloca. 284 /// 285 /// These are operands which have cannot actually be used to refer to the 286 /// alloca as they are outside its range and the user doesn't correct for 287 /// that. These mostly consist of PHI node inputs and the like which we just 288 /// need to replace with undef. 289 /// @{ 290 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 291 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 292 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 293 /// @} 294 295 /// \brief MemTransferInst auxiliary data. 296 /// This struct provides some auxiliary data about memory transfer 297 /// intrinsics such as memcpy and memmove. These intrinsics can use two 298 /// different ranges within the same alloca, and provide other challenges to 299 /// correctly represent. We stash extra data to help us untangle this 300 /// after the partitioning is complete. 301 struct MemTransferOffsets { 302 /// The destination begin and end offsets when the destination is within 303 /// this alloca. If the end offset is zero the destination is not within 304 /// this alloca. 305 uint64_t DestBegin, DestEnd; 306 307 /// The source begin and end offsets when the source is within this alloca. 308 /// If the end offset is zero, the source is not within this alloca. 309 uint64_t SourceBegin, SourceEnd; 310 311 /// Flag for whether an alloca is splittable. 312 bool IsSplittable; 313 }; 314 MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { 315 return MemTransferInstData.lookup(&II); 316 } 317 318 /// \brief Map from a PHI or select operand back to a partition. 319 /// 320 /// When manipulating PHI nodes or selects, they can use more than one 321 /// partition of an alloca. We store a special mapping to allow finding the 322 /// partition referenced by each of these operands, if any. 323 iterator findPartitionForPHIOrSelectOperand(Use *U) { 324 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 325 = PHIOrSelectOpMap.find(U); 326 if (MapIt == PHIOrSelectOpMap.end()) 327 return end(); 328 329 return begin() + MapIt->second.first; 330 } 331 332 /// \brief Map from a PHI or select operand back to the specific use of 333 /// a partition. 334 /// 335 /// Similar to mapping these operands back to the partitions, this maps 336 /// directly to the use structure of that partition. 337 use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { 338 SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt 339 = PHIOrSelectOpMap.find(U); 340 assert(MapIt != PHIOrSelectOpMap.end()); 341 return Uses[MapIt->second.first].begin() + MapIt->second.second; 342 } 343 344 /// \brief Compute a common type among the uses of a particular partition. 345 /// 346 /// This routines walks all of the uses of a particular partition and tries 347 /// to find a common type between them. Untyped operations such as memset and 348 /// memcpy are ignored. 349 Type *getCommonType(iterator I) const; 350 351 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 352 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 353 void printUsers(raw_ostream &OS, const_iterator I, 354 StringRef Indent = " ") const; 355 void print(raw_ostream &OS) const; 356 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; 357 void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; 358 #endif 359 360 private: 361 template <typename DerivedT, typename RetT = void> class BuilderBase; 362 class PartitionBuilder; 363 friend class AllocaPartitioning::PartitionBuilder; 364 class UseBuilder; 365 friend class AllocaPartitioning::UseBuilder; 366 367 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 368 /// \brief Handle to alloca instruction to simplify method interfaces. 369 AllocaInst &AI; 370 #endif 371 372 /// \brief The instruction responsible for this alloca having no partitioning. 373 /// 374 /// When an instruction (potentially) escapes the pointer to the alloca, we 375 /// store a pointer to that here and abort trying to partition the alloca. 376 /// This will be null if the alloca is partitioned successfully. 377 Instruction *PointerEscapingInstr; 378 379 /// \brief The partitions of the alloca. 380 /// 381 /// We store a vector of the partitions over the alloca here. This vector is 382 /// sorted by increasing begin offset, and then by decreasing end offset. See 383 /// the Partition inner class for more details. Initially (during 384 /// construction) there are overlaps, but we form a disjoint sequence of 385 /// partitions while finishing construction and a fully constructed object is 386 /// expected to always have this as a disjoint space. 387 SmallVector<Partition, 8> Partitions; 388 389 /// \brief The uses of the partitions. 390 /// 391 /// This is essentially a mapping from each partition to a list of uses of 392 /// that partition. The mapping is done with a Uses vector that has the exact 393 /// same number of entries as the partition vector. Each entry is itself 394 /// a vector of the uses. 395 SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; 396 397 /// \brief Instructions which will become dead if we rewrite the alloca. 398 /// 399 /// Note that these are not separated by partition. This is because we expect 400 /// a partitioned alloca to be completely rewritten or not rewritten at all. 401 /// If rewritten, all these instructions can simply be removed and replaced 402 /// with undef as they come from outside of the allocated space. 403 SmallVector<Instruction *, 8> DeadUsers; 404 405 /// \brief Operands which will become dead if we rewrite the alloca. 406 /// 407 /// These are operands that in their particular use can be replaced with 408 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 409 /// to PHI nodes and the like. They aren't entirely dead (there might be 410 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 411 /// want to swap this particular input for undef to simplify the use lists of 412 /// the alloca. 413 SmallVector<Use *, 8> DeadOperands; 414 415 /// \brief The underlying storage for auxiliary memcpy and memset info. 416 SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; 417 418 /// \brief A side datastructure used when building up the partitions and uses. 419 /// 420 /// This mapping is only really used during the initial building of the 421 /// partitioning so that we can retain information about PHI and select nodes 422 /// processed. 423 SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; 424 425 /// \brief Auxiliary information for particular PHI or select operands. 426 SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; 427 428 /// \brief A utility routine called from the constructor. 429 /// 430 /// This does what it says on the tin. It is the key of the alloca partition 431 /// splitting and merging. After it is called we have the desired disjoint 432 /// collection of partitions. 433 void splitAndMergePartitions(); 434 }; 435 } 436 437 static Value *foldSelectInst(SelectInst &SI) { 438 // If the condition being selected on is a constant or the same value is 439 // being selected between, fold the select. Yes this does (rarely) happen 440 // early on. 441 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 442 return SI.getOperand(1+CI->isZero()); 443 if (SI.getOperand(1) == SI.getOperand(2)) 444 return SI.getOperand(1); 445 446 return 0; 447 } 448 449 /// \brief Builder for the alloca partitioning. 450 /// 451 /// This class builds an alloca partitioning by recursively visiting the uses 452 /// of an alloca and splitting the partitions for each load and store at each 453 /// offset. 454 class AllocaPartitioning::PartitionBuilder 455 : public PtrUseVisitor<PartitionBuilder> { 456 friend class PtrUseVisitor<PartitionBuilder>; 457 friend class InstVisitor<PartitionBuilder>; 458 typedef PtrUseVisitor<PartitionBuilder> Base; 459 460 const uint64_t AllocSize; 461 AllocaPartitioning &P; 462 463 SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; 464 465 public: 466 PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) 467 : PtrUseVisitor<PartitionBuilder>(DL), 468 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), 469 P(P) {} 470 471 private: 472 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 473 bool IsSplittable = false) { 474 // Completely skip uses which have a zero size or start either before or 475 // past the end of the allocation. 476 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { 477 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 478 << " which has zero size or starts outside of the " 479 << AllocSize << " byte alloca:\n" 480 << " alloca: " << P.AI << "\n" 481 << " use: " << I << "\n"); 482 return; 483 } 484 485 uint64_t BeginOffset = Offset.getZExtValue(); 486 uint64_t EndOffset = BeginOffset + Size; 487 488 // Clamp the end offset to the end of the allocation. Note that this is 489 // formulated to handle even the case where "BeginOffset + Size" overflows. 490 // This may appear superficially to be something we could ignore entirely, 491 // but that is not so! There may be widened loads or PHI-node uses where 492 // some instructions are dead but not others. We can't completely ignore 493 // them, and so have to record at least the information here. 494 assert(AllocSize >= BeginOffset); // Established above. 495 if (Size > AllocSize - BeginOffset) { 496 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 497 << " to remain within the " << AllocSize << " byte alloca:\n" 498 << " alloca: " << P.AI << "\n" 499 << " use: " << I << "\n"); 500 EndOffset = AllocSize; 501 } 502 503 Partition New(BeginOffset, EndOffset, IsSplittable); 504 P.Partitions.push_back(New); 505 } 506 507 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 508 uint64_t Size, bool IsVolatile) { 509 // We allow splitting of loads and stores where the type is an integer type 510 // and cover the entire alloca. This prevents us from splitting over 511 // eagerly. 512 // FIXME: In the great blue eventually, we should eagerly split all integer 513 // loads and stores, and then have a separate step that merges adjacent 514 // alloca partitions into a single partition suitable for integer widening. 515 // Or we should skip the merge step and rely on GVN and other passes to 516 // merge adjacent loads and stores that survive mem2reg. 517 bool IsSplittable = 518 Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; 519 520 insertUse(I, Offset, Size, IsSplittable); 521 } 522 523 void visitLoadInst(LoadInst &LI) { 524 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 525 "All simple FCA loads should have been pre-split"); 526 527 if (!IsOffsetKnown) 528 return PI.setAborted(&LI); 529 530 uint64_t Size = DL.getTypeStoreSize(LI.getType()); 531 return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); 532 } 533 534 void visitStoreInst(StoreInst &SI) { 535 Value *ValOp = SI.getValueOperand(); 536 if (ValOp == *U) 537 return PI.setEscapedAndAborted(&SI); 538 if (!IsOffsetKnown) 539 return PI.setAborted(&SI); 540 541 uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); 542 543 // If this memory access can be shown to *statically* extend outside the 544 // bounds of of the allocation, it's behavior is undefined, so simply 545 // ignore it. Note that this is more strict than the generic clamping 546 // behavior of insertUse. We also try to handle cases which might run the 547 // risk of overflow. 548 // FIXME: We should instead consider the pointer to have escaped if this 549 // function is being instrumented for addressing bugs or race conditions. 550 if (Offset.isNegative() || Size > AllocSize || 551 Offset.ugt(AllocSize - Size)) { 552 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset 553 << " which extends past the end of the " << AllocSize 554 << " byte alloca:\n" 555 << " alloca: " << P.AI << "\n" 556 << " use: " << SI << "\n"); 557 return; 558 } 559 560 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 561 "All simple FCA stores should have been pre-split"); 562 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); 563 } 564 565 566 void visitMemSetInst(MemSetInst &II) { 567 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 568 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 569 if ((Length && Length->getValue() == 0) || 570 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 571 // Zero-length mem transfer intrinsics can be ignored entirely. 572 return; 573 574 if (!IsOffsetKnown) 575 return PI.setAborted(&II); 576 577 insertUse(II, Offset, 578 Length ? Length->getLimitedValue() 579 : AllocSize - Offset.getLimitedValue(), 580 (bool)Length); 581 } 582 583 void visitMemTransferInst(MemTransferInst &II) { 584 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 585 if ((Length && Length->getValue() == 0) || 586 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 587 // Zero-length mem transfer intrinsics can be ignored entirely. 588 return; 589 590 if (!IsOffsetKnown) 591 return PI.setAborted(&II); 592 593 uint64_t RawOffset = Offset.getLimitedValue(); 594 uint64_t Size = Length ? Length->getLimitedValue() 595 : AllocSize - RawOffset; 596 597 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 598 599 // Only intrinsics with a constant length can be split. 600 Offsets.IsSplittable = Length; 601 602 if (*U == II.getRawDest()) { 603 Offsets.DestBegin = RawOffset; 604 Offsets.DestEnd = RawOffset + Size; 605 } 606 if (*U == II.getRawSource()) { 607 Offsets.SourceBegin = RawOffset; 608 Offsets.SourceEnd = RawOffset + Size; 609 } 610 611 // If we have set up end offsets for both the source and the destination, 612 // we have found both sides of this transfer pointing at the same alloca. 613 bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; 614 if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { 615 unsigned PrevIdx = MemTransferPartitionMap[&II]; 616 617 // Check if the begin offsets match and this is a non-volatile transfer. 618 // In that case, we can completely elide the transfer. 619 if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { 620 P.Partitions[PrevIdx].kill(); 621 return; 622 } 623 624 // Otherwise we have an offset transfer within the same alloca. We can't 625 // split those. 626 P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; 627 } else if (SeenBothEnds) { 628 // Handle the case where this exact use provides both ends of the 629 // operation. 630 assert(II.getRawDest() == II.getRawSource()); 631 632 // For non-volatile transfers this is a no-op. 633 if (!II.isVolatile()) 634 return; 635 636 // Otherwise just suppress splitting. 637 Offsets.IsSplittable = false; 638 } 639 640 641 // Insert the use now that we've fixed up the splittable nature. 642 insertUse(II, Offset, Size, Offsets.IsSplittable); 643 644 // Setup the mapping from intrinsic to partition of we've not seen both 645 // ends of this transfer. 646 if (!SeenBothEnds) { 647 unsigned NewIdx = P.Partitions.size() - 1; 648 bool Inserted 649 = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; 650 assert(Inserted && 651 "Already have intrinsic in map but haven't seen both ends"); 652 (void)Inserted; 653 } 654 } 655 656 // Disable SRoA for any intrinsics except for lifetime invariants. 657 // FIXME: What about debug intrinsics? This matches old behavior, but 658 // doesn't make sense. 659 void visitIntrinsicInst(IntrinsicInst &II) { 660 if (!IsOffsetKnown) 661 return PI.setAborted(&II); 662 663 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 664 II.getIntrinsicID() == Intrinsic::lifetime_end) { 665 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 666 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 667 Length->getLimitedValue()); 668 insertUse(II, Offset, Size, true); 669 return; 670 } 671 672 Base::visitIntrinsicInst(II); 673 } 674 675 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 676 // We consider any PHI or select that results in a direct load or store of 677 // the same offset to be a viable use for partitioning purposes. These uses 678 // are considered unsplittable and the size is the maximum loaded or stored 679 // size. 680 SmallPtrSet<Instruction *, 4> Visited; 681 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 682 Visited.insert(Root); 683 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 684 // If there are no loads or stores, the access is dead. We mark that as 685 // a size zero access. 686 Size = 0; 687 do { 688 Instruction *I, *UsedI; 689 llvm::tie(UsedI, I) = Uses.pop_back_val(); 690 691 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 692 Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); 693 continue; 694 } 695 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 696 Value *Op = SI->getOperand(0); 697 if (Op == UsedI) 698 return SI; 699 Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); 700 continue; 701 } 702 703 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 704 if (!GEP->hasAllZeroIndices()) 705 return GEP; 706 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 707 !isa<SelectInst>(I)) { 708 return I; 709 } 710 711 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; 712 ++UI) 713 if (Visited.insert(cast<Instruction>(*UI))) 714 Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); 715 } while (!Uses.empty()); 716 717 return 0; 718 } 719 720 void visitPHINode(PHINode &PN) { 721 if (PN.use_empty()) 722 return; 723 if (!IsOffsetKnown) 724 return PI.setAborted(&PN); 725 726 // See if we already have computed info on this node. 727 std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; 728 if (PHIInfo.first) { 729 PHIInfo.second = true; 730 insertUse(PN, Offset, PHIInfo.first); 731 return; 732 } 733 734 // Check for an unsafe use of the PHI node. 735 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) 736 return PI.setAborted(UnsafeI); 737 738 insertUse(PN, Offset, PHIInfo.first); 739 } 740 741 void visitSelectInst(SelectInst &SI) { 742 if (SI.use_empty()) 743 return; 744 if (Value *Result = foldSelectInst(SI)) { 745 if (Result == *U) 746 // If the result of the constant fold will be the pointer, recurse 747 // through the select as if we had RAUW'ed it. 748 enqueueUsers(SI); 749 750 return; 751 } 752 if (!IsOffsetKnown) 753 return PI.setAborted(&SI); 754 755 // See if we already have computed info on this node. 756 std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; 757 if (SelectInfo.first) { 758 SelectInfo.second = true; 759 insertUse(SI, Offset, SelectInfo.first); 760 return; 761 } 762 763 // Check for an unsafe use of the PHI node. 764 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) 765 return PI.setAborted(UnsafeI); 766 767 insertUse(SI, Offset, SelectInfo.first); 768 } 769 770 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 771 void visitInstruction(Instruction &I) { 772 PI.setAborted(&I); 773 } 774 }; 775 776 /// \brief Use adder for the alloca partitioning. 777 /// 778 /// This class adds the uses of an alloca to all of the partitions which they 779 /// use. For splittable partitions, this can end up doing essentially a linear 780 /// walk of the partitions, but the number of steps remains bounded by the 781 /// total result instruction size: 782 /// - The number of partitions is a result of the number unsplittable 783 /// instructions using the alloca. 784 /// - The number of users of each partition is at worst the total number of 785 /// splittable instructions using the alloca. 786 /// Thus we will produce N * M instructions in the end, where N are the number 787 /// of unsplittable uses and M are the number of splittable. This visitor does 788 /// the exact same number of updates to the partitioning. 789 /// 790 /// In the more common case, this visitor will leverage the fact that the 791 /// partition space is pre-sorted, and do a logarithmic search for the 792 /// partition needed, making the total visit a classical ((N + M) * log(N)) 793 /// complexity operation. 794 class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { 795 friend class PtrUseVisitor<UseBuilder>; 796 friend class InstVisitor<UseBuilder>; 797 typedef PtrUseVisitor<UseBuilder> Base; 798 799 const uint64_t AllocSize; 800 AllocaPartitioning &P; 801 802 /// \brief Set to de-duplicate dead instructions found in the use walk. 803 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 804 805 public: 806 UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) 807 : PtrUseVisitor<UseBuilder>(TD), 808 AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), 809 P(P) {} 810 811 private: 812 void markAsDead(Instruction &I) { 813 if (VisitedDeadInsts.insert(&I)) 814 P.DeadUsers.push_back(&I); 815 } 816 817 void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { 818 // If the use has a zero size or extends outside of the allocation, record 819 // it as a dead use for elimination later. 820 if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) 821 return markAsDead(User); 822 823 uint64_t BeginOffset = Offset.getZExtValue(); 824 uint64_t EndOffset = BeginOffset + Size; 825 826 // Clamp the end offset to the end of the allocation. Note that this is 827 // formulated to handle even the case where "BeginOffset + Size" overflows. 828 assert(AllocSize >= BeginOffset); // Established above. 829 if (Size > AllocSize - BeginOffset) 830 EndOffset = AllocSize; 831 832 // NB: This only works if we have zero overlapping partitions. 833 iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset); 834 if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset) 835 I = llvm::prior(I); 836 iterator E = P.end(); 837 bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset; 838 for (; I != E && I->BeginOffset < EndOffset; ++I) { 839 PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), 840 std::min(I->EndOffset, EndOffset), U, IsSplit); 841 P.use_push_back(I, NewPU); 842 if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) 843 P.PHIOrSelectOpMap[U] 844 = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); 845 } 846 } 847 848 void visitBitCastInst(BitCastInst &BC) { 849 if (BC.use_empty()) 850 return markAsDead(BC); 851 852 return Base::visitBitCastInst(BC); 853 } 854 855 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 856 if (GEPI.use_empty()) 857 return markAsDead(GEPI); 858 859 return Base::visitGetElementPtrInst(GEPI); 860 } 861 862 void visitLoadInst(LoadInst &LI) { 863 assert(IsOffsetKnown); 864 uint64_t Size = DL.getTypeStoreSize(LI.getType()); 865 insertUse(LI, Offset, Size); 866 } 867 868 void visitStoreInst(StoreInst &SI) { 869 assert(IsOffsetKnown); 870 uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType()); 871 872 // If this memory access can be shown to *statically* extend outside the 873 // bounds of of the allocation, it's behavior is undefined, so simply 874 // ignore it. Note that this is more strict than the generic clamping 875 // behavior of insertUse. 876 if (Offset.isNegative() || Size > AllocSize || 877 Offset.ugt(AllocSize - Size)) 878 return markAsDead(SI); 879 880 insertUse(SI, Offset, Size); 881 } 882 883 void visitMemSetInst(MemSetInst &II) { 884 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 885 if ((Length && Length->getValue() == 0) || 886 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 887 return markAsDead(II); 888 889 assert(IsOffsetKnown); 890 insertUse(II, Offset, Length ? Length->getLimitedValue() 891 : AllocSize - Offset.getLimitedValue()); 892 } 893 894 void visitMemTransferInst(MemTransferInst &II) { 895 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 896 if ((Length && Length->getValue() == 0) || 897 (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) 898 return markAsDead(II); 899 900 assert(IsOffsetKnown); 901 uint64_t Size = Length ? Length->getLimitedValue() 902 : AllocSize - Offset.getLimitedValue(); 903 904 MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; 905 if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && 906 Offsets.DestBegin == Offsets.SourceBegin) 907 return markAsDead(II); // Skip identity transfers without side-effects. 908 909 insertUse(II, Offset, Size); 910 } 911 912 void visitIntrinsicInst(IntrinsicInst &II) { 913 assert(IsOffsetKnown); 914 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 915 II.getIntrinsicID() == Intrinsic::lifetime_end); 916 917 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 918 insertUse(II, Offset, std::min(Length->getLimitedValue(), 919 AllocSize - Offset.getLimitedValue())); 920 } 921 922 void insertPHIOrSelect(Instruction &User, const APInt &Offset) { 923 uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; 924 925 // For PHI and select operands outside the alloca, we can't nuke the entire 926 // phi or select -- the other side might still be relevant, so we special 927 // case them here and use a separate structure to track the operands 928 // themselves which should be replaced with undef. 929 if ((Offset.isNegative() && Offset.uge(Size)) || 930 (!Offset.isNegative() && Offset.uge(AllocSize))) { 931 P.DeadOperands.push_back(U); 932 return; 933 } 934 935 insertUse(User, Offset, Size); 936 } 937 938 void visitPHINode(PHINode &PN) { 939 if (PN.use_empty()) 940 return markAsDead(PN); 941 942 assert(IsOffsetKnown); 943 insertPHIOrSelect(PN, Offset); 944 } 945 946 void visitSelectInst(SelectInst &SI) { 947 if (SI.use_empty()) 948 return markAsDead(SI); 949 950 if (Value *Result = foldSelectInst(SI)) { 951 if (Result == *U) 952 // If the result of the constant fold will be the pointer, recurse 953 // through the select as if we had RAUW'ed it. 954 enqueueUsers(SI); 955 else 956 // Otherwise the operand to the select is dead, and we can replace it 957 // with undef. 958 P.DeadOperands.push_back(U); 959 960 return; 961 } 962 963 assert(IsOffsetKnown); 964 insertPHIOrSelect(SI, Offset); 965 } 966 967 /// \brief Unreachable, we've already visited the alloca once. 968 void visitInstruction(Instruction &I) { 969 llvm_unreachable("Unhandled instruction in use builder."); 970 } 971 }; 972 973 void AllocaPartitioning::splitAndMergePartitions() { 974 size_t NumDeadPartitions = 0; 975 976 // Track the range of splittable partitions that we pass when accumulating 977 // overlapping unsplittable partitions. 978 uint64_t SplitEndOffset = 0ull; 979 980 Partition New(0ull, 0ull, false); 981 982 for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { 983 ++j; 984 985 if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { 986 assert(New.BeginOffset == New.EndOffset); 987 New = Partitions[i]; 988 } else { 989 assert(New.IsSplittable); 990 New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); 991 } 992 assert(New.BeginOffset != New.EndOffset); 993 994 // Scan the overlapping partitions. 995 while (j != e && New.EndOffset > Partitions[j].BeginOffset) { 996 // If the new partition we are forming is splittable, stop at the first 997 // unsplittable partition. 998 if (New.IsSplittable && !Partitions[j].IsSplittable) 999 break; 1000 1001 // Grow the new partition to include any equally splittable range. 'j' is 1002 // always equally splittable when New is splittable, but when New is not 1003 // splittable, we may subsume some (or part of some) splitable partition 1004 // without growing the new one. 1005 if (New.IsSplittable == Partitions[j].IsSplittable) { 1006 New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); 1007 } else { 1008 assert(!New.IsSplittable); 1009 assert(Partitions[j].IsSplittable); 1010 SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); 1011 } 1012 1013 Partitions[j].kill(); 1014 ++NumDeadPartitions; 1015 ++j; 1016 } 1017 1018 // If the new partition is splittable, chop off the end as soon as the 1019 // unsplittable subsequent partition starts and ensure we eventually cover 1020 // the splittable area. 1021 if (j != e && New.IsSplittable) { 1022 SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); 1023 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1024 } 1025 1026 // Add the new partition if it differs from the original one and is 1027 // non-empty. We can end up with an empty partition here if it was 1028 // splittable but there is an unsplittable one that starts at the same 1029 // offset. 1030 if (New != Partitions[i]) { 1031 if (New.BeginOffset != New.EndOffset) 1032 Partitions.push_back(New); 1033 // Mark the old one for removal. 1034 Partitions[i].kill(); 1035 ++NumDeadPartitions; 1036 } 1037 1038 New.BeginOffset = New.EndOffset; 1039 if (!New.IsSplittable) { 1040 New.EndOffset = std::max(New.EndOffset, SplitEndOffset); 1041 if (j != e && !Partitions[j].IsSplittable) 1042 New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); 1043 New.IsSplittable = true; 1044 // If there is a trailing splittable partition which won't be fused into 1045 // the next splittable partition go ahead and add it onto the partitions 1046 // list. 1047 if (New.BeginOffset < New.EndOffset && 1048 (j == e || !Partitions[j].IsSplittable || 1049 New.EndOffset < Partitions[j].BeginOffset)) { 1050 Partitions.push_back(New); 1051 New.BeginOffset = New.EndOffset = 0ull; 1052 } 1053 } 1054 } 1055 1056 // Re-sort the partitions now that they have been split and merged into 1057 // disjoint set of partitions. Also remove any of the dead partitions we've 1058 // replaced in the process. 1059 std::sort(Partitions.begin(), Partitions.end()); 1060 if (NumDeadPartitions) { 1061 assert(Partitions.back().isDead()); 1062 assert((ptrdiff_t)NumDeadPartitions == 1063 std::count(Partitions.begin(), Partitions.end(), Partitions.back())); 1064 } 1065 Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); 1066 } 1067 1068 AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) 1069 : 1070 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1071 AI(AI), 1072 #endif 1073 PointerEscapingInstr(0) { 1074 PartitionBuilder PB(TD, AI, *this); 1075 PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); 1076 if (PtrI.isEscaped() || PtrI.isAborted()) { 1077 // FIXME: We should sink the escape vs. abort info into the caller nicely, 1078 // possibly by just storing the PtrInfo in the AllocaPartitioning. 1079 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 1080 : PtrI.getAbortingInst(); 1081 assert(PointerEscapingInstr && "Did not track a bad instruction"); 1082 return; 1083 } 1084 1085 // Sort the uses. This arranges for the offsets to be in ascending order, 1086 // and the sizes to be in descending order. 1087 std::sort(Partitions.begin(), Partitions.end()); 1088 1089 // Remove any partitions from the back which are marked as dead. 1090 while (!Partitions.empty() && Partitions.back().isDead()) 1091 Partitions.pop_back(); 1092 1093 if (Partitions.size() > 1) { 1094 // Intersect splittability for all partitions with equal offsets and sizes. 1095 // Then remove all but the first so that we have a sequence of non-equal but 1096 // potentially overlapping partitions. 1097 for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; 1098 I = J) { 1099 ++J; 1100 while (J != E && *I == *J) { 1101 I->IsSplittable &= J->IsSplittable; 1102 ++J; 1103 } 1104 } 1105 Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), 1106 Partitions.end()); 1107 1108 // Split splittable and merge unsplittable partitions into a disjoint set 1109 // of partitions over the used space of the allocation. 1110 splitAndMergePartitions(); 1111 } 1112 1113 // Record how many partitions we end up with. 1114 NumAllocaPartitions += Partitions.size(); 1115 MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca); 1116 1117 // Now build up the user lists for each of these disjoint partitions by 1118 // re-walking the recursive users of the alloca. 1119 Uses.resize(Partitions.size()); 1120 UseBuilder UB(TD, AI, *this); 1121 PtrI = UB.visitPtr(AI); 1122 assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); 1123 assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); 1124 1125 unsigned NumUses = 0; 1126 #if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) 1127 for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx) 1128 NumUses += Uses[Idx].size(); 1129 #endif 1130 NumAllocaPartitionUses += NumUses; 1131 MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca); 1132 } 1133 1134 Type *AllocaPartitioning::getCommonType(iterator I) const { 1135 Type *Ty = 0; 1136 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1137 Use *U = UI->getUse(); 1138 if (!U) 1139 continue; // Skip dead uses. 1140 if (isa<IntrinsicInst>(*U->getUser())) 1141 continue; 1142 if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) 1143 continue; 1144 1145 Type *UserTy = 0; 1146 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) 1147 UserTy = LI->getType(); 1148 else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) 1149 UserTy = SI->getValueOperand()->getType(); 1150 else 1151 return 0; // Bail if we have weird uses. 1152 1153 if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { 1154 // If the type is larger than the partition, skip it. We only encounter 1155 // this for split integer operations where we want to use the type of the 1156 // entity causing the split. 1157 if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) 1158 continue; 1159 1160 // If we have found an integer type use covering the alloca, use that 1161 // regardless of the other types, as integers are often used for a "bucket 1162 // of bits" type. 1163 return ITy; 1164 } 1165 1166 if (Ty && Ty != UserTy) 1167 return 0; 1168 1169 Ty = UserTy; 1170 } 1171 return Ty; 1172 } 1173 1174 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1175 1176 void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, 1177 StringRef Indent) const { 1178 OS << Indent << "partition #" << (I - begin()) 1179 << " [" << I->BeginOffset << "," << I->EndOffset << ")" 1180 << (I->IsSplittable ? " (splittable)" : "") 1181 << (Uses[I - begin()].empty() ? " (zero uses)" : "") 1182 << "\n"; 1183 } 1184 1185 void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, 1186 StringRef Indent) const { 1187 for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { 1188 if (!UI->getUse()) 1189 continue; // Skip dead uses. 1190 OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " 1191 << "used by: " << *UI->getUse()->getUser() << "\n"; 1192 if (MemTransferInst *II = 1193 dyn_cast<MemTransferInst>(UI->getUse()->getUser())) { 1194 const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); 1195 bool IsDest; 1196 if (!MTO.IsSplittable) 1197 IsDest = UI->BeginOffset == MTO.DestBegin; 1198 else 1199 IsDest = MTO.DestBegin != 0u; 1200 OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " 1201 << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) 1202 << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; 1203 } 1204 } 1205 } 1206 1207 void AllocaPartitioning::print(raw_ostream &OS) const { 1208 if (PointerEscapingInstr) { 1209 OS << "No partitioning for alloca: " << AI << "\n" 1210 << " A pointer to this alloca escaped by:\n" 1211 << " " << *PointerEscapingInstr << "\n"; 1212 return; 1213 } 1214 1215 OS << "Partitioning of alloca: " << AI << "\n"; 1216 for (const_iterator I = begin(), E = end(); I != E; ++I) { 1217 print(OS, I); 1218 printUsers(OS, I); 1219 } 1220 } 1221 1222 void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } 1223 void AllocaPartitioning::dump() const { print(dbgs()); } 1224 1225 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1226 1227 1228 namespace { 1229 /// \brief Implementation of LoadAndStorePromoter for promoting allocas. 1230 /// 1231 /// This subclass of LoadAndStorePromoter adds overrides to handle promoting 1232 /// the loads and stores of an alloca instruction, as well as updating its 1233 /// debug information. This is used when a domtree is unavailable and thus 1234 /// mem2reg in its full form can't be used to handle promotion of allocas to 1235 /// scalar values. 1236 class AllocaPromoter : public LoadAndStorePromoter { 1237 AllocaInst &AI; 1238 DIBuilder &DIB; 1239 1240 SmallVector<DbgDeclareInst *, 4> DDIs; 1241 SmallVector<DbgValueInst *, 4> DVIs; 1242 1243 public: 1244 AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, 1245 AllocaInst &AI, DIBuilder &DIB) 1246 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 1247 1248 void run(const SmallVectorImpl<Instruction*> &Insts) { 1249 // Remember which alloca we're promoting (for isInstInList). 1250 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 1251 for (Value::use_iterator UI = DebugNode->use_begin(), 1252 UE = DebugNode->use_end(); 1253 UI != UE; ++UI) 1254 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) 1255 DDIs.push_back(DDI); 1256 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) 1257 DVIs.push_back(DVI); 1258 } 1259 1260 LoadAndStorePromoter::run(Insts); 1261 AI.eraseFromParent(); 1262 while (!DDIs.empty()) 1263 DDIs.pop_back_val()->eraseFromParent(); 1264 while (!DVIs.empty()) 1265 DVIs.pop_back_val()->eraseFromParent(); 1266 } 1267 1268 virtual bool isInstInList(Instruction *I, 1269 const SmallVectorImpl<Instruction*> &Insts) const { 1270 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 1271 return LI->getOperand(0) == &AI; 1272 return cast<StoreInst>(I)->getPointerOperand() == &AI; 1273 } 1274 1275 virtual void updateDebugInfo(Instruction *Inst) const { 1276 for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 1277 E = DDIs.end(); I != E; ++I) { 1278 DbgDeclareInst *DDI = *I; 1279 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 1280 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 1281 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 1282 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 1283 } 1284 for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 1285 E = DVIs.end(); I != E; ++I) { 1286 DbgValueInst *DVI = *I; 1287 Value *Arg = 0; 1288 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 1289 // If an argument is zero extended then use argument directly. The ZExt 1290 // may be zapped by an optimization pass in future. 1291 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 1292 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 1293 if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 1294 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 1295 if (!Arg) 1296 Arg = SI->getOperand(0); 1297 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 1298 Arg = LI->getOperand(0); 1299 } else { 1300 continue; 1301 } 1302 Instruction *DbgVal = 1303 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 1304 Inst); 1305 DbgVal->setDebugLoc(DVI->getDebugLoc()); 1306 } 1307 } 1308 }; 1309 } // end anon namespace 1310 1311 1312 namespace { 1313 /// \brief An optimization pass providing Scalar Replacement of Aggregates. 1314 /// 1315 /// This pass takes allocations which can be completely analyzed (that is, they 1316 /// don't escape) and tries to turn them into scalar SSA values. There are 1317 /// a few steps to this process. 1318 /// 1319 /// 1) It takes allocations of aggregates and analyzes the ways in which they 1320 /// are used to try to split them into smaller allocations, ideally of 1321 /// a single scalar data type. It will split up memcpy and memset accesses 1322 /// as necessary and try to isolate individual scalar accesses. 1323 /// 2) It will transform accesses into forms which are suitable for SSA value 1324 /// promotion. This can be replacing a memset with a scalar store of an 1325 /// integer value, or it can involve speculating operations on a PHI or 1326 /// select to be a PHI or select of the results. 1327 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly 1328 /// onto insert and extract operations on a vector value, and convert them to 1329 /// this form. By doing so, it will enable promotion of vector aggregates to 1330 /// SSA vector values. 1331 class SROA : public FunctionPass { 1332 const bool RequiresDomTree; 1333 1334 LLVMContext *C; 1335 const DataLayout *TD; 1336 DominatorTree *DT; 1337 1338 /// \brief Worklist of alloca instructions to simplify. 1339 /// 1340 /// Each alloca in the function is added to this. Each new alloca formed gets 1341 /// added to it as well to recursively simplify unless that alloca can be 1342 /// directly promoted. Finally, each time we rewrite a use of an alloca other 1343 /// the one being actively rewritten, we add it back onto the list if not 1344 /// already present to ensure it is re-visited. 1345 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 1346 1347 /// \brief A collection of instructions to delete. 1348 /// We try to batch deletions to simplify code and make things a bit more 1349 /// efficient. 1350 SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; 1351 1352 /// \brief Post-promotion worklist. 1353 /// 1354 /// Sometimes we discover an alloca which has a high probability of becoming 1355 /// viable for SROA after a round of promotion takes place. In those cases, 1356 /// the alloca is enqueued here for re-processing. 1357 /// 1358 /// Note that we have to be very careful to clear allocas out of this list in 1359 /// the event they are deleted. 1360 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 1361 1362 /// \brief A collection of alloca instructions we can directly promote. 1363 std::vector<AllocaInst *> PromotableAllocas; 1364 1365 public: 1366 SROA(bool RequiresDomTree = true) 1367 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 1368 C(0), TD(0), DT(0) { 1369 initializeSROAPass(*PassRegistry::getPassRegistry()); 1370 } 1371 bool runOnFunction(Function &F); 1372 void getAnalysisUsage(AnalysisUsage &AU) const; 1373 1374 const char *getPassName() const { return "SROA"; } 1375 static char ID; 1376 1377 private: 1378 friend class PHIOrSelectSpeculator; 1379 friend class AllocaPartitionRewriter; 1380 friend class AllocaPartitionVectorRewriter; 1381 1382 bool rewriteAllocaPartition(AllocaInst &AI, 1383 AllocaPartitioning &P, 1384 AllocaPartitioning::iterator PI); 1385 bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); 1386 bool runOnAlloca(AllocaInst &AI); 1387 void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); 1388 bool promoteAllocas(Function &F); 1389 }; 1390 } 1391 1392 char SROA::ID = 0; 1393 1394 FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 1395 return new SROA(RequiresDomTree); 1396 } 1397 1398 INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 1399 false, false) 1400 INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1401 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 1402 false, false) 1403 1404 namespace { 1405 /// \brief Visitor to speculate PHIs and Selects where possible. 1406 class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { 1407 // Befriend the base class so it can delegate to private visit methods. 1408 friend class llvm::InstVisitor<PHIOrSelectSpeculator>; 1409 1410 const DataLayout &TD; 1411 AllocaPartitioning &P; 1412 SROA &Pass; 1413 1414 public: 1415 PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) 1416 : TD(TD), P(P), Pass(Pass) {} 1417 1418 /// \brief Visit the users of an alloca partition and rewrite them. 1419 void visitUsers(AllocaPartitioning::const_iterator PI) { 1420 // Note that we need to use an index here as the underlying vector of uses 1421 // may be grown during speculation. However, we never need to re-visit the 1422 // new uses, and so we can use the initial size bound. 1423 for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { 1424 const PartitionUse &PU = P.getUse(PI, Idx); 1425 if (!PU.getUse()) 1426 continue; // Skip dead use. 1427 1428 visit(cast<Instruction>(PU.getUse()->getUser())); 1429 } 1430 } 1431 1432 private: 1433 // By default, skip this instruction. 1434 void visitInstruction(Instruction &I) {} 1435 1436 /// PHI instructions that use an alloca and are subsequently loaded can be 1437 /// rewritten to load both input pointers in the pred blocks and then PHI the 1438 /// results, allowing the load of the alloca to be promoted. 1439 /// From this: 1440 /// %P2 = phi [i32* %Alloca, i32* %Other] 1441 /// %V = load i32* %P2 1442 /// to: 1443 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1444 /// ... 1445 /// %V2 = load i32* %Other 1446 /// ... 1447 /// %V = phi [i32 %V1, i32 %V2] 1448 /// 1449 /// We can do this to a select if its only uses are loads and if the operands 1450 /// to the select can be loaded unconditionally. 1451 /// 1452 /// FIXME: This should be hoisted into a generic utility, likely in 1453 /// Transforms/Util/Local.h 1454 bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { 1455 // For now, we can only do this promotion if the load is in the same block 1456 // as the PHI, and if there are no stores between the phi and load. 1457 // TODO: Allow recursive phi users. 1458 // TODO: Allow stores. 1459 BasicBlock *BB = PN.getParent(); 1460 unsigned MaxAlign = 0; 1461 for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); 1462 UI != UE; ++UI) { 1463 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1464 if (LI == 0 || !LI->isSimple()) return false; 1465 1466 // For now we only allow loads in the same block as the PHI. This is 1467 // a common case that happens when instcombine merges two loads through 1468 // a PHI. 1469 if (LI->getParent() != BB) return false; 1470 1471 // Ensure that there are no instructions between the PHI and the load that 1472 // could store. 1473 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 1474 if (BBI->mayWriteToMemory()) 1475 return false; 1476 1477 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 1478 Loads.push_back(LI); 1479 } 1480 1481 // We can only transform this if it is safe to push the loads into the 1482 // predecessor blocks. The only thing to watch out for is that we can't put 1483 // a possibly trapping load in the predecessor if it is a critical edge. 1484 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1485 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1486 Value *InVal = PN.getIncomingValue(Idx); 1487 1488 // If the value is produced by the terminator of the predecessor (an 1489 // invoke) or it has side-effects, there is no valid place to put a load 1490 // in the predecessor. 1491 if (TI == InVal || TI->mayHaveSideEffects()) 1492 return false; 1493 1494 // If the predecessor has a single successor, then the edge isn't 1495 // critical. 1496 if (TI->getNumSuccessors() == 1) 1497 continue; 1498 1499 // If this pointer is always safe to load, or if we can prove that there 1500 // is already a load in the block, then we can move the load to the pred 1501 // block. 1502 if (InVal->isDereferenceablePointer() || 1503 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) 1504 continue; 1505 1506 return false; 1507 } 1508 1509 return true; 1510 } 1511 1512 void visitPHINode(PHINode &PN) { 1513 DEBUG(dbgs() << " original: " << PN << "\n"); 1514 1515 SmallVector<LoadInst *, 4> Loads; 1516 if (!isSafePHIToSpeculate(PN, Loads)) 1517 return; 1518 1519 assert(!Loads.empty()); 1520 1521 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 1522 IRBuilderTy PHIBuilder(&PN); 1523 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1524 PN.getName() + ".sroa.speculated"); 1525 1526 // Get the TBAA tag and alignment to use from one of the loads. It doesn't 1527 // matter which one we get and if any differ. 1528 LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); 1529 MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); 1530 unsigned Align = SomeLoad->getAlignment(); 1531 1532 // Rewrite all loads of the PN to use the new PHI. 1533 do { 1534 LoadInst *LI = Loads.pop_back_val(); 1535 LI->replaceAllUsesWith(NewPN); 1536 Pass.DeadInsts.insert(LI); 1537 } while (!Loads.empty()); 1538 1539 // Inject loads into all of the pred blocks. 1540 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1541 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1542 TerminatorInst *TI = Pred->getTerminator(); 1543 Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); 1544 Value *InVal = PN.getIncomingValue(Idx); 1545 IRBuilderTy PredBuilder(TI); 1546 1547 LoadInst *Load 1548 = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + 1549 Pred->getName())); 1550 ++NumLoadsSpeculated; 1551 Load->setAlignment(Align); 1552 if (TBAATag) 1553 Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); 1554 NewPN->addIncoming(Load, Pred); 1555 1556 Instruction *Ptr = dyn_cast<Instruction>(InVal); 1557 if (!Ptr) 1558 // No uses to rewrite. 1559 continue; 1560 1561 // Try to lookup and rewrite any partition uses corresponding to this phi 1562 // input. 1563 AllocaPartitioning::iterator PI 1564 = P.findPartitionForPHIOrSelectOperand(InUse); 1565 if (PI == P.end()) 1566 continue; 1567 1568 // Replace the Use in the PartitionUse for this operand with the Use 1569 // inside the load. 1570 AllocaPartitioning::use_iterator UI 1571 = P.findPartitionUseForPHIOrSelectOperand(InUse); 1572 assert(isa<PHINode>(*UI->getUse()->getUser())); 1573 UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex())); 1574 } 1575 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1576 } 1577 1578 /// Select instructions that use an alloca and are subsequently loaded can be 1579 /// rewritten to load both input pointers and then select between the result, 1580 /// allowing the load of the alloca to be promoted. 1581 /// From this: 1582 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 1583 /// %V = load i32* %P2 1584 /// to: 1585 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1586 /// %V2 = load i32* %Other 1587 /// %V = select i1 %cond, i32 %V1, i32 %V2 1588 /// 1589 /// We can do this to a select if its only uses are loads and if the operand 1590 /// to the select can be loaded unconditionally. 1591 bool isSafeSelectToSpeculate(SelectInst &SI, 1592 SmallVectorImpl<LoadInst *> &Loads) { 1593 Value *TValue = SI.getTrueValue(); 1594 Value *FValue = SI.getFalseValue(); 1595 bool TDerefable = TValue->isDereferenceablePointer(); 1596 bool FDerefable = FValue->isDereferenceablePointer(); 1597 1598 for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); 1599 UI != UE; ++UI) { 1600 LoadInst *LI = dyn_cast<LoadInst>(*UI); 1601 if (LI == 0 || !LI->isSimple()) return false; 1602 1603 // Both operands to the select need to be dereferencable, either 1604 // absolutely (e.g. allocas) or at this point because we can see other 1605 // accesses to it. 1606 if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, 1607 LI->getAlignment(), &TD)) 1608 return false; 1609 if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, 1610 LI->getAlignment(), &TD)) 1611 return false; 1612 Loads.push_back(LI); 1613 } 1614 1615 return true; 1616 } 1617 1618 void visitSelectInst(SelectInst &SI) { 1619 DEBUG(dbgs() << " original: " << SI << "\n"); 1620 1621 // If the select isn't safe to speculate, just use simple logic to emit it. 1622 SmallVector<LoadInst *, 4> Loads; 1623 if (!isSafeSelectToSpeculate(SI, Loads)) 1624 return; 1625 1626 IRBuilderTy IRB(&SI); 1627 Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; 1628 AllocaPartitioning::iterator PIs[2]; 1629 PartitionUse PUs[2]; 1630 for (unsigned i = 0, e = 2; i != e; ++i) { 1631 PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); 1632 if (PIs[i] != P.end()) { 1633 // If the pointer is within the partitioning, remove the select from 1634 // its uses. We'll add in the new loads below. 1635 AllocaPartitioning::use_iterator UI 1636 = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); 1637 PUs[i] = *UI; 1638 // Clear out the use here so that the offsets into the use list remain 1639 // stable but this use is ignored when rewriting. 1640 UI->setUse(0); 1641 } 1642 } 1643 1644 Value *TV = SI.getTrueValue(); 1645 Value *FV = SI.getFalseValue(); 1646 // Replace the loads of the select with a select of two loads. 1647 while (!Loads.empty()) { 1648 LoadInst *LI = Loads.pop_back_val(); 1649 1650 IRB.SetInsertPoint(LI); 1651 LoadInst *TL = 1652 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 1653 LoadInst *FL = 1654 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 1655 NumLoadsSpeculated += 2; 1656 1657 // Transfer alignment and TBAA info if present. 1658 TL->setAlignment(LI->getAlignment()); 1659 FL->setAlignment(LI->getAlignment()); 1660 if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { 1661 TL->setMetadata(LLVMContext::MD_tbaa, Tag); 1662 FL->setMetadata(LLVMContext::MD_tbaa, Tag); 1663 } 1664 1665 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1666 LI->getName() + ".sroa.speculated"); 1667 1668 LoadInst *Loads[2] = { TL, FL }; 1669 for (unsigned i = 0, e = 2; i != e; ++i) { 1670 if (PIs[i] != P.end()) { 1671 Use *LoadUse = &Loads[i]->getOperandUse(0); 1672 assert(PUs[i].getUse()->get() == LoadUse->get()); 1673 PUs[i].setUse(LoadUse); 1674 P.use_push_back(PIs[i], PUs[i]); 1675 } 1676 } 1677 1678 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1679 LI->replaceAllUsesWith(V); 1680 Pass.DeadInsts.insert(LI); 1681 } 1682 } 1683 }; 1684 } 1685 1686 /// \brief Build a GEP out of a base pointer and indices. 1687 /// 1688 /// This will return the BasePtr if that is valid, or build a new GEP 1689 /// instruction using the IRBuilder if GEP-ing is needed. 1690 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, 1691 SmallVectorImpl<Value *> &Indices, 1692 const Twine &Prefix) { 1693 if (Indices.empty()) 1694 return BasePtr; 1695 1696 // A single zero index is a no-op, so check for this and avoid building a GEP 1697 // in that case. 1698 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1699 return BasePtr; 1700 1701 return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); 1702 } 1703 1704 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward 1705 /// TargetTy without changing the offset of the pointer. 1706 /// 1707 /// This routine assumes we've already established a properly offset GEP with 1708 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1709 /// zero-indices down through type layers until we find one the same as 1710 /// TargetTy. If we can't find one with the same type, we at least try to use 1711 /// one with the same size. If none of that works, we just produce the GEP as 1712 /// indicated by Indices to have the correct offset. 1713 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, 1714 Value *BasePtr, Type *Ty, Type *TargetTy, 1715 SmallVectorImpl<Value *> &Indices, 1716 const Twine &Prefix) { 1717 if (Ty == TargetTy) 1718 return buildGEP(IRB, BasePtr, Indices, Prefix); 1719 1720 // See if we can descend into a struct and locate a field with the correct 1721 // type. 1722 unsigned NumLayers = 0; 1723 Type *ElementTy = Ty; 1724 do { 1725 if (ElementTy->isPointerTy()) 1726 break; 1727 if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { 1728 ElementTy = SeqTy->getElementType(); 1729 // Note that we use the default address space as this index is over an 1730 // array or a vector, not a pointer. 1731 Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); 1732 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1733 if (STy->element_begin() == STy->element_end()) 1734 break; // Nothing left to descend into. 1735 ElementTy = *STy->element_begin(); 1736 Indices.push_back(IRB.getInt32(0)); 1737 } else { 1738 break; 1739 } 1740 ++NumLayers; 1741 } while (ElementTy != TargetTy); 1742 if (ElementTy != TargetTy) 1743 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1744 1745 return buildGEP(IRB, BasePtr, Indices, Prefix); 1746 } 1747 1748 /// \brief Recursively compute indices for a natural GEP. 1749 /// 1750 /// This is the recursive step for getNaturalGEPWithOffset that walks down the 1751 /// element types adding appropriate indices for the GEP. 1752 static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, 1753 Value *Ptr, Type *Ty, APInt &Offset, 1754 Type *TargetTy, 1755 SmallVectorImpl<Value *> &Indices, 1756 const Twine &Prefix) { 1757 if (Offset == 0) 1758 return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); 1759 1760 // We can't recurse through pointer types. 1761 if (Ty->isPointerTy()) 1762 return 0; 1763 1764 // We try to analyze GEPs over vectors here, but note that these GEPs are 1765 // extremely poorly defined currently. The long-term goal is to remove GEPing 1766 // over a vector from the IR completely. 1767 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 1768 unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); 1769 if (ElementSizeInBits % 8) 1770 return 0; // GEPs over non-multiple of 8 size vector elements are invalid. 1771 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 1772 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1773 if (NumSkippedElements.ugt(VecTy->getNumElements())) 1774 return 0; 1775 Offset -= NumSkippedElements * ElementSize; 1776 Indices.push_back(IRB.getInt(NumSkippedElements)); 1777 return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), 1778 Offset, TargetTy, Indices, Prefix); 1779 } 1780 1781 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 1782 Type *ElementTy = ArrTy->getElementType(); 1783 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1784 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1785 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 1786 return 0; 1787 1788 Offset -= NumSkippedElements * ElementSize; 1789 Indices.push_back(IRB.getInt(NumSkippedElements)); 1790 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1791 Indices, Prefix); 1792 } 1793 1794 StructType *STy = dyn_cast<StructType>(Ty); 1795 if (!STy) 1796 return 0; 1797 1798 const StructLayout *SL = TD.getStructLayout(STy); 1799 uint64_t StructOffset = Offset.getZExtValue(); 1800 if (StructOffset >= SL->getSizeInBytes()) 1801 return 0; 1802 unsigned Index = SL->getElementContainingOffset(StructOffset); 1803 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 1804 Type *ElementTy = STy->getElementType(Index); 1805 if (Offset.uge(TD.getTypeAllocSize(ElementTy))) 1806 return 0; // The offset points into alignment padding. 1807 1808 Indices.push_back(IRB.getInt32(Index)); 1809 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1810 Indices, Prefix); 1811 } 1812 1813 /// \brief Get a natural GEP from a base pointer to a particular offset and 1814 /// resulting in a particular type. 1815 /// 1816 /// The goal is to produce a "natural" looking GEP that works with the existing 1817 /// composite types to arrive at the appropriate offset and element type for 1818 /// a pointer. TargetTy is the element type the returned GEP should point-to if 1819 /// possible. We recurse by decreasing Offset, adding the appropriate index to 1820 /// Indices, and setting Ty to the result subtype. 1821 /// 1822 /// If no natural GEP can be constructed, this function returns null. 1823 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, 1824 Value *Ptr, APInt Offset, Type *TargetTy, 1825 SmallVectorImpl<Value *> &Indices, 1826 const Twine &Prefix) { 1827 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1828 1829 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1830 // an i8. 1831 if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) 1832 return 0; 1833 1834 Type *ElementTy = Ty->getElementType(); 1835 if (!ElementTy->isSized()) 1836 return 0; // We can't GEP through an unsized element. 1837 APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); 1838 if (ElementSize == 0) 1839 return 0; // Zero-length arrays can't help us build a natural GEP. 1840 APInt NumSkippedElements = Offset.sdiv(ElementSize); 1841 1842 Offset -= NumSkippedElements * ElementSize; 1843 Indices.push_back(IRB.getInt(NumSkippedElements)); 1844 return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, 1845 Indices, Prefix); 1846 } 1847 1848 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 1849 /// resulting pointer has PointerTy. 1850 /// 1851 /// This tries very hard to compute a "natural" GEP which arrives at the offset 1852 /// and produces the pointer type desired. Where it cannot, it will try to use 1853 /// the natural GEP to arrive at the offset and bitcast to the type. Where that 1854 /// fails, it will try to use an existing i8* and GEP to the byte offset and 1855 /// bitcast to the type. 1856 /// 1857 /// The strategy for finding the more natural GEPs is to peel off layers of the 1858 /// pointer, walking back through bit casts and GEPs, searching for a base 1859 /// pointer from which we can compute a natural GEP with the desired 1860 /// properties. The algorithm tries to fold as many constant indices into 1861 /// a single GEP as possible, thus making each GEP more independent of the 1862 /// surrounding code. 1863 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, 1864 Value *Ptr, APInt Offset, Type *PointerTy, 1865 const Twine &Prefix) { 1866 // Even though we don't look through PHI nodes, we could be called on an 1867 // instruction in an unreachable block, which may be on a cycle. 1868 SmallPtrSet<Value *, 4> Visited; 1869 Visited.insert(Ptr); 1870 SmallVector<Value *, 4> Indices; 1871 1872 // We may end up computing an offset pointer that has the wrong type. If we 1873 // never are able to compute one directly that has the correct type, we'll 1874 // fall back to it, so keep it around here. 1875 Value *OffsetPtr = 0; 1876 1877 // Remember any i8 pointer we come across to re-use if we need to do a raw 1878 // byte offset. 1879 Value *Int8Ptr = 0; 1880 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1881 1882 Type *TargetTy = PointerTy->getPointerElementType(); 1883 1884 do { 1885 // First fold any existing GEPs into the offset. 1886 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1887 APInt GEPOffset(Offset.getBitWidth(), 0); 1888 if (!GEP->accumulateConstantOffset(TD, GEPOffset)) 1889 break; 1890 Offset += GEPOffset; 1891 Ptr = GEP->getPointerOperand(); 1892 if (!Visited.insert(Ptr)) 1893 break; 1894 } 1895 1896 // See if we can perform a natural GEP here. 1897 Indices.clear(); 1898 if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, 1899 Indices, Prefix)) { 1900 if (P->getType() == PointerTy) { 1901 // Zap any offset pointer that we ended up computing in previous rounds. 1902 if (OffsetPtr && OffsetPtr->use_empty()) 1903 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 1904 I->eraseFromParent(); 1905 return P; 1906 } 1907 if (!OffsetPtr) { 1908 OffsetPtr = P; 1909 } 1910 } 1911 1912 // Stash this pointer if we've found an i8*. 1913 if (Ptr->getType()->isIntegerTy(8)) { 1914 Int8Ptr = Ptr; 1915 Int8PtrOffset = Offset; 1916 } 1917 1918 // Peel off a layer of the pointer and update the offset appropriately. 1919 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1920 Ptr = cast<Operator>(Ptr)->getOperand(0); 1921 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1922 if (GA->mayBeOverridden()) 1923 break; 1924 Ptr = GA->getAliasee(); 1925 } else { 1926 break; 1927 } 1928 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1929 } while (Visited.insert(Ptr)); 1930 1931 if (!OffsetPtr) { 1932 if (!Int8Ptr) { 1933 Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), 1934 Prefix + ".raw_cast"); 1935 Int8PtrOffset = Offset; 1936 } 1937 1938 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 1939 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 1940 Prefix + ".raw_idx"); 1941 } 1942 Ptr = OffsetPtr; 1943 1944 // On the off chance we were targeting i8*, guard the bitcast here. 1945 if (Ptr->getType() != PointerTy) 1946 Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); 1947 1948 return Ptr; 1949 } 1950 1951 /// \brief Test whether we can convert a value from the old to the new type. 1952 /// 1953 /// This predicate should be used to guard calls to convertValue in order to 1954 /// ensure that we only try to convert viable values. The strategy is that we 1955 /// will peel off single element struct and array wrappings to get to an 1956 /// underlying value, and convert that value. 1957 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1958 if (OldTy == NewTy) 1959 return true; 1960 if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) 1961 if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) 1962 if (NewITy->getBitWidth() >= OldITy->getBitWidth()) 1963 return true; 1964 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 1965 return false; 1966 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1967 return false; 1968 1969 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1970 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 1971 return true; 1972 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 1973 return true; 1974 return false; 1975 } 1976 1977 return true; 1978 } 1979 1980 /// \brief Generic routine to convert an SSA value to a value of a different 1981 /// type. 1982 /// 1983 /// This will try various different casting techniques, such as bitcasts, 1984 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 1985 /// two types for viability with this routine. 1986 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 1987 Type *Ty) { 1988 assert(canConvertValue(DL, V->getType(), Ty) && 1989 "Value not convertable to type"); 1990 if (V->getType() == Ty) 1991 return V; 1992 if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType())) 1993 if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty)) 1994 if (NewITy->getBitWidth() > OldITy->getBitWidth()) 1995 return IRB.CreateZExt(V, NewITy); 1996 if (V->getType()->isIntegerTy() && Ty->isPointerTy()) 1997 return IRB.CreateIntToPtr(V, Ty); 1998 if (V->getType()->isPointerTy() && Ty->isIntegerTy()) 1999 return IRB.CreatePtrToInt(V, Ty); 2000 2001 return IRB.CreateBitCast(V, Ty); 2002 } 2003 2004 /// \brief Test whether the given alloca partition can be promoted to a vector. 2005 /// 2006 /// This is a quick test to check whether we can rewrite a particular alloca 2007 /// partition (and its newly formed alloca) into a vector alloca with only 2008 /// whole-vector loads and stores such that it could be promoted to a vector 2009 /// SSA value. We only can ensure this for a limited set of operations, and we 2010 /// don't want to do the rewrites unless we are confident that the result will 2011 /// be promotable, so we have an early test here. 2012 static bool isVectorPromotionViable(const DataLayout &TD, 2013 Type *AllocaTy, 2014 AllocaPartitioning &P, 2015 uint64_t PartitionBeginOffset, 2016 uint64_t PartitionEndOffset, 2017 AllocaPartitioning::const_use_iterator I, 2018 AllocaPartitioning::const_use_iterator E) { 2019 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 2020 if (!Ty) 2021 return false; 2022 2023 uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); 2024 2025 // While the definition of LLVM vectors is bitpacked, we don't support sizes 2026 // that aren't byte sized. 2027 if (ElementSize % 8) 2028 return false; 2029 assert((TD.getTypeSizeInBits(Ty) % 8) == 0 && 2030 "vector size not a multiple of element size?"); 2031 ElementSize /= 8; 2032 2033 for (; I != E; ++I) { 2034 Use *U = I->getUse(); 2035 if (!U) 2036 continue; // Skip dead use. 2037 2038 uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; 2039 uint64_t BeginIndex = BeginOffset / ElementSize; 2040 if (BeginIndex * ElementSize != BeginOffset || 2041 BeginIndex >= Ty->getNumElements()) 2042 return false; 2043 uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; 2044 uint64_t EndIndex = EndOffset / ElementSize; 2045 if (EndIndex * ElementSize != EndOffset || 2046 EndIndex > Ty->getNumElements()) 2047 return false; 2048 2049 assert(EndIndex > BeginIndex && "Empty vector!"); 2050 uint64_t NumElements = EndIndex - BeginIndex; 2051 Type *PartitionTy 2052 = (NumElements == 1) ? Ty->getElementType() 2053 : VectorType::get(Ty->getElementType(), NumElements); 2054 2055 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 2056 if (MI->isVolatile()) 2057 return false; 2058 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) { 2059 const AllocaPartitioning::MemTransferOffsets &MTO 2060 = P.getMemTransferOffsets(*MTI); 2061 if (!MTO.IsSplittable) 2062 return false; 2063 } 2064 } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { 2065 // Disable vector promotion when there are loads or stores of an FCA. 2066 return false; 2067 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 2068 if (LI->isVolatile()) 2069 return false; 2070 if (!canConvertValue(TD, PartitionTy, LI->getType())) 2071 return false; 2072 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 2073 if (SI->isVolatile()) 2074 return false; 2075 if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) 2076 return false; 2077 } else { 2078 return false; 2079 } 2080 } 2081 return true; 2082 } 2083 2084 /// \brief Test whether the given alloca partition's integer operations can be 2085 /// widened to promotable ones. 2086 /// 2087 /// This is a quick test to check whether we can rewrite the integer loads and 2088 /// stores to a particular alloca into wider loads and stores and be able to 2089 /// promote the resulting alloca. 2090 static bool isIntegerWideningViable(const DataLayout &TD, 2091 Type *AllocaTy, 2092 uint64_t AllocBeginOffset, 2093 AllocaPartitioning &P, 2094 AllocaPartitioning::const_use_iterator I, 2095 AllocaPartitioning::const_use_iterator E) { 2096 uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); 2097 // Don't create integer types larger than the maximum bitwidth. 2098 if (SizeInBits > IntegerType::MAX_INT_BITS) 2099 return false; 2100 2101 // Don't try to handle allocas with bit-padding. 2102 if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) 2103 return false; 2104 2105 // We need to ensure that an integer type with the appropriate bitwidth can 2106 // be converted to the alloca type, whatever that is. We don't want to force 2107 // the alloca itself to have an integer type if there is a more suitable one. 2108 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 2109 if (!canConvertValue(TD, AllocaTy, IntTy) || 2110 !canConvertValue(TD, IntTy, AllocaTy)) 2111 return false; 2112 2113 uint64_t Size = TD.getTypeStoreSize(AllocaTy); 2114 2115 // Check the uses to ensure the uses are (likely) promotable integer uses. 2116 // Also ensure that the alloca has a covering load or store. We don't want 2117 // to widen the integer operations only to fail to promote due to some other 2118 // unsplittable entry (which we may make splittable later). 2119 bool WholeAllocaOp = false; 2120 for (; I != E; ++I) { 2121 Use *U = I->getUse(); 2122 if (!U) 2123 continue; // Skip dead use. 2124 2125 uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; 2126 uint64_t RelEnd = I->EndOffset - AllocBeginOffset; 2127 2128 // We can't reasonably handle cases where the load or store extends past 2129 // the end of the aloca's type and into its padding. 2130 if (RelEnd > Size) 2131 return false; 2132 2133 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 2134 if (LI->isVolatile()) 2135 return false; 2136 if (RelBegin == 0 && RelEnd == Size) 2137 WholeAllocaOp = true; 2138 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2139 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2140 return false; 2141 continue; 2142 } 2143 // Non-integer loads need to be convertible from the alloca type so that 2144 // they are promotable. 2145 if (RelBegin != 0 || RelEnd != Size || 2146 !canConvertValue(TD, AllocaTy, LI->getType())) 2147 return false; 2148 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 2149 Type *ValueTy = SI->getValueOperand()->getType(); 2150 if (SI->isVolatile()) 2151 return false; 2152 if (RelBegin == 0 && RelEnd == Size) 2153 WholeAllocaOp = true; 2154 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2155 if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) 2156 return false; 2157 continue; 2158 } 2159 // Non-integer stores need to be convertible to the alloca type so that 2160 // they are promotable. 2161 if (RelBegin != 0 || RelEnd != Size || 2162 !canConvertValue(TD, ValueTy, AllocaTy)) 2163 return false; 2164 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 2165 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 2166 return false; 2167 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) { 2168 const AllocaPartitioning::MemTransferOffsets &MTO 2169 = P.getMemTransferOffsets(*MTI); 2170 if (!MTO.IsSplittable) 2171 return false; 2172 } 2173 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 2174 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 2175 II->getIntrinsicID() != Intrinsic::lifetime_end) 2176 return false; 2177 } else { 2178 return false; 2179 } 2180 } 2181 return WholeAllocaOp; 2182 } 2183 2184 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 2185 IntegerType *Ty, uint64_t Offset, 2186 const Twine &Name) { 2187 DEBUG(dbgs() << " start: " << *V << "\n"); 2188 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2189 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2190 "Element extends past full value"); 2191 uint64_t ShAmt = 8*Offset; 2192 if (DL.isBigEndian()) 2193 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2194 if (ShAmt) { 2195 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2196 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2197 } 2198 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2199 "Cannot extract to a larger integer!"); 2200 if (Ty != IntTy) { 2201 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2202 DEBUG(dbgs() << " trunced: " << *V << "\n"); 2203 } 2204 return V; 2205 } 2206 2207 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, 2208 Value *V, uint64_t Offset, const Twine &Name) { 2209 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2210 IntegerType *Ty = cast<IntegerType>(V->getType()); 2211 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2212 "Cannot insert a larger integer!"); 2213 DEBUG(dbgs() << " start: " << *V << "\n"); 2214 if (Ty != IntTy) { 2215 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2216 DEBUG(dbgs() << " extended: " << *V << "\n"); 2217 } 2218 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 2219 "Element store outside of alloca store"); 2220 uint64_t ShAmt = 8*Offset; 2221 if (DL.isBigEndian()) 2222 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 2223 if (ShAmt) { 2224 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2225 DEBUG(dbgs() << " shifted: " << *V << "\n"); 2226 } 2227 2228 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2229 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2230 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2231 DEBUG(dbgs() << " masked: " << *Old << "\n"); 2232 V = IRB.CreateOr(Old, V, Name + ".insert"); 2233 DEBUG(dbgs() << " inserted: " << *V << "\n"); 2234 } 2235 return V; 2236 } 2237 2238 static Value *extractVector(IRBuilderTy &IRB, Value *V, 2239 unsigned BeginIndex, unsigned EndIndex, 2240 const Twine &Name) { 2241 VectorType *VecTy = cast<VectorType>(V->getType()); 2242 unsigned NumElements = EndIndex - BeginIndex; 2243 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2244 2245 if (NumElements == VecTy->getNumElements()) 2246 return V; 2247 2248 if (NumElements == 1) { 2249 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 2250 Name + ".extract"); 2251 DEBUG(dbgs() << " extract: " << *V << "\n"); 2252 return V; 2253 } 2254 2255 SmallVector<Constant*, 8> Mask; 2256 Mask.reserve(NumElements); 2257 for (unsigned i = BeginIndex; i != EndIndex; ++i) 2258 Mask.push_back(IRB.getInt32(i)); 2259 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2260 ConstantVector::get(Mask), 2261 Name + ".extract"); 2262 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2263 return V; 2264 } 2265 2266 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, 2267 unsigned BeginIndex, const Twine &Name) { 2268 VectorType *VecTy = cast<VectorType>(Old->getType()); 2269 assert(VecTy && "Can only insert a vector into a vector"); 2270 2271 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 2272 if (!Ty) { 2273 // Single element to insert. 2274 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 2275 Name + ".insert"); 2276 DEBUG(dbgs() << " insert: " << *V << "\n"); 2277 return V; 2278 } 2279 2280 assert(Ty->getNumElements() <= VecTy->getNumElements() && 2281 "Too many elements!"); 2282 if (Ty->getNumElements() == VecTy->getNumElements()) { 2283 assert(V->getType() == VecTy && "Vector type mismatch"); 2284 return V; 2285 } 2286 unsigned EndIndex = BeginIndex + Ty->getNumElements(); 2287 2288 // When inserting a smaller vector into the larger to store, we first 2289 // use a shuffle vector to widen it with undef elements, and then 2290 // a second shuffle vector to select between the loaded vector and the 2291 // incoming vector. 2292 SmallVector<Constant*, 8> Mask; 2293 Mask.reserve(VecTy->getNumElements()); 2294 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2295 if (i >= BeginIndex && i < EndIndex) 2296 Mask.push_back(IRB.getInt32(i - BeginIndex)); 2297 else 2298 Mask.push_back(UndefValue::get(IRB.getInt32Ty())); 2299 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 2300 ConstantVector::get(Mask), 2301 Name + ".expand"); 2302 DEBUG(dbgs() << " shuffle1: " << *V << "\n"); 2303 2304 Mask.clear(); 2305 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 2306 if (i >= BeginIndex && i < EndIndex) 2307 Mask.push_back(IRB.getInt32(i)); 2308 else 2309 Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); 2310 V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask), 2311 Name + "insert"); 2312 DEBUG(dbgs() << " shuffle2: " << *V << "\n"); 2313 return V; 2314 } 2315 2316 namespace { 2317 /// \brief Visitor to rewrite instructions using a partition of an alloca to 2318 /// use a new alloca. 2319 /// 2320 /// Also implements the rewriting to vector-based accesses when the partition 2321 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2322 /// lives here. 2323 class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, 2324 bool> { 2325 // Befriend the base class so it can delegate to private visit methods. 2326 friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; 2327 2328 const DataLayout &TD; 2329 AllocaPartitioning &P; 2330 SROA &Pass; 2331 AllocaInst &OldAI, &NewAI; 2332 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2333 Type *NewAllocaTy; 2334 2335 // If we are rewriting an alloca partition which can be written as pure 2336 // vector operations, we stash extra information here. When VecTy is 2337 // non-null, we have some strict guarantees about the rewritten alloca: 2338 // - The new alloca is exactly the size of the vector type here. 2339 // - The accesses all either map to the entire vector or to a single 2340 // element. 2341 // - The set of accessing instructions is only one of those handled above 2342 // in isVectorPromotionViable. Generally these are the same access kinds 2343 // which are promotable via mem2reg. 2344 VectorType *VecTy; 2345 Type *ElementTy; 2346 uint64_t ElementSize; 2347 2348 // This is a convenience and flag variable that will be null unless the new 2349 // alloca's integer operations should be widened to this integer type due to 2350 // passing isIntegerWideningViable above. If it is non-null, the desired 2351 // integer type will be stored here for easy access during rewriting. 2352 IntegerType *IntTy; 2353 2354 // The offset of the partition user currently being rewritten. 2355 uint64_t BeginOffset, EndOffset; 2356 bool IsSplit; 2357 Use *OldUse; 2358 Instruction *OldPtr; 2359 2360 // The name prefix to use when rewriting instructions for this alloca. 2361 std::string NamePrefix; 2362 2363 public: 2364 AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, 2365 AllocaPartitioning::iterator PI, 2366 SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, 2367 uint64_t NewBeginOffset, uint64_t NewEndOffset) 2368 : TD(TD), P(P), Pass(Pass), 2369 OldAI(OldAI), NewAI(NewAI), 2370 NewAllocaBeginOffset(NewBeginOffset), 2371 NewAllocaEndOffset(NewEndOffset), 2372 NewAllocaTy(NewAI.getAllocatedType()), 2373 VecTy(), ElementTy(), ElementSize(), IntTy(), 2374 BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr() { 2375 } 2376 2377 /// \brief Visit the users of the alloca partition and rewrite them. 2378 bool visitUsers(AllocaPartitioning::const_use_iterator I, 2379 AllocaPartitioning::const_use_iterator E) { 2380 if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, 2381 NewAllocaBeginOffset, NewAllocaEndOffset, 2382 I, E)) { 2383 ++NumVectorized; 2384 VecTy = cast<VectorType>(NewAI.getAllocatedType()); 2385 ElementTy = VecTy->getElementType(); 2386 assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && 2387 "Only multiple-of-8 sized vector elements are viable"); 2388 ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; 2389 } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), 2390 NewAllocaBeginOffset, P, I, E)) { 2391 IntTy = Type::getIntNTy(NewAI.getContext(), 2392 TD.getTypeSizeInBits(NewAI.getAllocatedType())); 2393 } 2394 bool CanSROA = true; 2395 for (; I != E; ++I) { 2396 if (!I->getUse()) 2397 continue; // Skip dead uses. 2398 BeginOffset = I->BeginOffset; 2399 EndOffset = I->EndOffset; 2400 IsSplit = I->isSplit(); 2401 OldUse = I->getUse(); 2402 OldPtr = cast<Instruction>(OldUse->get()); 2403 NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); 2404 CanSROA &= visit(cast<Instruction>(OldUse->getUser())); 2405 } 2406 if (VecTy) { 2407 assert(CanSROA); 2408 VecTy = 0; 2409 ElementTy = 0; 2410 ElementSize = 0; 2411 } 2412 if (IntTy) { 2413 assert(CanSROA); 2414 IntTy = 0; 2415 } 2416 return CanSROA; 2417 } 2418 2419 private: 2420 // Every instruction which can end up as a user must have a rewrite rule. 2421 bool visitInstruction(Instruction &I) { 2422 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2423 llvm_unreachable("No rewrite rule for this instruction!"); 2424 } 2425 2426 Twine getName(const Twine &Suffix) { 2427 return NamePrefix + Suffix; 2428 } 2429 2430 Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) { 2431 assert(BeginOffset >= NewAllocaBeginOffset); 2432 APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); 2433 return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); 2434 } 2435 2436 /// \brief Compute suitable alignment to access an offset into the new alloca. 2437 unsigned getOffsetAlign(uint64_t Offset) { 2438 unsigned NewAIAlign = NewAI.getAlignment(); 2439 if (!NewAIAlign) 2440 NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); 2441 return MinAlign(NewAIAlign, Offset); 2442 } 2443 2444 /// \brief Compute suitable alignment to access this partition of the new 2445 /// alloca. 2446 unsigned getPartitionAlign() { 2447 return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); 2448 } 2449 2450 /// \brief Compute suitable alignment to access a type at an offset of the 2451 /// new alloca. 2452 /// 2453 /// \returns zero if the type's ABI alignment is a suitable alignment, 2454 /// otherwise returns the maximal suitable alignment. 2455 unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { 2456 unsigned Align = getOffsetAlign(Offset); 2457 return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; 2458 } 2459 2460 /// \brief Compute suitable alignment to access a type at the beginning of 2461 /// this partition of the new alloca. 2462 /// 2463 /// See \c getOffsetTypeAlign for details; this routine delegates to it. 2464 unsigned getPartitionTypeAlign(Type *Ty) { 2465 return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); 2466 } 2467 2468 unsigned getIndex(uint64_t Offset) { 2469 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2470 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2471 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2472 uint32_t Index = RelOffset / ElementSize; 2473 assert(Index * ElementSize == RelOffset); 2474 return Index; 2475 } 2476 2477 void deleteIfTriviallyDead(Value *V) { 2478 Instruction *I = cast<Instruction>(V); 2479 if (isInstructionTriviallyDead(I)) 2480 Pass.DeadInsts.insert(I); 2481 } 2482 2483 Value *rewriteVectorizedLoadInst(IRBuilderTy &IRB) { 2484 unsigned BeginIndex = getIndex(BeginOffset); 2485 unsigned EndIndex = getIndex(EndOffset); 2486 assert(EndIndex > BeginIndex && "Empty vector!"); 2487 2488 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2489 getName(".load")); 2490 return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec")); 2491 } 2492 2493 Value *rewriteIntegerLoad(IRBuilderTy &IRB, LoadInst &LI) { 2494 assert(IntTy && "We cannot insert an integer to the alloca"); 2495 assert(!LI.isVolatile()); 2496 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2497 getName(".load")); 2498 V = convertValue(TD, IRB, V, IntTy); 2499 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2500 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2501 if (Offset > 0 || EndOffset < NewAllocaEndOffset) 2502 V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, 2503 getName(".extract")); 2504 return V; 2505 } 2506 2507 bool visitLoadInst(LoadInst &LI) { 2508 DEBUG(dbgs() << " original: " << LI << "\n"); 2509 Value *OldOp = LI.getOperand(0); 2510 assert(OldOp == OldPtr); 2511 2512 uint64_t Size = EndOffset - BeginOffset; 2513 2514 IRBuilderTy IRB(&LI); 2515 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8) 2516 : LI.getType(); 2517 bool IsPtrAdjusted = false; 2518 Value *V; 2519 if (VecTy) { 2520 V = rewriteVectorizedLoadInst(IRB); 2521 } else if (IntTy && LI.getType()->isIntegerTy()) { 2522 V = rewriteIntegerLoad(IRB, LI); 2523 } else if (BeginOffset == NewAllocaBeginOffset && 2524 canConvertValue(TD, NewAllocaTy, LI.getType())) { 2525 V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2526 LI.isVolatile(), getName(".load")); 2527 } else { 2528 Type *LTy = TargetTy->getPointerTo(); 2529 V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), 2530 getPartitionTypeAlign(TargetTy), 2531 LI.isVolatile(), getName(".load")); 2532 IsPtrAdjusted = true; 2533 } 2534 V = convertValue(TD, IRB, V, TargetTy); 2535 2536 if (IsSplit) { 2537 assert(!LI.isVolatile()); 2538 assert(LI.getType()->isIntegerTy() && 2539 "Only integer type loads and stores are split"); 2540 assert(Size < TD.getTypeStoreSize(LI.getType()) && 2541 "Split load isn't smaller than original load"); 2542 assert(LI.getType()->getIntegerBitWidth() == 2543 TD.getTypeStoreSizeInBits(LI.getType()) && 2544 "Non-byte-multiple bit width"); 2545 // Move the insertion point just past the load so that we can refer to it. 2546 IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); 2547 // Create a placeholder value with the same type as LI to use as the 2548 // basis for the new value. This allows us to replace the uses of LI with 2549 // the computed value, and then replace the placeholder with LI, leaving 2550 // LI only used for this computation. 2551 Value *Placeholder 2552 = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); 2553 V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, 2554 getName(".insert")); 2555 LI.replaceAllUsesWith(V); 2556 Placeholder->replaceAllUsesWith(&LI); 2557 delete Placeholder; 2558 } else { 2559 LI.replaceAllUsesWith(V); 2560 } 2561 2562 Pass.DeadInsts.insert(&LI); 2563 deleteIfTriviallyDead(OldOp); 2564 DEBUG(dbgs() << " to: " << *V << "\n"); 2565 return !LI.isVolatile() && !IsPtrAdjusted; 2566 } 2567 2568 bool rewriteVectorizedStoreInst(IRBuilderTy &IRB, Value *V, 2569 StoreInst &SI, Value *OldOp) { 2570 unsigned BeginIndex = getIndex(BeginOffset); 2571 unsigned EndIndex = getIndex(EndOffset); 2572 assert(EndIndex > BeginIndex && "Empty vector!"); 2573 unsigned NumElements = EndIndex - BeginIndex; 2574 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2575 Type *PartitionTy 2576 = (NumElements == 1) ? ElementTy 2577 : VectorType::get(ElementTy, NumElements); 2578 if (V->getType() != PartitionTy) 2579 V = convertValue(TD, IRB, V, PartitionTy); 2580 2581 // Mix in the existing elements. 2582 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2583 getName(".load")); 2584 V = insertVector(IRB, Old, V, BeginIndex, getName(".vec")); 2585 2586 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2587 Pass.DeadInsts.insert(&SI); 2588 2589 (void)Store; 2590 DEBUG(dbgs() << " to: " << *Store << "\n"); 2591 return true; 2592 } 2593 2594 bool rewriteIntegerStore(IRBuilderTy &IRB, Value *V, StoreInst &SI) { 2595 assert(IntTy && "We cannot extract an integer from the alloca"); 2596 assert(!SI.isVolatile()); 2597 if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 2598 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2599 getName(".oldload")); 2600 Old = convertValue(TD, IRB, Old, IntTy); 2601 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2602 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2603 V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, 2604 getName(".insert")); 2605 } 2606 V = convertValue(TD, IRB, V, NewAllocaTy); 2607 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 2608 Pass.DeadInsts.insert(&SI); 2609 (void)Store; 2610 DEBUG(dbgs() << " to: " << *Store << "\n"); 2611 return true; 2612 } 2613 2614 bool visitStoreInst(StoreInst &SI) { 2615 DEBUG(dbgs() << " original: " << SI << "\n"); 2616 Value *OldOp = SI.getOperand(1); 2617 assert(OldOp == OldPtr); 2618 IRBuilderTy IRB(&SI); 2619 2620 Value *V = SI.getValueOperand(); 2621 2622 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2623 // alloca that should be re-examined after promoting this alloca. 2624 if (V->getType()->isPointerTy()) 2625 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 2626 Pass.PostPromotionWorklist.insert(AI); 2627 2628 uint64_t Size = EndOffset - BeginOffset; 2629 if (Size < TD.getTypeStoreSize(V->getType())) { 2630 assert(!SI.isVolatile()); 2631 assert(IsSplit && "A seemingly split store isn't splittable"); 2632 assert(V->getType()->isIntegerTy() && 2633 "Only integer type loads and stores are split"); 2634 assert(V->getType()->getIntegerBitWidth() == 2635 TD.getTypeStoreSizeInBits(V->getType()) && 2636 "Non-byte-multiple bit width"); 2637 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); 2638 V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, 2639 getName(".extract")); 2640 } 2641 2642 if (VecTy) 2643 return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); 2644 if (IntTy && V->getType()->isIntegerTy()) 2645 return rewriteIntegerStore(IRB, V, SI); 2646 2647 StoreInst *NewSI; 2648 if (BeginOffset == NewAllocaBeginOffset && 2649 canConvertValue(TD, V->getType(), NewAllocaTy)) { 2650 V = convertValue(TD, IRB, V, NewAllocaTy); 2651 NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2652 SI.isVolatile()); 2653 } else { 2654 Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); 2655 NewSI = IRB.CreateAlignedStore(V, NewPtr, 2656 getPartitionTypeAlign(V->getType()), 2657 SI.isVolatile()); 2658 } 2659 (void)NewSI; 2660 Pass.DeadInsts.insert(&SI); 2661 deleteIfTriviallyDead(OldOp); 2662 2663 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 2664 return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 2665 } 2666 2667 /// \brief Compute an integer value from splatting an i8 across the given 2668 /// number of bytes. 2669 /// 2670 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 2671 /// call this routine. 2672 /// FIXME: Heed the advice above. 2673 /// 2674 /// \param V The i8 value to splat. 2675 /// \param Size The number of bytes in the output (assuming i8 is one byte) 2676 Value *getIntegerSplat(IRBuilderTy &IRB, Value *V, unsigned Size) { 2677 assert(Size > 0 && "Expected a positive number of bytes."); 2678 IntegerType *VTy = cast<IntegerType>(V->getType()); 2679 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 2680 if (Size == 1) 2681 return V; 2682 2683 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 2684 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), 2685 ConstantExpr::getUDiv( 2686 Constant::getAllOnesValue(SplatIntTy), 2687 ConstantExpr::getZExt( 2688 Constant::getAllOnesValue(V->getType()), 2689 SplatIntTy)), 2690 getName(".isplat")); 2691 return V; 2692 } 2693 2694 /// \brief Compute a vector splat for a given element value. 2695 Value *getVectorSplat(IRBuilderTy &IRB, Value *V, unsigned NumElements) { 2696 V = IRB.CreateVectorSplat(NumElements, V, NamePrefix); 2697 DEBUG(dbgs() << " splat: " << *V << "\n"); 2698 return V; 2699 } 2700 2701 bool visitMemSetInst(MemSetInst &II) { 2702 DEBUG(dbgs() << " original: " << II << "\n"); 2703 IRBuilderTy IRB(&II); 2704 assert(II.getRawDest() == OldPtr); 2705 2706 // If the memset has a variable size, it cannot be split, just adjust the 2707 // pointer to the new alloca. 2708 if (!isa<Constant>(II.getLength())) { 2709 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2710 Type *CstTy = II.getAlignmentCst()->getType(); 2711 II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); 2712 2713 deleteIfTriviallyDead(OldPtr); 2714 return false; 2715 } 2716 2717 // Record this instruction for deletion. 2718 Pass.DeadInsts.insert(&II); 2719 2720 Type *AllocaTy = NewAI.getAllocatedType(); 2721 Type *ScalarTy = AllocaTy->getScalarType(); 2722 2723 // If this doesn't map cleanly onto the alloca type, and that type isn't 2724 // a single value type, just emit a memset. 2725 if (!VecTy && !IntTy && 2726 (BeginOffset != NewAllocaBeginOffset || 2727 EndOffset != NewAllocaEndOffset || 2728 !AllocaTy->isSingleValueType() || 2729 !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || 2730 TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { 2731 Type *SizeTy = II.getLength()->getType(); 2732 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2733 CallInst *New 2734 = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, 2735 II.getRawDest()->getType()), 2736 II.getValue(), Size, getPartitionAlign(), 2737 II.isVolatile()); 2738 (void)New; 2739 DEBUG(dbgs() << " to: " << *New << "\n"); 2740 return false; 2741 } 2742 2743 // If we can represent this as a simple value, we have to build the actual 2744 // value to store, which requires expanding the byte present in memset to 2745 // a sensible representation for the alloca type. This is essentially 2746 // splatting the byte to a sufficiently wide integer, splatting it across 2747 // any desired vector width, and bitcasting to the final type. 2748 Value *V; 2749 2750 if (VecTy) { 2751 // If this is a memset of a vectorized alloca, insert it. 2752 assert(ElementTy == ScalarTy); 2753 2754 unsigned BeginIndex = getIndex(BeginOffset); 2755 unsigned EndIndex = getIndex(EndOffset); 2756 assert(EndIndex > BeginIndex && "Empty vector!"); 2757 unsigned NumElements = EndIndex - BeginIndex; 2758 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2759 2760 Value *Splat = getIntegerSplat(IRB, II.getValue(), 2761 TD.getTypeSizeInBits(ElementTy)/8); 2762 Splat = convertValue(TD, IRB, Splat, ElementTy); 2763 if (NumElements > 1) 2764 Splat = getVectorSplat(IRB, Splat, NumElements); 2765 2766 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2767 getName(".oldload")); 2768 V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec")); 2769 } else if (IntTy) { 2770 // If this is a memset on an alloca where we can widen stores, insert the 2771 // set integer. 2772 assert(!II.isVolatile()); 2773 2774 uint64_t Size = EndOffset - BeginOffset; 2775 V = getIntegerSplat(IRB, II.getValue(), Size); 2776 2777 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 2778 EndOffset != NewAllocaBeginOffset)) { 2779 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2780 getName(".oldload")); 2781 Old = convertValue(TD, IRB, Old, IntTy); 2782 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2783 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2784 V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); 2785 } else { 2786 assert(V->getType() == IntTy && 2787 "Wrong type for an alloca wide integer!"); 2788 } 2789 V = convertValue(TD, IRB, V, AllocaTy); 2790 } else { 2791 // Established these invariants above. 2792 assert(BeginOffset == NewAllocaBeginOffset); 2793 assert(EndOffset == NewAllocaEndOffset); 2794 2795 V = getIntegerSplat(IRB, II.getValue(), 2796 TD.getTypeSizeInBits(ScalarTy)/8); 2797 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 2798 V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements()); 2799 2800 V = convertValue(TD, IRB, V, AllocaTy); 2801 } 2802 2803 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 2804 II.isVolatile()); 2805 (void)New; 2806 DEBUG(dbgs() << " to: " << *New << "\n"); 2807 return !II.isVolatile(); 2808 } 2809 2810 bool visitMemTransferInst(MemTransferInst &II) { 2811 // Rewriting of memory transfer instructions can be a bit tricky. We break 2812 // them into two categories: split intrinsics and unsplit intrinsics. 2813 2814 DEBUG(dbgs() << " original: " << II << "\n"); 2815 IRBuilderTy IRB(&II); 2816 2817 assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); 2818 bool IsDest = II.getRawDest() == OldPtr; 2819 2820 const AllocaPartitioning::MemTransferOffsets &MTO 2821 = P.getMemTransferOffsets(II); 2822 2823 // Compute the relative offset within the transfer. 2824 unsigned IntPtrWidth = TD.getPointerSizeInBits(); 2825 APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin 2826 : MTO.SourceBegin)); 2827 2828 unsigned Align = II.getAlignment(); 2829 if (Align > 1) 2830 Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), 2831 MinAlign(II.getAlignment(), getPartitionAlign())); 2832 2833 // For unsplit intrinsics, we simply modify the source and destination 2834 // pointers in place. This isn't just an optimization, it is a matter of 2835 // correctness. With unsplit intrinsics we may be dealing with transfers 2836 // within a single alloca before SROA ran, or with transfers that have 2837 // a variable length. We may also be dealing with memmove instead of 2838 // memcpy, and so simply updating the pointers is the necessary for us to 2839 // update both source and dest of a single call. 2840 if (!MTO.IsSplittable) { 2841 Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); 2842 if (IsDest) 2843 II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); 2844 else 2845 II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); 2846 2847 Type *CstTy = II.getAlignmentCst()->getType(); 2848 II.setAlignment(ConstantInt::get(CstTy, Align)); 2849 2850 DEBUG(dbgs() << " to: " << II << "\n"); 2851 deleteIfTriviallyDead(OldOp); 2852 return false; 2853 } 2854 // For split transfer intrinsics we have an incredibly useful assurance: 2855 // the source and destination do not reside within the same alloca, and at 2856 // least one of them does not escape. This means that we can replace 2857 // memmove with memcpy, and we don't need to worry about all manner of 2858 // downsides to splitting and transforming the operations. 2859 2860 // If this doesn't map cleanly onto the alloca type, and that type isn't 2861 // a single value type, just emit a memcpy. 2862 bool EmitMemCpy 2863 = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || 2864 EndOffset != NewAllocaEndOffset || 2865 !NewAI.getAllocatedType()->isSingleValueType()); 2866 2867 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 2868 // size hasn't been shrunk based on analysis of the viable range, this is 2869 // a no-op. 2870 if (EmitMemCpy && &OldAI == &NewAI) { 2871 uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; 2872 uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; 2873 // Ensure the start lines up. 2874 assert(BeginOffset == OrigBegin); 2875 (void)OrigBegin; 2876 2877 // Rewrite the size as needed. 2878 if (EndOffset != OrigEnd) 2879 II.setLength(ConstantInt::get(II.getLength()->getType(), 2880 EndOffset - BeginOffset)); 2881 return false; 2882 } 2883 // Record this instruction for deletion. 2884 Pass.DeadInsts.insert(&II); 2885 2886 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2887 // alloca that should be re-examined after rewriting this instruction. 2888 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 2889 if (AllocaInst *AI 2890 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) 2891 Pass.Worklist.insert(AI); 2892 2893 if (EmitMemCpy) { 2894 Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() 2895 : II.getRawDest()->getType(); 2896 2897 // Compute the other pointer, folding as much as possible to produce 2898 // a single, simple GEP in most cases. 2899 OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2900 getName("." + OtherPtr->getName())); 2901 2902 Value *OurPtr 2903 = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() 2904 : II.getRawSource()->getType()); 2905 Type *SizeTy = II.getLength()->getType(); 2906 Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); 2907 2908 CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, 2909 IsDest ? OtherPtr : OurPtr, 2910 Size, Align, II.isVolatile()); 2911 (void)New; 2912 DEBUG(dbgs() << " to: " << *New << "\n"); 2913 return false; 2914 } 2915 2916 // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy 2917 // is equivalent to 1, but that isn't true if we end up rewriting this as 2918 // a load or store. 2919 if (!Align) 2920 Align = 1; 2921 2922 bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && 2923 EndOffset == NewAllocaEndOffset; 2924 uint64_t Size = EndOffset - BeginOffset; 2925 unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; 2926 unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; 2927 unsigned NumElements = EndIndex - BeginIndex; 2928 IntegerType *SubIntTy 2929 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; 2930 2931 Type *OtherPtrTy = NewAI.getType(); 2932 if (VecTy && !IsWholeAlloca) { 2933 if (NumElements == 1) 2934 OtherPtrTy = VecTy->getElementType(); 2935 else 2936 OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); 2937 2938 OtherPtrTy = OtherPtrTy->getPointerTo(); 2939 } else if (IntTy && !IsWholeAlloca) { 2940 OtherPtrTy = SubIntTy->getPointerTo(); 2941 } 2942 2943 Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, 2944 getName("." + OtherPtr->getName())); 2945 Value *DstPtr = &NewAI; 2946 if (!IsDest) 2947 std::swap(SrcPtr, DstPtr); 2948 2949 Value *Src; 2950 if (VecTy && !IsWholeAlloca && !IsDest) { 2951 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2952 getName(".load")); 2953 Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec")); 2954 } else if (IntTy && !IsWholeAlloca && !IsDest) { 2955 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2956 getName(".load")); 2957 Src = convertValue(TD, IRB, Src, IntTy); 2958 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2959 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2960 Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); 2961 } else { 2962 Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), 2963 getName(".copyload")); 2964 } 2965 2966 if (VecTy && !IsWholeAlloca && IsDest) { 2967 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2968 getName(".oldload")); 2969 Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec")); 2970 } else if (IntTy && !IsWholeAlloca && IsDest) { 2971 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 2972 getName(".oldload")); 2973 Old = convertValue(TD, IRB, Old, IntTy); 2974 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2975 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2976 Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); 2977 Src = convertValue(TD, IRB, Src, NewAllocaTy); 2978 } 2979 2980 StoreInst *Store = cast<StoreInst>( 2981 IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); 2982 (void)Store; 2983 DEBUG(dbgs() << " to: " << *Store << "\n"); 2984 return !II.isVolatile(); 2985 } 2986 2987 bool visitIntrinsicInst(IntrinsicInst &II) { 2988 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 2989 II.getIntrinsicID() == Intrinsic::lifetime_end); 2990 DEBUG(dbgs() << " original: " << II << "\n"); 2991 IRBuilderTy IRB(&II); 2992 assert(II.getArgOperand(1) == OldPtr); 2993 2994 // Record this instruction for deletion. 2995 Pass.DeadInsts.insert(&II); 2996 2997 ConstantInt *Size 2998 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 2999 EndOffset - BeginOffset); 3000 Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); 3001 Value *New; 3002 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 3003 New = IRB.CreateLifetimeStart(Ptr, Size); 3004 else 3005 New = IRB.CreateLifetimeEnd(Ptr, Size); 3006 3007 (void)New; 3008 DEBUG(dbgs() << " to: " << *New << "\n"); 3009 return true; 3010 } 3011 3012 bool visitPHINode(PHINode &PN) { 3013 DEBUG(dbgs() << " original: " << PN << "\n"); 3014 3015 // We would like to compute a new pointer in only one place, but have it be 3016 // as local as possible to the PHI. To do that, we re-use the location of 3017 // the old pointer, which necessarily must be in the right position to 3018 // dominate the PHI. 3019 IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr)); 3020 3021 Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); 3022 // Replace the operands which were using the old pointer. 3023 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 3024 3025 DEBUG(dbgs() << " to: " << PN << "\n"); 3026 deleteIfTriviallyDead(OldPtr); 3027 return false; 3028 } 3029 3030 bool visitSelectInst(SelectInst &SI) { 3031 DEBUG(dbgs() << " original: " << SI << "\n"); 3032 IRBuilderTy IRB(&SI); 3033 3034 // Find the operand we need to rewrite here. 3035 bool IsTrueVal = SI.getTrueValue() == OldPtr; 3036 if (IsTrueVal) 3037 assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); 3038 else 3039 assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); 3040 3041 Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); 3042 SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); 3043 DEBUG(dbgs() << " to: " << SI << "\n"); 3044 deleteIfTriviallyDead(OldPtr); 3045 return false; 3046 } 3047 3048 }; 3049 } 3050 3051 namespace { 3052 /// \brief Visitor to rewrite aggregate loads and stores as scalar. 3053 /// 3054 /// This pass aggressively rewrites all aggregate loads and stores on 3055 /// a particular pointer (or any pointer derived from it which we can identify) 3056 /// with scalar loads and stores. 3057 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 3058 // Befriend the base class so it can delegate to private visit methods. 3059 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 3060 3061 const DataLayout &TD; 3062 3063 /// Queue of pointer uses to analyze and potentially rewrite. 3064 SmallVector<Use *, 8> Queue; 3065 3066 /// Set to prevent us from cycling with phi nodes and loops. 3067 SmallPtrSet<User *, 8> Visited; 3068 3069 /// The current pointer use being rewritten. This is used to dig up the used 3070 /// value (as opposed to the user). 3071 Use *U; 3072 3073 public: 3074 AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} 3075 3076 /// Rewrite loads and stores through a pointer and all pointers derived from 3077 /// it. 3078 bool rewrite(Instruction &I) { 3079 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 3080 enqueueUsers(I); 3081 bool Changed = false; 3082 while (!Queue.empty()) { 3083 U = Queue.pop_back_val(); 3084 Changed |= visit(cast<Instruction>(U->getUser())); 3085 } 3086 return Changed; 3087 } 3088 3089 private: 3090 /// Enqueue all the users of the given instruction for further processing. 3091 /// This uses a set to de-duplicate users. 3092 void enqueueUsers(Instruction &I) { 3093 for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; 3094 ++UI) 3095 if (Visited.insert(*UI)) 3096 Queue.push_back(&UI.getUse()); 3097 } 3098 3099 // Conservative default is to not rewrite anything. 3100 bool visitInstruction(Instruction &I) { return false; } 3101 3102 /// \brief Generic recursive split emission class. 3103 template <typename Derived> 3104 class OpSplitter { 3105 protected: 3106 /// The builder used to form new instructions. 3107 IRBuilderTy IRB; 3108 /// The indices which to be used with insert- or extractvalue to select the 3109 /// appropriate value within the aggregate. 3110 SmallVector<unsigned, 4> Indices; 3111 /// The indices to a GEP instruction which will move Ptr to the correct slot 3112 /// within the aggregate. 3113 SmallVector<Value *, 4> GEPIndices; 3114 /// The base pointer of the original op, used as a base for GEPing the 3115 /// split operations. 3116 Value *Ptr; 3117 3118 /// Initialize the splitter with an insertion point, Ptr and start with a 3119 /// single zero GEP index. 3120 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 3121 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 3122 3123 public: 3124 /// \brief Generic recursive split emission routine. 3125 /// 3126 /// This method recursively splits an aggregate op (load or store) into 3127 /// scalar or vector ops. It splits recursively until it hits a single value 3128 /// and emits that single value operation via the template argument. 3129 /// 3130 /// The logic of this routine relies on GEPs and insertvalue and 3131 /// extractvalue all operating with the same fundamental index list, merely 3132 /// formatted differently (GEPs need actual values). 3133 /// 3134 /// \param Ty The type being split recursively into smaller ops. 3135 /// \param Agg The aggregate value being built up or stored, depending on 3136 /// whether this is splitting a load or a store respectively. 3137 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3138 if (Ty->isSingleValueType()) 3139 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 3140 3141 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3142 unsigned OldSize = Indices.size(); 3143 (void)OldSize; 3144 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3145 ++Idx) { 3146 assert(Indices.size() == OldSize && "Did not return to the old size"); 3147 Indices.push_back(Idx); 3148 GEPIndices.push_back(IRB.getInt32(Idx)); 3149 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3150 GEPIndices.pop_back(); 3151 Indices.pop_back(); 3152 } 3153 return; 3154 } 3155 3156 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3157 unsigned OldSize = Indices.size(); 3158 (void)OldSize; 3159 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3160 ++Idx) { 3161 assert(Indices.size() == OldSize && "Did not return to the old size"); 3162 Indices.push_back(Idx); 3163 GEPIndices.push_back(IRB.getInt32(Idx)); 3164 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3165 GEPIndices.pop_back(); 3166 Indices.pop_back(); 3167 } 3168 return; 3169 } 3170 3171 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3172 } 3173 }; 3174 3175 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3176 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3177 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 3178 3179 /// Emit a leaf load of a single value. This is called at the leaves of the 3180 /// recursive emission to actually load values. 3181 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3182 assert(Ty->isSingleValueType()); 3183 // Load the single value and insert it using the indices. 3184 Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"); 3185 Value *Load = IRB.CreateLoad(GEP, Name + ".load"); 3186 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3187 DEBUG(dbgs() << " to: " << *Load << "\n"); 3188 } 3189 }; 3190 3191 bool visitLoadInst(LoadInst &LI) { 3192 assert(LI.getPointerOperand() == *U); 3193 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3194 return false; 3195 3196 // We have an aggregate being loaded, split it apart. 3197 DEBUG(dbgs() << " original: " << LI << "\n"); 3198 LoadOpSplitter Splitter(&LI, *U); 3199 Value *V = UndefValue::get(LI.getType()); 3200 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3201 LI.replaceAllUsesWith(V); 3202 LI.eraseFromParent(); 3203 return true; 3204 } 3205 3206 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3207 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 3208 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 3209 3210 /// Emit a leaf store of a single value. This is called at the leaves of the 3211 /// recursive emission to actually produce stores. 3212 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 3213 assert(Ty->isSingleValueType()); 3214 // Extract the single value and store it using the indices. 3215 Value *Store = IRB.CreateStore( 3216 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 3217 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 3218 (void)Store; 3219 DEBUG(dbgs() << " to: " << *Store << "\n"); 3220 } 3221 }; 3222 3223 bool visitStoreInst(StoreInst &SI) { 3224 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3225 return false; 3226 Value *V = SI.getValueOperand(); 3227 if (V->getType()->isSingleValueType()) 3228 return false; 3229 3230 // We have an aggregate being stored, split it apart. 3231 DEBUG(dbgs() << " original: " << SI << "\n"); 3232 StoreOpSplitter Splitter(&SI, *U); 3233 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3234 SI.eraseFromParent(); 3235 return true; 3236 } 3237 3238 bool visitBitCastInst(BitCastInst &BC) { 3239 enqueueUsers(BC); 3240 return false; 3241 } 3242 3243 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3244 enqueueUsers(GEPI); 3245 return false; 3246 } 3247 3248 bool visitPHINode(PHINode &PN) { 3249 enqueueUsers(PN); 3250 return false; 3251 } 3252 3253 bool visitSelectInst(SelectInst &SI) { 3254 enqueueUsers(SI); 3255 return false; 3256 } 3257 }; 3258 } 3259 3260 /// \brief Strip aggregate type wrapping. 3261 /// 3262 /// This removes no-op aggregate types wrapping an underlying type. It will 3263 /// strip as many layers of types as it can without changing either the type 3264 /// size or the allocated size. 3265 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3266 if (Ty->isSingleValueType()) 3267 return Ty; 3268 3269 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 3270 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 3271 3272 Type *InnerTy; 3273 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3274 InnerTy = ArrTy->getElementType(); 3275 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3276 const StructLayout *SL = DL.getStructLayout(STy); 3277 unsigned Index = SL->getElementContainingOffset(0); 3278 InnerTy = STy->getElementType(Index); 3279 } else { 3280 return Ty; 3281 } 3282 3283 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 3284 TypeSize > DL.getTypeSizeInBits(InnerTy)) 3285 return Ty; 3286 3287 return stripAggregateTypeWrapping(DL, InnerTy); 3288 } 3289 3290 /// \brief Try to find a partition of the aggregate type passed in for a given 3291 /// offset and size. 3292 /// 3293 /// This recurses through the aggregate type and tries to compute a subtype 3294 /// based on the offset and size. When the offset and size span a sub-section 3295 /// of an array, it will even compute a new array type for that sub-section, 3296 /// and the same for structs. 3297 /// 3298 /// Note that this routine is very strict and tries to find a partition of the 3299 /// type which produces the *exact* right offset and size. It is not forgiving 3300 /// when the size or offset cause either end of type-based partition to be off. 3301 /// Also, this is a best-effort routine. It is reasonable to give up and not 3302 /// return a type if necessary. 3303 static Type *getTypePartition(const DataLayout &TD, Type *Ty, 3304 uint64_t Offset, uint64_t Size) { 3305 if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) 3306 return stripAggregateTypeWrapping(TD, Ty); 3307 if (Offset > TD.getTypeAllocSize(Ty) || 3308 (TD.getTypeAllocSize(Ty) - Offset) < Size) 3309 return 0; 3310 3311 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 3312 // We can't partition pointers... 3313 if (SeqTy->isPointerTy()) 3314 return 0; 3315 3316 Type *ElementTy = SeqTy->getElementType(); 3317 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3318 uint64_t NumSkippedElements = Offset / ElementSize; 3319 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) 3320 if (NumSkippedElements >= ArrTy->getNumElements()) 3321 return 0; 3322 if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) 3323 if (NumSkippedElements >= VecTy->getNumElements()) 3324 return 0; 3325 Offset -= NumSkippedElements * ElementSize; 3326 3327 // First check if we need to recurse. 3328 if (Offset > 0 || Size < ElementSize) { 3329 // Bail if the partition ends in a different array element. 3330 if ((Offset + Size) > ElementSize) 3331 return 0; 3332 // Recurse through the element type trying to peel off offset bytes. 3333 return getTypePartition(TD, ElementTy, Offset, Size); 3334 } 3335 assert(Offset == 0); 3336 3337 if (Size == ElementSize) 3338 return stripAggregateTypeWrapping(TD, ElementTy); 3339 assert(Size > ElementSize); 3340 uint64_t NumElements = Size / ElementSize; 3341 if (NumElements * ElementSize != Size) 3342 return 0; 3343 return ArrayType::get(ElementTy, NumElements); 3344 } 3345 3346 StructType *STy = dyn_cast<StructType>(Ty); 3347 if (!STy) 3348 return 0; 3349 3350 const StructLayout *SL = TD.getStructLayout(STy); 3351 if (Offset >= SL->getSizeInBytes()) 3352 return 0; 3353 uint64_t EndOffset = Offset + Size; 3354 if (EndOffset > SL->getSizeInBytes()) 3355 return 0; 3356 3357 unsigned Index = SL->getElementContainingOffset(Offset); 3358 Offset -= SL->getElementOffset(Index); 3359 3360 Type *ElementTy = STy->getElementType(Index); 3361 uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); 3362 if (Offset >= ElementSize) 3363 return 0; // The offset points into alignment padding. 3364 3365 // See if any partition must be contained by the element. 3366 if (Offset > 0 || Size < ElementSize) { 3367 if ((Offset + Size) > ElementSize) 3368 return 0; 3369 return getTypePartition(TD, ElementTy, Offset, Size); 3370 } 3371 assert(Offset == 0); 3372 3373 if (Size == ElementSize) 3374 return stripAggregateTypeWrapping(TD, ElementTy); 3375 3376 StructType::element_iterator EI = STy->element_begin() + Index, 3377 EE = STy->element_end(); 3378 if (EndOffset < SL->getSizeInBytes()) { 3379 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 3380 if (Index == EndIndex) 3381 return 0; // Within a single element and its padding. 3382 3383 // Don't try to form "natural" types if the elements don't line up with the 3384 // expected size. 3385 // FIXME: We could potentially recurse down through the last element in the 3386 // sub-struct to find a natural end point. 3387 if (SL->getElementOffset(EndIndex) != EndOffset) 3388 return 0; 3389 3390 assert(Index < EndIndex); 3391 EE = STy->element_begin() + EndIndex; 3392 } 3393 3394 // Try to build up a sub-structure. 3395 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 3396 STy->isPacked()); 3397 const StructLayout *SubSL = TD.getStructLayout(SubTy); 3398 if (Size != SubSL->getSizeInBytes()) 3399 return 0; // The sub-struct doesn't have quite the size needed. 3400 3401 return SubTy; 3402 } 3403 3404 /// \brief Rewrite an alloca partition's users. 3405 /// 3406 /// This routine drives both of the rewriting goals of the SROA pass. It tries 3407 /// to rewrite uses of an alloca partition to be conducive for SSA value 3408 /// promotion. If the partition needs a new, more refined alloca, this will 3409 /// build that new alloca, preserving as much type information as possible, and 3410 /// rewrite the uses of the old alloca to point at the new one and have the 3411 /// appropriate new offsets. It also evaluates how successful the rewrite was 3412 /// at enabling promotion and if it was successful queues the alloca to be 3413 /// promoted. 3414 bool SROA::rewriteAllocaPartition(AllocaInst &AI, 3415 AllocaPartitioning &P, 3416 AllocaPartitioning::iterator PI) { 3417 uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; 3418 bool IsLive = false; 3419 for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), 3420 UE = P.use_end(PI); 3421 UI != UE && !IsLive; ++UI) 3422 if (UI->getUse()) 3423 IsLive = true; 3424 if (!IsLive) 3425 return false; // No live uses left of this partition. 3426 3427 DEBUG(dbgs() << "Speculating PHIs and selects in partition " 3428 << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); 3429 3430 PHIOrSelectSpeculator Speculator(*TD, P, *this); 3431 DEBUG(dbgs() << " speculating "); 3432 DEBUG(P.print(dbgs(), PI, "")); 3433 Speculator.visitUsers(PI); 3434 3435 // Try to compute a friendly type for this partition of the alloca. This 3436 // won't always succeed, in which case we fall back to a legal integer type 3437 // or an i8 array of an appropriate size. 3438 Type *AllocaTy = 0; 3439 if (Type *PartitionTy = P.getCommonType(PI)) 3440 if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) 3441 AllocaTy = PartitionTy; 3442 if (!AllocaTy) 3443 if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), 3444 PI->BeginOffset, AllocaSize)) 3445 AllocaTy = PartitionTy; 3446 if ((!AllocaTy || 3447 (AllocaTy->isArrayTy() && 3448 AllocaTy->getArrayElementType()->isIntegerTy())) && 3449 TD->isLegalInteger(AllocaSize * 8)) 3450 AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); 3451 if (!AllocaTy) 3452 AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); 3453 assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); 3454 3455 // Check for the case where we're going to rewrite to a new alloca of the 3456 // exact same type as the original, and with the same access offsets. In that 3457 // case, re-use the existing alloca, but still run through the rewriter to 3458 // perform phi and select speculation. 3459 AllocaInst *NewAI; 3460 if (AllocaTy == AI.getAllocatedType()) { 3461 assert(PI->BeginOffset == 0 && 3462 "Non-zero begin offset but same alloca type"); 3463 assert(PI == P.begin() && "Begin offset is zero on later partition"); 3464 NewAI = &AI; 3465 } else { 3466 unsigned Alignment = AI.getAlignment(); 3467 if (!Alignment) { 3468 // The minimum alignment which users can rely on when the explicit 3469 // alignment is omitted or zero is that required by the ABI for this 3470 // type. 3471 Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); 3472 } 3473 Alignment = MinAlign(Alignment, PI->BeginOffset); 3474 // If we will get at least this much alignment from the type alone, leave 3475 // the alloca's alignment unconstrained. 3476 if (Alignment <= TD->getABITypeAlignment(AllocaTy)) 3477 Alignment = 0; 3478 NewAI = new AllocaInst(AllocaTy, 0, Alignment, 3479 AI.getName() + ".sroa." + Twine(PI - P.begin()), 3480 &AI); 3481 ++NumNewAllocas; 3482 } 3483 3484 DEBUG(dbgs() << "Rewriting alloca partition " 3485 << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " 3486 << *NewAI << "\n"); 3487 3488 // Track the high watermark of the post-promotion worklist. We will reset it 3489 // to this point if the alloca is not in fact scheduled for promotion. 3490 unsigned PPWOldSize = PostPromotionWorklist.size(); 3491 3492 AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, 3493 PI->BeginOffset, PI->EndOffset); 3494 DEBUG(dbgs() << " rewriting "); 3495 DEBUG(P.print(dbgs(), PI, "")); 3496 bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); 3497 if (Promotable) { 3498 DEBUG(dbgs() << " and queuing for promotion\n"); 3499 PromotableAllocas.push_back(NewAI); 3500 } else if (NewAI != &AI) { 3501 // If we can't promote the alloca, iterate on it to check for new 3502 // refinements exposed by splitting the current alloca. Don't iterate on an 3503 // alloca which didn't actually change and didn't get promoted. 3504 Worklist.insert(NewAI); 3505 } 3506 3507 // Drop any post-promotion work items if promotion didn't happen. 3508 if (!Promotable) 3509 while (PostPromotionWorklist.size() > PPWOldSize) 3510 PostPromotionWorklist.pop_back(); 3511 3512 return true; 3513 } 3514 3515 /// \brief Walks the partitioning of an alloca rewriting uses of each partition. 3516 bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { 3517 bool Changed = false; 3518 for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; 3519 ++PI) 3520 Changed |= rewriteAllocaPartition(AI, P, PI); 3521 3522 return Changed; 3523 } 3524 3525 /// \brief Analyze an alloca for SROA. 3526 /// 3527 /// This analyzes the alloca to ensure we can reason about it, builds 3528 /// a partitioning of the alloca, and then hands it off to be split and 3529 /// rewritten as needed. 3530 bool SROA::runOnAlloca(AllocaInst &AI) { 3531 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 3532 ++NumAllocasAnalyzed; 3533 3534 // Special case dead allocas, as they're trivial. 3535 if (AI.use_empty()) { 3536 AI.eraseFromParent(); 3537 return true; 3538 } 3539 3540 // Skip alloca forms that this analysis can't handle. 3541 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 3542 TD->getTypeAllocSize(AI.getAllocatedType()) == 0) 3543 return false; 3544 3545 bool Changed = false; 3546 3547 // First, split any FCA loads and stores touching this alloca to promote 3548 // better splitting and promotion opportunities. 3549 AggLoadStoreRewriter AggRewriter(*TD); 3550 Changed |= AggRewriter.rewrite(AI); 3551 3552 // Build the partition set using a recursive instruction-visiting builder. 3553 AllocaPartitioning P(*TD, AI); 3554 DEBUG(P.print(dbgs())); 3555 if (P.isEscaped()) 3556 return Changed; 3557 3558 // Delete all the dead users of this alloca before splitting and rewriting it. 3559 for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), 3560 DE = P.dead_user_end(); 3561 DI != DE; ++DI) { 3562 Changed = true; 3563 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 3564 DeadInsts.insert(*DI); 3565 } 3566 for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), 3567 DE = P.dead_op_end(); 3568 DO != DE; ++DO) { 3569 Value *OldV = **DO; 3570 // Clobber the use with an undef value. 3571 **DO = UndefValue::get(OldV->getType()); 3572 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 3573 if (isInstructionTriviallyDead(OldI)) { 3574 Changed = true; 3575 DeadInsts.insert(OldI); 3576 } 3577 } 3578 3579 // No partitions to split. Leave the dead alloca for a later pass to clean up. 3580 if (P.begin() == P.end()) 3581 return Changed; 3582 3583 return splitAlloca(AI, P) || Changed; 3584 } 3585 3586 /// \brief Delete the dead instructions accumulated in this run. 3587 /// 3588 /// Recursively deletes the dead instructions we've accumulated. This is done 3589 /// at the very end to maximize locality of the recursive delete and to 3590 /// minimize the problems of invalidated instruction pointers as such pointers 3591 /// are used heavily in the intermediate stages of the algorithm. 3592 /// 3593 /// We also record the alloca instructions deleted here so that they aren't 3594 /// subsequently handed to mem2reg to promote. 3595 void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { 3596 while (!DeadInsts.empty()) { 3597 Instruction *I = DeadInsts.pop_back_val(); 3598 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 3599 3600 I->replaceAllUsesWith(UndefValue::get(I->getType())); 3601 3602 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 3603 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 3604 // Zero out the operand and see if it becomes trivially dead. 3605 *OI = 0; 3606 if (isInstructionTriviallyDead(U)) 3607 DeadInsts.insert(U); 3608 } 3609 3610 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3611 DeletedAllocas.insert(AI); 3612 3613 ++NumDeleted; 3614 I->eraseFromParent(); 3615 } 3616 } 3617 3618 /// \brief Promote the allocas, using the best available technique. 3619 /// 3620 /// This attempts to promote whatever allocas have been identified as viable in 3621 /// the PromotableAllocas list. If that list is empty, there is nothing to do. 3622 /// If there is a domtree available, we attempt to promote using the full power 3623 /// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 3624 /// based on the SSAUpdater utilities. This function returns whether any 3625 /// promotion occurred. 3626 bool SROA::promoteAllocas(Function &F) { 3627 if (PromotableAllocas.empty()) 3628 return false; 3629 3630 NumPromoted += PromotableAllocas.size(); 3631 3632 if (DT && !ForceSSAUpdater) { 3633 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 3634 PromoteMemToReg(PromotableAllocas, *DT); 3635 PromotableAllocas.clear(); 3636 return true; 3637 } 3638 3639 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 3640 SSAUpdater SSA; 3641 DIBuilder DIB(*F.getParent()); 3642 SmallVector<Instruction*, 64> Insts; 3643 3644 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 3645 AllocaInst *AI = PromotableAllocas[Idx]; 3646 for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); 3647 UI != UE;) { 3648 Instruction *I = cast<Instruction>(*UI++); 3649 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 3650 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 3651 // leading to them) here. Eventually it should use them to optimize the 3652 // scalar values produced. 3653 if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { 3654 assert(onlyUsedByLifetimeMarkers(I) && 3655 "Found a bitcast used outside of a lifetime marker."); 3656 while (!I->use_empty()) 3657 cast<Instruction>(*I->use_begin())->eraseFromParent(); 3658 I->eraseFromParent(); 3659 continue; 3660 } 3661 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 3662 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 3663 II->getIntrinsicID() == Intrinsic::lifetime_end); 3664 II->eraseFromParent(); 3665 continue; 3666 } 3667 3668 Insts.push_back(I); 3669 } 3670 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 3671 Insts.clear(); 3672 } 3673 3674 PromotableAllocas.clear(); 3675 return true; 3676 } 3677 3678 namespace { 3679 /// \brief A predicate to test whether an alloca belongs to a set. 3680 class IsAllocaInSet { 3681 typedef SmallPtrSet<AllocaInst *, 4> SetType; 3682 const SetType &Set; 3683 3684 public: 3685 typedef AllocaInst *argument_type; 3686 3687 IsAllocaInSet(const SetType &Set) : Set(Set) {} 3688 bool operator()(AllocaInst *AI) const { return Set.count(AI); } 3689 }; 3690 } 3691 3692 bool SROA::runOnFunction(Function &F) { 3693 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 3694 C = &F.getContext(); 3695 TD = getAnalysisIfAvailable<DataLayout>(); 3696 if (!TD) { 3697 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 3698 return false; 3699 } 3700 DT = getAnalysisIfAvailable<DominatorTree>(); 3701 3702 BasicBlock &EntryBB = F.getEntryBlock(); 3703 for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); 3704 I != E; ++I) 3705 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 3706 Worklist.insert(AI); 3707 3708 bool Changed = false; 3709 // A set of deleted alloca instruction pointers which should be removed from 3710 // the list of promotable allocas. 3711 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 3712 3713 do { 3714 while (!Worklist.empty()) { 3715 Changed |= runOnAlloca(*Worklist.pop_back_val()); 3716 deleteDeadInstructions(DeletedAllocas); 3717 3718 // Remove the deleted allocas from various lists so that we don't try to 3719 // continue processing them. 3720 if (!DeletedAllocas.empty()) { 3721 Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3722 PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); 3723 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 3724 PromotableAllocas.end(), 3725 IsAllocaInSet(DeletedAllocas)), 3726 PromotableAllocas.end()); 3727 DeletedAllocas.clear(); 3728 } 3729 } 3730 3731 Changed |= promoteAllocas(F); 3732 3733 Worklist = PostPromotionWorklist; 3734 PostPromotionWorklist.clear(); 3735 } while (!Worklist.empty()); 3736 3737 return Changed; 3738 } 3739 3740 void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 3741 if (RequiresDomTree) 3742 AU.addRequired<DominatorTree>(); 3743 AU.setPreservesCFG(); 3744 } 3745