1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. 12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13 // of instructions in order to estimate the profitability of vectorization. 14 // 15 // The loop vectorizer combines consecutive loop iterations into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. InnerLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // Other ideas/concepts are from: 38 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39 // 40 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41 // Vectorizing Compilers. 42 // 43 //===----------------------------------------------------------------------===// 44 45 #define LV_NAME "loop-vectorize" 46 #define DEBUG_TYPE LV_NAME 47 48 #include "llvm/Transforms/Vectorize.h" 49 #include "llvm/ADT/DenseMap.h" 50 #include "llvm/ADT/MapVector.h" 51 #include "llvm/ADT/SmallPtrSet.h" 52 #include "llvm/ADT/SmallSet.h" 53 #include "llvm/ADT/SmallVector.h" 54 #include "llvm/ADT/StringExtras.h" 55 #include "llvm/Analysis/AliasAnalysis.h" 56 #include "llvm/Analysis/AliasSetTracker.h" 57 #include "llvm/Analysis/Dominators.h" 58 #include "llvm/Analysis/LoopInfo.h" 59 #include "llvm/Analysis/LoopIterator.h" 60 #include "llvm/Analysis/LoopPass.h" 61 #include "llvm/Analysis/ScalarEvolution.h" 62 #include "llvm/Analysis/ScalarEvolutionExpander.h" 63 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 64 #include "llvm/Analysis/TargetTransformInfo.h" 65 #include "llvm/Analysis/ValueTracking.h" 66 #include "llvm/Analysis/Verifier.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DerivedTypes.h" 70 #include "llvm/IR/Function.h" 71 #include "llvm/IR/IRBuilder.h" 72 #include "llvm/IR/Instructions.h" 73 #include "llvm/IR/IntrinsicInst.h" 74 #include "llvm/IR/LLVMContext.h" 75 #include "llvm/IR/Module.h" 76 #include "llvm/IR/Type.h" 77 #include "llvm/IR/Value.h" 78 #include "llvm/Pass.h" 79 #include "llvm/Support/CommandLine.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/PatternMatch.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include "llvm/Support/ValueHandle.h" 84 #include "llvm/Target/TargetLibraryInfo.h" 85 #include "llvm/Transforms/Scalar.h" 86 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 87 #include "llvm/Transforms/Utils/Local.h" 88 #include <algorithm> 89 #include <map> 90 91 using namespace llvm; 92 using namespace llvm::PatternMatch; 93 94 static cl::opt<unsigned> 95 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 96 cl::desc("Sets the SIMD width. Zero is autoselect.")); 97 98 static cl::opt<unsigned> 99 VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, 100 cl::desc("Sets the vectorization unroll count. " 101 "Zero is autoselect.")); 102 103 static cl::opt<bool> 104 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 105 cl::desc("Enable if-conversion during vectorization.")); 106 107 /// We don't vectorize loops with a known constant trip count below this number. 108 static cl::opt<unsigned> 109 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), 110 cl::Hidden, 111 cl::desc("Don't vectorize loops with a constant " 112 "trip count that is smaller than this " 113 "value.")); 114 115 /// We don't unroll loops with a known constant trip count below this number. 116 static const unsigned TinyTripCountUnrollThreshold = 128; 117 118 /// When performing memory disambiguation checks at runtime do not make more 119 /// than this number of comparisons. 120 static const unsigned RuntimeMemoryCheckThreshold = 8; 121 122 /// We use a metadata with this name to indicate that a scalar loop was 123 /// vectorized and that we don't need to re-vectorize it if we run into it 124 /// again. 125 static const char* 126 AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized"; 127 128 namespace { 129 130 // Forward declarations. 131 class LoopVectorizationLegality; 132 class LoopVectorizationCostModel; 133 134 /// InnerLoopVectorizer vectorizes loops which contain only one basic 135 /// block to a specified vectorization factor (VF). 136 /// This class performs the widening of scalars into vectors, or multiple 137 /// scalars. This class also implements the following features: 138 /// * It inserts an epilogue loop for handling loops that don't have iteration 139 /// counts that are known to be a multiple of the vectorization factor. 140 /// * It handles the code generation for reduction variables. 141 /// * Scalarization (implementation using scalars) of un-vectorizable 142 /// instructions. 143 /// InnerLoopVectorizer does not perform any vectorization-legality 144 /// checks, and relies on the caller to check for the different legality 145 /// aspects. The InnerLoopVectorizer relies on the 146 /// LoopVectorizationLegality class to provide information about the induction 147 /// and reduction variables that were found to a given vectorization factor. 148 class InnerLoopVectorizer { 149 public: 150 InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 151 DominatorTree *DT, DataLayout *DL, 152 const TargetLibraryInfo *TLI, unsigned VecWidth, 153 unsigned UnrollFactor) 154 : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), 155 VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), 156 OldInduction(0), WidenMap(UnrollFactor) {} 157 158 // Perform the actual loop widening (vectorization). 159 void vectorize(LoopVectorizationLegality *Legal) { 160 // Create a new empty loop. Unlink the old loop and connect the new one. 161 createEmptyLoop(Legal); 162 // Widen each instruction in the old loop to a new one in the new loop. 163 // Use the Legality module to find the induction and reduction variables. 164 vectorizeLoop(Legal); 165 // Register the new loop and update the analysis passes. 166 updateAnalysis(); 167 } 168 169 private: 170 /// A small list of PHINodes. 171 typedef SmallVector<PHINode*, 4> PhiVector; 172 /// When we unroll loops we have multiple vector values for each scalar. 173 /// This data structure holds the unrolled and vectorized values that 174 /// originated from one scalar instruction. 175 typedef SmallVector<Value*, 2> VectorParts; 176 177 /// Add code that checks at runtime if the accessed arrays overlap. 178 /// Returns the comparator value or NULL if no check is needed. 179 Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, 180 Instruction *Loc); 181 /// Create an empty loop, based on the loop ranges of the old loop. 182 void createEmptyLoop(LoopVectorizationLegality *Legal); 183 /// Copy and widen the instructions from the old loop. 184 void vectorizeLoop(LoopVectorizationLegality *Legal); 185 186 /// A helper function that computes the predicate of the block BB, assuming 187 /// that the header block of the loop is set to True. It returns the *entry* 188 /// mask for the block BB. 189 VectorParts createBlockInMask(BasicBlock *BB); 190 /// A helper function that computes the predicate of the edge between SRC 191 /// and DST. 192 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 193 194 /// A helper function to vectorize a single BB within the innermost loop. 195 void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, 196 PhiVector *PV); 197 198 /// Insert the new loop to the loop hierarchy and pass manager 199 /// and update the analysis passes. 200 void updateAnalysis(); 201 202 /// This instruction is un-vectorizable. Implement it as a sequence 203 /// of scalars. 204 void scalarizeInstruction(Instruction *Instr); 205 206 /// Vectorize Load and Store instructions, 207 void vectorizeMemoryInstruction(Instruction *Instr, 208 LoopVectorizationLegality *Legal); 209 210 /// Create a broadcast instruction. This method generates a broadcast 211 /// instruction (shuffle) for loop invariant values and for the induction 212 /// value. If this is the induction variable then we extend it to N, N+1, ... 213 /// this is needed because each iteration in the loop corresponds to a SIMD 214 /// element. 215 Value *getBroadcastInstrs(Value *V); 216 217 /// This function adds 0, 1, 2 ... to each vector element, starting at zero. 218 /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). 219 /// The sequence starts at StartIndex. 220 Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); 221 222 /// When we go over instructions in the basic block we rely on previous 223 /// values within the current basic block or on loop invariant values. 224 /// When we widen (vectorize) values we place them in the map. If the values 225 /// are not within the map, they have to be loop invariant, so we simply 226 /// broadcast them into a vector. 227 VectorParts &getVectorValue(Value *V); 228 229 /// Generate a shuffle sequence that will reverse the vector Vec. 230 Value *reverseVector(Value *Vec); 231 232 /// This is a helper class that holds the vectorizer state. It maps scalar 233 /// instructions to vector instructions. When the code is 'unrolled' then 234 /// then a single scalar value is mapped to multiple vector parts. The parts 235 /// are stored in the VectorPart type. 236 struct ValueMap { 237 /// C'tor. UnrollFactor controls the number of vectors ('parts') that 238 /// are mapped. 239 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 240 241 /// \return True if 'Key' is saved in the Value Map. 242 bool has(Value *Key) const { return MapStorage.count(Key); } 243 244 /// Initializes a new entry in the map. Sets all of the vector parts to the 245 /// save value in 'Val'. 246 /// \return A reference to a vector with splat values. 247 VectorParts &splat(Value *Key, Value *Val) { 248 VectorParts &Entry = MapStorage[Key]; 249 Entry.assign(UF, Val); 250 return Entry; 251 } 252 253 ///\return A reference to the value that is stored at 'Key'. 254 VectorParts &get(Value *Key) { 255 VectorParts &Entry = MapStorage[Key]; 256 if (Entry.empty()) 257 Entry.resize(UF); 258 assert(Entry.size() == UF); 259 return Entry; 260 } 261 262 private: 263 /// The unroll factor. Each entry in the map stores this number of vector 264 /// elements. 265 unsigned UF; 266 267 /// Map storage. We use std::map and not DenseMap because insertions to a 268 /// dense map invalidates its iterators. 269 std::map<Value *, VectorParts> MapStorage; 270 }; 271 272 /// The original loop. 273 Loop *OrigLoop; 274 /// Scev analysis to use. 275 ScalarEvolution *SE; 276 /// Loop Info. 277 LoopInfo *LI; 278 /// Dominator Tree. 279 DominatorTree *DT; 280 /// Data Layout. 281 DataLayout *DL; 282 /// Target Library Info. 283 const TargetLibraryInfo *TLI; 284 285 /// The vectorization SIMD factor to use. Each vector will have this many 286 /// vector elements. 287 unsigned VF; 288 /// The vectorization unroll factor to use. Each scalar is vectorized to this 289 /// many different vector instructions. 290 unsigned UF; 291 292 /// The builder that we use 293 IRBuilder<> Builder; 294 295 // --- Vectorization state --- 296 297 /// The vector-loop preheader. 298 BasicBlock *LoopVectorPreHeader; 299 /// The scalar-loop preheader. 300 BasicBlock *LoopScalarPreHeader; 301 /// Middle Block between the vector and the scalar. 302 BasicBlock *LoopMiddleBlock; 303 ///The ExitBlock of the scalar loop. 304 BasicBlock *LoopExitBlock; 305 ///The vector loop body. 306 BasicBlock *LoopVectorBody; 307 ///The scalar loop body. 308 BasicBlock *LoopScalarBody; 309 /// A list of all bypass blocks. The first block is the entry of the loop. 310 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 311 312 /// The new Induction variable which was added to the new block. 313 PHINode *Induction; 314 /// The induction variable of the old basic block. 315 PHINode *OldInduction; 316 /// Maps scalars to widened vectors. 317 ValueMap WidenMap; 318 }; 319 320 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 321 /// to what vectorization factor. 322 /// This class does not look at the profitability of vectorization, only the 323 /// legality. This class has two main kinds of checks: 324 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 325 /// will change the order of memory accesses in a way that will change the 326 /// correctness of the program. 327 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 328 /// checks for a number of different conditions, such as the availability of a 329 /// single induction variable, that all types are supported and vectorize-able, 330 /// etc. This code reflects the capabilities of InnerLoopVectorizer. 331 /// This class is also used by InnerLoopVectorizer for identifying 332 /// induction variable and the different reduction variables. 333 class LoopVectorizationLegality { 334 public: 335 LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, 336 DominatorTree *DT, TargetTransformInfo* TTI, 337 AliasAnalysis *AA, TargetLibraryInfo *TLI) 338 : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), 339 Induction(0), HasFunNoNaNAttr(false) {} 340 341 /// This enum represents the kinds of reductions that we support. 342 enum ReductionKind { 343 RK_NoReduction, ///< Not a reduction. 344 RK_IntegerAdd, ///< Sum of integers. 345 RK_IntegerMult, ///< Product of integers. 346 RK_IntegerOr, ///< Bitwise or logical OR of numbers. 347 RK_IntegerAnd, ///< Bitwise or logical AND of numbers. 348 RK_IntegerXor, ///< Bitwise or logical XOR of numbers. 349 RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). 350 RK_FloatAdd, ///< Sum of floats. 351 RK_FloatMult, ///< Product of floats. 352 RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). 353 }; 354 355 /// This enum represents the kinds of inductions that we support. 356 enum InductionKind { 357 IK_NoInduction, ///< Not an induction variable. 358 IK_IntInduction, ///< Integer induction variable. Step = 1. 359 IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. 360 IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). 361 IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). 362 }; 363 364 // This enum represents the kind of minmax reduction. 365 enum MinMaxReductionKind { 366 MRK_Invalid, 367 MRK_UIntMin, 368 MRK_UIntMax, 369 MRK_SIntMin, 370 MRK_SIntMax, 371 MRK_FloatMin, 372 MRK_FloatMax 373 }; 374 375 /// This POD struct holds information about reduction variables. 376 struct ReductionDescriptor { 377 ReductionDescriptor() : StartValue(0), LoopExitInstr(0), 378 Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} 379 380 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, 381 MinMaxReductionKind MK) 382 : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} 383 384 // The starting value of the reduction. 385 // It does not have to be zero! 386 TrackingVH<Value> StartValue; 387 // The instruction who's value is used outside the loop. 388 Instruction *LoopExitInstr; 389 // The kind of the reduction. 390 ReductionKind Kind; 391 // If this a min/max reduction the kind of reduction. 392 MinMaxReductionKind MinMaxKind; 393 }; 394 395 /// This POD struct holds information about a potential reduction operation. 396 struct ReductionInstDesc { 397 ReductionInstDesc(bool IsRedux, Instruction *I) : 398 IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} 399 400 ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : 401 IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} 402 403 // Is this instruction a reduction candidate. 404 bool IsReduction; 405 // The last instruction in a min/max pattern (select of the select(icmp()) 406 // pattern), or the current reduction instruction otherwise. 407 Instruction *PatternLastInst; 408 // If this is a min/max pattern the comparison predicate. 409 MinMaxReductionKind MinMaxKind; 410 }; 411 412 // This POD struct holds information about the memory runtime legality 413 // check that a group of pointers do not overlap. 414 struct RuntimePointerCheck { 415 RuntimePointerCheck() : Need(false) {} 416 417 /// Reset the state of the pointer runtime information. 418 void reset() { 419 Need = false; 420 Pointers.clear(); 421 Starts.clear(); 422 Ends.clear(); 423 } 424 425 /// Insert a pointer and calculate the start and end SCEVs. 426 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); 427 428 /// This flag indicates if we need to add the runtime check. 429 bool Need; 430 /// Holds the pointers that we need to check. 431 SmallVector<TrackingVH<Value>, 2> Pointers; 432 /// Holds the pointer value at the beginning of the loop. 433 SmallVector<const SCEV*, 2> Starts; 434 /// Holds the pointer value at the end of the loop. 435 SmallVector<const SCEV*, 2> Ends; 436 /// Holds the information if this pointer is used for writing to memory. 437 SmallVector<bool, 2> IsWritePtr; 438 }; 439 440 /// A POD for saving information about induction variables. 441 struct InductionInfo { 442 InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} 443 InductionInfo() : StartValue(0), IK(IK_NoInduction) {} 444 /// Start value. 445 TrackingVH<Value> StartValue; 446 /// Induction kind. 447 InductionKind IK; 448 }; 449 450 /// ReductionList contains the reduction descriptors for all 451 /// of the reductions that were found in the loop. 452 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 453 454 /// InductionList saves induction variables and maps them to the 455 /// induction descriptor. 456 typedef MapVector<PHINode*, InductionInfo> InductionList; 457 458 /// Alias(Multi)Map stores the values (GEPs or underlying objects and their 459 /// respective Store/Load instruction(s) to calculate aliasing. 460 typedef MapVector<Value*, Instruction* > AliasMap; 461 typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap; 462 463 /// Returns true if it is legal to vectorize this loop. 464 /// This does not mean that it is profitable to vectorize this 465 /// loop, only that it is legal to do so. 466 bool canVectorize(); 467 468 /// Returns the Induction variable. 469 PHINode *getInduction() { return Induction; } 470 471 /// Returns the reduction variables found in the loop. 472 ReductionList *getReductionVars() { return &Reductions; } 473 474 /// Returns the induction variables found in the loop. 475 InductionList *getInductionVars() { return &Inductions; } 476 477 /// Returns True if V is an induction variable in this loop. 478 bool isInductionVariable(const Value *V); 479 480 /// Return true if the block BB needs to be predicated in order for the loop 481 /// to be vectorized. 482 bool blockNeedsPredication(BasicBlock *BB); 483 484 /// Check if this pointer is consecutive when vectorizing. This happens 485 /// when the last index of the GEP is the induction variable, or that the 486 /// pointer itself is an induction variable. 487 /// This check allows us to vectorize A[idx] into a wide load/store. 488 /// Returns: 489 /// 0 - Stride is unknown or non consecutive. 490 /// 1 - Address is consecutive. 491 /// -1 - Address is consecutive, and decreasing. 492 int isConsecutivePtr(Value *Ptr); 493 494 /// Returns true if the value V is uniform within the loop. 495 bool isUniform(Value *V); 496 497 /// Returns true if this instruction will remain scalar after vectorization. 498 bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 499 500 /// Returns the information that we collected about runtime memory check. 501 RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } 502 503 /// This function returns the identity element (or neutral element) for 504 /// the operation K. 505 static Constant *getReductionIdentity(ReductionKind K, Type *Tp); 506 private: 507 /// Check if a single basic block loop is vectorizable. 508 /// At this point we know that this is a loop with a constant trip count 509 /// and we only need to check individual instructions. 510 bool canVectorizeInstrs(); 511 512 /// When we vectorize loops we may change the order in which 513 /// we read and write from memory. This method checks if it is 514 /// legal to vectorize the code, considering only memory constrains. 515 /// Returns true if the loop is vectorizable 516 bool canVectorizeMemory(); 517 518 /// Return true if we can vectorize this loop using the IF-conversion 519 /// transformation. 520 bool canVectorizeWithIfConvert(); 521 522 /// Collect the variables that need to stay uniform after vectorization. 523 void collectLoopUniforms(); 524 525 /// Return true if all of the instructions in the block can be speculatively 526 /// executed. 527 bool blockCanBePredicated(BasicBlock *BB); 528 529 /// Returns True, if 'Phi' is the kind of reduction variable for type 530 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 531 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 532 /// Returns a struct describing if the instruction 'I' can be a reduction 533 /// variable of type 'Kind'. If the reduction is a min/max pattern of 534 /// select(icmp()) this function advances the instruction pointer 'I' from the 535 /// compare instruction to the select instruction and stores this pointer in 536 /// 'PatternLastInst' member of the returned struct. 537 ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, 538 ReductionInstDesc &Desc); 539 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction 540 /// pattern corresponding to a min(X, Y) or max(X, Y). 541 static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, 542 ReductionInstDesc &Prev); 543 /// Returns the induction kind of Phi. This function may return NoInduction 544 /// if the PHI is not an induction variable. 545 InductionKind isInductionVariable(PHINode *Phi); 546 /// Return true if can compute the address bounds of Ptr within the loop. 547 bool hasComputableBounds(Value *Ptr); 548 /// Return true if there is the chance of write reorder. 549 bool hasPossibleGlobalWriteReorder(Value *Object, 550 Instruction *Inst, 551 AliasMultiMap &WriteObjects, 552 unsigned MaxByteWidth); 553 /// Return the AA location for a load or a store. 554 AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); 555 556 557 /// The loop that we evaluate. 558 Loop *TheLoop; 559 /// Scev analysis. 560 ScalarEvolution *SE; 561 /// DataLayout analysis. 562 DataLayout *DL; 563 /// Dominators. 564 DominatorTree *DT; 565 /// Target Info. 566 TargetTransformInfo *TTI; 567 /// Alias Analysis. 568 AliasAnalysis *AA; 569 /// Target Library Info. 570 TargetLibraryInfo *TLI; 571 572 // --- vectorization state --- // 573 574 /// Holds the integer induction variable. This is the counter of the 575 /// loop. 576 PHINode *Induction; 577 /// Holds the reduction variables. 578 ReductionList Reductions; 579 /// Holds all of the induction variables that we found in the loop. 580 /// Notice that inductions don't need to start at zero and that induction 581 /// variables can be pointers. 582 InductionList Inductions; 583 584 /// Allowed outside users. This holds the reduction 585 /// vars which can be accessed from outside the loop. 586 SmallPtrSet<Value*, 4> AllowedExit; 587 /// This set holds the variables which are known to be uniform after 588 /// vectorization. 589 SmallPtrSet<Instruction*, 4> Uniforms; 590 /// We need to check that all of the pointers in this list are disjoint 591 /// at runtime. 592 RuntimePointerCheck PtrRtCheck; 593 /// Can we assume the absence of NaNs. 594 bool HasFunNoNaNAttr; 595 }; 596 597 /// LoopVectorizationCostModel - estimates the expected speedups due to 598 /// vectorization. 599 /// In many cases vectorization is not profitable. This can happen because of 600 /// a number of reasons. In this class we mainly attempt to predict the 601 /// expected speedup/slowdowns due to the supported instruction set. We use the 602 /// TargetTransformInfo to query the different backends for the cost of 603 /// different operations. 604 class LoopVectorizationCostModel { 605 public: 606 LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 607 LoopVectorizationLegality *Legal, 608 const TargetTransformInfo &TTI, 609 DataLayout *DL, const TargetLibraryInfo *TLI) 610 : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} 611 612 /// Information about vectorization costs 613 struct VectorizationFactor { 614 unsigned Width; // Vector width with best cost 615 unsigned Cost; // Cost of the loop with that width 616 }; 617 /// \return The most profitable vectorization factor and the cost of that VF. 618 /// This method checks every power of two up to VF. If UserVF is not ZERO 619 /// then this vectorization factor will be selected if vectorization is 620 /// possible. 621 VectorizationFactor selectVectorizationFactor(bool OptForSize, 622 unsigned UserVF); 623 624 /// \return The size (in bits) of the widest type in the code that 625 /// needs to be vectorized. We ignore values that remain scalar such as 626 /// 64 bit loop indices. 627 unsigned getWidestType(); 628 629 /// \return The most profitable unroll factor. 630 /// If UserUF is non-zero then this method finds the best unroll-factor 631 /// based on register pressure and other parameters. 632 /// VF and LoopCost are the selected vectorization factor and the cost of the 633 /// selected VF. 634 unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, 635 unsigned LoopCost); 636 637 /// \brief A struct that represents some properties of the register usage 638 /// of a loop. 639 struct RegisterUsage { 640 /// Holds the number of loop invariant values that are used in the loop. 641 unsigned LoopInvariantRegs; 642 /// Holds the maximum number of concurrent live intervals in the loop. 643 unsigned MaxLocalUsers; 644 /// Holds the number of instructions in the loop. 645 unsigned NumInstructions; 646 }; 647 648 /// \return information about the register usage of the loop. 649 RegisterUsage calculateRegisterUsage(); 650 651 private: 652 /// Returns the expected execution cost. The unit of the cost does 653 /// not matter because we use the 'cost' units to compare different 654 /// vector widths. The cost that is returned is *not* normalized by 655 /// the factor width. 656 unsigned expectedCost(unsigned VF); 657 658 /// Returns the execution time cost of an instruction for a given vector 659 /// width. Vector width of one means scalar. 660 unsigned getInstructionCost(Instruction *I, unsigned VF); 661 662 /// A helper function for converting Scalar types to vector types. 663 /// If the incoming type is void, we return void. If the VF is 1, we return 664 /// the scalar type. 665 static Type* ToVectorTy(Type *Scalar, unsigned VF); 666 667 /// Returns whether the instruction is a load or store and will be a emitted 668 /// as a vector operation. 669 bool isConsecutiveLoadOrStore(Instruction *I); 670 671 /// The loop that we evaluate. 672 Loop *TheLoop; 673 /// Scev analysis. 674 ScalarEvolution *SE; 675 /// Loop Info analysis. 676 LoopInfo *LI; 677 /// Vectorization legality. 678 LoopVectorizationLegality *Legal; 679 /// Vector target information. 680 const TargetTransformInfo &TTI; 681 /// Target data layout information. 682 DataLayout *DL; 683 /// Target Library Info. 684 const TargetLibraryInfo *TLI; 685 }; 686 687 /// The LoopVectorize Pass. 688 struct LoopVectorize : public LoopPass { 689 /// Pass identification, replacement for typeid 690 static char ID; 691 692 explicit LoopVectorize() : LoopPass(ID) { 693 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 694 } 695 696 ScalarEvolution *SE; 697 DataLayout *DL; 698 LoopInfo *LI; 699 TargetTransformInfo *TTI; 700 DominatorTree *DT; 701 AliasAnalysis *AA; 702 TargetLibraryInfo *TLI; 703 704 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 705 // We only vectorize innermost loops. 706 if (!L->empty()) 707 return false; 708 709 SE = &getAnalysis<ScalarEvolution>(); 710 DL = getAnalysisIfAvailable<DataLayout>(); 711 LI = &getAnalysis<LoopInfo>(); 712 TTI = &getAnalysis<TargetTransformInfo>(); 713 DT = &getAnalysis<DominatorTree>(); 714 AA = getAnalysisIfAvailable<AliasAnalysis>(); 715 TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); 716 717 if (DL == NULL) { 718 DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); 719 return false; 720 } 721 722 DEBUG(dbgs() << "LV: Checking a loop in \"" << 723 L->getHeader()->getParent()->getName() << "\"\n"); 724 725 // Check if it is legal to vectorize the loop. 726 LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); 727 if (!LVL.canVectorize()) { 728 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 729 return false; 730 } 731 732 // Use the cost model. 733 LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI); 734 735 // Check the function attributes to find out if this function should be 736 // optimized for size. 737 Function *F = L->getHeader()->getParent(); 738 Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; 739 Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; 740 unsigned FnIndex = AttributeSet::FunctionIndex; 741 bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); 742 bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); 743 744 if (NoFloat) { 745 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 746 "attribute is used.\n"); 747 return false; 748 } 749 750 // Select the optimal vectorization factor. 751 LoopVectorizationCostModel::VectorizationFactor VF; 752 VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); 753 // Select the unroll factor. 754 unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, 755 VF.Width, VF.Cost); 756 757 if (VF.Width == 1) { 758 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 759 return false; 760 } 761 762 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< 763 F->getParent()->getModuleIdentifier()<<"\n"); 764 DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); 765 766 // If we decided that it is *legal* to vectorize the loop then do it. 767 InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); 768 LB.vectorize(&LVL); 769 770 DEBUG(verifyFunction(*L->getHeader()->getParent())); 771 return true; 772 } 773 774 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 775 LoopPass::getAnalysisUsage(AU); 776 AU.addRequiredID(LoopSimplifyID); 777 AU.addRequiredID(LCSSAID); 778 AU.addRequired<DominatorTree>(); 779 AU.addRequired<LoopInfo>(); 780 AU.addRequired<ScalarEvolution>(); 781 AU.addRequired<TargetTransformInfo>(); 782 AU.addPreserved<LoopInfo>(); 783 AU.addPreserved<DominatorTree>(); 784 } 785 786 }; 787 788 } // end anonymous namespace 789 790 //===----------------------------------------------------------------------===// 791 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 792 // LoopVectorizationCostModel. 793 //===----------------------------------------------------------------------===// 794 795 void 796 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 797 Loop *Lp, Value *Ptr, 798 bool WritePtr) { 799 const SCEV *Sc = SE->getSCEV(Ptr); 800 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 801 assert(AR && "Invalid addrec expression"); 802 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 803 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 804 Pointers.push_back(Ptr); 805 Starts.push_back(AR->getStart()); 806 Ends.push_back(ScEnd); 807 IsWritePtr.push_back(WritePtr); 808 } 809 810 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 811 // Save the current insertion location. 812 Instruction *Loc = Builder.GetInsertPoint(); 813 814 // We need to place the broadcast of invariant variables outside the loop. 815 Instruction *Instr = dyn_cast<Instruction>(V); 816 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 817 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 818 819 // Place the code for broadcasting invariant variables in the new preheader. 820 if (Invariant) 821 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 822 823 // Broadcast the scalar into all locations in the vector. 824 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 825 826 // Restore the builder insertion point. 827 if (Invariant) 828 Builder.SetInsertPoint(Loc); 829 830 return Shuf; 831 } 832 833 Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, 834 bool Negate) { 835 assert(Val->getType()->isVectorTy() && "Must be a vector"); 836 assert(Val->getType()->getScalarType()->isIntegerTy() && 837 "Elem must be an integer"); 838 // Create the types. 839 Type *ITy = Val->getType()->getScalarType(); 840 VectorType *Ty = cast<VectorType>(Val->getType()); 841 int VLen = Ty->getNumElements(); 842 SmallVector<Constant*, 8> Indices; 843 844 // Create a vector of consecutive numbers from zero to VF. 845 for (int i = 0; i < VLen; ++i) { 846 int64_t Idx = Negate ? (-i) : i; 847 Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); 848 } 849 850 // Add the consecutive indices to the vector value. 851 Constant *Cv = ConstantVector::get(Indices); 852 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 853 return Builder.CreateAdd(Val, Cv, "induction"); 854 } 855 856 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 857 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 858 // Make sure that the pointer does not point to structs. 859 if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) 860 return 0; 861 862 // If this value is a pointer induction variable we know it is consecutive. 863 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 864 if (Phi && Inductions.count(Phi)) { 865 InductionInfo II = Inductions[Phi]; 866 if (IK_PtrInduction == II.IK) 867 return 1; 868 else if (IK_ReversePtrInduction == II.IK) 869 return -1; 870 } 871 872 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 873 if (!Gep) 874 return 0; 875 876 unsigned NumOperands = Gep->getNumOperands(); 877 Value *LastIndex = Gep->getOperand(NumOperands - 1); 878 879 Value *GpPtr = Gep->getPointerOperand(); 880 // If this GEP value is a consecutive pointer induction variable and all of 881 // the indices are constant then we know it is consecutive. We can 882 Phi = dyn_cast<PHINode>(GpPtr); 883 if (Phi && Inductions.count(Phi)) { 884 885 // Make sure that the pointer does not point to structs. 886 PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 887 if (GepPtrType->getElementType()->isAggregateType()) 888 return 0; 889 890 // Make sure that all of the index operands are loop invariant. 891 for (unsigned i = 1; i < NumOperands; ++i) 892 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 893 return 0; 894 895 InductionInfo II = Inductions[Phi]; 896 if (IK_PtrInduction == II.IK) 897 return 1; 898 else if (IK_ReversePtrInduction == II.IK) 899 return -1; 900 } 901 902 // Check that all of the gep indices are uniform except for the last. 903 for (unsigned i = 0; i < NumOperands - 1; ++i) 904 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 905 return 0; 906 907 // We can emit wide load/stores only if the last index is the induction 908 // variable. 909 const SCEV *Last = SE->getSCEV(LastIndex); 910 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 911 const SCEV *Step = AR->getStepRecurrence(*SE); 912 913 // The memory is consecutive because the last index is consecutive 914 // and all other indices are loop invariant. 915 if (Step->isOne()) 916 return 1; 917 if (Step->isAllOnesValue()) 918 return -1; 919 } 920 921 return 0; 922 } 923 924 bool LoopVectorizationLegality::isUniform(Value *V) { 925 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 926 } 927 928 InnerLoopVectorizer::VectorParts& 929 InnerLoopVectorizer::getVectorValue(Value *V) { 930 assert(V != Induction && "The new induction variable should not be used."); 931 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 932 933 // If we have this scalar in the map, return it. 934 if (WidenMap.has(V)) 935 return WidenMap.get(V); 936 937 // If this scalar is unknown, assume that it is a constant or that it is 938 // loop invariant. Broadcast V and save the value for future uses. 939 Value *B = getBroadcastInstrs(V); 940 return WidenMap.splat(V, B); 941 } 942 943 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 944 assert(Vec->getType()->isVectorTy() && "Invalid type"); 945 SmallVector<Constant*, 8> ShuffleMask; 946 for (unsigned i = 0; i < VF; ++i) 947 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 948 949 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 950 ConstantVector::get(ShuffleMask), 951 "reverse"); 952 } 953 954 955 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 956 LoopVectorizationLegality *Legal) { 957 // Attempt to issue a wide load. 958 LoadInst *LI = dyn_cast<LoadInst>(Instr); 959 StoreInst *SI = dyn_cast<StoreInst>(Instr); 960 961 assert((LI || SI) && "Invalid Load/Store instruction"); 962 963 Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 964 Type *DataTy = VectorType::get(ScalarDataTy, VF); 965 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 966 unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 967 968 unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); 969 unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; 970 971 if (ScalarAllocatedSize != VectorElementSize) 972 return scalarizeInstruction(Instr); 973 974 // If the pointer is loop invariant or if it is non consecutive, 975 // scalarize the load. 976 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 977 bool Reverse = ConsecutiveStride < 0; 978 bool UniformLoad = LI && Legal->isUniform(Ptr); 979 if (!ConsecutiveStride || UniformLoad) 980 return scalarizeInstruction(Instr); 981 982 Constant *Zero = Builder.getInt32(0); 983 VectorParts &Entry = WidenMap.get(Instr); 984 985 // Handle consecutive loads/stores. 986 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 987 if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 988 Value *PtrOperand = Gep->getPointerOperand(); 989 Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 990 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 991 992 // Create the new GEP with the new induction variable. 993 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 994 Gep2->setOperand(0, FirstBasePtr); 995 Gep2->setName("gep.indvar.base"); 996 Ptr = Builder.Insert(Gep2); 997 } else if (Gep) { 998 assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), 999 OrigLoop) && "Base ptr must be invariant"); 1000 1001 // The last index does not have to be the induction. It can be 1002 // consecutive and be a function of the index. For example A[I+1]; 1003 unsigned NumOperands = Gep->getNumOperands(); 1004 1005 Value *LastGepOperand = Gep->getOperand(NumOperands - 1); 1006 VectorParts &GEPParts = getVectorValue(LastGepOperand); 1007 Value *LastIndex = GEPParts[0]; 1008 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1009 1010 // Create the new GEP with the new induction variable. 1011 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1012 Gep2->setOperand(NumOperands - 1, LastIndex); 1013 Gep2->setName("gep.indvar.idx"); 1014 Ptr = Builder.Insert(Gep2); 1015 } else { 1016 // Use the induction element ptr. 1017 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1018 VectorParts &PtrVal = getVectorValue(Ptr); 1019 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 1020 } 1021 1022 // Handle Stores: 1023 if (SI) { 1024 assert(!Legal->isUniform(SI->getPointerOperand()) && 1025 "We do not allow storing to uniform addresses"); 1026 1027 VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); 1028 for (unsigned Part = 0; Part < UF; ++Part) { 1029 // Calculate the pointer for the specific unroll-part. 1030 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1031 1032 if (Reverse) { 1033 // If we store to reverse consecutive memory locations then we need 1034 // to reverse the order of elements in the stored value. 1035 StoredVal[Part] = reverseVector(StoredVal[Part]); 1036 // If the address is consecutive but reversed, then the 1037 // wide store needs to start at the last vector element. 1038 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1039 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1040 } 1041 1042 Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); 1043 Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); 1044 } 1045 } 1046 1047 for (unsigned Part = 0; Part < UF; ++Part) { 1048 // Calculate the pointer for the specific unroll-part. 1049 Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1050 1051 if (Reverse) { 1052 // If the address is consecutive but reversed, then the 1053 // wide store needs to start at the last vector element. 1054 PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1055 PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1056 } 1057 1058 Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); 1059 Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); 1060 cast<LoadInst>(LI)->setAlignment(Alignment); 1061 Entry[Part] = Reverse ? reverseVector(LI) : LI; 1062 } 1063 } 1064 1065 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 1066 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 1067 // Holds vector parameters or scalars, in case of uniform vals. 1068 SmallVector<VectorParts, 4> Params; 1069 1070 // Find all of the vectorized parameters. 1071 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1072 Value *SrcOp = Instr->getOperand(op); 1073 1074 // If we are accessing the old induction variable, use the new one. 1075 if (SrcOp == OldInduction) { 1076 Params.push_back(getVectorValue(SrcOp)); 1077 continue; 1078 } 1079 1080 // Try using previously calculated values. 1081 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 1082 1083 // If the src is an instruction that appeared earlier in the basic block 1084 // then it should already be vectorized. 1085 if (SrcInst && OrigLoop->contains(SrcInst)) { 1086 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 1087 // The parameter is a vector value from earlier. 1088 Params.push_back(WidenMap.get(SrcInst)); 1089 } else { 1090 // The parameter is a scalar from outside the loop. Maybe even a constant. 1091 VectorParts Scalars; 1092 Scalars.append(UF, SrcOp); 1093 Params.push_back(Scalars); 1094 } 1095 } 1096 1097 assert(Params.size() == Instr->getNumOperands() && 1098 "Invalid number of operands"); 1099 1100 // Does this instruction return a value ? 1101 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 1102 1103 Value *UndefVec = IsVoidRetTy ? 0 : 1104 UndefValue::get(VectorType::get(Instr->getType(), VF)); 1105 // Create a new entry in the WidenMap and initialize it to Undef or Null. 1106 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 1107 1108 // For each vector unroll 'part': 1109 for (unsigned Part = 0; Part < UF; ++Part) { 1110 // For each scalar that we create: 1111 for (unsigned Width = 0; Width < VF; ++Width) { 1112 Instruction *Cloned = Instr->clone(); 1113 if (!IsVoidRetTy) 1114 Cloned->setName(Instr->getName() + ".cloned"); 1115 // Replace the operands of the cloned instrucions with extracted scalars. 1116 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1117 Value *Op = Params[op][Part]; 1118 // Param is a vector. Need to extract the right lane. 1119 if (Op->getType()->isVectorTy()) 1120 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 1121 Cloned->setOperand(op, Op); 1122 } 1123 1124 // Place the cloned scalar in the new loop. 1125 Builder.Insert(Cloned); 1126 1127 // If the original scalar returns a value we need to place it in a vector 1128 // so that future users will be able to use it. 1129 if (!IsVoidRetTy) 1130 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 1131 Builder.getInt32(Width)); 1132 } 1133 } 1134 } 1135 1136 Instruction * 1137 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 1138 Instruction *Loc) { 1139 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 1140 Legal->getRuntimePointerCheck(); 1141 1142 if (!PtrRtCheck->Need) 1143 return NULL; 1144 1145 Instruction *MemoryRuntimeCheck = 0; 1146 unsigned NumPointers = PtrRtCheck->Pointers.size(); 1147 SmallVector<Value* , 2> Starts; 1148 SmallVector<Value* , 2> Ends; 1149 1150 SCEVExpander Exp(*SE, "induction"); 1151 1152 // Use this type for pointer arithmetic. 1153 Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); 1154 1155 for (unsigned i = 0; i < NumPointers; ++i) { 1156 Value *Ptr = PtrRtCheck->Pointers[i]; 1157 const SCEV *Sc = SE->getSCEV(Ptr); 1158 1159 if (SE->isLoopInvariant(Sc, OrigLoop)) { 1160 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 1161 *Ptr <<"\n"); 1162 Starts.push_back(Ptr); 1163 Ends.push_back(Ptr); 1164 } else { 1165 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 1166 1167 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 1168 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 1169 Starts.push_back(Start); 1170 Ends.push_back(End); 1171 } 1172 } 1173 1174 IRBuilder<> ChkBuilder(Loc); 1175 1176 for (unsigned i = 0; i < NumPointers; ++i) { 1177 for (unsigned j = i+1; j < NumPointers; ++j) { 1178 // No need to check if two readonly pointers intersect. 1179 if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) 1180 continue; 1181 1182 Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); 1183 Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); 1184 Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); 1185 Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); 1186 1187 Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); 1188 Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); 1189 Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); 1190 if (MemoryRuntimeCheck) 1191 IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, 1192 "conflict.rdx"); 1193 1194 MemoryRuntimeCheck = cast<Instruction>(IsConflict); 1195 } 1196 } 1197 1198 return MemoryRuntimeCheck; 1199 } 1200 1201 void 1202 InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 1203 /* 1204 In this function we generate a new loop. The new loop will contain 1205 the vectorized instructions while the old loop will continue to run the 1206 scalar remainder. 1207 1208 [ ] <-- vector loop bypass (may consist of multiple blocks). 1209 / | 1210 / v 1211 | [ ] <-- vector pre header. 1212 | | 1213 | v 1214 | [ ] \ 1215 | [ ]_| <-- vector loop. 1216 | | 1217 \ v 1218 >[ ] <--- middle-block. 1219 / | 1220 / v 1221 | [ ] <--- new preheader. 1222 | | 1223 | v 1224 | [ ] \ 1225 | [ ]_| <-- old scalar loop to handle remainder. 1226 \ | 1227 \ v 1228 >[ ] <-- exit block. 1229 ... 1230 */ 1231 1232 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 1233 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 1234 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 1235 assert(ExitBlock && "Must have an exit block"); 1236 1237 // Mark the old scalar loop with metadata that tells us not to vectorize this 1238 // loop again if we run into it. 1239 MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None); 1240 OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD); 1241 1242 // Some loops have a single integer induction variable, while other loops 1243 // don't. One example is c++ iterators that often have multiple pointer 1244 // induction variables. In the code below we also support a case where we 1245 // don't have a single induction variable. 1246 OldInduction = Legal->getInduction(); 1247 Type *IdxTy = OldInduction ? OldInduction->getType() : 1248 DL->getIntPtrType(SE->getContext()); 1249 1250 // Find the loop boundaries. 1251 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 1252 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 1253 1254 // Get the total trip count from the count by adding 1. 1255 ExitCount = SE->getAddExpr(ExitCount, 1256 SE->getConstant(ExitCount->getType(), 1)); 1257 1258 // Expand the trip count and place the new instructions in the preheader. 1259 // Notice that the pre-header does not change, only the loop body. 1260 SCEVExpander Exp(*SE, "induction"); 1261 1262 // Count holds the overall loop count (N). 1263 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 1264 BypassBlock->getTerminator()); 1265 1266 // The loop index does not have to start at Zero. Find the original start 1267 // value from the induction PHI node. If we don't have an induction variable 1268 // then we know that it starts at zero. 1269 Value *StartIdx = OldInduction ? 1270 OldInduction->getIncomingValueForBlock(BypassBlock): 1271 ConstantInt::get(IdxTy, 0); 1272 1273 assert(BypassBlock && "Invalid loop structure"); 1274 LoopBypassBlocks.push_back(BypassBlock); 1275 1276 // Split the single block loop into the two loop structure described above. 1277 BasicBlock *VectorPH = 1278 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 1279 BasicBlock *VecBody = 1280 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 1281 BasicBlock *MiddleBlock = 1282 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 1283 BasicBlock *ScalarPH = 1284 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 1285 1286 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 1287 // inside the loop. 1288 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1289 1290 // Generate the induction variable. 1291 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 1292 // The loop step is equal to the vectorization factor (num of SIMD elements) 1293 // times the unroll factor (num of SIMD instructions). 1294 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 1295 1296 // This is the IR builder that we use to add all of the logic for bypassing 1297 // the new vector loop. 1298 IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); 1299 1300 // We may need to extend the index in case there is a type mismatch. 1301 // We know that the count starts at zero and does not overflow. 1302 if (Count->getType() != IdxTy) { 1303 // The exit count can be of pointer type. Convert it to the correct 1304 // integer type. 1305 if (ExitCount->getType()->isPointerTy()) 1306 Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); 1307 else 1308 Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); 1309 } 1310 1311 // Add the start index to the loop count to get the new end index. 1312 Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); 1313 1314 // Now we need to generate the expression for N - (N % VF), which is 1315 // the part that the vectorized body will execute. 1316 Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); 1317 Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); 1318 Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, 1319 "end.idx.rnd.down"); 1320 1321 // Now, compare the new count to zero. If it is zero skip the vector loop and 1322 // jump to the scalar loop. 1323 Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, 1324 "cmp.zero"); 1325 1326 BasicBlock *LastBypassBlock = BypassBlock; 1327 1328 // Generate the code that checks in runtime if arrays overlap. We put the 1329 // checks into a separate block to make the more common case of few elements 1330 // faster. 1331 Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, 1332 BypassBlock->getTerminator()); 1333 if (MemRuntimeCheck) { 1334 // Create a new block containing the memory check. 1335 BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, 1336 "vector.memcheck"); 1337 LoopBypassBlocks.push_back(CheckBlock); 1338 1339 // Replace the branch into the memory check block with a conditional branch 1340 // for the "few elements case". 1341 Instruction *OldTerm = BypassBlock->getTerminator(); 1342 BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); 1343 OldTerm->eraseFromParent(); 1344 1345 Cmp = MemRuntimeCheck; 1346 LastBypassBlock = CheckBlock; 1347 } 1348 1349 LastBypassBlock->getTerminator()->eraseFromParent(); 1350 BranchInst::Create(MiddleBlock, VectorPH, Cmp, 1351 LastBypassBlock); 1352 1353 // We are going to resume the execution of the scalar loop. 1354 // Go over all of the induction variables that we found and fix the 1355 // PHIs that are left in the scalar version of the loop. 1356 // The starting values of PHI nodes depend on the counter of the last 1357 // iteration in the vectorized loop. 1358 // If we come from a bypass edge then we need to start from the original 1359 // start value. 1360 1361 // This variable saves the new starting index for the scalar loop. 1362 PHINode *ResumeIndex = 0; 1363 LoopVectorizationLegality::InductionList::iterator I, E; 1364 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 1365 for (I = List->begin(), E = List->end(); I != E; ++I) { 1366 PHINode *OrigPhi = I->first; 1367 LoopVectorizationLegality::InductionInfo II = I->second; 1368 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 1369 MiddleBlock->getTerminator()); 1370 Value *EndValue = 0; 1371 switch (II.IK) { 1372 case LoopVectorizationLegality::IK_NoInduction: 1373 llvm_unreachable("Unknown induction"); 1374 case LoopVectorizationLegality::IK_IntInduction: { 1375 // Handle the integer induction counter: 1376 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 1377 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 1378 // We know what the end value is. 1379 EndValue = IdxEndRoundDown; 1380 // We also know which PHI node holds it. 1381 ResumeIndex = ResumeVal; 1382 break; 1383 } 1384 case LoopVectorizationLegality::IK_ReverseIntInduction: { 1385 // Convert the CountRoundDown variable to the PHI size. 1386 unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); 1387 unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); 1388 Value *CRD = CountRoundDown; 1389 if (CRDSize > IISize) 1390 CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, 1391 II.StartValue->getType(), "tr.crd", 1392 LoopBypassBlocks.back()->getTerminator()); 1393 else if (CRDSize < IISize) 1394 CRD = CastInst::Create(Instruction::SExt, CountRoundDown, 1395 II.StartValue->getType(), 1396 "sext.crd", 1397 LoopBypassBlocks.back()->getTerminator()); 1398 // Handle reverse integer induction counter: 1399 EndValue = 1400 BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", 1401 LoopBypassBlocks.back()->getTerminator()); 1402 break; 1403 } 1404 case LoopVectorizationLegality::IK_PtrInduction: { 1405 // For pointer induction variables, calculate the offset using 1406 // the end index. 1407 EndValue = 1408 GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", 1409 LoopBypassBlocks.back()->getTerminator()); 1410 break; 1411 } 1412 case LoopVectorizationLegality::IK_ReversePtrInduction: { 1413 // The value at the end of the loop for the reverse pointer is calculated 1414 // by creating a GEP with a negative index starting from the start value. 1415 Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); 1416 Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown, 1417 "rev.ind.end", 1418 LoopBypassBlocks.back()->getTerminator()); 1419 EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx, 1420 "rev.ptr.ind.end", 1421 LoopBypassBlocks.back()->getTerminator()); 1422 break; 1423 } 1424 }// end of case 1425 1426 // The new PHI merges the original incoming value, in case of a bypass, 1427 // or the value at the end of the vectorized loop. 1428 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1429 ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 1430 ResumeVal->addIncoming(EndValue, VecBody); 1431 1432 // Fix the scalar body counter (PHI node). 1433 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 1434 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 1435 } 1436 1437 // If we are generating a new induction variable then we also need to 1438 // generate the code that calculates the exit value. This value is not 1439 // simply the end of the counter because we may skip the vectorized body 1440 // in case of a runtime check. 1441 if (!OldInduction){ 1442 assert(!ResumeIndex && "Unexpected resume value found"); 1443 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 1444 MiddleBlock->getTerminator()); 1445 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1446 ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); 1447 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 1448 } 1449 1450 // Make sure that we found the index where scalar loop needs to continue. 1451 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 1452 "Invalid resume Index"); 1453 1454 // Add a check in the middle block to see if we have completed 1455 // all of the iterations in the first vector loop. 1456 // If (N - N%VF) == N, then we *don't* need to run the remainder. 1457 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 1458 ResumeIndex, "cmp.n", 1459 MiddleBlock->getTerminator()); 1460 1461 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 1462 // Remove the old terminator. 1463 MiddleBlock->getTerminator()->eraseFromParent(); 1464 1465 // Create i+1 and fill the PHINode. 1466 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 1467 Induction->addIncoming(StartIdx, VectorPH); 1468 Induction->addIncoming(NextIdx, VecBody); 1469 // Create the compare. 1470 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 1471 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 1472 1473 // Now we have two terminators. Remove the old one from the block. 1474 VecBody->getTerminator()->eraseFromParent(); 1475 1476 // Get ready to start creating new instructions into the vectorized body. 1477 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1478 1479 // Create and register the new vector loop. 1480 Loop* Lp = new Loop(); 1481 Loop *ParentLoop = OrigLoop->getParentLoop(); 1482 1483 // Insert the new loop into the loop nest and register the new basic blocks. 1484 if (ParentLoop) { 1485 ParentLoop->addChildLoop(Lp); 1486 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 1487 ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase()); 1488 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 1489 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 1490 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 1491 } else { 1492 LI->addTopLevelLoop(Lp); 1493 } 1494 1495 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 1496 1497 // Save the state. 1498 LoopVectorPreHeader = VectorPH; 1499 LoopScalarPreHeader = ScalarPH; 1500 LoopMiddleBlock = MiddleBlock; 1501 LoopExitBlock = ExitBlock; 1502 LoopVectorBody = VecBody; 1503 LoopScalarBody = OldBasicBlock; 1504 } 1505 1506 /// This function returns the identity element (or neutral element) for 1507 /// the operation K. 1508 Constant* 1509 LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { 1510 switch (K) { 1511 case RK_IntegerXor: 1512 case RK_IntegerAdd: 1513 case RK_IntegerOr: 1514 // Adding, Xoring, Oring zero to a number does not change it. 1515 return ConstantInt::get(Tp, 0); 1516 case RK_IntegerMult: 1517 // Multiplying a number by 1 does not change it. 1518 return ConstantInt::get(Tp, 1); 1519 case RK_IntegerAnd: 1520 // AND-ing a number with an all-1 value does not change it. 1521 return ConstantInt::get(Tp, -1, true); 1522 case RK_FloatMult: 1523 // Multiplying a number by 1 does not change it. 1524 return ConstantFP::get(Tp, 1.0L); 1525 case RK_FloatAdd: 1526 // Adding zero to a number does not change it. 1527 return ConstantFP::get(Tp, 0.0L); 1528 default: 1529 llvm_unreachable("Unknown reduction kind"); 1530 } 1531 } 1532 1533 static Intrinsic::ID 1534 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { 1535 // If we have an intrinsic call, check if it is trivially vectorizable. 1536 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 1537 switch (II->getIntrinsicID()) { 1538 case Intrinsic::sqrt: 1539 case Intrinsic::sin: 1540 case Intrinsic::cos: 1541 case Intrinsic::exp: 1542 case Intrinsic::exp2: 1543 case Intrinsic::log: 1544 case Intrinsic::log10: 1545 case Intrinsic::log2: 1546 case Intrinsic::fabs: 1547 case Intrinsic::floor: 1548 case Intrinsic::ceil: 1549 case Intrinsic::trunc: 1550 case Intrinsic::rint: 1551 case Intrinsic::nearbyint: 1552 case Intrinsic::pow: 1553 case Intrinsic::fma: 1554 case Intrinsic::fmuladd: 1555 return II->getIntrinsicID(); 1556 default: 1557 return Intrinsic::not_intrinsic; 1558 } 1559 } 1560 1561 if (!TLI) 1562 return Intrinsic::not_intrinsic; 1563 1564 LibFunc::Func Func; 1565 Function *F = CI->getCalledFunction(); 1566 // We're going to make assumptions on the semantics of the functions, check 1567 // that the target knows that it's available in this environment. 1568 if (!F || !TLI->getLibFunc(F->getName(), Func)) 1569 return Intrinsic::not_intrinsic; 1570 1571 // Otherwise check if we have a call to a function that can be turned into a 1572 // vector intrinsic. 1573 switch (Func) { 1574 default: 1575 break; 1576 case LibFunc::sin: 1577 case LibFunc::sinf: 1578 case LibFunc::sinl: 1579 return Intrinsic::sin; 1580 case LibFunc::cos: 1581 case LibFunc::cosf: 1582 case LibFunc::cosl: 1583 return Intrinsic::cos; 1584 case LibFunc::exp: 1585 case LibFunc::expf: 1586 case LibFunc::expl: 1587 return Intrinsic::exp; 1588 case LibFunc::exp2: 1589 case LibFunc::exp2f: 1590 case LibFunc::exp2l: 1591 return Intrinsic::exp2; 1592 case LibFunc::log: 1593 case LibFunc::logf: 1594 case LibFunc::logl: 1595 return Intrinsic::log; 1596 case LibFunc::log10: 1597 case LibFunc::log10f: 1598 case LibFunc::log10l: 1599 return Intrinsic::log10; 1600 case LibFunc::log2: 1601 case LibFunc::log2f: 1602 case LibFunc::log2l: 1603 return Intrinsic::log2; 1604 case LibFunc::fabs: 1605 case LibFunc::fabsf: 1606 case LibFunc::fabsl: 1607 return Intrinsic::fabs; 1608 case LibFunc::floor: 1609 case LibFunc::floorf: 1610 case LibFunc::floorl: 1611 return Intrinsic::floor; 1612 case LibFunc::ceil: 1613 case LibFunc::ceilf: 1614 case LibFunc::ceill: 1615 return Intrinsic::ceil; 1616 case LibFunc::trunc: 1617 case LibFunc::truncf: 1618 case LibFunc::truncl: 1619 return Intrinsic::trunc; 1620 case LibFunc::rint: 1621 case LibFunc::rintf: 1622 case LibFunc::rintl: 1623 return Intrinsic::rint; 1624 case LibFunc::nearbyint: 1625 case LibFunc::nearbyintf: 1626 case LibFunc::nearbyintl: 1627 return Intrinsic::nearbyint; 1628 case LibFunc::pow: 1629 case LibFunc::powf: 1630 case LibFunc::powl: 1631 return Intrinsic::pow; 1632 } 1633 1634 return Intrinsic::not_intrinsic; 1635 } 1636 1637 /// This function translates the reduction kind to an LLVM binary operator. 1638 static unsigned 1639 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { 1640 switch (Kind) { 1641 case LoopVectorizationLegality::RK_IntegerAdd: 1642 return Instruction::Add; 1643 case LoopVectorizationLegality::RK_IntegerMult: 1644 return Instruction::Mul; 1645 case LoopVectorizationLegality::RK_IntegerOr: 1646 return Instruction::Or; 1647 case LoopVectorizationLegality::RK_IntegerAnd: 1648 return Instruction::And; 1649 case LoopVectorizationLegality::RK_IntegerXor: 1650 return Instruction::Xor; 1651 case LoopVectorizationLegality::RK_FloatMult: 1652 return Instruction::FMul; 1653 case LoopVectorizationLegality::RK_FloatAdd: 1654 return Instruction::FAdd; 1655 case LoopVectorizationLegality::RK_IntegerMinMax: 1656 return Instruction::ICmp; 1657 case LoopVectorizationLegality::RK_FloatMinMax: 1658 return Instruction::FCmp; 1659 default: 1660 llvm_unreachable("Unknown reduction operation"); 1661 } 1662 } 1663 1664 Value *createMinMaxOp(IRBuilder<> &Builder, 1665 LoopVectorizationLegality::MinMaxReductionKind RK, 1666 Value *Left, 1667 Value *Right) { 1668 CmpInst::Predicate P = CmpInst::ICMP_NE; 1669 switch (RK) { 1670 default: 1671 llvm_unreachable("Unknown min/max reduction kind"); 1672 case LoopVectorizationLegality::MRK_UIntMin: 1673 P = CmpInst::ICMP_ULT; 1674 break; 1675 case LoopVectorizationLegality::MRK_UIntMax: 1676 P = CmpInst::ICMP_UGT; 1677 break; 1678 case LoopVectorizationLegality::MRK_SIntMin: 1679 P = CmpInst::ICMP_SLT; 1680 break; 1681 case LoopVectorizationLegality::MRK_SIntMax: 1682 P = CmpInst::ICMP_SGT; 1683 break; 1684 case LoopVectorizationLegality::MRK_FloatMin: 1685 P = CmpInst::FCMP_OLT; 1686 break; 1687 case LoopVectorizationLegality::MRK_FloatMax: 1688 P = CmpInst::FCMP_OGT; 1689 break; 1690 } 1691 1692 Value *Cmp; 1693 if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) 1694 Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); 1695 else 1696 Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); 1697 1698 Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); 1699 return Select; 1700 } 1701 1702 void 1703 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1704 //===------------------------------------------------===// 1705 // 1706 // Notice: any optimization or new instruction that go 1707 // into the code below should be also be implemented in 1708 // the cost-model. 1709 // 1710 //===------------------------------------------------===// 1711 Constant *Zero = Builder.getInt32(0); 1712 1713 // In order to support reduction variables we need to be able to vectorize 1714 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1715 // stages. First, we create a new vector PHI node with no incoming edges. 1716 // We use this value when we vectorize all of the instructions that use the 1717 // PHI. Next, after all of the instructions in the block are complete we 1718 // add the new incoming edges to the PHI. At this point all of the 1719 // instructions in the basic block are vectorized, so we can use them to 1720 // construct the PHI. 1721 PhiVector RdxPHIsToFix; 1722 1723 // Scan the loop in a topological order to ensure that defs are vectorized 1724 // before users. 1725 LoopBlocksDFS DFS(OrigLoop); 1726 DFS.perform(LI); 1727 1728 // Vectorize all of the blocks in the original loop. 1729 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 1730 be = DFS.endRPO(); bb != be; ++bb) 1731 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 1732 1733 // At this point every instruction in the original loop is widened to 1734 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1735 // that we vectorized. The PHI nodes are currently empty because we did 1736 // not want to introduce cycles. Notice that the remaining PHI nodes 1737 // that we need to fix are reduction variables. 1738 1739 // Create the 'reduced' values for each of the induction vars. 1740 // The reduced values are the vector values that we scalarize and combine 1741 // after the loop is finished. 1742 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1743 it != e; ++it) { 1744 PHINode *RdxPhi = *it; 1745 assert(RdxPhi && "Unable to recover vectorized PHI"); 1746 1747 // Find the reduction variable descriptor. 1748 assert(Legal->getReductionVars()->count(RdxPhi) && 1749 "Unable to find the reduction variable"); 1750 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1751 (*Legal->getReductionVars())[RdxPhi]; 1752 1753 // We need to generate a reduction vector from the incoming scalar. 1754 // To do so, we need to generate the 'identity' vector and overide 1755 // one of the elements with the incoming scalar reduction. We need 1756 // to do it in the vector-loop preheader. 1757 Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); 1758 1759 // This is the vector-clone of the value that leaves the loop. 1760 VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1761 Type *VecTy = VectorExit[0]->getType(); 1762 1763 // Find the reduction identity variable. Zero for addition, or, xor, 1764 // one for multiplication, -1 for And. 1765 Value *Identity; 1766 Value *VectorStart; 1767 if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || 1768 RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { 1769 // MinMax reduction have the start value as their identify. 1770 VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, 1771 "minmax.ident"); 1772 } else { 1773 Constant *Iden = 1774 LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, 1775 VecTy->getScalarType()); 1776 Identity = ConstantVector::getSplat(VF, Iden); 1777 1778 // This vector is the Identity vector where the first element is the 1779 // incoming scalar reduction. 1780 VectorStart = Builder.CreateInsertElement(Identity, 1781 RdxDesc.StartValue, Zero); 1782 } 1783 1784 // Fix the vector-loop phi. 1785 // We created the induction variable so we know that the 1786 // preheader is the first entry. 1787 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1788 1789 // Reductions do not have to start at zero. They can start with 1790 // any loop invariant values. 1791 VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 1792 BasicBlock *Latch = OrigLoop->getLoopLatch(); 1793 Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 1794 VectorParts &Val = getVectorValue(LoopVal); 1795 for (unsigned part = 0; part < UF; ++part) { 1796 // Make sure to add the reduction stat value only to the 1797 // first unroll part. 1798 Value *StartVal = (part == 0) ? VectorStart : Identity; 1799 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); 1800 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); 1801 } 1802 1803 // Before each round, move the insertion point right between 1804 // the PHIs and the values we are going to write. 1805 // This allows us to write both PHINodes and the extractelement 1806 // instructions. 1807 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1808 1809 VectorParts RdxParts; 1810 for (unsigned part = 0; part < UF; ++part) { 1811 // This PHINode contains the vectorized reduction variable, or 1812 // the initial value vector, if we bypass the vector loop. 1813 VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); 1814 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1815 Value *StartVal = (part == 0) ? VectorStart : Identity; 1816 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1817 NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); 1818 NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); 1819 RdxParts.push_back(NewPhi); 1820 } 1821 1822 // Reduce all of the unrolled parts into a single vector. 1823 Value *ReducedPartRdx = RdxParts[0]; 1824 unsigned Op = getReductionBinOp(RdxDesc.Kind); 1825 for (unsigned part = 1; part < UF; ++part) { 1826 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 1827 ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, 1828 RdxParts[part], ReducedPartRdx, 1829 "bin.rdx"); 1830 else 1831 ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, 1832 ReducedPartRdx, RdxParts[part]); 1833 } 1834 1835 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 1836 // and vector ops, reducing the set of values being computed by half each 1837 // round. 1838 assert(isPowerOf2_32(VF) && 1839 "Reduction emission only supported for pow2 vectors!"); 1840 Value *TmpVec = ReducedPartRdx; 1841 SmallVector<Constant*, 32> ShuffleMask(VF, 0); 1842 for (unsigned i = VF; i != 1; i >>= 1) { 1843 // Move the upper half of the vector to the lower half. 1844 for (unsigned j = 0; j != i/2; ++j) 1845 ShuffleMask[j] = Builder.getInt32(i/2 + j); 1846 1847 // Fill the rest of the mask with undef. 1848 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 1849 UndefValue::get(Builder.getInt32Ty())); 1850 1851 Value *Shuf = 1852 Builder.CreateShuffleVector(TmpVec, 1853 UndefValue::get(TmpVec->getType()), 1854 ConstantVector::get(ShuffleMask), 1855 "rdx.shuf"); 1856 1857 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 1858 TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, 1859 "bin.rdx"); 1860 else 1861 TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); 1862 } 1863 1864 // The result is in the first element of the vector. 1865 Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 1866 1867 // Now, we need to fix the users of the reduction variable 1868 // inside and outside of the scalar remainder loop. 1869 // We know that the loop is in LCSSA form. We need to update the 1870 // PHI nodes in the exit blocks. 1871 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1872 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1873 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1874 if (!LCSSAPhi) continue; 1875 1876 // All PHINodes need to have a single entry edge, or two if 1877 // we already fixed them. 1878 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1879 1880 // We found our reduction value exit-PHI. Update it with the 1881 // incoming bypass edge. 1882 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1883 // Add an edge coming from the bypass. 1884 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1885 break; 1886 } 1887 }// end of the LCSSA phi scan. 1888 1889 // Fix the scalar loop reduction variable with the incoming reduction sum 1890 // from the vector body and from the backedge value. 1891 int IncomingEdgeBlockIdx = 1892 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 1893 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 1894 // Pick the other block. 1895 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 1896 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1897 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1898 }// end of for each redux variable. 1899 1900 // The Loop exit block may have single value PHI nodes where the incoming 1901 // value is 'undef'. While vectorizing we only handled real values that 1902 // were defined inside the loop. Here we handle the 'undef case'. 1903 // See PR14725. 1904 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1905 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1906 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1907 if (!LCSSAPhi) continue; 1908 if (LCSSAPhi->getNumIncomingValues() == 1) 1909 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 1910 LoopMiddleBlock); 1911 } 1912 } 1913 1914 InnerLoopVectorizer::VectorParts 1915 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 1916 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 1917 "Invalid edge"); 1918 1919 VectorParts SrcMask = createBlockInMask(Src); 1920 1921 // The terminator has to be a branch inst! 1922 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 1923 assert(BI && "Unexpected terminator found"); 1924 1925 if (BI->isConditional()) { 1926 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 1927 1928 if (BI->getSuccessor(0) != Dst) 1929 for (unsigned part = 0; part < UF; ++part) 1930 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 1931 1932 for (unsigned part = 0; part < UF; ++part) 1933 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 1934 return EdgeMask; 1935 } 1936 1937 return SrcMask; 1938 } 1939 1940 InnerLoopVectorizer::VectorParts 1941 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 1942 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 1943 1944 // Loop incoming mask is all-one. 1945 if (OrigLoop->getHeader() == BB) { 1946 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 1947 return getVectorValue(C); 1948 } 1949 1950 // This is the block mask. We OR all incoming edges, and with zero. 1951 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 1952 VectorParts BlockMask = getVectorValue(Zero); 1953 1954 // For each pred: 1955 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 1956 VectorParts EM = createEdgeMask(*it, BB); 1957 for (unsigned part = 0; part < UF; ++part) 1958 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 1959 } 1960 1961 return BlockMask; 1962 } 1963 1964 void 1965 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 1966 BasicBlock *BB, PhiVector *PV) { 1967 // For each instruction in the old loop. 1968 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1969 VectorParts &Entry = WidenMap.get(it); 1970 switch (it->getOpcode()) { 1971 case Instruction::Br: 1972 // Nothing to do for PHIs and BR, since we already took care of the 1973 // loop control flow instructions. 1974 continue; 1975 case Instruction::PHI:{ 1976 PHINode* P = cast<PHINode>(it); 1977 // Handle reduction variables: 1978 if (Legal->getReductionVars()->count(P)) { 1979 for (unsigned part = 0; part < UF; ++part) { 1980 // This is phase one of vectorizing PHIs. 1981 Type *VecTy = VectorType::get(it->getType(), VF); 1982 Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", 1983 LoopVectorBody-> getFirstInsertionPt()); 1984 } 1985 PV->push_back(P); 1986 continue; 1987 } 1988 1989 // Check for PHI nodes that are lowered to vector selects. 1990 if (P->getParent() != OrigLoop->getHeader()) { 1991 // We know that all PHIs in non header blocks are converted into 1992 // selects, so we don't have to worry about the insertion order and we 1993 // can just use the builder. 1994 // At this point we generate the predication tree. There may be 1995 // duplications since this is a simple recursive scan, but future 1996 // optimizations will clean it up. 1997 1998 unsigned NumIncoming = P->getNumIncomingValues(); 1999 assert(NumIncoming > 1 && "Invalid PHI"); 2000 2001 // Generate a sequence of selects of the form: 2002 // SELECT(Mask3, In3, 2003 // SELECT(Mask2, In2, 2004 // ( ...))) 2005 for (unsigned In = 0; In < NumIncoming; In++) { 2006 VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), 2007 P->getParent()); 2008 VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 2009 2010 for (unsigned part = 0; part < UF; ++part) { 2011 // We don't need to 'select' the first PHI operand because it is 2012 // the default value if all of the other masks don't match. 2013 if (In == 0) 2014 Entry[part] = In0[part]; 2015 else 2016 // Select between the current value and the previous incoming edge 2017 // based on the incoming mask. 2018 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 2019 Entry[part], "predphi"); 2020 } 2021 } 2022 continue; 2023 } 2024 2025 // This PHINode must be an induction variable. 2026 // Make sure that we know about it. 2027 assert(Legal->getInductionVars()->count(P) && 2028 "Not an induction variable"); 2029 2030 LoopVectorizationLegality::InductionInfo II = 2031 Legal->getInductionVars()->lookup(P); 2032 2033 switch (II.IK) { 2034 case LoopVectorizationLegality::IK_NoInduction: 2035 llvm_unreachable("Unknown induction"); 2036 case LoopVectorizationLegality::IK_IntInduction: { 2037 assert(P == OldInduction && "Unexpected PHI"); 2038 Value *Broadcasted = getBroadcastInstrs(Induction); 2039 // After broadcasting the induction variable we need to make the 2040 // vector consecutive by adding 0, 1, 2 ... 2041 for (unsigned part = 0; part < UF; ++part) 2042 Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); 2043 continue; 2044 } 2045 case LoopVectorizationLegality::IK_ReverseIntInduction: 2046 case LoopVectorizationLegality::IK_PtrInduction: 2047 case LoopVectorizationLegality::IK_ReversePtrInduction: 2048 // Handle reverse integer and pointer inductions. 2049 Value *StartIdx = 0; 2050 // If we have a single integer induction variable then use it. 2051 // Otherwise, start counting at zero. 2052 if (OldInduction) { 2053 LoopVectorizationLegality::InductionInfo OldII = 2054 Legal->getInductionVars()->lookup(OldInduction); 2055 StartIdx = OldII.StartValue; 2056 } else { 2057 StartIdx = ConstantInt::get(Induction->getType(), 0); 2058 } 2059 // This is the normalized GEP that starts counting at zero. 2060 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 2061 "normalized.idx"); 2062 2063 // Handle the reverse integer induction variable case. 2064 if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { 2065 IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 2066 Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 2067 "resize.norm.idx"); 2068 Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 2069 "reverse.idx"); 2070 2071 // This is a new value so do not hoist it out. 2072 Value *Broadcasted = getBroadcastInstrs(ReverseInd); 2073 // After broadcasting the induction variable we need to make the 2074 // vector consecutive by adding ... -3, -2, -1, 0. 2075 for (unsigned part = 0; part < UF; ++part) 2076 Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, 2077 true); 2078 continue; 2079 } 2080 2081 // Handle the pointer induction variable case. 2082 assert(P->getType()->isPointerTy() && "Unexpected type."); 2083 2084 // Is this a reverse induction ptr or a consecutive induction ptr. 2085 bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == 2086 II.IK); 2087 2088 // This is the vector of results. Notice that we don't generate 2089 // vector geps because scalar geps result in better code. 2090 for (unsigned part = 0; part < UF; ++part) { 2091 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 2092 for (unsigned int i = 0; i < VF; ++i) { 2093 int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); 2094 Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); 2095 Value *GlobalIdx; 2096 if (!Reverse) 2097 GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 2098 else 2099 GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); 2100 2101 Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 2102 "next.gep"); 2103 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 2104 Builder.getInt32(i), 2105 "insert.gep"); 2106 } 2107 Entry[part] = VecVal; 2108 } 2109 continue; 2110 } 2111 2112 }// End of PHI. 2113 2114 case Instruction::Add: 2115 case Instruction::FAdd: 2116 case Instruction::Sub: 2117 case Instruction::FSub: 2118 case Instruction::Mul: 2119 case Instruction::FMul: 2120 case Instruction::UDiv: 2121 case Instruction::SDiv: 2122 case Instruction::FDiv: 2123 case Instruction::URem: 2124 case Instruction::SRem: 2125 case Instruction::FRem: 2126 case Instruction::Shl: 2127 case Instruction::LShr: 2128 case Instruction::AShr: 2129 case Instruction::And: 2130 case Instruction::Or: 2131 case Instruction::Xor: { 2132 // Just widen binops. 2133 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 2134 VectorParts &A = getVectorValue(it->getOperand(0)); 2135 VectorParts &B = getVectorValue(it->getOperand(1)); 2136 2137 // Use this vector value for all users of the original instruction. 2138 for (unsigned Part = 0; Part < UF; ++Part) { 2139 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 2140 2141 // Update the NSW, NUW and Exact flags. Notice: V can be an Undef. 2142 BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V); 2143 if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) { 2144 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 2145 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 2146 } 2147 if (VecOp && isa<PossiblyExactOperator>(VecOp)) 2148 VecOp->setIsExact(BinOp->isExact()); 2149 2150 Entry[Part] = V; 2151 } 2152 break; 2153 } 2154 case Instruction::Select: { 2155 // Widen selects. 2156 // If the selector is loop invariant we can create a select 2157 // instruction with a scalar condition. Otherwise, use vector-select. 2158 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), 2159 OrigLoop); 2160 2161 // The condition can be loop invariant but still defined inside the 2162 // loop. This means that we can't just use the original 'cond' value. 2163 // We have to take the 'vectorized' value and pick the first lane. 2164 // Instcombine will make this a no-op. 2165 VectorParts &Cond = getVectorValue(it->getOperand(0)); 2166 VectorParts &Op0 = getVectorValue(it->getOperand(1)); 2167 VectorParts &Op1 = getVectorValue(it->getOperand(2)); 2168 Value *ScalarCond = Builder.CreateExtractElement(Cond[0], 2169 Builder.getInt32(0)); 2170 for (unsigned Part = 0; Part < UF; ++Part) { 2171 Entry[Part] = Builder.CreateSelect( 2172 InvariantCond ? ScalarCond : Cond[Part], 2173 Op0[Part], 2174 Op1[Part]); 2175 } 2176 break; 2177 } 2178 2179 case Instruction::ICmp: 2180 case Instruction::FCmp: { 2181 // Widen compares. Generate vector compares. 2182 bool FCmp = (it->getOpcode() == Instruction::FCmp); 2183 CmpInst *Cmp = dyn_cast<CmpInst>(it); 2184 VectorParts &A = getVectorValue(it->getOperand(0)); 2185 VectorParts &B = getVectorValue(it->getOperand(1)); 2186 for (unsigned Part = 0; Part < UF; ++Part) { 2187 Value *C = 0; 2188 if (FCmp) 2189 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 2190 else 2191 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 2192 Entry[Part] = C; 2193 } 2194 break; 2195 } 2196 2197 case Instruction::Store: 2198 case Instruction::Load: 2199 vectorizeMemoryInstruction(it, Legal); 2200 break; 2201 case Instruction::ZExt: 2202 case Instruction::SExt: 2203 case Instruction::FPToUI: 2204 case Instruction::FPToSI: 2205 case Instruction::FPExt: 2206 case Instruction::PtrToInt: 2207 case Instruction::IntToPtr: 2208 case Instruction::SIToFP: 2209 case Instruction::UIToFP: 2210 case Instruction::Trunc: 2211 case Instruction::FPTrunc: 2212 case Instruction::BitCast: { 2213 CastInst *CI = dyn_cast<CastInst>(it); 2214 /// Optimize the special case where the source is the induction 2215 /// variable. Notice that we can only optimize the 'trunc' case 2216 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 2217 /// c. other casts depend on pointer size. 2218 if (CI->getOperand(0) == OldInduction && 2219 it->getOpcode() == Instruction::Trunc) { 2220 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 2221 CI->getType()); 2222 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 2223 for (unsigned Part = 0; Part < UF; ++Part) 2224 Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); 2225 break; 2226 } 2227 /// Vectorize casts. 2228 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 2229 2230 VectorParts &A = getVectorValue(it->getOperand(0)); 2231 for (unsigned Part = 0; Part < UF; ++Part) 2232 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 2233 break; 2234 } 2235 2236 case Instruction::Call: { 2237 // Ignore dbg intrinsics. 2238 if (isa<DbgInfoIntrinsic>(it)) 2239 break; 2240 2241 Module *M = BB->getParent()->getParent(); 2242 CallInst *CI = cast<CallInst>(it); 2243 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2244 assert(ID && "Not an intrinsic call!"); 2245 for (unsigned Part = 0; Part < UF; ++Part) { 2246 SmallVector<Value*, 4> Args; 2247 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 2248 VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); 2249 Args.push_back(Arg[Part]); 2250 } 2251 Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; 2252 Function *F = Intrinsic::getDeclaration(M, ID, Tys); 2253 Entry[Part] = Builder.CreateCall(F, Args); 2254 } 2255 break; 2256 } 2257 2258 default: 2259 // All other instructions are unsupported. Scalarize them. 2260 scalarizeInstruction(it); 2261 break; 2262 }// end of switch. 2263 }// end of for_each instr. 2264 } 2265 2266 void InnerLoopVectorizer::updateAnalysis() { 2267 // Forget the original basic block. 2268 SE->forgetLoop(OrigLoop); 2269 2270 // Update the dominator tree information. 2271 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 2272 "Entry does not dominate exit."); 2273 2274 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 2275 DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); 2276 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); 2277 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 2278 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); 2279 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 2280 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 2281 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2282 2283 DEBUG(DT->verifyAnalysis()); 2284 } 2285 2286 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 2287 if (!EnableIfConversion) 2288 return false; 2289 2290 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 2291 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 2292 2293 // Collect the blocks that need predication. 2294 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 2295 BasicBlock *BB = LoopBlocks[i]; 2296 2297 // We don't support switch statements inside loops. 2298 if (!isa<BranchInst>(BB->getTerminator())) 2299 return false; 2300 2301 // We must be able to predicate all blocks that need to be predicated. 2302 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 2303 return false; 2304 } 2305 2306 // We can if-convert this loop. 2307 return true; 2308 } 2309 2310 bool LoopVectorizationLegality::canVectorize() { 2311 // We must have a loop in canonical form. Loops with indirectbr in them cannot 2312 // be canonicalized. 2313 if (!TheLoop->getLoopPreheader()) 2314 return false; 2315 2316 // We can only vectorize innermost loops. 2317 if (TheLoop->getSubLoopsVector().size()) 2318 return false; 2319 2320 // We must have a single backedge. 2321 if (TheLoop->getNumBackEdges() != 1) 2322 return false; 2323 2324 // We must have a single exiting block. 2325 if (!TheLoop->getExitingBlock()) 2326 return false; 2327 2328 unsigned NumBlocks = TheLoop->getNumBlocks(); 2329 2330 // Check if we can if-convert non single-bb loops. 2331 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 2332 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 2333 return false; 2334 } 2335 2336 // We need to have a loop header. 2337 BasicBlock *Latch = TheLoop->getLoopLatch(); 2338 DEBUG(dbgs() << "LV: Found a loop: " << 2339 TheLoop->getHeader()->getName() << "\n"); 2340 2341 // ScalarEvolution needs to be able to find the exit count. 2342 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 2343 if (ExitCount == SE->getCouldNotCompute()) { 2344 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 2345 return false; 2346 } 2347 2348 // Do not loop-vectorize loops with a tiny trip count. 2349 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 2350 if (TC > 0u && TC < TinyTripCountVectorThreshold) { 2351 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 2352 "This loop is not worth vectorizing.\n"); 2353 return false; 2354 } 2355 2356 // Check if we can vectorize the instructions and CFG in this loop. 2357 if (!canVectorizeInstrs()) { 2358 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 2359 return false; 2360 } 2361 2362 // Go over each instruction and look at memory deps. 2363 if (!canVectorizeMemory()) { 2364 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 2365 return false; 2366 } 2367 2368 // Collect all of the variables that remain uniform after vectorization. 2369 collectLoopUniforms(); 2370 2371 DEBUG(dbgs() << "LV: We can vectorize this loop" << 2372 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 2373 <<"!\n"); 2374 2375 // Okay! We can vectorize. At this point we don't have any other mem analysis 2376 // which may limit our maximum vectorization factor, so just return true with 2377 // no restrictions. 2378 return true; 2379 } 2380 2381 /// \brief Check that the instruction has outside loop users and is not an 2382 /// identified reduction variable. 2383 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 2384 SmallPtrSet<Value *, 4> &Reductions) { 2385 // Reduction instructions are allowed to have exit users. All other 2386 // instructions must not have external users. 2387 if (!Reductions.count(Inst)) 2388 //Check that all of the users of the loop are inside the BB. 2389 for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); 2390 I != E; ++I) { 2391 Instruction *U = cast<Instruction>(*I); 2392 // This user may be a reduction exit value. 2393 if (!TheLoop->contains(U)) { 2394 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 2395 return true; 2396 } 2397 } 2398 return false; 2399 } 2400 2401 bool LoopVectorizationLegality::canVectorizeInstrs() { 2402 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 2403 BasicBlock *Header = TheLoop->getHeader(); 2404 2405 // If we marked the scalar loop as "already vectorized" then no need 2406 // to vectorize it again. 2407 if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) { 2408 DEBUG(dbgs() << "LV: This loop was vectorized before\n"); 2409 return false; 2410 } 2411 2412 // Look for the attribute signaling the absence of NaNs. 2413 Function &F = *Header->getParent(); 2414 if (F.hasFnAttribute("no-nans-fp-math")) 2415 HasFunNoNaNAttr = F.getAttributes().getAttribute( 2416 AttributeSet::FunctionIndex, 2417 "no-nans-fp-math").getValueAsString() == "true"; 2418 2419 // For each block in the loop. 2420 for (Loop::block_iterator bb = TheLoop->block_begin(), 2421 be = TheLoop->block_end(); bb != be; ++bb) { 2422 2423 // Scan the instructions in the block and look for hazards. 2424 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2425 ++it) { 2426 2427 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 2428 // Check that this PHI type is allowed. 2429 if (!Phi->getType()->isIntegerTy() && 2430 !Phi->getType()->isFloatingPointTy() && 2431 !Phi->getType()->isPointerTy()) { 2432 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 2433 return false; 2434 } 2435 2436 // If this PHINode is not in the header block, then we know that we 2437 // can convert it to select during if-conversion. No need to check if 2438 // the PHIs in this block are induction or reduction variables. 2439 if (*bb != Header) { 2440 // Check that this instruction has no outside users or is an 2441 // identified reduction value with an outside user. 2442 if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) 2443 continue; 2444 return false; 2445 } 2446 2447 // We only allow if-converted PHIs with more than two incoming values. 2448 if (Phi->getNumIncomingValues() != 2) { 2449 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 2450 return false; 2451 } 2452 2453 // This is the value coming from the preheader. 2454 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 2455 // Check if this is an induction variable. 2456 InductionKind IK = isInductionVariable(Phi); 2457 2458 if (IK_NoInduction != IK) { 2459 // Int inductions are special because we only allow one IV. 2460 if (IK == IK_IntInduction) { 2461 if (Induction) { 2462 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 2463 return false; 2464 } 2465 Induction = Phi; 2466 } 2467 2468 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 2469 Inductions[Phi] = InductionInfo(StartValue, IK); 2470 continue; 2471 } 2472 2473 if (AddReductionVar(Phi, RK_IntegerAdd)) { 2474 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 2475 continue; 2476 } 2477 if (AddReductionVar(Phi, RK_IntegerMult)) { 2478 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 2479 continue; 2480 } 2481 if (AddReductionVar(Phi, RK_IntegerOr)) { 2482 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 2483 continue; 2484 } 2485 if (AddReductionVar(Phi, RK_IntegerAnd)) { 2486 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 2487 continue; 2488 } 2489 if (AddReductionVar(Phi, RK_IntegerXor)) { 2490 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 2491 continue; 2492 } 2493 if (AddReductionVar(Phi, RK_IntegerMinMax)) { 2494 DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); 2495 continue; 2496 } 2497 if (AddReductionVar(Phi, RK_FloatMult)) { 2498 DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); 2499 continue; 2500 } 2501 if (AddReductionVar(Phi, RK_FloatAdd)) { 2502 DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); 2503 continue; 2504 } 2505 if (AddReductionVar(Phi, RK_FloatMinMax)) { 2506 DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); 2507 continue; 2508 } 2509 2510 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 2511 return false; 2512 }// end of PHI handling 2513 2514 // We still don't handle functions. However, we can ignore dbg intrinsic 2515 // calls and we do handle certain intrinsic and libm functions. 2516 CallInst *CI = dyn_cast<CallInst>(it); 2517 if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { 2518 DEBUG(dbgs() << "LV: Found a call site.\n"); 2519 return false; 2520 } 2521 2522 // Check that the instruction return type is vectorizable. 2523 if (!VectorType::isValidElementType(it->getType()) && 2524 !it->getType()->isVoidTy()) { 2525 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 2526 return false; 2527 } 2528 2529 // Check that the stored type is vectorizable. 2530 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 2531 Type *T = ST->getValueOperand()->getType(); 2532 if (!VectorType::isValidElementType(T)) 2533 return false; 2534 } 2535 2536 // Reduction instructions are allowed to have exit users. 2537 // All other instructions must not have external users. 2538 if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) 2539 return false; 2540 2541 } // next instr. 2542 2543 } 2544 2545 if (!Induction) { 2546 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 2547 assert(getInductionVars()->size() && "No induction variables"); 2548 } 2549 2550 return true; 2551 } 2552 2553 void LoopVectorizationLegality::collectLoopUniforms() { 2554 // We now know that the loop is vectorizable! 2555 // Collect variables that will remain uniform after vectorization. 2556 std::vector<Value*> Worklist; 2557 BasicBlock *Latch = TheLoop->getLoopLatch(); 2558 2559 // Start with the conditional branch and walk up the block. 2560 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 2561 2562 while (Worklist.size()) { 2563 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 2564 Worklist.pop_back(); 2565 2566 // Look at instructions inside this loop. 2567 // Stop when reaching PHI nodes. 2568 // TODO: we need to follow values all over the loop, not only in this block. 2569 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 2570 continue; 2571 2572 // This is a known uniform. 2573 Uniforms.insert(I); 2574 2575 // Insert all operands. 2576 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 2577 Worklist.push_back(I->getOperand(i)); 2578 } 2579 } 2580 } 2581 2582 AliasAnalysis::Location 2583 LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { 2584 if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) 2585 return AA->getLocation(Store); 2586 else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) 2587 return AA->getLocation(Load); 2588 2589 llvm_unreachable("Should be either load or store instruction"); 2590 } 2591 2592 bool 2593 LoopVectorizationLegality::hasPossibleGlobalWriteReorder( 2594 Value *Object, 2595 Instruction *Inst, 2596 AliasMultiMap& WriteObjects, 2597 unsigned MaxByteWidth) { 2598 2599 AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); 2600 2601 std::vector<Instruction*>::iterator 2602 it = WriteObjects[Object].begin(), 2603 end = WriteObjects[Object].end(); 2604 2605 for (; it != end; ++it) { 2606 Instruction* I = *it; 2607 if (I == Inst) 2608 continue; 2609 2610 AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); 2611 if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), 2612 ThatLoc.getWithNewSize(MaxByteWidth))) 2613 return true; 2614 } 2615 return false; 2616 } 2617 2618 bool LoopVectorizationLegality::canVectorizeMemory() { 2619 2620 typedef SmallVector<Value*, 16> ValueVector; 2621 typedef SmallPtrSet<Value*, 16> ValueSet; 2622 // Holds the Load and Store *instructions*. 2623 ValueVector Loads; 2624 ValueVector Stores; 2625 PtrRtCheck.Pointers.clear(); 2626 PtrRtCheck.Need = false; 2627 2628 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); 2629 2630 // For each block. 2631 for (Loop::block_iterator bb = TheLoop->block_begin(), 2632 be = TheLoop->block_end(); bb != be; ++bb) { 2633 2634 // Scan the BB and collect legal loads and stores. 2635 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2636 ++it) { 2637 2638 // If this is a load, save it. If this instruction can read from memory 2639 // but is not a load, then we quit. Notice that we don't handle function 2640 // calls that read or write. 2641 if (it->mayReadFromMemory()) { 2642 LoadInst *Ld = dyn_cast<LoadInst>(it); 2643 if (!Ld) return false; 2644 if (!Ld->isSimple() && !IsAnnotatedParallel) { 2645 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 2646 return false; 2647 } 2648 Loads.push_back(Ld); 2649 continue; 2650 } 2651 2652 // Save 'store' instructions. Abort if other instructions write to memory. 2653 if (it->mayWriteToMemory()) { 2654 StoreInst *St = dyn_cast<StoreInst>(it); 2655 if (!St) return false; 2656 if (!St->isSimple() && !IsAnnotatedParallel) { 2657 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 2658 return false; 2659 } 2660 Stores.push_back(St); 2661 } 2662 } // next instr. 2663 } // next block. 2664 2665 // Now we have two lists that hold the loads and the stores. 2666 // Next, we find the pointers that they use. 2667 2668 // Check if we see any stores. If there are no stores, then we don't 2669 // care if the pointers are *restrict*. 2670 if (!Stores.size()) { 2671 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 2672 return true; 2673 } 2674 2675 // Holds the read and read-write *pointers* that we find. These maps hold 2676 // unique values for pointers (so no need for multi-map). 2677 AliasMap Reads; 2678 AliasMap ReadWrites; 2679 2680 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 2681 // multiple times on the same object. If the ptr is accessed twice, once 2682 // for read and once for write, it will only appear once (on the write 2683 // list). This is okay, since we are going to check for conflicts between 2684 // writes and between reads and writes, but not between reads and reads. 2685 ValueSet Seen; 2686 2687 ValueVector::iterator I, IE; 2688 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 2689 StoreInst *ST = cast<StoreInst>(*I); 2690 Value* Ptr = ST->getPointerOperand(); 2691 2692 if (isUniform(Ptr)) { 2693 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 2694 return false; 2695 } 2696 2697 // If we did *not* see this pointer before, insert it to 2698 // the read-write list. At this phase it is only a 'write' list. 2699 if (Seen.insert(Ptr)) 2700 ReadWrites.insert(std::make_pair(Ptr, ST)); 2701 } 2702 2703 if (IsAnnotatedParallel) { 2704 DEBUG(dbgs() 2705 << "LV: A loop annotated parallel, ignore memory dependency " 2706 << "checks.\n"); 2707 return true; 2708 } 2709 2710 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 2711 LoadInst *LD = cast<LoadInst>(*I); 2712 Value* Ptr = LD->getPointerOperand(); 2713 // If we did *not* see this pointer before, insert it to the 2714 // read list. If we *did* see it before, then it is already in 2715 // the read-write list. This allows us to vectorize expressions 2716 // such as A[i] += x; Because the address of A[i] is a read-write 2717 // pointer. This only works if the index of A[i] is consecutive. 2718 // If the address of i is unknown (for example A[B[i]]) then we may 2719 // read a few words, modify, and write a few words, and some of the 2720 // words may be written to the same address. 2721 if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) 2722 Reads.insert(std::make_pair(Ptr, LD)); 2723 } 2724 2725 // If we write (or read-write) to a single destination and there are no 2726 // other reads in this loop then is it safe to vectorize. 2727 if (ReadWrites.size() == 1 && Reads.size() == 0) { 2728 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 2729 return true; 2730 } 2731 2732 unsigned NumReadPtrs = 0; 2733 unsigned NumWritePtrs = 0; 2734 2735 // Find pointers with computable bounds. We are going to use this information 2736 // to place a runtime bound check. 2737 bool CanDoRT = true; 2738 AliasMap::iterator MI, ME; 2739 for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { 2740 Value *V = (*MI).first; 2741 if (hasComputableBounds(V)) { 2742 PtrRtCheck.insert(SE, TheLoop, V, true); 2743 NumWritePtrs++; 2744 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); 2745 } else { 2746 CanDoRT = false; 2747 break; 2748 } 2749 } 2750 for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { 2751 Value *V = (*MI).first; 2752 if (hasComputableBounds(V)) { 2753 PtrRtCheck.insert(SE, TheLoop, V, false); 2754 NumReadPtrs++; 2755 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); 2756 } else { 2757 CanDoRT = false; 2758 break; 2759 } 2760 } 2761 2762 // Check that we did not collect too many pointers or found a 2763 // unsizeable pointer. 2764 unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); 2765 DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); 2766 if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { 2767 PtrRtCheck.reset(); 2768 CanDoRT = false; 2769 } 2770 2771 if (CanDoRT) { 2772 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 2773 } 2774 2775 bool NeedRTCheck = false; 2776 2777 // Biggest vectorized access possible, vector width * unroll factor. 2778 // TODO: We're being very pessimistic here, find a way to know the 2779 // real access width before getting here. 2780 unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * 2781 TTI->getMaximumUnrollFactor(); 2782 // Now that the pointers are in two lists (Reads and ReadWrites), we 2783 // can check that there are no conflicts between each of the writes and 2784 // between the writes to the reads. 2785 // Note that WriteObjects duplicates the stores (indexed now by underlying 2786 // objects) to avoid pointing to elements inside ReadWrites. 2787 // TODO: Maybe create a new type where they can interact without duplication. 2788 AliasMultiMap WriteObjects; 2789 ValueVector TempObjects; 2790 2791 // Check that the read-writes do not conflict with other read-write 2792 // pointers. 2793 bool AllWritesIdentified = true; 2794 for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { 2795 Value *Val = (*MI).first; 2796 Instruction *Inst = (*MI).second; 2797 2798 GetUnderlyingObjects(Val, TempObjects, DL); 2799 for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); 2800 UI != UE; ++UI) { 2801 if (!isIdentifiedObject(*UI)) { 2802 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); 2803 NeedRTCheck = true; 2804 AllWritesIdentified = false; 2805 } 2806 2807 // Never seen it before, can't alias. 2808 if (WriteObjects[*UI].empty()) { 2809 DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); 2810 WriteObjects[*UI].push_back(Inst); 2811 continue; 2812 } 2813 // Direct alias found. 2814 if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { 2815 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 2816 << **UI <<"\n"); 2817 return false; 2818 } 2819 DEBUG(dbgs() << "LV: Found a conflicting global value:" 2820 << **UI <<"\n"); 2821 DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); 2822 DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); 2823 2824 // If global alias, make sure they do alias. 2825 if (hasPossibleGlobalWriteReorder(*UI, 2826 Inst, 2827 WriteObjects, 2828 MaxByteWidth)) { 2829 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI 2830 << "\n"); 2831 return false; 2832 } 2833 2834 // Didn't alias, insert into map for further reference. 2835 WriteObjects[*UI].push_back(Inst); 2836 } 2837 TempObjects.clear(); 2838 } 2839 2840 /// Check that the reads don't conflict with the read-writes. 2841 for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { 2842 Value *Val = (*MI).first; 2843 GetUnderlyingObjects(Val, TempObjects, DL); 2844 for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); 2845 UI != UE; ++UI) { 2846 // If all of the writes are identified then we don't care if the read 2847 // pointer is identified or not. 2848 if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { 2849 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); 2850 NeedRTCheck = true; 2851 } 2852 2853 // Never seen it before, can't alias. 2854 if (WriteObjects[*UI].empty()) 2855 continue; 2856 // Direct alias found. 2857 if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) { 2858 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 2859 << **UI <<"\n"); 2860 return false; 2861 } 2862 DEBUG(dbgs() << "LV: Found a global value: " 2863 << **UI <<"\n"); 2864 Instruction *Inst = (*MI).second; 2865 DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); 2866 DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); 2867 2868 // If global alias, make sure they do alias. 2869 if (hasPossibleGlobalWriteReorder(*UI, 2870 Inst, 2871 WriteObjects, 2872 MaxByteWidth)) { 2873 DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI 2874 << "\n"); 2875 return false; 2876 } 2877 } 2878 TempObjects.clear(); 2879 } 2880 2881 PtrRtCheck.Need = NeedRTCheck; 2882 if (NeedRTCheck && !CanDoRT) { 2883 DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << 2884 "the array bounds.\n"); 2885 PtrRtCheck.reset(); 2886 return false; 2887 } 2888 2889 DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << 2890 " need a runtime memory check.\n"); 2891 return true; 2892 } 2893 2894 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 2895 ReductionKind Kind) { 2896 if (Phi->getNumIncomingValues() != 2) 2897 return false; 2898 2899 // Reduction variables are only found in the loop header block. 2900 if (Phi->getParent() != TheLoop->getHeader()) 2901 return false; 2902 2903 // Obtain the reduction start value from the value that comes from the loop 2904 // preheader. 2905 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 2906 2907 // ExitInstruction is the single value which is used outside the loop. 2908 // We only allow for a single reduction value to be used outside the loop. 2909 // This includes users of the reduction, variables (which form a cycle 2910 // which ends in the phi node). 2911 Instruction *ExitInstruction = 0; 2912 // Indicates that we found a binary operation in our scan. 2913 bool FoundBinOp = false; 2914 2915 // Iter is our iterator. We start with the PHI node and scan for all of the 2916 // users of this instruction. All users must be instructions that can be 2917 // used as reduction variables (such as ADD). We may have a single 2918 // out-of-block user. The cycle must end with the original PHI. 2919 Instruction *Iter = Phi; 2920 2921 // To recognize min/max patterns formed by a icmp select sequence, we store 2922 // the number of instruction we saw from the recognized min/max pattern, 2923 // such that we don't stop when we see the phi has two uses (one by the select 2924 // and one by the icmp) and to make sure we only see exactly the two 2925 // instructions. 2926 unsigned NumCmpSelectPatternInst = 0; 2927 ReductionInstDesc ReduxDesc(false, 0); 2928 2929 // Avoid cycles in the chain. 2930 SmallPtrSet<Instruction *, 8> VisitedInsts; 2931 while (VisitedInsts.insert(Iter)) { 2932 // If the instruction has no users then this is a broken 2933 // chain and can't be a reduction variable. 2934 if (Iter->use_empty()) 2935 return false; 2936 2937 // Did we find a user inside this loop already ? 2938 bool FoundInBlockUser = false; 2939 // Did we reach the initial PHI node already ? 2940 bool FoundStartPHI = false; 2941 2942 // Is this a bin op ? 2943 FoundBinOp |= !isa<PHINode>(Iter); 2944 2945 // For each of the *users* of iter. 2946 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 2947 it != e; ++it) { 2948 Instruction *U = cast<Instruction>(*it); 2949 // We already know that the PHI is a user. 2950 if (U == Phi) { 2951 FoundStartPHI = true; 2952 continue; 2953 } 2954 2955 // Check if we found the exit user. 2956 BasicBlock *Parent = U->getParent(); 2957 if (!TheLoop->contains(Parent)) { 2958 // Exit if you find multiple outside users. 2959 if (ExitInstruction != 0) 2960 return false; 2961 ExitInstruction = Iter; 2962 } 2963 2964 // We allow in-loop PHINodes which are not the original reduction PHI 2965 // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE 2966 // structure) then don't skip this PHI. 2967 if (isa<PHINode>(Iter) && isa<PHINode>(U) && 2968 U->getParent() != TheLoop->getHeader() && 2969 TheLoop->contains(U) && 2970 Iter->hasNUsesOrMore(2)) 2971 continue; 2972 2973 // We can't have multiple inside users except for a combination of 2974 // icmp/select both using the phi. 2975 if (FoundInBlockUser && !NumCmpSelectPatternInst) 2976 return false; 2977 FoundInBlockUser = true; 2978 2979 // Any reduction instr must be of one of the allowed kinds. 2980 ReduxDesc = isReductionInstr(U, Kind, ReduxDesc); 2981 if (!ReduxDesc.IsReduction) 2982 return false; 2983 2984 if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U))) 2985 ++NumCmpSelectPatternInst; 2986 if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U))) 2987 ++NumCmpSelectPatternInst; 2988 2989 // Reductions of instructions such as Div, and Sub is only 2990 // possible if the LHS is the reduction variable. 2991 if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) && 2992 !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter) 2993 return false; 2994 2995 Iter = ReduxDesc.PatternLastInst; 2996 } 2997 2998 // This means we have seen one but not the other instruction of the 2999 // pattern or more than just a select and cmp. 3000 if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && 3001 NumCmpSelectPatternInst != 2) 3002 return false; 3003 3004 // We found a reduction var if we have reached the original 3005 // phi node and we only have a single instruction with out-of-loop 3006 // users. 3007 if (FoundStartPHI) { 3008 // This instruction is allowed to have out-of-loop users. 3009 AllowedExit.insert(ExitInstruction); 3010 3011 // Save the description of this reduction variable. 3012 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, 3013 ReduxDesc.MinMaxKind); 3014 Reductions[Phi] = RD; 3015 // We've ended the cycle. This is a reduction variable if we have an 3016 // outside user and it has a binary op. 3017 return FoundBinOp && ExitInstruction; 3018 } 3019 } 3020 3021 return false; 3022 } 3023 3024 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction 3025 /// pattern corresponding to a min(X, Y) or max(X, Y). 3026 LoopVectorizationLegality::ReductionInstDesc 3027 LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, 3028 ReductionInstDesc &Prev) { 3029 3030 assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && 3031 "Expect a select instruction"); 3032 Instruction *Cmp = 0; 3033 SelectInst *Select = 0; 3034 3035 // We must handle the select(cmp()) as a single instruction. Advance to the 3036 // select. 3037 if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { 3038 if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) 3039 return ReductionInstDesc(false, I); 3040 return ReductionInstDesc(Select, Prev.MinMaxKind); 3041 } 3042 3043 // Only handle single use cases for now. 3044 if (!(Select = dyn_cast<SelectInst>(I))) 3045 return ReductionInstDesc(false, I); 3046 if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && 3047 !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) 3048 return ReductionInstDesc(false, I); 3049 if (!Cmp->hasOneUse()) 3050 return ReductionInstDesc(false, I); 3051 3052 Value *CmpLeft; 3053 Value *CmpRight; 3054 3055 // Look for a min/max pattern. 3056 if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3057 return ReductionInstDesc(Select, MRK_UIntMin); 3058 else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3059 return ReductionInstDesc(Select, MRK_UIntMax); 3060 else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3061 return ReductionInstDesc(Select, MRK_SIntMax); 3062 else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3063 return ReductionInstDesc(Select, MRK_SIntMin); 3064 else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3065 return ReductionInstDesc(Select, MRK_FloatMin); 3066 else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3067 return ReductionInstDesc(Select, MRK_FloatMax); 3068 else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3069 return ReductionInstDesc(Select, MRK_FloatMin); 3070 else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 3071 return ReductionInstDesc(Select, MRK_FloatMax); 3072 3073 return ReductionInstDesc(false, I); 3074 } 3075 3076 LoopVectorizationLegality::ReductionInstDesc 3077 LoopVectorizationLegality::isReductionInstr(Instruction *I, 3078 ReductionKind Kind, 3079 ReductionInstDesc &Prev) { 3080 bool FP = I->getType()->isFloatingPointTy(); 3081 bool FastMath = (FP && I->isCommutative() && I->isAssociative()); 3082 switch (I->getOpcode()) { 3083 default: 3084 return ReductionInstDesc(false, I); 3085 case Instruction::PHI: 3086 if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && 3087 Kind != RK_FloatMinMax)) 3088 return ReductionInstDesc(false, I); 3089 return ReductionInstDesc(I, Prev.MinMaxKind); 3090 case Instruction::Sub: 3091 case Instruction::Add: 3092 return ReductionInstDesc(Kind == RK_IntegerAdd, I); 3093 case Instruction::Mul: 3094 return ReductionInstDesc(Kind == RK_IntegerMult, I); 3095 case Instruction::And: 3096 return ReductionInstDesc(Kind == RK_IntegerAnd, I); 3097 case Instruction::Or: 3098 return ReductionInstDesc(Kind == RK_IntegerOr, I); 3099 case Instruction::Xor: 3100 return ReductionInstDesc(Kind == RK_IntegerXor, I); 3101 case Instruction::FMul: 3102 return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); 3103 case Instruction::FAdd: 3104 return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); 3105 case Instruction::FCmp: 3106 case Instruction::ICmp: 3107 case Instruction::Select: 3108 if (Kind != RK_IntegerMinMax && 3109 (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) 3110 return ReductionInstDesc(false, I); 3111 return isMinMaxSelectCmpPattern(I, Prev); 3112 } 3113 } 3114 3115 LoopVectorizationLegality::InductionKind 3116 LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 3117 Type *PhiTy = Phi->getType(); 3118 // We only handle integer and pointer inductions variables. 3119 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 3120 return IK_NoInduction; 3121 3122 // Check that the PHI is consecutive. 3123 const SCEV *PhiScev = SE->getSCEV(Phi); 3124 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 3125 if (!AR) { 3126 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 3127 return IK_NoInduction; 3128 } 3129 const SCEV *Step = AR->getStepRecurrence(*SE); 3130 3131 // Integer inductions need to have a stride of one. 3132 if (PhiTy->isIntegerTy()) { 3133 if (Step->isOne()) 3134 return IK_IntInduction; 3135 if (Step->isAllOnesValue()) 3136 return IK_ReverseIntInduction; 3137 return IK_NoInduction; 3138 } 3139 3140 // Calculate the pointer stride and check if it is consecutive. 3141 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 3142 if (!C) 3143 return IK_NoInduction; 3144 3145 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 3146 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 3147 if (C->getValue()->equalsInt(Size)) 3148 return IK_PtrInduction; 3149 else if (C->getValue()->equalsInt(0 - Size)) 3150 return IK_ReversePtrInduction; 3151 3152 return IK_NoInduction; 3153 } 3154 3155 bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 3156 Value *In0 = const_cast<Value*>(V); 3157 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 3158 if (!PN) 3159 return false; 3160 3161 return Inductions.count(PN); 3162 } 3163 3164 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 3165 assert(TheLoop->contains(BB) && "Unknown block used"); 3166 3167 // Blocks that do not dominate the latch need predication. 3168 BasicBlock* Latch = TheLoop->getLoopLatch(); 3169 return !DT->dominates(BB, Latch); 3170 } 3171 3172 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 3173 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3174 // We don't predicate loads/stores at the moment. 3175 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 3176 return false; 3177 3178 // The instructions below can trap. 3179 switch (it->getOpcode()) { 3180 default: continue; 3181 case Instruction::UDiv: 3182 case Instruction::SDiv: 3183 case Instruction::URem: 3184 case Instruction::SRem: 3185 return false; 3186 } 3187 } 3188 3189 return true; 3190 } 3191 3192 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 3193 const SCEV *PhiScev = SE->getSCEV(Ptr); 3194 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 3195 if (!AR) 3196 return false; 3197 3198 return AR->isAffine(); 3199 } 3200 3201 LoopVectorizationCostModel::VectorizationFactor 3202 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 3203 unsigned UserVF) { 3204 // Width 1 means no vectorize 3205 VectorizationFactor Factor = { 1U, 0U }; 3206 if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 3207 DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 3208 return Factor; 3209 } 3210 3211 // Find the trip count. 3212 unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 3213 DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n"); 3214 3215 unsigned WidestType = getWidestType(); 3216 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 3217 unsigned MaxVectorSize = WidestRegister / WidestType; 3218 DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); 3219 DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); 3220 3221 if (MaxVectorSize == 0) { 3222 DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 3223 MaxVectorSize = 1; 3224 } 3225 3226 assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" 3227 " into one vector!"); 3228 3229 unsigned VF = MaxVectorSize; 3230 3231 // If we optimize the program for size, avoid creating the tail loop. 3232 if (OptForSize) { 3233 // If we are unable to calculate the trip count then don't try to vectorize. 3234 if (TC < 2) { 3235 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 3236 return Factor; 3237 } 3238 3239 // Find the maximum SIMD width that can fit within the trip count. 3240 VF = TC % MaxVectorSize; 3241 3242 if (VF == 0) 3243 VF = MaxVectorSize; 3244 3245 // If the trip count that we found modulo the vectorization factor is not 3246 // zero then we require a tail. 3247 if (VF < 2) { 3248 DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 3249 return Factor; 3250 } 3251 } 3252 3253 if (UserVF != 0) { 3254 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 3255 DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n"); 3256 3257 Factor.Width = UserVF; 3258 return Factor; 3259 } 3260 3261 float Cost = expectedCost(1); 3262 unsigned Width = 1; 3263 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 3264 for (unsigned i=2; i <= VF; i*=2) { 3265 // Notice that the vector loop needs to be executed less times, so 3266 // we need to divide the cost of the vector loops by the width of 3267 // the vector elements. 3268 float VectorCost = expectedCost(i) / (float)i; 3269 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 3270 (int)VectorCost << ".\n"); 3271 if (VectorCost < Cost) { 3272 Cost = VectorCost; 3273 Width = i; 3274 } 3275 } 3276 3277 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 3278 Factor.Width = Width; 3279 Factor.Cost = Width * Cost; 3280 return Factor; 3281 } 3282 3283 unsigned LoopVectorizationCostModel::getWidestType() { 3284 unsigned MaxWidth = 8; 3285 3286 // For each block. 3287 for (Loop::block_iterator bb = TheLoop->block_begin(), 3288 be = TheLoop->block_end(); bb != be; ++bb) { 3289 BasicBlock *BB = *bb; 3290 3291 // For each instruction in the loop. 3292 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3293 Type *T = it->getType(); 3294 3295 // Only examine Loads, Stores and PHINodes. 3296 if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 3297 continue; 3298 3299 // Examine PHI nodes that are reduction variables. 3300 if (PHINode *PN = dyn_cast<PHINode>(it)) 3301 if (!Legal->getReductionVars()->count(PN)) 3302 continue; 3303 3304 // Examine the stored values. 3305 if (StoreInst *ST = dyn_cast<StoreInst>(it)) 3306 T = ST->getValueOperand()->getType(); 3307 3308 // Ignore loaded pointer types and stored pointer types that are not 3309 // consecutive. However, we do want to take consecutive stores/loads of 3310 // pointer vectors into account. 3311 if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) 3312 continue; 3313 3314 MaxWidth = std::max(MaxWidth, 3315 (unsigned)DL->getTypeSizeInBits(T->getScalarType())); 3316 } 3317 } 3318 3319 return MaxWidth; 3320 } 3321 3322 unsigned 3323 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, 3324 unsigned UserUF, 3325 unsigned VF, 3326 unsigned LoopCost) { 3327 3328 // -- The unroll heuristics -- 3329 // We unroll the loop in order to expose ILP and reduce the loop overhead. 3330 // There are many micro-architectural considerations that we can't predict 3331 // at this level. For example frontend pressure (on decode or fetch) due to 3332 // code size, or the number and capabilities of the execution ports. 3333 // 3334 // We use the following heuristics to select the unroll factor: 3335 // 1. If the code has reductions the we unroll in order to break the cross 3336 // iteration dependency. 3337 // 2. If the loop is really small then we unroll in order to reduce the loop 3338 // overhead. 3339 // 3. We don't unroll if we think that we will spill registers to memory due 3340 // to the increased register pressure. 3341 3342 // Use the user preference, unless 'auto' is selected. 3343 if (UserUF != 0) 3344 return UserUF; 3345 3346 // When we optimize for size we don't unroll. 3347 if (OptForSize) 3348 return 1; 3349 3350 // Do not unroll loops with a relatively small trip count. 3351 unsigned TC = SE->getSmallConstantTripCount(TheLoop, 3352 TheLoop->getLoopLatch()); 3353 if (TC > 1 && TC < TinyTripCountUnrollThreshold) 3354 return 1; 3355 3356 unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); 3357 DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << 3358 " vector registers\n"); 3359 3360 LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); 3361 // We divide by these constants so assume that we have at least one 3362 // instruction that uses at least one register. 3363 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 3364 R.NumInstructions = std::max(R.NumInstructions, 1U); 3365 3366 // We calculate the unroll factor using the following formula. 3367 // Subtract the number of loop invariants from the number of available 3368 // registers. These registers are used by all of the unrolled instances. 3369 // Next, divide the remaining registers by the number of registers that is 3370 // required by the loop, in order to estimate how many parallel instances 3371 // fit without causing spills. 3372 unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; 3373 3374 // Clamp the unroll factor ranges to reasonable factors. 3375 unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); 3376 3377 // If we did not calculate the cost for VF (because the user selected the VF) 3378 // then we calculate the cost of VF here. 3379 if (LoopCost == 0) 3380 LoopCost = expectedCost(VF); 3381 3382 // Clamp the calculated UF to be between the 1 and the max unroll factor 3383 // that the target allows. 3384 if (UF > MaxUnrollSize) 3385 UF = MaxUnrollSize; 3386 else if (UF < 1) 3387 UF = 1; 3388 3389 if (Legal->getReductionVars()->size()) { 3390 DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); 3391 return UF; 3392 } 3393 3394 // We want to unroll tiny loops in order to reduce the loop overhead. 3395 // We assume that the cost overhead is 1 and we use the cost model 3396 // to estimate the cost of the loop and unroll until the cost of the 3397 // loop overhead is about 5% of the cost of the loop. 3398 DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); 3399 if (LoopCost < 20) { 3400 DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); 3401 unsigned NewUF = 20/LoopCost + 1; 3402 return std::min(NewUF, UF); 3403 } 3404 3405 DEBUG(dbgs() << "LV: Not Unrolling. \n"); 3406 return 1; 3407 } 3408 3409 LoopVectorizationCostModel::RegisterUsage 3410 LoopVectorizationCostModel::calculateRegisterUsage() { 3411 // This function calculates the register usage by measuring the highest number 3412 // of values that are alive at a single location. Obviously, this is a very 3413 // rough estimation. We scan the loop in a topological order in order and 3414 // assign a number to each instruction. We use RPO to ensure that defs are 3415 // met before their users. We assume that each instruction that has in-loop 3416 // users starts an interval. We record every time that an in-loop value is 3417 // used, so we have a list of the first and last occurrences of each 3418 // instruction. Next, we transpose this data structure into a multi map that 3419 // holds the list of intervals that *end* at a specific location. This multi 3420 // map allows us to perform a linear search. We scan the instructions linearly 3421 // and record each time that a new interval starts, by placing it in a set. 3422 // If we find this value in the multi-map then we remove it from the set. 3423 // The max register usage is the maximum size of the set. 3424 // We also search for instructions that are defined outside the loop, but are 3425 // used inside the loop. We need this number separately from the max-interval 3426 // usage number because when we unroll, loop-invariant values do not take 3427 // more register. 3428 LoopBlocksDFS DFS(TheLoop); 3429 DFS.perform(LI); 3430 3431 RegisterUsage R; 3432 R.NumInstructions = 0; 3433 3434 // Each 'key' in the map opens a new interval. The values 3435 // of the map are the index of the 'last seen' usage of the 3436 // instruction that is the key. 3437 typedef DenseMap<Instruction*, unsigned> IntervalMap; 3438 // Maps instruction to its index. 3439 DenseMap<unsigned, Instruction*> IdxToInstr; 3440 // Marks the end of each interval. 3441 IntervalMap EndPoint; 3442 // Saves the list of instruction indices that are used in the loop. 3443 SmallSet<Instruction*, 8> Ends; 3444 // Saves the list of values that are used in the loop but are 3445 // defined outside the loop, such as arguments and constants. 3446 SmallPtrSet<Value*, 8> LoopInvariants; 3447 3448 unsigned Index = 0; 3449 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 3450 be = DFS.endRPO(); bb != be; ++bb) { 3451 R.NumInstructions += (*bb)->size(); 3452 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 3453 ++it) { 3454 Instruction *I = it; 3455 IdxToInstr[Index++] = I; 3456 3457 // Save the end location of each USE. 3458 for (unsigned i = 0; i < I->getNumOperands(); ++i) { 3459 Value *U = I->getOperand(i); 3460 Instruction *Instr = dyn_cast<Instruction>(U); 3461 3462 // Ignore non-instruction values such as arguments, constants, etc. 3463 if (!Instr) continue; 3464 3465 // If this instruction is outside the loop then record it and continue. 3466 if (!TheLoop->contains(Instr)) { 3467 LoopInvariants.insert(Instr); 3468 continue; 3469 } 3470 3471 // Overwrite previous end points. 3472 EndPoint[Instr] = Index; 3473 Ends.insert(Instr); 3474 } 3475 } 3476 } 3477 3478 // Saves the list of intervals that end with the index in 'key'. 3479 typedef SmallVector<Instruction*, 2> InstrList; 3480 DenseMap<unsigned, InstrList> TransposeEnds; 3481 3482 // Transpose the EndPoints to a list of values that end at each index. 3483 for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 3484 it != e; ++it) 3485 TransposeEnds[it->second].push_back(it->first); 3486 3487 SmallSet<Instruction*, 8> OpenIntervals; 3488 unsigned MaxUsage = 0; 3489 3490 3491 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 3492 for (unsigned int i = 0; i < Index; ++i) { 3493 Instruction *I = IdxToInstr[i]; 3494 // Ignore instructions that are never used within the loop. 3495 if (!Ends.count(I)) continue; 3496 3497 // Remove all of the instructions that end at this location. 3498 InstrList &List = TransposeEnds[i]; 3499 for (unsigned int j=0, e = List.size(); j < e; ++j) 3500 OpenIntervals.erase(List[j]); 3501 3502 // Count the number of live interals. 3503 MaxUsage = std::max(MaxUsage, OpenIntervals.size()); 3504 3505 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << 3506 OpenIntervals.size() <<"\n"); 3507 3508 // Add the current instruction to the list of open intervals. 3509 OpenIntervals.insert(I); 3510 } 3511 3512 unsigned Invariant = LoopInvariants.size(); 3513 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n"); 3514 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n"); 3515 DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n"); 3516 3517 R.LoopInvariantRegs = Invariant; 3518 R.MaxLocalUsers = MaxUsage; 3519 return R; 3520 } 3521 3522 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 3523 unsigned Cost = 0; 3524 3525 // For each block. 3526 for (Loop::block_iterator bb = TheLoop->block_begin(), 3527 be = TheLoop->block_end(); bb != be; ++bb) { 3528 unsigned BlockCost = 0; 3529 BasicBlock *BB = *bb; 3530 3531 // For each instruction in the old loop. 3532 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3533 // Skip dbg intrinsics. 3534 if (isa<DbgInfoIntrinsic>(it)) 3535 continue; 3536 3537 unsigned C = getInstructionCost(it, VF); 3538 Cost += C; 3539 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 3540 VF << " For instruction: "<< *it << "\n"); 3541 } 3542 3543 // We assume that if-converted blocks have a 50% chance of being executed. 3544 // When the code is scalar then some of the blocks are avoided due to CF. 3545 // When the code is vectorized we execute all code paths. 3546 if (Legal->blockNeedsPredication(*bb) && VF == 1) 3547 BlockCost /= 2; 3548 3549 Cost += BlockCost; 3550 } 3551 3552 return Cost; 3553 } 3554 3555 unsigned 3556 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 3557 // If we know that this instruction will remain uniform, check the cost of 3558 // the scalar version. 3559 if (Legal->isUniformAfterVectorization(I)) 3560 VF = 1; 3561 3562 Type *RetTy = I->getType(); 3563 Type *VectorTy = ToVectorTy(RetTy, VF); 3564 3565 // TODO: We need to estimate the cost of intrinsic calls. 3566 switch (I->getOpcode()) { 3567 case Instruction::GetElementPtr: 3568 // We mark this instruction as zero-cost because the cost of GEPs in 3569 // vectorized code depends on whether the corresponding memory instruction 3570 // is scalarized or not. Therefore, we handle GEPs with the memory 3571 // instruction cost. 3572 return 0; 3573 case Instruction::Br: { 3574 return TTI.getCFInstrCost(I->getOpcode()); 3575 } 3576 case Instruction::PHI: 3577 //TODO: IF-converted IFs become selects. 3578 return 0; 3579 case Instruction::Add: 3580 case Instruction::FAdd: 3581 case Instruction::Sub: 3582 case Instruction::FSub: 3583 case Instruction::Mul: 3584 case Instruction::FMul: 3585 case Instruction::UDiv: 3586 case Instruction::SDiv: 3587 case Instruction::FDiv: 3588 case Instruction::URem: 3589 case Instruction::SRem: 3590 case Instruction::FRem: 3591 case Instruction::Shl: 3592 case Instruction::LShr: 3593 case Instruction::AShr: 3594 case Instruction::And: 3595 case Instruction::Or: 3596 case Instruction::Xor: { 3597 // Certain instructions can be cheaper to vectorize if they have a constant 3598 // second vector operand. One example of this are shifts on x86. 3599 TargetTransformInfo::OperandValueKind Op1VK = 3600 TargetTransformInfo::OK_AnyValue; 3601 TargetTransformInfo::OperandValueKind Op2VK = 3602 TargetTransformInfo::OK_AnyValue; 3603 3604 if (isa<ConstantInt>(I->getOperand(1))) 3605 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 3606 3607 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); 3608 } 3609 case Instruction::Select: { 3610 SelectInst *SI = cast<SelectInst>(I); 3611 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 3612 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 3613 Type *CondTy = SI->getCondition()->getType(); 3614 if (!ScalarCond) 3615 CondTy = VectorType::get(CondTy, VF); 3616 3617 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 3618 } 3619 case Instruction::ICmp: 3620 case Instruction::FCmp: { 3621 Type *ValTy = I->getOperand(0)->getType(); 3622 VectorTy = ToVectorTy(ValTy, VF); 3623 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 3624 } 3625 case Instruction::Store: 3626 case Instruction::Load: { 3627 StoreInst *SI = dyn_cast<StoreInst>(I); 3628 LoadInst *LI = dyn_cast<LoadInst>(I); 3629 Type *ValTy = (SI ? SI->getValueOperand()->getType() : 3630 LI->getType()); 3631 VectorTy = ToVectorTy(ValTy, VF); 3632 3633 unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 3634 unsigned AS = SI ? SI->getPointerAddressSpace() : 3635 LI->getPointerAddressSpace(); 3636 Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 3637 // We add the cost of address computation here instead of with the gep 3638 // instruction because only here we know whether the operation is 3639 // scalarized. 3640 if (VF == 1) 3641 return TTI.getAddressComputationCost(VectorTy) + 3642 TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 3643 3644 // Scalarized loads/stores. 3645 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 3646 bool Reverse = ConsecutiveStride < 0; 3647 unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); 3648 unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; 3649 if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { 3650 unsigned Cost = 0; 3651 // The cost of extracting from the value vector and pointer vector. 3652 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 3653 for (unsigned i = 0; i < VF; ++i) { 3654 // The cost of extracting the pointer operand. 3655 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 3656 // In case of STORE, the cost of ExtractElement from the vector. 3657 // In case of LOAD, the cost of InsertElement into the returned 3658 // vector. 3659 Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 3660 Instruction::InsertElement, 3661 VectorTy, i); 3662 } 3663 3664 // The cost of the scalar loads/stores. 3665 Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); 3666 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 3667 Alignment, AS); 3668 return Cost; 3669 } 3670 3671 // Wide load/stores. 3672 unsigned Cost = TTI.getAddressComputationCost(VectorTy); 3673 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 3674 3675 if (Reverse) 3676 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3677 VectorTy, 0); 3678 return Cost; 3679 } 3680 case Instruction::ZExt: 3681 case Instruction::SExt: 3682 case Instruction::FPToUI: 3683 case Instruction::FPToSI: 3684 case Instruction::FPExt: 3685 case Instruction::PtrToInt: 3686 case Instruction::IntToPtr: 3687 case Instruction::SIToFP: 3688 case Instruction::UIToFP: 3689 case Instruction::Trunc: 3690 case Instruction::FPTrunc: 3691 case Instruction::BitCast: { 3692 // We optimize the truncation of induction variable. 3693 // The cost of these is the same as the scalar operation. 3694 if (I->getOpcode() == Instruction::Trunc && 3695 Legal->isInductionVariable(I->getOperand(0))) 3696 return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 3697 I->getOperand(0)->getType()); 3698 3699 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 3700 return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 3701 } 3702 case Instruction::Call: { 3703 CallInst *CI = cast<CallInst>(I); 3704 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3705 assert(ID && "Not an intrinsic call!"); 3706 Type *RetTy = ToVectorTy(CI->getType(), VF); 3707 SmallVector<Type*, 4> Tys; 3708 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3709 Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3710 return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); 3711 } 3712 default: { 3713 // We are scalarizing the instruction. Return the cost of the scalar 3714 // instruction, plus the cost of insert and extract into vector 3715 // elements, times the vector width. 3716 unsigned Cost = 0; 3717 3718 if (!RetTy->isVoidTy() && VF != 1) { 3719 unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 3720 VectorTy); 3721 unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 3722 VectorTy); 3723 3724 // The cost of inserting the results plus extracting each one of the 3725 // operands. 3726 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 3727 } 3728 3729 // The cost of executing VF copies of the scalar instruction. This opcode 3730 // is unknown. Assume that it is the same as 'mul'. 3731 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 3732 return Cost; 3733 } 3734 }// end of switch. 3735 } 3736 3737 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 3738 if (Scalar->isVoidTy() || VF == 1) 3739 return Scalar; 3740 return VectorType::get(Scalar, VF); 3741 } 3742 3743 char LoopVectorize::ID = 0; 3744 static const char lv_name[] = "Loop Vectorization"; 3745 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 3746 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3747 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3748 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3749 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 3750 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 3751 3752 namespace llvm { 3753 Pass *createLoopVectorizePass() { 3754 return new LoopVectorize(); 3755 } 3756 } 3757 3758 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 3759 // Check for a store. 3760 if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) 3761 return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 3762 3763 // Check for a load. 3764 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 3765 return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 3766 3767 return false; 3768 } 3769