1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. 12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13 // of instructions in order to estimate the profitability of vectorization. 14 // 15 // The loop vectorizer combines consecutive loop iterations into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. InnerLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // The interleaved access vectorization is based on the paper: 38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 39 // Data for SIMD 40 // 41 // Other ideas/concepts are from: 42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 43 // 44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 45 // Vectorizing Compilers. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #include "llvm/Transforms/Vectorize.h" 50 #include "llvm/ADT/DenseMap.h" 51 #include "llvm/ADT/Hashing.h" 52 #include "llvm/ADT/MapVector.h" 53 #include "llvm/ADT/SetVector.h" 54 #include "llvm/ADT/SmallPtrSet.h" 55 #include "llvm/ADT/SmallSet.h" 56 #include "llvm/ADT/SmallVector.h" 57 #include "llvm/ADT/Statistic.h" 58 #include "llvm/ADT/StringExtras.h" 59 #include "llvm/Analysis/AliasAnalysis.h" 60 #include "llvm/Analysis/BasicAliasAnalysis.h" 61 #include "llvm/Analysis/AliasSetTracker.h" 62 #include "llvm/Analysis/AssumptionCache.h" 63 #include "llvm/Analysis/BlockFrequencyInfo.h" 64 #include "llvm/Analysis/CodeMetrics.h" 65 #include "llvm/Analysis/DemandedBits.h" 66 #include "llvm/Analysis/GlobalsModRef.h" 67 #include "llvm/Analysis/LoopAccessAnalysis.h" 68 #include "llvm/Analysis/LoopInfo.h" 69 #include "llvm/Analysis/LoopIterator.h" 70 #include "llvm/Analysis/LoopPass.h" 71 #include "llvm/Analysis/ScalarEvolution.h" 72 #include "llvm/Analysis/ScalarEvolutionExpander.h" 73 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 74 #include "llvm/Analysis/TargetTransformInfo.h" 75 #include "llvm/Analysis/ValueTracking.h" 76 #include "llvm/IR/Constants.h" 77 #include "llvm/IR/DataLayout.h" 78 #include "llvm/IR/DebugInfo.h" 79 #include "llvm/IR/DerivedTypes.h" 80 #include "llvm/IR/DiagnosticInfo.h" 81 #include "llvm/IR/Dominators.h" 82 #include "llvm/IR/Function.h" 83 #include "llvm/IR/IRBuilder.h" 84 #include "llvm/IR/Instructions.h" 85 #include "llvm/IR/IntrinsicInst.h" 86 #include "llvm/IR/LLVMContext.h" 87 #include "llvm/IR/Module.h" 88 #include "llvm/IR/PatternMatch.h" 89 #include "llvm/IR/Type.h" 90 #include "llvm/IR/Value.h" 91 #include "llvm/IR/ValueHandle.h" 92 #include "llvm/IR/Verifier.h" 93 #include "llvm/Pass.h" 94 #include "llvm/Support/BranchProbability.h" 95 #include "llvm/Support/CommandLine.h" 96 #include "llvm/Support/Debug.h" 97 #include "llvm/Support/raw_ostream.h" 98 #include "llvm/Transforms/Scalar.h" 99 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 100 #include "llvm/Transforms/Utils/Local.h" 101 #include "llvm/Analysis/VectorUtils.h" 102 #include "llvm/Transforms/Utils/LoopUtils.h" 103 #include <algorithm> 104 #include <functional> 105 #include <map> 106 #include <tuple> 107 108 using namespace llvm; 109 using namespace llvm::PatternMatch; 110 111 #define LV_NAME "loop-vectorize" 112 #define DEBUG_TYPE LV_NAME 113 114 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 115 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 116 117 static cl::opt<bool> 118 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 119 cl::desc("Enable if-conversion during vectorization.")); 120 121 /// We don't vectorize loops with a known constant trip count below this number. 122 static cl::opt<unsigned> 123 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), 124 cl::Hidden, 125 cl::desc("Don't vectorize loops with a constant " 126 "trip count that is smaller than this " 127 "value.")); 128 129 static cl::opt<bool> MaximizeBandwidth( 130 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 131 cl::desc("Maximize bandwidth when selecting vectorization factor which " 132 "will be determined by the smallest type in loop.")); 133 134 /// This enables versioning on the strides of symbolically striding memory 135 /// accesses in code like the following. 136 /// for (i = 0; i < N; ++i) 137 /// A[i * Stride1] += B[i * Stride2] ... 138 /// 139 /// Will be roughly translated to 140 /// if (Stride1 == 1 && Stride2 == 1) { 141 /// for (i = 0; i < N; i+=4) 142 /// A[i:i+3] += ... 143 /// } else 144 /// ... 145 static cl::opt<bool> EnableMemAccessVersioning( 146 "enable-mem-access-versioning", cl::init(true), cl::Hidden, 147 cl::desc("Enable symbolic stride memory access versioning")); 148 149 static cl::opt<bool> EnableInterleavedMemAccesses( 150 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 151 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 152 153 /// Maximum factor for an interleaved memory access. 154 static cl::opt<unsigned> MaxInterleaveGroupFactor( 155 "max-interleave-group-factor", cl::Hidden, 156 cl::desc("Maximum factor for an interleaved access group (default = 8)"), 157 cl::init(8)); 158 159 /// We don't interleave loops with a known constant trip count below this 160 /// number. 161 static const unsigned TinyTripCountInterleaveThreshold = 128; 162 163 static cl::opt<unsigned> ForceTargetNumScalarRegs( 164 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 165 cl::desc("A flag that overrides the target's number of scalar registers.")); 166 167 static cl::opt<unsigned> ForceTargetNumVectorRegs( 168 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 169 cl::desc("A flag that overrides the target's number of vector registers.")); 170 171 /// Maximum vectorization interleave count. 172 static const unsigned MaxInterleaveFactor = 16; 173 174 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 175 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 176 cl::desc("A flag that overrides the target's max interleave factor for " 177 "scalar loops.")); 178 179 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 180 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 181 cl::desc("A flag that overrides the target's max interleave factor for " 182 "vectorized loops.")); 183 184 static cl::opt<unsigned> ForceTargetInstructionCost( 185 "force-target-instruction-cost", cl::init(0), cl::Hidden, 186 cl::desc("A flag that overrides the target's expected cost for " 187 "an instruction to a single constant value. Mostly " 188 "useful for getting consistent testing.")); 189 190 static cl::opt<unsigned> SmallLoopCost( 191 "small-loop-cost", cl::init(20), cl::Hidden, 192 cl::desc( 193 "The cost of a loop that is considered 'small' by the interleaver.")); 194 195 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 196 "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, 197 cl::desc("Enable the use of the block frequency analysis to access PGO " 198 "heuristics minimizing code growth in cold regions and being more " 199 "aggressive in hot regions.")); 200 201 // Runtime interleave loops for load/store throughput. 202 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 203 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 204 cl::desc( 205 "Enable runtime interleaving until load/store ports are saturated")); 206 207 /// The number of stores in a loop that are allowed to need predication. 208 static cl::opt<unsigned> NumberOfStoresToPredicate( 209 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 210 cl::desc("Max number of stores to be predicated behind an if.")); 211 212 static cl::opt<bool> EnableIndVarRegisterHeur( 213 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 214 cl::desc("Count the induction variable only once when interleaving")); 215 216 static cl::opt<bool> EnableCondStoresVectorization( 217 "enable-cond-stores-vec", cl::init(false), cl::Hidden, 218 cl::desc("Enable if predication of stores during vectorization.")); 219 220 static cl::opt<unsigned> MaxNestedScalarReductionIC( 221 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 222 cl::desc("The maximum interleave count to use when interleaving a scalar " 223 "reduction in a nested loop.")); 224 225 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 226 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 227 cl::desc("The maximum allowed number of runtime memory checks with a " 228 "vectorize(enable) pragma.")); 229 230 static cl::opt<unsigned> VectorizeSCEVCheckThreshold( 231 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 232 cl::desc("The maximum number of SCEV checks allowed.")); 233 234 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 235 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 236 cl::desc("The maximum number of SCEV checks allowed with a " 237 "vectorize(enable) pragma")); 238 239 namespace { 240 241 // Forward declarations. 242 class LoopVectorizeHints; 243 class LoopVectorizationLegality; 244 class LoopVectorizationCostModel; 245 class LoopVectorizationRequirements; 246 247 /// \brief This modifies LoopAccessReport to initialize message with 248 /// loop-vectorizer-specific part. 249 class VectorizationReport : public LoopAccessReport { 250 public: 251 VectorizationReport(Instruction *I = nullptr) 252 : LoopAccessReport("loop not vectorized: ", I) {} 253 254 /// \brief This allows promotion of the loop-access analysis report into the 255 /// loop-vectorizer report. It modifies the message to add the 256 /// loop-vectorizer-specific part of the message. 257 explicit VectorizationReport(const LoopAccessReport &R) 258 : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), 259 R.getInstr()) {} 260 }; 261 262 /// A helper function for converting Scalar types to vector types. 263 /// If the incoming type is void, we return void. If the VF is 1, we return 264 /// the scalar type. 265 static Type* ToVectorTy(Type *Scalar, unsigned VF) { 266 if (Scalar->isVoidTy() || VF == 1) 267 return Scalar; 268 return VectorType::get(Scalar, VF); 269 } 270 271 /// A helper function that returns GEP instruction and knows to skip a 272 /// 'bitcast'. The 'bitcast' may be skipped if the source and the destination 273 /// pointee types of the 'bitcast' have the same size. 274 /// For example: 275 /// bitcast double** %var to i64* - can be skipped 276 /// bitcast double** %var to i8* - can not 277 static GetElementPtrInst *getGEPInstruction(Value *Ptr) { 278 279 if (isa<GetElementPtrInst>(Ptr)) 280 return cast<GetElementPtrInst>(Ptr); 281 282 if (isa<BitCastInst>(Ptr) && 283 isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { 284 Type *BitcastTy = Ptr->getType(); 285 Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); 286 if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) 287 return nullptr; 288 Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); 289 Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); 290 const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); 291 if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) 292 return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); 293 } 294 return nullptr; 295 } 296 297 /// InnerLoopVectorizer vectorizes loops which contain only one basic 298 /// block to a specified vectorization factor (VF). 299 /// This class performs the widening of scalars into vectors, or multiple 300 /// scalars. This class also implements the following features: 301 /// * It inserts an epilogue loop for handling loops that don't have iteration 302 /// counts that are known to be a multiple of the vectorization factor. 303 /// * It handles the code generation for reduction variables. 304 /// * Scalarization (implementation using scalars) of un-vectorizable 305 /// instructions. 306 /// InnerLoopVectorizer does not perform any vectorization-legality 307 /// checks, and relies on the caller to check for the different legality 308 /// aspects. The InnerLoopVectorizer relies on the 309 /// LoopVectorizationLegality class to provide information about the induction 310 /// and reduction variables that were found to a given vectorization factor. 311 class InnerLoopVectorizer { 312 public: 313 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 314 LoopInfo *LI, DominatorTree *DT, 315 const TargetLibraryInfo *TLI, 316 const TargetTransformInfo *TTI, unsigned VecWidth, 317 unsigned UnrollFactor) 318 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 319 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), 320 Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), 321 TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), 322 AddedSafetyChecks(false) {} 323 324 // Perform the actual loop widening (vectorization). 325 // MinimumBitWidths maps scalar integer values to the smallest bitwidth they 326 // can be validly truncated to. The cost model has assumed this truncation 327 // will happen when vectorizing. 328 void vectorize(LoopVectorizationLegality *L, 329 MapVector<Instruction*,uint64_t> MinimumBitWidths) { 330 MinBWs = MinimumBitWidths; 331 Legal = L; 332 // Create a new empty loop. Unlink the old loop and connect the new one. 333 createEmptyLoop(); 334 // Widen each instruction in the old loop to a new one in the new loop. 335 // Use the Legality module to find the induction and reduction variables. 336 vectorizeLoop(); 337 } 338 339 // Return true if any runtime check is added. 340 bool IsSafetyChecksAdded() { 341 return AddedSafetyChecks; 342 } 343 344 virtual ~InnerLoopVectorizer() {} 345 346 protected: 347 /// A small list of PHINodes. 348 typedef SmallVector<PHINode*, 4> PhiVector; 349 /// When we unroll loops we have multiple vector values for each scalar. 350 /// This data structure holds the unrolled and vectorized values that 351 /// originated from one scalar instruction. 352 typedef SmallVector<Value*, 2> VectorParts; 353 354 // When we if-convert we need to create edge masks. We have to cache values 355 // so that we don't end up with exponential recursion/IR. 356 typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, 357 VectorParts> EdgeMaskCache; 358 359 /// Create an empty loop, based on the loop ranges of the old loop. 360 void createEmptyLoop(); 361 /// Create a new induction variable inside L. 362 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 363 Value *Step, Instruction *DL); 364 /// Copy and widen the instructions from the old loop. 365 virtual void vectorizeLoop(); 366 367 /// Fix a first-order recurrence. This is the second phase of vectorizing 368 /// this phi node. 369 void fixFirstOrderRecurrence(PHINode *Phi); 370 371 /// \brief The Loop exit block may have single value PHI nodes where the 372 /// incoming value is 'Undef'. While vectorizing we only handled real values 373 /// that were defined inside the loop. Here we fix the 'undef case'. 374 /// See PR14725. 375 void fixLCSSAPHIs(); 376 377 /// Shrinks vector element sizes based on information in "MinBWs". 378 void truncateToMinimalBitwidths(); 379 380 /// A helper function that computes the predicate of the block BB, assuming 381 /// that the header block of the loop is set to True. It returns the *entry* 382 /// mask for the block BB. 383 VectorParts createBlockInMask(BasicBlock *BB); 384 /// A helper function that computes the predicate of the edge between SRC 385 /// and DST. 386 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 387 388 /// A helper function to vectorize a single BB within the innermost loop. 389 void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); 390 391 /// Vectorize a single PHINode in a block. This method handles the induction 392 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 393 /// arbitrary length vectors. 394 void widenPHIInstruction(Instruction *PN, VectorParts &Entry, 395 unsigned UF, unsigned VF, PhiVector *PV); 396 397 /// Insert the new loop to the loop hierarchy and pass manager 398 /// and update the analysis passes. 399 void updateAnalysis(); 400 401 /// This instruction is un-vectorizable. Implement it as a sequence 402 /// of scalars. If \p IfPredicateStore is true we need to 'hide' each 403 /// scalarized instruction behind an if block predicated on the control 404 /// dependence of the instruction. 405 virtual void scalarizeInstruction(Instruction *Instr, 406 bool IfPredicateStore=false); 407 408 /// Vectorize Load and Store instructions, 409 virtual void vectorizeMemoryInstruction(Instruction *Instr); 410 411 /// Create a broadcast instruction. This method generates a broadcast 412 /// instruction (shuffle) for loop invariant values and for the induction 413 /// value. If this is the induction variable then we extend it to N, N+1, ... 414 /// this is needed because each iteration in the loop corresponds to a SIMD 415 /// element. 416 virtual Value *getBroadcastInstrs(Value *V); 417 418 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 419 /// to each vector element of Val. The sequence starts at StartIndex. 420 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); 421 422 /// When we go over instructions in the basic block we rely on previous 423 /// values within the current basic block or on loop invariant values. 424 /// When we widen (vectorize) values we place them in the map. If the values 425 /// are not within the map, they have to be loop invariant, so we simply 426 /// broadcast them into a vector. 427 VectorParts &getVectorValue(Value *V); 428 429 /// Try to vectorize the interleaved access group that \p Instr belongs to. 430 void vectorizeInterleaveGroup(Instruction *Instr); 431 432 /// Generate a shuffle sequence that will reverse the vector Vec. 433 virtual Value *reverseVector(Value *Vec); 434 435 /// Returns (and creates if needed) the original loop trip count. 436 Value *getOrCreateTripCount(Loop *NewLoop); 437 438 /// Returns (and creates if needed) the trip count of the widened loop. 439 Value *getOrCreateVectorTripCount(Loop *NewLoop); 440 441 /// Emit a bypass check to see if the trip count would overflow, or we 442 /// wouldn't have enough iterations to execute one vector loop. 443 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 444 /// Emit a bypass check to see if the vector trip count is nonzero. 445 void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); 446 /// Emit a bypass check to see if all of the SCEV assumptions we've 447 /// had to make are correct. 448 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 449 /// Emit bypass checks to check any memory assumptions we may have made. 450 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 451 452 /// This is a helper class that holds the vectorizer state. It maps scalar 453 /// instructions to vector instructions. When the code is 'unrolled' then 454 /// then a single scalar value is mapped to multiple vector parts. The parts 455 /// are stored in the VectorPart type. 456 struct ValueMap { 457 /// C'tor. UnrollFactor controls the number of vectors ('parts') that 458 /// are mapped. 459 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 460 461 /// \return True if 'Key' is saved in the Value Map. 462 bool has(Value *Key) const { return MapStorage.count(Key); } 463 464 /// Initializes a new entry in the map. Sets all of the vector parts to the 465 /// save value in 'Val'. 466 /// \return A reference to a vector with splat values. 467 VectorParts &splat(Value *Key, Value *Val) { 468 VectorParts &Entry = MapStorage[Key]; 469 Entry.assign(UF, Val); 470 return Entry; 471 } 472 473 ///\return A reference to the value that is stored at 'Key'. 474 VectorParts &get(Value *Key) { 475 VectorParts &Entry = MapStorage[Key]; 476 if (Entry.empty()) 477 Entry.resize(UF); 478 assert(Entry.size() == UF); 479 return Entry; 480 } 481 482 private: 483 /// The unroll factor. Each entry in the map stores this number of vector 484 /// elements. 485 unsigned UF; 486 487 /// Map storage. We use std::map and not DenseMap because insertions to a 488 /// dense map invalidates its iterators. 489 std::map<Value *, VectorParts> MapStorage; 490 }; 491 492 /// The original loop. 493 Loop *OrigLoop; 494 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 495 /// dynamic knowledge to simplify SCEV expressions and converts them to a 496 /// more usable form. 497 PredicatedScalarEvolution &PSE; 498 /// Loop Info. 499 LoopInfo *LI; 500 /// Dominator Tree. 501 DominatorTree *DT; 502 /// Alias Analysis. 503 AliasAnalysis *AA; 504 /// Target Library Info. 505 const TargetLibraryInfo *TLI; 506 /// Target Transform Info. 507 const TargetTransformInfo *TTI; 508 509 /// The vectorization SIMD factor to use. Each vector will have this many 510 /// vector elements. 511 unsigned VF; 512 513 protected: 514 /// The vectorization unroll factor to use. Each scalar is vectorized to this 515 /// many different vector instructions. 516 unsigned UF; 517 518 /// The builder that we use 519 IRBuilder<> Builder; 520 521 // --- Vectorization state --- 522 523 /// The vector-loop preheader. 524 BasicBlock *LoopVectorPreHeader; 525 /// The scalar-loop preheader. 526 BasicBlock *LoopScalarPreHeader; 527 /// Middle Block between the vector and the scalar. 528 BasicBlock *LoopMiddleBlock; 529 ///The ExitBlock of the scalar loop. 530 BasicBlock *LoopExitBlock; 531 ///The vector loop body. 532 SmallVector<BasicBlock *, 4> LoopVectorBody; 533 ///The scalar loop body. 534 BasicBlock *LoopScalarBody; 535 /// A list of all bypass blocks. The first block is the entry of the loop. 536 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 537 538 /// The new Induction variable which was added to the new block. 539 PHINode *Induction; 540 /// The induction variable of the old basic block. 541 PHINode *OldInduction; 542 /// Maps scalars to widened vectors. 543 ValueMap WidenMap; 544 /// Store instructions that should be predicated, as a pair 545 /// <StoreInst, Predicate> 546 SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores; 547 EdgeMaskCache MaskCache; 548 /// Trip count of the original loop. 549 Value *TripCount; 550 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 551 Value *VectorTripCount; 552 553 /// Map of scalar integer values to the smallest bitwidth they can be legally 554 /// represented as. The vector equivalents of these values should be truncated 555 /// to this type. 556 MapVector<Instruction*,uint64_t> MinBWs; 557 LoopVectorizationLegality *Legal; 558 559 // Record whether runtime check is added. 560 bool AddedSafetyChecks; 561 }; 562 563 class InnerLoopUnroller : public InnerLoopVectorizer { 564 public: 565 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 566 LoopInfo *LI, DominatorTree *DT, 567 const TargetLibraryInfo *TLI, 568 const TargetTransformInfo *TTI, unsigned UnrollFactor) 569 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {} 570 571 private: 572 void scalarizeInstruction(Instruction *Instr, 573 bool IfPredicateStore = false) override; 574 void vectorizeMemoryInstruction(Instruction *Instr) override; 575 Value *getBroadcastInstrs(Value *V) override; 576 Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; 577 Value *reverseVector(Value *Vec) override; 578 }; 579 580 /// \brief Look for a meaningful debug location on the instruction or it's 581 /// operands. 582 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 583 if (!I) 584 return I; 585 586 DebugLoc Empty; 587 if (I->getDebugLoc() != Empty) 588 return I; 589 590 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 591 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 592 if (OpInst->getDebugLoc() != Empty) 593 return OpInst; 594 } 595 596 return I; 597 } 598 599 /// \brief Set the debug location in the builder using the debug location in the 600 /// instruction. 601 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 602 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) 603 B.SetCurrentDebugLocation(Inst->getDebugLoc()); 604 else 605 B.SetCurrentDebugLocation(DebugLoc()); 606 } 607 608 #ifndef NDEBUG 609 /// \return string containing a file name and a line # for the given loop. 610 static std::string getDebugLocString(const Loop *L) { 611 std::string Result; 612 if (L) { 613 raw_string_ostream OS(Result); 614 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 615 LoopDbgLoc.print(OS); 616 else 617 // Just print the module name. 618 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 619 OS.flush(); 620 } 621 return Result; 622 } 623 #endif 624 625 /// \brief Propagate known metadata from one instruction to another. 626 static void propagateMetadata(Instruction *To, const Instruction *From) { 627 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 628 From->getAllMetadataOtherThanDebugLoc(Metadata); 629 630 for (auto M : Metadata) { 631 unsigned Kind = M.first; 632 633 // These are safe to transfer (this is safe for TBAA, even when we 634 // if-convert, because should that metadata have had a control dependency 635 // on the condition, and thus actually aliased with some other 636 // non-speculated memory access when the condition was false, this would be 637 // caught by the runtime overlap checks). 638 if (Kind != LLVMContext::MD_tbaa && 639 Kind != LLVMContext::MD_alias_scope && 640 Kind != LLVMContext::MD_noalias && 641 Kind != LLVMContext::MD_fpmath && 642 Kind != LLVMContext::MD_nontemporal) 643 continue; 644 645 To->setMetadata(Kind, M.second); 646 } 647 } 648 649 /// \brief Propagate known metadata from one instruction to a vector of others. 650 static void propagateMetadata(SmallVectorImpl<Value *> &To, 651 const Instruction *From) { 652 for (Value *V : To) 653 if (Instruction *I = dyn_cast<Instruction>(V)) 654 propagateMetadata(I, From); 655 } 656 657 /// \brief The group of interleaved loads/stores sharing the same stride and 658 /// close to each other. 659 /// 660 /// Each member in this group has an index starting from 0, and the largest 661 /// index should be less than interleaved factor, which is equal to the absolute 662 /// value of the access's stride. 663 /// 664 /// E.g. An interleaved load group of factor 4: 665 /// for (unsigned i = 0; i < 1024; i+=4) { 666 /// a = A[i]; // Member of index 0 667 /// b = A[i+1]; // Member of index 1 668 /// d = A[i+3]; // Member of index 3 669 /// ... 670 /// } 671 /// 672 /// An interleaved store group of factor 4: 673 /// for (unsigned i = 0; i < 1024; i+=4) { 674 /// ... 675 /// A[i] = a; // Member of index 0 676 /// A[i+1] = b; // Member of index 1 677 /// A[i+2] = c; // Member of index 2 678 /// A[i+3] = d; // Member of index 3 679 /// } 680 /// 681 /// Note: the interleaved load group could have gaps (missing members), but 682 /// the interleaved store group doesn't allow gaps. 683 class InterleaveGroup { 684 public: 685 InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) 686 : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { 687 assert(Align && "The alignment should be non-zero"); 688 689 Factor = std::abs(Stride); 690 assert(Factor > 1 && "Invalid interleave factor"); 691 692 Reverse = Stride < 0; 693 Members[0] = Instr; 694 } 695 696 bool isReverse() const { return Reverse; } 697 unsigned getFactor() const { return Factor; } 698 unsigned getAlignment() const { return Align; } 699 unsigned getNumMembers() const { return Members.size(); } 700 701 /// \brief Try to insert a new member \p Instr with index \p Index and 702 /// alignment \p NewAlign. The index is related to the leader and it could be 703 /// negative if it is the new leader. 704 /// 705 /// \returns false if the instruction doesn't belong to the group. 706 bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { 707 assert(NewAlign && "The new member's alignment should be non-zero"); 708 709 int Key = Index + SmallestKey; 710 711 // Skip if there is already a member with the same index. 712 if (Members.count(Key)) 713 return false; 714 715 if (Key > LargestKey) { 716 // The largest index is always less than the interleave factor. 717 if (Index >= static_cast<int>(Factor)) 718 return false; 719 720 LargestKey = Key; 721 } else if (Key < SmallestKey) { 722 // The largest index is always less than the interleave factor. 723 if (LargestKey - Key >= static_cast<int>(Factor)) 724 return false; 725 726 SmallestKey = Key; 727 } 728 729 // It's always safe to select the minimum alignment. 730 Align = std::min(Align, NewAlign); 731 Members[Key] = Instr; 732 return true; 733 } 734 735 /// \brief Get the member with the given index \p Index 736 /// 737 /// \returns nullptr if contains no such member. 738 Instruction *getMember(unsigned Index) const { 739 int Key = SmallestKey + Index; 740 if (!Members.count(Key)) 741 return nullptr; 742 743 return Members.find(Key)->second; 744 } 745 746 /// \brief Get the index for the given member. Unlike the key in the member 747 /// map, the index starts from 0. 748 unsigned getIndex(Instruction *Instr) const { 749 for (auto I : Members) 750 if (I.second == Instr) 751 return I.first - SmallestKey; 752 753 llvm_unreachable("InterleaveGroup contains no such member"); 754 } 755 756 Instruction *getInsertPos() const { return InsertPos; } 757 void setInsertPos(Instruction *Inst) { InsertPos = Inst; } 758 759 private: 760 unsigned Factor; // Interleave Factor. 761 bool Reverse; 762 unsigned Align; 763 DenseMap<int, Instruction *> Members; 764 int SmallestKey; 765 int LargestKey; 766 767 // To avoid breaking dependences, vectorized instructions of an interleave 768 // group should be inserted at either the first load or the last store in 769 // program order. 770 // 771 // E.g. %even = load i32 // Insert Position 772 // %add = add i32 %even // Use of %even 773 // %odd = load i32 774 // 775 // store i32 %even 776 // %odd = add i32 // Def of %odd 777 // store i32 %odd // Insert Position 778 Instruction *InsertPos; 779 }; 780 781 /// \brief Drive the analysis of interleaved memory accesses in the loop. 782 /// 783 /// Use this class to analyze interleaved accesses only when we can vectorize 784 /// a loop. Otherwise it's meaningless to do analysis as the vectorization 785 /// on interleaved accesses is unsafe. 786 /// 787 /// The analysis collects interleave groups and records the relationships 788 /// between the member and the group in a map. 789 class InterleavedAccessInfo { 790 public: 791 InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, 792 DominatorTree *DT) 793 : PSE(PSE), TheLoop(L), DT(DT) {} 794 795 ~InterleavedAccessInfo() { 796 SmallSet<InterleaveGroup *, 4> DelSet; 797 // Avoid releasing a pointer twice. 798 for (auto &I : InterleaveGroupMap) 799 DelSet.insert(I.second); 800 for (auto *Ptr : DelSet) 801 delete Ptr; 802 } 803 804 /// \brief Analyze the interleaved accesses and collect them in interleave 805 /// groups. Substitute symbolic strides using \p Strides. 806 void analyzeInterleaving(const ValueToValueMap &Strides); 807 808 /// \brief Check if \p Instr belongs to any interleave group. 809 bool isInterleaved(Instruction *Instr) const { 810 return InterleaveGroupMap.count(Instr); 811 } 812 813 /// \brief Get the interleave group that \p Instr belongs to. 814 /// 815 /// \returns nullptr if doesn't have such group. 816 InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { 817 if (InterleaveGroupMap.count(Instr)) 818 return InterleaveGroupMap.find(Instr)->second; 819 return nullptr; 820 } 821 822 private: 823 /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. 824 /// Simplifies SCEV expressions in the context of existing SCEV assumptions. 825 /// The interleaved access analysis can also add new predicates (for example 826 /// by versioning strides of pointers). 827 PredicatedScalarEvolution &PSE; 828 Loop *TheLoop; 829 DominatorTree *DT; 830 831 /// Holds the relationships between the members and the interleave group. 832 DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; 833 834 /// \brief The descriptor for a strided memory access. 835 struct StrideDescriptor { 836 StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, 837 unsigned Align) 838 : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} 839 840 StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} 841 842 int Stride; // The access's stride. It is negative for a reverse access. 843 const SCEV *Scev; // The scalar expression of this access 844 unsigned Size; // The size of the memory object. 845 unsigned Align; // The alignment of this access. 846 }; 847 848 /// \brief Create a new interleave group with the given instruction \p Instr, 849 /// stride \p Stride and alignment \p Align. 850 /// 851 /// \returns the newly created interleave group. 852 InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, 853 unsigned Align) { 854 assert(!InterleaveGroupMap.count(Instr) && 855 "Already in an interleaved access group"); 856 InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); 857 return InterleaveGroupMap[Instr]; 858 } 859 860 /// \brief Release the group and remove all the relationships. 861 void releaseGroup(InterleaveGroup *Group) { 862 for (unsigned i = 0; i < Group->getFactor(); i++) 863 if (Instruction *Member = Group->getMember(i)) 864 InterleaveGroupMap.erase(Member); 865 866 delete Group; 867 } 868 869 /// \brief Collect all the accesses with a constant stride in program order. 870 void collectConstStridedAccesses( 871 MapVector<Instruction *, StrideDescriptor> &StrideAccesses, 872 const ValueToValueMap &Strides); 873 }; 874 875 /// Utility class for getting and setting loop vectorizer hints in the form 876 /// of loop metadata. 877 /// This class keeps a number of loop annotations locally (as member variables) 878 /// and can, upon request, write them back as metadata on the loop. It will 879 /// initially scan the loop for existing metadata, and will update the local 880 /// values based on information in the loop. 881 /// We cannot write all values to metadata, as the mere presence of some info, 882 /// for example 'force', means a decision has been made. So, we need to be 883 /// careful NOT to add them if the user hasn't specifically asked so. 884 class LoopVectorizeHints { 885 enum HintKind { 886 HK_WIDTH, 887 HK_UNROLL, 888 HK_FORCE 889 }; 890 891 /// Hint - associates name and validation with the hint value. 892 struct Hint { 893 const char * Name; 894 unsigned Value; // This may have to change for non-numeric values. 895 HintKind Kind; 896 897 Hint(const char * Name, unsigned Value, HintKind Kind) 898 : Name(Name), Value(Value), Kind(Kind) { } 899 900 bool validate(unsigned Val) { 901 switch (Kind) { 902 case HK_WIDTH: 903 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 904 case HK_UNROLL: 905 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 906 case HK_FORCE: 907 return (Val <= 1); 908 } 909 return false; 910 } 911 }; 912 913 /// Vectorization width. 914 Hint Width; 915 /// Vectorization interleave factor. 916 Hint Interleave; 917 /// Vectorization forced 918 Hint Force; 919 920 /// Return the loop metadata prefix. 921 static StringRef Prefix() { return "llvm.loop."; } 922 923 public: 924 enum ForceKind { 925 FK_Undefined = -1, ///< Not selected. 926 FK_Disabled = 0, ///< Forcing disabled. 927 FK_Enabled = 1, ///< Forcing enabled. 928 }; 929 930 LoopVectorizeHints(const Loop *L, bool DisableInterleaving) 931 : Width("vectorize.width", VectorizerParams::VectorizationFactor, 932 HK_WIDTH), 933 Interleave("interleave.count", DisableInterleaving, HK_UNROLL), 934 Force("vectorize.enable", FK_Undefined, HK_FORCE), 935 TheLoop(L) { 936 // Populate values with existing loop metadata. 937 getHintsFromMetadata(); 938 939 // force-vector-interleave overrides DisableInterleaving. 940 if (VectorizerParams::isInterleaveForced()) 941 Interleave.Value = VectorizerParams::VectorizationInterleave; 942 943 DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() 944 << "LV: Interleaving disabled by the pass manager\n"); 945 } 946 947 /// Mark the loop L as already vectorized by setting the width to 1. 948 void setAlreadyVectorized() { 949 Width.Value = Interleave.Value = 1; 950 Hint Hints[] = {Width, Interleave}; 951 writeHintsToMetadata(Hints); 952 } 953 954 bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { 955 if (getForce() == LoopVectorizeHints::FK_Disabled) { 956 DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 957 emitOptimizationRemarkAnalysis(F->getContext(), 958 vectorizeAnalysisPassName(), *F, 959 L->getStartLoc(), emitRemark()); 960 return false; 961 } 962 963 if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { 964 DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 965 emitOptimizationRemarkAnalysis(F->getContext(), 966 vectorizeAnalysisPassName(), *F, 967 L->getStartLoc(), emitRemark()); 968 return false; 969 } 970 971 if (getWidth() == 1 && getInterleave() == 1) { 972 // FIXME: Add a separate metadata to indicate when the loop has already 973 // been vectorized instead of setting width and count to 1. 974 DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 975 // FIXME: Add interleave.disable metadata. This will allow 976 // vectorize.disable to be used without disabling the pass and errors 977 // to differentiate between disabled vectorization and a width of 1. 978 emitOptimizationRemarkAnalysis( 979 F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), 980 "loop not vectorized: vectorization and interleaving are explicitly " 981 "disabled, or vectorize width and interleave count are both set to " 982 "1"); 983 return false; 984 } 985 986 return true; 987 } 988 989 /// Dumps all the hint information. 990 std::string emitRemark() const { 991 VectorizationReport R; 992 if (Force.Value == LoopVectorizeHints::FK_Disabled) 993 R << "vectorization is explicitly disabled"; 994 else { 995 R << "use -Rpass-analysis=loop-vectorize for more info"; 996 if (Force.Value == LoopVectorizeHints::FK_Enabled) { 997 R << " (Force=true"; 998 if (Width.Value != 0) 999 R << ", Vector Width=" << Width.Value; 1000 if (Interleave.Value != 0) 1001 R << ", Interleave Count=" << Interleave.Value; 1002 R << ")"; 1003 } 1004 } 1005 1006 return R.str(); 1007 } 1008 1009 unsigned getWidth() const { return Width.Value; } 1010 unsigned getInterleave() const { return Interleave.Value; } 1011 enum ForceKind getForce() const { return (ForceKind)Force.Value; } 1012 const char *vectorizeAnalysisPassName() const { 1013 // If hints are provided that don't disable vectorization use the 1014 // AlwaysPrint pass name to force the frontend to print the diagnostic. 1015 if (getWidth() == 1) 1016 return LV_NAME; 1017 if (getForce() == LoopVectorizeHints::FK_Disabled) 1018 return LV_NAME; 1019 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) 1020 return LV_NAME; 1021 return DiagnosticInfo::AlwaysPrint; 1022 } 1023 1024 bool allowReordering() const { 1025 // When enabling loop hints are provided we allow the vectorizer to change 1026 // the order of operations that is given by the scalar loop. This is not 1027 // enabled by default because can be unsafe or inefficient. For example, 1028 // reordering floating-point operations will change the way round-off 1029 // error accumulates in the loop. 1030 return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; 1031 } 1032 1033 private: 1034 /// Find hints specified in the loop metadata and update local values. 1035 void getHintsFromMetadata() { 1036 MDNode *LoopID = TheLoop->getLoopID(); 1037 if (!LoopID) 1038 return; 1039 1040 // First operand should refer to the loop id itself. 1041 assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 1042 assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 1043 1044 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1045 const MDString *S = nullptr; 1046 SmallVector<Metadata *, 4> Args; 1047 1048 // The expected hint is either a MDString or a MDNode with the first 1049 // operand a MDString. 1050 if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 1051 if (!MD || MD->getNumOperands() == 0) 1052 continue; 1053 S = dyn_cast<MDString>(MD->getOperand(0)); 1054 for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 1055 Args.push_back(MD->getOperand(i)); 1056 } else { 1057 S = dyn_cast<MDString>(LoopID->getOperand(i)); 1058 assert(Args.size() == 0 && "too many arguments for MDString"); 1059 } 1060 1061 if (!S) 1062 continue; 1063 1064 // Check if the hint starts with the loop metadata prefix. 1065 StringRef Name = S->getString(); 1066 if (Args.size() == 1) 1067 setHint(Name, Args[0]); 1068 } 1069 } 1070 1071 /// Checks string hint with one operand and set value if valid. 1072 void setHint(StringRef Name, Metadata *Arg) { 1073 if (!Name.startswith(Prefix())) 1074 return; 1075 Name = Name.substr(Prefix().size(), StringRef::npos); 1076 1077 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 1078 if (!C) return; 1079 unsigned Val = C->getZExtValue(); 1080 1081 Hint *Hints[] = {&Width, &Interleave, &Force}; 1082 for (auto H : Hints) { 1083 if (Name == H->Name) { 1084 if (H->validate(Val)) 1085 H->Value = Val; 1086 else 1087 DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 1088 break; 1089 } 1090 } 1091 } 1092 1093 /// Create a new hint from name / value pair. 1094 MDNode *createHintMetadata(StringRef Name, unsigned V) const { 1095 LLVMContext &Context = TheLoop->getHeader()->getContext(); 1096 Metadata *MDs[] = {MDString::get(Context, Name), 1097 ConstantAsMetadata::get( 1098 ConstantInt::get(Type::getInt32Ty(Context), V))}; 1099 return MDNode::get(Context, MDs); 1100 } 1101 1102 /// Matches metadata with hint name. 1103 bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { 1104 MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); 1105 if (!Name) 1106 return false; 1107 1108 for (auto H : HintTypes) 1109 if (Name->getString().endswith(H.Name)) 1110 return true; 1111 return false; 1112 } 1113 1114 /// Sets current hints into loop metadata, keeping other values intact. 1115 void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { 1116 if (HintTypes.size() == 0) 1117 return; 1118 1119 // Reserve the first element to LoopID (see below). 1120 SmallVector<Metadata *, 4> MDs(1); 1121 // If the loop already has metadata, then ignore the existing operands. 1122 MDNode *LoopID = TheLoop->getLoopID(); 1123 if (LoopID) { 1124 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1125 MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); 1126 // If node in update list, ignore old value. 1127 if (!matchesHintMetadataName(Node, HintTypes)) 1128 MDs.push_back(Node); 1129 } 1130 } 1131 1132 // Now, add the missing hints. 1133 for (auto H : HintTypes) 1134 MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); 1135 1136 // Replace current metadata node with new one. 1137 LLVMContext &Context = TheLoop->getHeader()->getContext(); 1138 MDNode *NewLoopID = MDNode::get(Context, MDs); 1139 // Set operand 0 to refer to the loop id itself. 1140 NewLoopID->replaceOperandWith(0, NewLoopID); 1141 1142 TheLoop->setLoopID(NewLoopID); 1143 } 1144 1145 /// The loop these hints belong to. 1146 const Loop *TheLoop; 1147 }; 1148 1149 static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, 1150 const LoopVectorizeHints &Hints, 1151 const LoopAccessReport &Message) { 1152 const char *Name = Hints.vectorizeAnalysisPassName(); 1153 LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); 1154 } 1155 1156 static void emitMissedWarning(Function *F, Loop *L, 1157 const LoopVectorizeHints &LH) { 1158 emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), 1159 LH.emitRemark()); 1160 1161 if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { 1162 if (LH.getWidth() != 1) 1163 emitLoopVectorizeWarning( 1164 F->getContext(), *F, L->getStartLoc(), 1165 "failed explicitly specified loop vectorization"); 1166 else if (LH.getInterleave() != 1) 1167 emitLoopInterleaveWarning( 1168 F->getContext(), *F, L->getStartLoc(), 1169 "failed explicitly specified loop interleaving"); 1170 } 1171 } 1172 1173 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 1174 /// to what vectorization factor. 1175 /// This class does not look at the profitability of vectorization, only the 1176 /// legality. This class has two main kinds of checks: 1177 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 1178 /// will change the order of memory accesses in a way that will change the 1179 /// correctness of the program. 1180 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 1181 /// checks for a number of different conditions, such as the availability of a 1182 /// single induction variable, that all types are supported and vectorize-able, 1183 /// etc. This code reflects the capabilities of InnerLoopVectorizer. 1184 /// This class is also used by InnerLoopVectorizer for identifying 1185 /// induction variable and the different reduction variables. 1186 class LoopVectorizationLegality { 1187 public: 1188 LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, 1189 DominatorTree *DT, TargetLibraryInfo *TLI, 1190 AliasAnalysis *AA, Function *F, 1191 const TargetTransformInfo *TTI, 1192 LoopAccessAnalysis *LAA, 1193 LoopVectorizationRequirements *R, 1194 const LoopVectorizeHints *H) 1195 : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), 1196 TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), 1197 Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), 1198 Requirements(R), Hints(H) {} 1199 1200 /// ReductionList contains the reduction descriptors for all 1201 /// of the reductions that were found in the loop. 1202 typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; 1203 1204 /// InductionList saves induction variables and maps them to the 1205 /// induction descriptor. 1206 typedef MapVector<PHINode*, InductionDescriptor> InductionList; 1207 1208 /// RecurrenceSet contains the phi nodes that are recurrences other than 1209 /// inductions and reductions. 1210 typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet; 1211 1212 /// Returns true if it is legal to vectorize this loop. 1213 /// This does not mean that it is profitable to vectorize this 1214 /// loop, only that it is legal to do so. 1215 bool canVectorize(); 1216 1217 /// Returns the Induction variable. 1218 PHINode *getInduction() { return Induction; } 1219 1220 /// Returns the reduction variables found in the loop. 1221 ReductionList *getReductionVars() { return &Reductions; } 1222 1223 /// Returns the induction variables found in the loop. 1224 InductionList *getInductionVars() { return &Inductions; } 1225 1226 /// Return the first-order recurrences found in the loop. 1227 RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } 1228 1229 /// Returns the widest induction type. 1230 Type *getWidestInductionType() { return WidestIndTy; } 1231 1232 /// Returns True if V is an induction variable in this loop. 1233 bool isInductionVariable(const Value *V); 1234 1235 /// Returns True if PN is a reduction variable in this loop. 1236 bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } 1237 1238 /// Returns True if Phi is a first-order recurrence in this loop. 1239 bool isFirstOrderRecurrence(const PHINode *Phi); 1240 1241 /// Return true if the block BB needs to be predicated in order for the loop 1242 /// to be vectorized. 1243 bool blockNeedsPredication(BasicBlock *BB); 1244 1245 /// Check if this pointer is consecutive when vectorizing. This happens 1246 /// when the last index of the GEP is the induction variable, or that the 1247 /// pointer itself is an induction variable. 1248 /// This check allows us to vectorize A[idx] into a wide load/store. 1249 /// Returns: 1250 /// 0 - Stride is unknown or non-consecutive. 1251 /// 1 - Address is consecutive. 1252 /// -1 - Address is consecutive, and decreasing. 1253 int isConsecutivePtr(Value *Ptr); 1254 1255 /// Returns true if the value V is uniform within the loop. 1256 bool isUniform(Value *V); 1257 1258 /// Returns true if this instruction will remain scalar after vectorization. 1259 bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 1260 1261 /// Returns the information that we collected about runtime memory check. 1262 const RuntimePointerChecking *getRuntimePointerChecking() const { 1263 return LAI->getRuntimePointerChecking(); 1264 } 1265 1266 const LoopAccessInfo *getLAI() const { 1267 return LAI; 1268 } 1269 1270 /// \brief Check if \p Instr belongs to any interleaved access group. 1271 bool isAccessInterleaved(Instruction *Instr) { 1272 return InterleaveInfo.isInterleaved(Instr); 1273 } 1274 1275 /// \brief Get the interleaved access group that \p Instr belongs to. 1276 const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { 1277 return InterleaveInfo.getInterleaveGroup(Instr); 1278 } 1279 1280 unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } 1281 1282 bool hasStride(Value *V) { return StrideSet.count(V); } 1283 bool mustCheckStrides() { return !StrideSet.empty(); } 1284 SmallPtrSet<Value *, 8>::iterator strides_begin() { 1285 return StrideSet.begin(); 1286 } 1287 SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } 1288 1289 /// Returns true if the target machine supports masked store operation 1290 /// for the given \p DataType and kind of access to \p Ptr. 1291 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1292 return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); 1293 } 1294 /// Returns true if the target machine supports masked load operation 1295 /// for the given \p DataType and kind of access to \p Ptr. 1296 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1297 return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); 1298 } 1299 /// Returns true if the target machine supports masked scatter operation 1300 /// for the given \p DataType. 1301 bool isLegalMaskedScatter(Type *DataType) { 1302 return TTI->isLegalMaskedScatter(DataType); 1303 } 1304 /// Returns true if the target machine supports masked gather operation 1305 /// for the given \p DataType. 1306 bool isLegalMaskedGather(Type *DataType) { 1307 return TTI->isLegalMaskedGather(DataType); 1308 } 1309 1310 /// Returns true if vector representation of the instruction \p I 1311 /// requires mask. 1312 bool isMaskRequired(const Instruction* I) { 1313 return (MaskedOp.count(I) != 0); 1314 } 1315 unsigned getNumStores() const { 1316 return LAI->getNumStores(); 1317 } 1318 unsigned getNumLoads() const { 1319 return LAI->getNumLoads(); 1320 } 1321 unsigned getNumPredStores() const { 1322 return NumPredStores; 1323 } 1324 private: 1325 /// Check if a single basic block loop is vectorizable. 1326 /// At this point we know that this is a loop with a constant trip count 1327 /// and we only need to check individual instructions. 1328 bool canVectorizeInstrs(); 1329 1330 /// When we vectorize loops we may change the order in which 1331 /// we read and write from memory. This method checks if it is 1332 /// legal to vectorize the code, considering only memory constrains. 1333 /// Returns true if the loop is vectorizable 1334 bool canVectorizeMemory(); 1335 1336 /// Return true if we can vectorize this loop using the IF-conversion 1337 /// transformation. 1338 bool canVectorizeWithIfConvert(); 1339 1340 /// Collect the variables that need to stay uniform after vectorization. 1341 void collectLoopUniforms(); 1342 1343 /// Return true if all of the instructions in the block can be speculatively 1344 /// executed. \p SafePtrs is a list of addresses that are known to be legal 1345 /// and we know that we can read from them without segfault. 1346 bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); 1347 1348 /// \brief Collect memory access with loop invariant strides. 1349 /// 1350 /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop 1351 /// invariant. 1352 void collectStridedAccess(Value *LoadOrStoreInst); 1353 1354 /// Report an analysis message to assist the user in diagnosing loops that are 1355 /// not vectorized. These are handled as LoopAccessReport rather than 1356 /// VectorizationReport because the << operator of VectorizationReport returns 1357 /// LoopAccessReport. 1358 void emitAnalysis(const LoopAccessReport &Message) const { 1359 emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); 1360 } 1361 1362 unsigned NumPredStores; 1363 1364 /// The loop that we evaluate. 1365 Loop *TheLoop; 1366 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. 1367 /// Applies dynamic knowledge to simplify SCEV expressions in the context 1368 /// of existing SCEV assumptions. The analysis will also add a minimal set 1369 /// of new predicates if this is required to enable vectorization and 1370 /// unrolling. 1371 PredicatedScalarEvolution &PSE; 1372 /// Target Library Info. 1373 TargetLibraryInfo *TLI; 1374 /// Parent function 1375 Function *TheFunction; 1376 /// Target Transform Info 1377 const TargetTransformInfo *TTI; 1378 /// Dominator Tree. 1379 DominatorTree *DT; 1380 // LoopAccess analysis. 1381 LoopAccessAnalysis *LAA; 1382 // And the loop-accesses info corresponding to this loop. This pointer is 1383 // null until canVectorizeMemory sets it up. 1384 const LoopAccessInfo *LAI; 1385 1386 /// The interleave access information contains groups of interleaved accesses 1387 /// with the same stride and close to each other. 1388 InterleavedAccessInfo InterleaveInfo; 1389 1390 // --- vectorization state --- // 1391 1392 /// Holds the integer induction variable. This is the counter of the 1393 /// loop. 1394 PHINode *Induction; 1395 /// Holds the reduction variables. 1396 ReductionList Reductions; 1397 /// Holds all of the induction variables that we found in the loop. 1398 /// Notice that inductions don't need to start at zero and that induction 1399 /// variables can be pointers. 1400 InductionList Inductions; 1401 /// Holds the phi nodes that are first-order recurrences. 1402 RecurrenceSet FirstOrderRecurrences; 1403 /// Holds the widest induction type encountered. 1404 Type *WidestIndTy; 1405 1406 /// Allowed outside users. This holds the reduction 1407 /// vars which can be accessed from outside the loop. 1408 SmallPtrSet<Value*, 4> AllowedExit; 1409 /// This set holds the variables which are known to be uniform after 1410 /// vectorization. 1411 SmallPtrSet<Instruction*, 4> Uniforms; 1412 1413 /// Can we assume the absence of NaNs. 1414 bool HasFunNoNaNAttr; 1415 1416 /// Vectorization requirements that will go through late-evaluation. 1417 LoopVectorizationRequirements *Requirements; 1418 1419 /// Used to emit an analysis of any legality issues. 1420 const LoopVectorizeHints *Hints; 1421 1422 ValueToValueMap Strides; 1423 SmallPtrSet<Value *, 8> StrideSet; 1424 1425 /// While vectorizing these instructions we have to generate a 1426 /// call to the appropriate masked intrinsic 1427 SmallPtrSet<const Instruction *, 8> MaskedOp; 1428 }; 1429 1430 /// LoopVectorizationCostModel - estimates the expected speedups due to 1431 /// vectorization. 1432 /// In many cases vectorization is not profitable. This can happen because of 1433 /// a number of reasons. In this class we mainly attempt to predict the 1434 /// expected speedup/slowdowns due to the supported instruction set. We use the 1435 /// TargetTransformInfo to query the different backends for the cost of 1436 /// different operations. 1437 class LoopVectorizationCostModel { 1438 public: 1439 LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 1440 LoopVectorizationLegality *Legal, 1441 const TargetTransformInfo &TTI, 1442 const TargetLibraryInfo *TLI, DemandedBits *DB, 1443 AssumptionCache *AC, const Function *F, 1444 const LoopVectorizeHints *Hints, 1445 SmallPtrSetImpl<const Value *> &ValuesToIgnore) 1446 : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), 1447 TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {} 1448 1449 /// Information about vectorization costs 1450 struct VectorizationFactor { 1451 unsigned Width; // Vector width with best cost 1452 unsigned Cost; // Cost of the loop with that width 1453 }; 1454 /// \return The most profitable vectorization factor and the cost of that VF. 1455 /// This method checks every power of two up to VF. If UserVF is not ZERO 1456 /// then this vectorization factor will be selected if vectorization is 1457 /// possible. 1458 VectorizationFactor selectVectorizationFactor(bool OptForSize); 1459 1460 /// \return The size (in bits) of the smallest and widest types in the code 1461 /// that needs to be vectorized. We ignore values that remain scalar such as 1462 /// 64 bit loop indices. 1463 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1464 1465 /// \return The desired interleave count. 1466 /// If interleave count has been specified by metadata it will be returned. 1467 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1468 /// are the selected vectorization factor and the cost of the selected VF. 1469 unsigned selectInterleaveCount(bool OptForSize, unsigned VF, 1470 unsigned LoopCost); 1471 1472 /// \return The most profitable unroll factor. 1473 /// This method finds the best unroll-factor based on register pressure and 1474 /// other parameters. VF and LoopCost are the selected vectorization factor 1475 /// and the cost of the selected VF. 1476 unsigned computeInterleaveCount(bool OptForSize, unsigned VF, 1477 unsigned LoopCost); 1478 1479 /// \brief A struct that represents some properties of the register usage 1480 /// of a loop. 1481 struct RegisterUsage { 1482 /// Holds the number of loop invariant values that are used in the loop. 1483 unsigned LoopInvariantRegs; 1484 /// Holds the maximum number of concurrent live intervals in the loop. 1485 unsigned MaxLocalUsers; 1486 /// Holds the number of instructions in the loop. 1487 unsigned NumInstructions; 1488 }; 1489 1490 /// \return Returns information about the register usages of the loop for the 1491 /// given vectorization factors. 1492 SmallVector<RegisterUsage, 8> 1493 calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs); 1494 1495 private: 1496 /// Returns the expected execution cost. The unit of the cost does 1497 /// not matter because we use the 'cost' units to compare different 1498 /// vector widths. The cost that is returned is *not* normalized by 1499 /// the factor width. 1500 unsigned expectedCost(unsigned VF); 1501 1502 /// Returns the execution time cost of an instruction for a given vector 1503 /// width. Vector width of one means scalar. 1504 unsigned getInstructionCost(Instruction *I, unsigned VF); 1505 1506 /// Returns whether the instruction is a load or store and will be a emitted 1507 /// as a vector operation. 1508 bool isConsecutiveLoadOrStore(Instruction *I); 1509 1510 /// Report an analysis message to assist the user in diagnosing loops that are 1511 /// not vectorized. These are handled as LoopAccessReport rather than 1512 /// VectorizationReport because the << operator of VectorizationReport returns 1513 /// LoopAccessReport. 1514 void emitAnalysis(const LoopAccessReport &Message) const { 1515 emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); 1516 } 1517 1518 public: 1519 /// Map of scalar integer values to the smallest bitwidth they can be legally 1520 /// represented as. The vector equivalents of these values should be truncated 1521 /// to this type. 1522 MapVector<Instruction*,uint64_t> MinBWs; 1523 1524 /// The loop that we evaluate. 1525 Loop *TheLoop; 1526 /// Scev analysis. 1527 ScalarEvolution *SE; 1528 /// Loop Info analysis. 1529 LoopInfo *LI; 1530 /// Vectorization legality. 1531 LoopVectorizationLegality *Legal; 1532 /// Vector target information. 1533 const TargetTransformInfo &TTI; 1534 /// Target Library Info. 1535 const TargetLibraryInfo *TLI; 1536 /// Demanded bits analysis 1537 DemandedBits *DB; 1538 const Function *TheFunction; 1539 // Loop Vectorize Hint. 1540 const LoopVectorizeHints *Hints; 1541 // Values to ignore in the cost model. 1542 const SmallPtrSetImpl<const Value *> &ValuesToIgnore; 1543 }; 1544 1545 /// \brief This holds vectorization requirements that must be verified late in 1546 /// the process. The requirements are set by legalize and costmodel. Once 1547 /// vectorization has been determined to be possible and profitable the 1548 /// requirements can be verified by looking for metadata or compiler options. 1549 /// For example, some loops require FP commutativity which is only allowed if 1550 /// vectorization is explicitly specified or if the fast-math compiler option 1551 /// has been provided. 1552 /// Late evaluation of these requirements allows helpful diagnostics to be 1553 /// composed that tells the user what need to be done to vectorize the loop. For 1554 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late 1555 /// evaluation should be used only when diagnostics can generated that can be 1556 /// followed by a non-expert user. 1557 class LoopVectorizationRequirements { 1558 public: 1559 LoopVectorizationRequirements() 1560 : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} 1561 1562 void addUnsafeAlgebraInst(Instruction *I) { 1563 // First unsafe algebra instruction. 1564 if (!UnsafeAlgebraInst) 1565 UnsafeAlgebraInst = I; 1566 } 1567 1568 void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } 1569 1570 bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { 1571 const char *Name = Hints.vectorizeAnalysisPassName(); 1572 bool Failed = false; 1573 if (UnsafeAlgebraInst && !Hints.allowReordering()) { 1574 emitOptimizationRemarkAnalysisFPCommute( 1575 F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), 1576 VectorizationReport() << "cannot prove it is safe to reorder " 1577 "floating-point operations"); 1578 Failed = true; 1579 } 1580 1581 // Test if runtime memcheck thresholds are exceeded. 1582 bool PragmaThresholdReached = 1583 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 1584 bool ThresholdReached = 1585 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 1586 if ((ThresholdReached && !Hints.allowReordering()) || 1587 PragmaThresholdReached) { 1588 emitOptimizationRemarkAnalysisAliasing( 1589 F->getContext(), Name, *F, L->getStartLoc(), 1590 VectorizationReport() 1591 << "cannot prove it is safe to reorder memory operations"); 1592 DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 1593 Failed = true; 1594 } 1595 1596 return Failed; 1597 } 1598 1599 private: 1600 unsigned NumRuntimePointerChecks; 1601 Instruction *UnsafeAlgebraInst; 1602 }; 1603 1604 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { 1605 if (L.empty()) 1606 return V.push_back(&L); 1607 1608 for (Loop *InnerL : L) 1609 addInnerLoop(*InnerL, V); 1610 } 1611 1612 /// The LoopVectorize Pass. 1613 struct LoopVectorize : public FunctionPass { 1614 /// Pass identification, replacement for typeid 1615 static char ID; 1616 1617 explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) 1618 : FunctionPass(ID), 1619 DisableUnrolling(NoUnrolling), 1620 AlwaysVectorize(AlwaysVectorize) { 1621 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1622 } 1623 1624 ScalarEvolution *SE; 1625 LoopInfo *LI; 1626 TargetTransformInfo *TTI; 1627 DominatorTree *DT; 1628 BlockFrequencyInfo *BFI; 1629 TargetLibraryInfo *TLI; 1630 DemandedBits *DB; 1631 AliasAnalysis *AA; 1632 AssumptionCache *AC; 1633 LoopAccessAnalysis *LAA; 1634 bool DisableUnrolling; 1635 bool AlwaysVectorize; 1636 1637 BlockFrequency ColdEntryFreq; 1638 1639 bool runOnFunction(Function &F) override { 1640 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1641 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1642 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1643 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1644 BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1645 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1646 TLI = TLIP ? &TLIP->getTLI() : nullptr; 1647 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1648 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1649 LAA = &getAnalysis<LoopAccessAnalysis>(); 1650 DB = &getAnalysis<DemandedBits>(); 1651 1652 // Compute some weights outside of the loop over the loops. Compute this 1653 // using a BranchProbability to re-use its scaling math. 1654 const BranchProbability ColdProb(1, 5); // 20% 1655 ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; 1656 1657 // Don't attempt if 1658 // 1. the target claims to have no vector registers, and 1659 // 2. interleaving won't help ILP. 1660 // 1661 // The second condition is necessary because, even if the target has no 1662 // vector registers, loop vectorization may still enable scalar 1663 // interleaving. 1664 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 1665 return false; 1666 1667 // Build up a worklist of inner-loops to vectorize. This is necessary as 1668 // the act of vectorizing or partially unrolling a loop creates new loops 1669 // and can invalidate iterators across the loops. 1670 SmallVector<Loop *, 8> Worklist; 1671 1672 for (Loop *L : *LI) 1673 addInnerLoop(*L, Worklist); 1674 1675 LoopsAnalyzed += Worklist.size(); 1676 1677 // Now walk the identified inner loops. 1678 bool Changed = false; 1679 while (!Worklist.empty()) 1680 Changed |= processLoop(Worklist.pop_back_val()); 1681 1682 // Process each loop nest in the function. 1683 return Changed; 1684 } 1685 1686 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 1687 SmallVector<Metadata *, 4> MDs; 1688 // Reserve first location for self reference to the LoopID metadata node. 1689 MDs.push_back(nullptr); 1690 bool IsUnrollMetadata = false; 1691 MDNode *LoopID = L->getLoopID(); 1692 if (LoopID) { 1693 // First find existing loop unrolling disable metadata. 1694 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1695 MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 1696 if (MD) { 1697 const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); 1698 IsUnrollMetadata = 1699 S && S->getString().startswith("llvm.loop.unroll.disable"); 1700 } 1701 MDs.push_back(LoopID->getOperand(i)); 1702 } 1703 } 1704 1705 if (!IsUnrollMetadata) { 1706 // Add runtime unroll disable metadata. 1707 LLVMContext &Context = L->getHeader()->getContext(); 1708 SmallVector<Metadata *, 1> DisableOperands; 1709 DisableOperands.push_back( 1710 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 1711 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 1712 MDs.push_back(DisableNode); 1713 MDNode *NewLoopID = MDNode::get(Context, MDs); 1714 // Set operand 0 to refer to the loop id itself. 1715 NewLoopID->replaceOperandWith(0, NewLoopID); 1716 L->setLoopID(NewLoopID); 1717 } 1718 } 1719 1720 bool processLoop(Loop *L) { 1721 assert(L->empty() && "Only process inner loops."); 1722 1723 #ifndef NDEBUG 1724 const std::string DebugLocStr = getDebugLocString(L); 1725 #endif /* NDEBUG */ 1726 1727 DEBUG(dbgs() << "\nLV: Checking a loop in \"" 1728 << L->getHeader()->getParent()->getName() << "\" from " 1729 << DebugLocStr << "\n"); 1730 1731 LoopVectorizeHints Hints(L, DisableUnrolling); 1732 1733 DEBUG(dbgs() << "LV: Loop hints:" 1734 << " force=" 1735 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 1736 ? "disabled" 1737 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 1738 ? "enabled" 1739 : "?")) << " width=" << Hints.getWidth() 1740 << " unroll=" << Hints.getInterleave() << "\n"); 1741 1742 // Function containing loop 1743 Function *F = L->getHeader()->getParent(); 1744 1745 // Looking at the diagnostic output is the only way to determine if a loop 1746 // was vectorized (other than looking at the IR or machine code), so it 1747 // is important to generate an optimization remark for each loop. Most of 1748 // these messages are generated by emitOptimizationRemarkAnalysis. Remarks 1749 // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are 1750 // less verbose reporting vectorized loops and unvectorized loops that may 1751 // benefit from vectorization, respectively. 1752 1753 if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { 1754 DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 1755 return false; 1756 } 1757 1758 // Check the loop for a trip count threshold: 1759 // do not vectorize loops with a tiny trip count. 1760 const unsigned TC = SE->getSmallConstantTripCount(L); 1761 if (TC > 0u && TC < TinyTripCountVectorThreshold) { 1762 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 1763 << "This loop is not worth vectorizing."); 1764 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 1765 DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 1766 else { 1767 DEBUG(dbgs() << "\n"); 1768 emitAnalysisDiag(F, L, Hints, VectorizationReport() 1769 << "vectorization is not beneficial " 1770 "and is not explicitly forced"); 1771 return false; 1772 } 1773 } 1774 1775 PredicatedScalarEvolution PSE(*SE, *L); 1776 1777 // Check if it is legal to vectorize the loop. 1778 LoopVectorizationRequirements Requirements; 1779 LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, 1780 &Requirements, &Hints); 1781 if (!LVL.canVectorize()) { 1782 DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 1783 emitMissedWarning(F, L, Hints); 1784 return false; 1785 } 1786 1787 // Collect values we want to ignore in the cost model. This includes 1788 // type-promoting instructions we identified during reduction detection. 1789 SmallPtrSet<const Value *, 32> ValuesToIgnore; 1790 CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore); 1791 for (auto &Reduction : *LVL.getReductionVars()) { 1792 RecurrenceDescriptor &RedDes = Reduction.second; 1793 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 1794 ValuesToIgnore.insert(Casts.begin(), Casts.end()); 1795 } 1796 1797 // Use the cost model. 1798 LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC, 1799 F, &Hints, ValuesToIgnore); 1800 1801 // Check the function attributes to find out if this function should be 1802 // optimized for size. 1803 bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && 1804 F->optForSize(); 1805 1806 // Compute the weighted frequency of this loop being executed and see if it 1807 // is less than 20% of the function entry baseline frequency. Note that we 1808 // always have a canonical loop here because we think we *can* vectorize. 1809 // FIXME: This is hidden behind a flag due to pervasive problems with 1810 // exactly what block frequency models. 1811 if (LoopVectorizeWithBlockFrequency) { 1812 BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); 1813 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 1814 LoopEntryFreq < ColdEntryFreq) 1815 OptForSize = true; 1816 } 1817 1818 // Check the function attributes to see if implicit floats are allowed. 1819 // FIXME: This check doesn't seem possibly correct -- what if the loop is 1820 // an integer loop and the vector instructions selected are purely integer 1821 // vector instructions? 1822 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 1823 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 1824 "attribute is used.\n"); 1825 emitAnalysisDiag( 1826 F, L, Hints, 1827 VectorizationReport() 1828 << "loop not vectorized due to NoImplicitFloat attribute"); 1829 emitMissedWarning(F, L, Hints); 1830 return false; 1831 } 1832 1833 // Select the optimal vectorization factor. 1834 const LoopVectorizationCostModel::VectorizationFactor VF = 1835 CM.selectVectorizationFactor(OptForSize); 1836 1837 // Select the interleave count. 1838 unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); 1839 1840 // Get user interleave count. 1841 unsigned UserIC = Hints.getInterleave(); 1842 1843 // Identify the diagnostic messages that should be produced. 1844 std::string VecDiagMsg, IntDiagMsg; 1845 bool VectorizeLoop = true, InterleaveLoop = true; 1846 1847 if (Requirements.doesNotMeet(F, L, Hints)) { 1848 DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 1849 "requirements.\n"); 1850 emitMissedWarning(F, L, Hints); 1851 return false; 1852 } 1853 1854 if (VF.Width == 1) { 1855 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 1856 VecDiagMsg = 1857 "the cost-model indicates that vectorization is not beneficial"; 1858 VectorizeLoop = false; 1859 } 1860 1861 if (IC == 1 && UserIC <= 1) { 1862 // Tell the user interleaving is not beneficial. 1863 DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 1864 IntDiagMsg = 1865 "the cost-model indicates that interleaving is not beneficial"; 1866 InterleaveLoop = false; 1867 if (UserIC == 1) 1868 IntDiagMsg += 1869 " and is explicitly disabled or interleave count is set to 1"; 1870 } else if (IC > 1 && UserIC == 1) { 1871 // Tell the user interleaving is beneficial, but it explicitly disabled. 1872 DEBUG(dbgs() 1873 << "LV: Interleaving is beneficial but is explicitly disabled."); 1874 IntDiagMsg = "the cost-model indicates that interleaving is beneficial " 1875 "but is explicitly disabled or interleave count is set to 1"; 1876 InterleaveLoop = false; 1877 } 1878 1879 // Override IC if user provided an interleave count. 1880 IC = UserIC > 0 ? UserIC : IC; 1881 1882 // Emit diagnostic messages, if any. 1883 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 1884 if (!VectorizeLoop && !InterleaveLoop) { 1885 // Do not vectorize or interleaving the loop. 1886 emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, 1887 L->getStartLoc(), VecDiagMsg); 1888 emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, 1889 L->getStartLoc(), IntDiagMsg); 1890 return false; 1891 } else if (!VectorizeLoop && InterleaveLoop) { 1892 DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 1893 emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, 1894 L->getStartLoc(), VecDiagMsg); 1895 } else if (VectorizeLoop && !InterleaveLoop) { 1896 DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 1897 << DebugLocStr << '\n'); 1898 emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, 1899 L->getStartLoc(), IntDiagMsg); 1900 } else if (VectorizeLoop && InterleaveLoop) { 1901 DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 1902 << DebugLocStr << '\n'); 1903 DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 1904 } 1905 1906 if (!VectorizeLoop) { 1907 assert(IC > 1 && "interleave count should not be 1 or 0"); 1908 // If we decided that it is not legal to vectorize the loop then 1909 // interleave it. 1910 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); 1911 Unroller.vectorize(&LVL, CM.MinBWs); 1912 1913 emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), 1914 Twine("interleaved loop (interleaved count: ") + 1915 Twine(IC) + ")"); 1916 } else { 1917 // If we decided that it is *legal* to vectorize the loop then do it. 1918 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); 1919 LB.vectorize(&LVL, CM.MinBWs); 1920 ++LoopsVectorized; 1921 1922 // Add metadata to disable runtime unrolling scalar loop when there's no 1923 // runtime check about strides and memory. Because at this situation, 1924 // scalar loop is rarely used not worthy to be unrolled. 1925 if (!LB.IsSafetyChecksAdded()) 1926 AddRuntimeUnrollDisableMetaData(L); 1927 1928 // Report the vectorization decision. 1929 emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), 1930 Twine("vectorized loop (vectorization width: ") + 1931 Twine(VF.Width) + ", interleaved count: " + 1932 Twine(IC) + ")"); 1933 } 1934 1935 // Mark the loop as already vectorized to avoid vectorizing again. 1936 Hints.setAlreadyVectorized(); 1937 1938 DEBUG(verifyFunction(*L->getHeader()->getParent())); 1939 return true; 1940 } 1941 1942 void getAnalysisUsage(AnalysisUsage &AU) const override { 1943 AU.addRequired<AssumptionCacheTracker>(); 1944 AU.addRequiredID(LoopSimplifyID); 1945 AU.addRequiredID(LCSSAID); 1946 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1947 AU.addRequired<DominatorTreeWrapperPass>(); 1948 AU.addRequired<LoopInfoWrapperPass>(); 1949 AU.addRequired<ScalarEvolutionWrapperPass>(); 1950 AU.addRequired<TargetTransformInfoWrapperPass>(); 1951 AU.addRequired<AAResultsWrapperPass>(); 1952 AU.addRequired<LoopAccessAnalysis>(); 1953 AU.addRequired<DemandedBits>(); 1954 AU.addPreserved<LoopInfoWrapperPass>(); 1955 AU.addPreserved<DominatorTreeWrapperPass>(); 1956 AU.addPreserved<BasicAAWrapperPass>(); 1957 AU.addPreserved<AAResultsWrapperPass>(); 1958 AU.addPreserved<GlobalsAAWrapperPass>(); 1959 } 1960 1961 }; 1962 1963 } // end anonymous namespace 1964 1965 //===----------------------------------------------------------------------===// 1966 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1967 // LoopVectorizationCostModel. 1968 //===----------------------------------------------------------------------===// 1969 1970 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1971 // We need to place the broadcast of invariant variables outside the loop. 1972 Instruction *Instr = dyn_cast<Instruction>(V); 1973 bool NewInstr = 1974 (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), 1975 Instr->getParent()) != LoopVectorBody.end()); 1976 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 1977 1978 // Place the code for broadcasting invariant variables in the new preheader. 1979 IRBuilder<>::InsertPointGuard Guard(Builder); 1980 if (Invariant) 1981 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1982 1983 // Broadcast the scalar into all locations in the vector. 1984 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1985 1986 return Shuf; 1987 } 1988 1989 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, 1990 Value *Step) { 1991 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1992 assert(Val->getType()->getScalarType()->isIntegerTy() && 1993 "Elem must be an integer"); 1994 assert(Step->getType() == Val->getType()->getScalarType() && 1995 "Step has wrong type"); 1996 // Create the types. 1997 Type *ITy = Val->getType()->getScalarType(); 1998 VectorType *Ty = cast<VectorType>(Val->getType()); 1999 int VLen = Ty->getNumElements(); 2000 SmallVector<Constant*, 8> Indices; 2001 2002 // Create a vector of consecutive numbers from zero to VF. 2003 for (int i = 0; i < VLen; ++i) 2004 Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); 2005 2006 // Add the consecutive indices to the vector value. 2007 Constant *Cv = ConstantVector::get(Indices); 2008 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2009 Step = Builder.CreateVectorSplat(VLen, Step); 2010 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2011 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2012 // which can be found from the original scalar operations. 2013 Step = Builder.CreateMul(Cv, Step); 2014 return Builder.CreateAdd(Val, Step, "induction"); 2015 } 2016 2017 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 2018 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); 2019 auto *SE = PSE.getSE(); 2020 // Make sure that the pointer does not point to structs. 2021 if (Ptr->getType()->getPointerElementType()->isAggregateType()) 2022 return 0; 2023 2024 // If this value is a pointer induction variable we know it is consecutive. 2025 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 2026 if (Phi && Inductions.count(Phi)) { 2027 InductionDescriptor II = Inductions[Phi]; 2028 return II.getConsecutiveDirection(); 2029 } 2030 2031 GetElementPtrInst *Gep = getGEPInstruction(Ptr); 2032 if (!Gep) 2033 return 0; 2034 2035 unsigned NumOperands = Gep->getNumOperands(); 2036 Value *GpPtr = Gep->getPointerOperand(); 2037 // If this GEP value is a consecutive pointer induction variable and all of 2038 // the indices are constant then we know it is consecutive. We can 2039 Phi = dyn_cast<PHINode>(GpPtr); 2040 if (Phi && Inductions.count(Phi)) { 2041 2042 // Make sure that the pointer does not point to structs. 2043 PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 2044 if (GepPtrType->getElementType()->isAggregateType()) 2045 return 0; 2046 2047 // Make sure that all of the index operands are loop invariant. 2048 for (unsigned i = 1; i < NumOperands; ++i) 2049 if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) 2050 return 0; 2051 2052 InductionDescriptor II = Inductions[Phi]; 2053 return II.getConsecutiveDirection(); 2054 } 2055 2056 unsigned InductionOperand = getGEPInductionOperand(Gep); 2057 2058 // Check that all of the gep indices are uniform except for our induction 2059 // operand. 2060 for (unsigned i = 0; i != NumOperands; ++i) 2061 if (i != InductionOperand && 2062 !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) 2063 return 0; 2064 2065 // We can emit wide load/stores only if the last non-zero index is the 2066 // induction variable. 2067 const SCEV *Last = nullptr; 2068 if (!Strides.count(Gep)) 2069 Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); 2070 else { 2071 // Because of the multiplication by a stride we can have a s/zext cast. 2072 // We are going to replace this stride by 1 so the cast is safe to ignore. 2073 // 2074 // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 2075 // %0 = trunc i64 %indvars.iv to i32 2076 // %mul = mul i32 %0, %Stride1 2077 // %idxprom = zext i32 %mul to i64 << Safe cast. 2078 // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom 2079 // 2080 Last = replaceSymbolicStrideSCEV(PSE, Strides, 2081 Gep->getOperand(InductionOperand), Gep); 2082 if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) 2083 Last = 2084 (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) 2085 ? C->getOperand() 2086 : Last; 2087 } 2088 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 2089 const SCEV *Step = AR->getStepRecurrence(*SE); 2090 2091 // The memory is consecutive because the last index is consecutive 2092 // and all other indices are loop invariant. 2093 if (Step->isOne()) 2094 return 1; 2095 if (Step->isAllOnesValue()) 2096 return -1; 2097 } 2098 2099 return 0; 2100 } 2101 2102 bool LoopVectorizationLegality::isUniform(Value *V) { 2103 return LAI->isUniform(V); 2104 } 2105 2106 InnerLoopVectorizer::VectorParts& 2107 InnerLoopVectorizer::getVectorValue(Value *V) { 2108 assert(V != Induction && "The new induction variable should not be used."); 2109 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2110 2111 // If we have a stride that is replaced by one, do it here. 2112 if (Legal->hasStride(V)) 2113 V = ConstantInt::get(V->getType(), 1); 2114 2115 // If we have this scalar in the map, return it. 2116 if (WidenMap.has(V)) 2117 return WidenMap.get(V); 2118 2119 // If this scalar is unknown, assume that it is a constant or that it is 2120 // loop invariant. Broadcast V and save the value for future uses. 2121 Value *B = getBroadcastInstrs(V); 2122 return WidenMap.splat(V, B); 2123 } 2124 2125 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2126 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2127 SmallVector<Constant*, 8> ShuffleMask; 2128 for (unsigned i = 0; i < VF; ++i) 2129 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2130 2131 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2132 ConstantVector::get(ShuffleMask), 2133 "reverse"); 2134 } 2135 2136 // Get a mask to interleave \p NumVec vectors into a wide vector. 2137 // I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> 2138 // E.g. For 2 interleaved vectors, if VF is 4, the mask is: 2139 // <0, 4, 1, 5, 2, 6, 3, 7> 2140 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, 2141 unsigned NumVec) { 2142 SmallVector<Constant *, 16> Mask; 2143 for (unsigned i = 0; i < VF; i++) 2144 for (unsigned j = 0; j < NumVec; j++) 2145 Mask.push_back(Builder.getInt32(j * VF + i)); 2146 2147 return ConstantVector::get(Mask); 2148 } 2149 2150 // Get the strided mask starting from index \p Start. 2151 // I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> 2152 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, 2153 unsigned Stride, unsigned VF) { 2154 SmallVector<Constant *, 16> Mask; 2155 for (unsigned i = 0; i < VF; i++) 2156 Mask.push_back(Builder.getInt32(Start + i * Stride)); 2157 2158 return ConstantVector::get(Mask); 2159 } 2160 2161 // Get a mask of two parts: The first part consists of sequential integers 2162 // starting from 0, The second part consists of UNDEFs. 2163 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> 2164 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, 2165 unsigned NumUndef) { 2166 SmallVector<Constant *, 16> Mask; 2167 for (unsigned i = 0; i < NumInt; i++) 2168 Mask.push_back(Builder.getInt32(i)); 2169 2170 Constant *Undef = UndefValue::get(Builder.getInt32Ty()); 2171 for (unsigned i = 0; i < NumUndef; i++) 2172 Mask.push_back(Undef); 2173 2174 return ConstantVector::get(Mask); 2175 } 2176 2177 // Concatenate two vectors with the same element type. The 2nd vector should 2178 // not have more elements than the 1st vector. If the 2nd vector has less 2179 // elements, extend it with UNDEFs. 2180 static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, 2181 Value *V2) { 2182 VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); 2183 VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); 2184 assert(VecTy1 && VecTy2 && 2185 VecTy1->getScalarType() == VecTy2->getScalarType() && 2186 "Expect two vectors with the same element type"); 2187 2188 unsigned NumElts1 = VecTy1->getNumElements(); 2189 unsigned NumElts2 = VecTy2->getNumElements(); 2190 assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); 2191 2192 if (NumElts1 > NumElts2) { 2193 // Extend with UNDEFs. 2194 Constant *ExtMask = 2195 getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); 2196 V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); 2197 } 2198 2199 Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); 2200 return Builder.CreateShuffleVector(V1, V2, Mask); 2201 } 2202 2203 // Concatenate vectors in the given list. All vectors have the same type. 2204 static Value *ConcatenateVectors(IRBuilder<> &Builder, 2205 ArrayRef<Value *> InputList) { 2206 unsigned NumVec = InputList.size(); 2207 assert(NumVec > 1 && "Should be at least two vectors"); 2208 2209 SmallVector<Value *, 8> ResList; 2210 ResList.append(InputList.begin(), InputList.end()); 2211 do { 2212 SmallVector<Value *, 8> TmpList; 2213 for (unsigned i = 0; i < NumVec - 1; i += 2) { 2214 Value *V0 = ResList[i], *V1 = ResList[i + 1]; 2215 assert((V0->getType() == V1->getType() || i == NumVec - 2) && 2216 "Only the last vector may have a different type"); 2217 2218 TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); 2219 } 2220 2221 // Push the last vector if the total number of vectors is odd. 2222 if (NumVec % 2 != 0) 2223 TmpList.push_back(ResList[NumVec - 1]); 2224 2225 ResList = TmpList; 2226 NumVec = ResList.size(); 2227 } while (NumVec > 1); 2228 2229 return ResList[0]; 2230 } 2231 2232 // Try to vectorize the interleave group that \p Instr belongs to. 2233 // 2234 // E.g. Translate following interleaved load group (factor = 3): 2235 // for (i = 0; i < N; i+=3) { 2236 // R = Pic[i]; // Member of index 0 2237 // G = Pic[i+1]; // Member of index 1 2238 // B = Pic[i+2]; // Member of index 2 2239 // ... // do something to R, G, B 2240 // } 2241 // To: 2242 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2243 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2244 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2245 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2246 // 2247 // Or translate following interleaved store group (factor = 3): 2248 // for (i = 0; i < N; i+=3) { 2249 // ... do something to R, G, B 2250 // Pic[i] = R; // Member of index 0 2251 // Pic[i+1] = G; // Member of index 1 2252 // Pic[i+2] = B; // Member of index 2 2253 // } 2254 // To: 2255 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2256 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2257 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2258 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2259 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2260 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { 2261 const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); 2262 assert(Group && "Fail to get an interleaved access group."); 2263 2264 // Skip if current instruction is not the insert position. 2265 if (Instr != Group->getInsertPos()) 2266 return; 2267 2268 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2269 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2270 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 2271 2272 // Prepare for the vector type of the interleaved load/store. 2273 Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2274 unsigned InterleaveFactor = Group->getFactor(); 2275 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2276 Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); 2277 2278 // Prepare for the new pointers. 2279 setDebugLocFromInst(Builder, Ptr); 2280 VectorParts &PtrParts = getVectorValue(Ptr); 2281 SmallVector<Value *, 2> NewPtrs; 2282 unsigned Index = Group->getIndex(Instr); 2283 for (unsigned Part = 0; Part < UF; Part++) { 2284 // Extract the pointer for current instruction from the pointer vector. A 2285 // reverse access uses the pointer in the last lane. 2286 Value *NewPtr = Builder.CreateExtractElement( 2287 PtrParts[Part], 2288 Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); 2289 2290 // Notice current instruction could be any index. Need to adjust the address 2291 // to the member of index 0. 2292 // 2293 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2294 // b = A[i]; // Member of index 0 2295 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2296 // 2297 // E.g. A[i+1] = a; // Member of index 1 2298 // A[i] = b; // Member of index 0 2299 // A[i+2] = c; // Member of index 2 (Current instruction) 2300 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2301 NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); 2302 2303 // Cast to the vector pointer type. 2304 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2305 } 2306 2307 setDebugLocFromInst(Builder, Instr); 2308 Value *UndefVec = UndefValue::get(VecTy); 2309 2310 // Vectorize the interleaved load group. 2311 if (LI) { 2312 for (unsigned Part = 0; Part < UF; Part++) { 2313 Instruction *NewLoadInstr = Builder.CreateAlignedLoad( 2314 NewPtrs[Part], Group->getAlignment(), "wide.vec"); 2315 2316 for (unsigned i = 0; i < InterleaveFactor; i++) { 2317 Instruction *Member = Group->getMember(i); 2318 2319 // Skip the gaps in the group. 2320 if (!Member) 2321 continue; 2322 2323 Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); 2324 Value *StridedVec = Builder.CreateShuffleVector( 2325 NewLoadInstr, UndefVec, StrideMask, "strided.vec"); 2326 2327 // If this member has different type, cast the result type. 2328 if (Member->getType() != ScalarTy) { 2329 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2330 StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); 2331 } 2332 2333 VectorParts &Entry = WidenMap.get(Member); 2334 Entry[Part] = 2335 Group->isReverse() ? reverseVector(StridedVec) : StridedVec; 2336 } 2337 2338 propagateMetadata(NewLoadInstr, Instr); 2339 } 2340 return; 2341 } 2342 2343 // The sub vector type for current instruction. 2344 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2345 2346 // Vectorize the interleaved store group. 2347 for (unsigned Part = 0; Part < UF; Part++) { 2348 // Collect the stored vector from each member. 2349 SmallVector<Value *, 4> StoredVecs; 2350 for (unsigned i = 0; i < InterleaveFactor; i++) { 2351 // Interleaved store group doesn't allow a gap, so each index has a member 2352 Instruction *Member = Group->getMember(i); 2353 assert(Member && "Fail to get a member from an interleaved store group"); 2354 2355 Value *StoredVec = 2356 getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part]; 2357 if (Group->isReverse()) 2358 StoredVec = reverseVector(StoredVec); 2359 2360 // If this member has different type, cast it to an unified type. 2361 if (StoredVec->getType() != SubVT) 2362 StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); 2363 2364 StoredVecs.push_back(StoredVec); 2365 } 2366 2367 // Concatenate all vectors into a wide vector. 2368 Value *WideVec = ConcatenateVectors(Builder, StoredVecs); 2369 2370 // Interleave the elements in the wide vector. 2371 Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); 2372 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2373 "interleaved.vec"); 2374 2375 Instruction *NewStoreInstr = 2376 Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); 2377 propagateMetadata(NewStoreInstr, Instr); 2378 } 2379 } 2380 2381 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { 2382 // Attempt to issue a wide load. 2383 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2384 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2385 2386 assert((LI || SI) && "Invalid Load/Store instruction"); 2387 2388 // Try to vectorize the interleave group if this access is interleaved. 2389 if (Legal->isAccessInterleaved(Instr)) 2390 return vectorizeInterleaveGroup(Instr); 2391 2392 Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2393 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2394 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 2395 unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 2396 // An alignment of 0 means target abi alignment. We need to use the scalar's 2397 // target abi alignment in such a case. 2398 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2399 if (!Alignment) 2400 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2401 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2402 unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); 2403 unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; 2404 2405 if (SI && Legal->blockNeedsPredication(SI->getParent()) && 2406 !Legal->isMaskRequired(SI)) 2407 return scalarizeInstruction(Instr, true); 2408 2409 if (ScalarAllocatedSize != VectorElementSize) 2410 return scalarizeInstruction(Instr); 2411 2412 // If the pointer is loop invariant scalarize the load. 2413 if (LI && Legal->isUniform(Ptr)) 2414 return scalarizeInstruction(Instr); 2415 2416 // If the pointer is non-consecutive and gather/scatter is not supported 2417 // scalarize the instruction. 2418 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 2419 bool Reverse = ConsecutiveStride < 0; 2420 bool CreateGatherScatter = !ConsecutiveStride && 2421 ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) || 2422 (SI && Legal->isLegalMaskedScatter(ScalarDataTy))); 2423 2424 if (!ConsecutiveStride && !CreateGatherScatter) 2425 return scalarizeInstruction(Instr); 2426 2427 Constant *Zero = Builder.getInt32(0); 2428 VectorParts &Entry = WidenMap.get(Instr); 2429 VectorParts VectorGep; 2430 2431 // Handle consecutive loads/stores. 2432 GetElementPtrInst *Gep = getGEPInstruction(Ptr); 2433 if (ConsecutiveStride) { 2434 if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 2435 setDebugLocFromInst(Builder, Gep); 2436 Value *PtrOperand = Gep->getPointerOperand(); 2437 Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 2438 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 2439 2440 // Create the new GEP with the new induction variable. 2441 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2442 Gep2->setOperand(0, FirstBasePtr); 2443 Gep2->setName("gep.indvar.base"); 2444 Ptr = Builder.Insert(Gep2); 2445 } else if (Gep) { 2446 setDebugLocFromInst(Builder, Gep); 2447 assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), 2448 OrigLoop) && 2449 "Base ptr must be invariant"); 2450 // The last index does not have to be the induction. It can be 2451 // consecutive and be a function of the index. For example A[I+1]; 2452 unsigned NumOperands = Gep->getNumOperands(); 2453 unsigned InductionOperand = getGEPInductionOperand(Gep); 2454 // Create the new GEP with the new induction variable. 2455 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2456 2457 for (unsigned i = 0; i < NumOperands; ++i) { 2458 Value *GepOperand = Gep->getOperand(i); 2459 Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); 2460 2461 // Update last index or loop invariant instruction anchored in loop. 2462 if (i == InductionOperand || 2463 (GepOperandInst && OrigLoop->contains(GepOperandInst))) { 2464 assert((i == InductionOperand || 2465 PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), 2466 OrigLoop)) && 2467 "Must be last index or loop invariant"); 2468 2469 VectorParts &GEPParts = getVectorValue(GepOperand); 2470 Value *Index = GEPParts[0]; 2471 Index = Builder.CreateExtractElement(Index, Zero); 2472 Gep2->setOperand(i, Index); 2473 Gep2->setName("gep.indvar.idx"); 2474 } 2475 } 2476 Ptr = Builder.Insert(Gep2); 2477 } else { // No GEP 2478 // Use the induction element ptr. 2479 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 2480 setDebugLocFromInst(Builder, Ptr); 2481 VectorParts &PtrVal = getVectorValue(Ptr); 2482 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 2483 } 2484 } else { 2485 // At this point we should vector version of GEP for Gather or Scatter 2486 assert(CreateGatherScatter && "The instruction should be scalarized"); 2487 if (Gep) { 2488 SmallVector<VectorParts, 4> OpsV; 2489 // Vectorizing GEP, across UF parts, we want to keep each loop-invariant 2490 // base or index of GEP scalar 2491 for (Value *Op : Gep->operands()) { 2492 if (PSE.getSE()->isLoopInvariant(PSE.getSCEV(Op), OrigLoop)) 2493 OpsV.push_back(VectorParts(UF, Op)); 2494 else 2495 OpsV.push_back(getVectorValue(Op)); 2496 } 2497 2498 for (unsigned Part = 0; Part < UF; ++Part) { 2499 SmallVector<Value*, 4> Ops; 2500 Value *GEPBasePtr = OpsV[0][Part]; 2501 for (unsigned i = 1; i < Gep->getNumOperands(); i++) 2502 Ops.push_back(OpsV[i][Part]); 2503 Value *NewGep = Builder.CreateGEP(nullptr, GEPBasePtr, Ops, 2504 "VectorGep"); 2505 assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); 2506 NewGep = Builder.CreateBitCast(NewGep, 2507 VectorType::get(Ptr->getType(), VF)); 2508 VectorGep.push_back(NewGep); 2509 } 2510 } else 2511 VectorGep = getVectorValue(Ptr); 2512 } 2513 2514 VectorParts Mask = createBlockInMask(Instr->getParent()); 2515 // Handle Stores: 2516 if (SI) { 2517 assert(!Legal->isUniform(SI->getPointerOperand()) && 2518 "We do not allow storing to uniform addresses"); 2519 setDebugLocFromInst(Builder, SI); 2520 // We don't want to update the value in the map as it might be used in 2521 // another expression. So don't use a reference type for "StoredVal". 2522 VectorParts StoredVal = getVectorValue(SI->getValueOperand()); 2523 2524 for (unsigned Part = 0; Part < UF; ++Part) { 2525 Instruction *NewSI = nullptr; 2526 if (CreateGatherScatter) { 2527 Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; 2528 NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part], 2529 Alignment, MaskPart); 2530 } else { 2531 // Calculate the pointer for the specific unroll-part. 2532 Value *PartPtr = 2533 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2534 2535 if (Reverse) { 2536 // If we store to reverse consecutive memory locations, then we need 2537 // to reverse the order of elements in the stored value. 2538 StoredVal[Part] = reverseVector(StoredVal[Part]); 2539 // If the address is consecutive but reversed, then the 2540 // wide store needs to start at the last vector element. 2541 PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2542 PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2543 Mask[Part] = reverseVector(Mask[Part]); 2544 } 2545 2546 Value *VecPtr = Builder.CreateBitCast(PartPtr, 2547 DataTy->getPointerTo(AddressSpace)); 2548 2549 if (Legal->isMaskRequired(SI)) 2550 NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, 2551 Mask[Part]); 2552 else 2553 NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, 2554 Alignment); 2555 } 2556 propagateMetadata(NewSI, SI); 2557 } 2558 return; 2559 } 2560 2561 // Handle loads. 2562 assert(LI && "Must have a load instruction"); 2563 setDebugLocFromInst(Builder, LI); 2564 for (unsigned Part = 0; Part < UF; ++Part) { 2565 Instruction* NewLI; 2566 if (CreateGatherScatter) { 2567 Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; 2568 NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, 2569 MaskPart, 0, "wide.masked.gather"); 2570 Entry[Part] = NewLI; 2571 } else { 2572 // Calculate the pointer for the specific unroll-part. 2573 Value *PartPtr = 2574 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2575 2576 if (Reverse) { 2577 // If the address is consecutive but reversed, then the 2578 // wide load needs to start at the last vector element. 2579 PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2580 PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2581 Mask[Part] = reverseVector(Mask[Part]); 2582 } 2583 2584 Value *VecPtr = Builder.CreateBitCast(PartPtr, 2585 DataTy->getPointerTo(AddressSpace)); 2586 if (Legal->isMaskRequired(LI)) 2587 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2588 UndefValue::get(DataTy), 2589 "wide.masked.load"); 2590 else 2591 NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); 2592 Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; 2593 } 2594 propagateMetadata(NewLI, LI); 2595 } 2596 } 2597 2598 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2599 bool IfPredicateStore) { 2600 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2601 // Holds vector parameters or scalars, in case of uniform vals. 2602 SmallVector<VectorParts, 4> Params; 2603 2604 setDebugLocFromInst(Builder, Instr); 2605 2606 // Find all of the vectorized parameters. 2607 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2608 Value *SrcOp = Instr->getOperand(op); 2609 2610 // If we are accessing the old induction variable, use the new one. 2611 if (SrcOp == OldInduction) { 2612 Params.push_back(getVectorValue(SrcOp)); 2613 continue; 2614 } 2615 2616 // Try using previously calculated values. 2617 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 2618 2619 // If the src is an instruction that appeared earlier in the basic block, 2620 // then it should already be vectorized. 2621 if (SrcInst && OrigLoop->contains(SrcInst)) { 2622 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 2623 // The parameter is a vector value from earlier. 2624 Params.push_back(WidenMap.get(SrcInst)); 2625 } else { 2626 // The parameter is a scalar from outside the loop. Maybe even a constant. 2627 VectorParts Scalars; 2628 Scalars.append(UF, SrcOp); 2629 Params.push_back(Scalars); 2630 } 2631 } 2632 2633 assert(Params.size() == Instr->getNumOperands() && 2634 "Invalid number of operands"); 2635 2636 // Does this instruction return a value ? 2637 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2638 2639 Value *UndefVec = IsVoidRetTy ? nullptr : 2640 UndefValue::get(VectorType::get(Instr->getType(), VF)); 2641 // Create a new entry in the WidenMap and initialize it to Undef or Null. 2642 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 2643 2644 VectorParts Cond; 2645 if (IfPredicateStore) { 2646 assert(Instr->getParent()->getSinglePredecessor() && 2647 "Only support single predecessor blocks"); 2648 Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 2649 Instr->getParent()); 2650 } 2651 2652 // For each vector unroll 'part': 2653 for (unsigned Part = 0; Part < UF; ++Part) { 2654 // For each scalar that we create: 2655 for (unsigned Width = 0; Width < VF; ++Width) { 2656 2657 // Start if-block. 2658 Value *Cmp = nullptr; 2659 if (IfPredicateStore) { 2660 Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); 2661 Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, 2662 ConstantInt::get(Cmp->getType(), 1)); 2663 } 2664 2665 Instruction *Cloned = Instr->clone(); 2666 if (!IsVoidRetTy) 2667 Cloned->setName(Instr->getName() + ".cloned"); 2668 // Replace the operands of the cloned instructions with extracted scalars. 2669 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2670 Value *Op = Params[op][Part]; 2671 // Param is a vector. Need to extract the right lane. 2672 if (Op->getType()->isVectorTy()) 2673 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 2674 Cloned->setOperand(op, Op); 2675 } 2676 2677 // Place the cloned scalar in the new loop. 2678 Builder.Insert(Cloned); 2679 2680 // If the original scalar returns a value we need to place it in a vector 2681 // so that future users will be able to use it. 2682 if (!IsVoidRetTy) 2683 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 2684 Builder.getInt32(Width)); 2685 // End if-block. 2686 if (IfPredicateStore) 2687 PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), 2688 Cmp)); 2689 } 2690 } 2691 } 2692 2693 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2694 Value *End, Value *Step, 2695 Instruction *DL) { 2696 BasicBlock *Header = L->getHeader(); 2697 BasicBlock *Latch = L->getLoopLatch(); 2698 // As we're just creating this loop, it's possible no latch exists 2699 // yet. If so, use the header as this will be a single block loop. 2700 if (!Latch) 2701 Latch = Header; 2702 2703 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2704 setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); 2705 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2706 2707 Builder.SetInsertPoint(Latch->getTerminator()); 2708 2709 // Create i+1 and fill the PHINode. 2710 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2711 Induction->addIncoming(Start, L->getLoopPreheader()); 2712 Induction->addIncoming(Next, Latch); 2713 // Create the compare. 2714 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2715 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2716 2717 // Now we have two terminators. Remove the old one from the block. 2718 Latch->getTerminator()->eraseFromParent(); 2719 2720 return Induction; 2721 } 2722 2723 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2724 if (TripCount) 2725 return TripCount; 2726 2727 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2728 // Find the loop boundaries. 2729 ScalarEvolution *SE = PSE.getSE(); 2730 const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); 2731 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2732 "Invalid loop count"); 2733 2734 Type *IdxTy = Legal->getWidestInductionType(); 2735 2736 // The exit count might have the type of i64 while the phi is i32. This can 2737 // happen if we have an induction variable that is sign extended before the 2738 // compare. The only way that we get a backedge taken count is that the 2739 // induction variable was signed and as such will not overflow. In such a case 2740 // truncation is legal. 2741 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2742 IdxTy->getPrimitiveSizeInBits()) 2743 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2744 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2745 2746 // Get the total trip count from the count by adding 1. 2747 const SCEV *ExitCount = SE->getAddExpr( 2748 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2749 2750 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2751 2752 // Expand the trip count and place the new instructions in the preheader. 2753 // Notice that the pre-header does not change, only the loop body. 2754 SCEVExpander Exp(*SE, DL, "induction"); 2755 2756 // Count holds the overall loop count (N). 2757 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2758 L->getLoopPreheader()->getTerminator()); 2759 2760 if (TripCount->getType()->isPointerTy()) 2761 TripCount = 2762 CastInst::CreatePointerCast(TripCount, IdxTy, 2763 "exitcount.ptrcnt.to.int", 2764 L->getLoopPreheader()->getTerminator()); 2765 2766 return TripCount; 2767 } 2768 2769 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2770 if (VectorTripCount) 2771 return VectorTripCount; 2772 2773 Value *TC = getOrCreateTripCount(L); 2774 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2775 2776 // Now we need to generate the expression for N - (N % VF), which is 2777 // the part that the vectorized body will execute. 2778 // The loop step is equal to the vectorization factor (num of SIMD elements) 2779 // times the unroll factor (num of SIMD instructions). 2780 Constant *Step = ConstantInt::get(TC->getType(), VF * UF); 2781 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2782 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2783 2784 return VectorTripCount; 2785 } 2786 2787 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2788 BasicBlock *Bypass) { 2789 Value *Count = getOrCreateTripCount(L); 2790 BasicBlock *BB = L->getLoopPreheader(); 2791 IRBuilder<> Builder(BB->getTerminator()); 2792 2793 // Generate code to check that the loop's trip count that we computed by 2794 // adding one to the backedge-taken count will not overflow. 2795 Value *CheckMinIters = 2796 Builder.CreateICmpULT(Count, 2797 ConstantInt::get(Count->getType(), VF * UF), 2798 "min.iters.check"); 2799 2800 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), 2801 "min.iters.checked"); 2802 // Update dominator tree immediately if the generated block is a 2803 // LoopBypassBlock because SCEV expansions to generate loop bypass 2804 // checks may query it before the current function is finished. 2805 DT->addNewBlock(NewBB, BB); 2806 if (L->getParentLoop()) 2807 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2808 ReplaceInstWithInst(BB->getTerminator(), 2809 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2810 LoopBypassBlocks.push_back(BB); 2811 } 2812 2813 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, 2814 BasicBlock *Bypass) { 2815 Value *TC = getOrCreateVectorTripCount(L); 2816 BasicBlock *BB = L->getLoopPreheader(); 2817 IRBuilder<> Builder(BB->getTerminator()); 2818 2819 // Now, compare the new count to zero. If it is zero skip the vector loop and 2820 // jump to the scalar loop. 2821 Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), 2822 "cmp.zero"); 2823 2824 // Generate code to check that the loop's trip count that we computed by 2825 // adding one to the backedge-taken count will not overflow. 2826 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), 2827 "vector.ph"); 2828 // Update dominator tree immediately if the generated block is a 2829 // LoopBypassBlock because SCEV expansions to generate loop bypass 2830 // checks may query it before the current function is finished. 2831 DT->addNewBlock(NewBB, BB); 2832 if (L->getParentLoop()) 2833 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2834 ReplaceInstWithInst(BB->getTerminator(), 2835 BranchInst::Create(Bypass, NewBB, Cmp)); 2836 LoopBypassBlocks.push_back(BB); 2837 } 2838 2839 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2840 BasicBlock *BB = L->getLoopPreheader(); 2841 2842 // Generate the code to check that the SCEV assumptions that we made. 2843 // We want the new basic block to start at the first instruction in a 2844 // sequence of instructions that form a check. 2845 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2846 "scev.check"); 2847 Value *SCEVCheck = 2848 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2849 2850 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2851 if (C->isZero()) 2852 return; 2853 2854 // Create a new block containing the stride check. 2855 BB->setName("vector.scevcheck"); 2856 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2857 // Update dominator tree immediately if the generated block is a 2858 // LoopBypassBlock because SCEV expansions to generate loop bypass 2859 // checks may query it before the current function is finished. 2860 DT->addNewBlock(NewBB, BB); 2861 if (L->getParentLoop()) 2862 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2863 ReplaceInstWithInst(BB->getTerminator(), 2864 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2865 LoopBypassBlocks.push_back(BB); 2866 AddedSafetyChecks = true; 2867 } 2868 2869 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 2870 BasicBlock *Bypass) { 2871 BasicBlock *BB = L->getLoopPreheader(); 2872 2873 // Generate the code that checks in runtime if arrays overlap. We put the 2874 // checks into a separate block to make the more common case of few elements 2875 // faster. 2876 Instruction *FirstCheckInst; 2877 Instruction *MemRuntimeCheck; 2878 std::tie(FirstCheckInst, MemRuntimeCheck) = 2879 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2880 if (!MemRuntimeCheck) 2881 return; 2882 2883 // Create a new block containing the memory check. 2884 BB->setName("vector.memcheck"); 2885 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2886 // Update dominator tree immediately if the generated block is a 2887 // LoopBypassBlock because SCEV expansions to generate loop bypass 2888 // checks may query it before the current function is finished. 2889 DT->addNewBlock(NewBB, BB); 2890 if (L->getParentLoop()) 2891 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2892 ReplaceInstWithInst(BB->getTerminator(), 2893 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2894 LoopBypassBlocks.push_back(BB); 2895 AddedSafetyChecks = true; 2896 } 2897 2898 2899 void InnerLoopVectorizer::createEmptyLoop() { 2900 /* 2901 In this function we generate a new loop. The new loop will contain 2902 the vectorized instructions while the old loop will continue to run the 2903 scalar remainder. 2904 2905 [ ] <-- loop iteration number check. 2906 / | 2907 / v 2908 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2909 | / | 2910 | / v 2911 || [ ] <-- vector pre header. 2912 |/ | 2913 | v 2914 | [ ] \ 2915 | [ ]_| <-- vector loop. 2916 | | 2917 | v 2918 | -[ ] <--- middle-block. 2919 | / | 2920 | / v 2921 -|- >[ ] <--- new preheader. 2922 | | 2923 | v 2924 | [ ] \ 2925 | [ ]_| <-- old scalar loop to handle remainder. 2926 \ | 2927 \ v 2928 >[ ] <-- exit block. 2929 ... 2930 */ 2931 2932 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2933 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2934 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2935 assert(VectorPH && "Invalid loop structure"); 2936 assert(ExitBlock && "Must have an exit block"); 2937 2938 // Some loops have a single integer induction variable, while other loops 2939 // don't. One example is c++ iterators that often have multiple pointer 2940 // induction variables. In the code below we also support a case where we 2941 // don't have a single induction variable. 2942 // 2943 // We try to obtain an induction variable from the original loop as hard 2944 // as possible. However if we don't find one that: 2945 // - is an integer 2946 // - counts from zero, stepping by one 2947 // - is the size of the widest induction variable type 2948 // then we create a new one. 2949 OldInduction = Legal->getInduction(); 2950 Type *IdxTy = Legal->getWidestInductionType(); 2951 2952 // Split the single block loop into the two loop structure described above. 2953 BasicBlock *VecBody = 2954 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2955 BasicBlock *MiddleBlock = 2956 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2957 BasicBlock *ScalarPH = 2958 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2959 2960 // Create and register the new vector loop. 2961 Loop* Lp = new Loop(); 2962 Loop *ParentLoop = OrigLoop->getParentLoop(); 2963 2964 // Insert the new loop into the loop nest and register the new basic blocks 2965 // before calling any utilities such as SCEV that require valid LoopInfo. 2966 if (ParentLoop) { 2967 ParentLoop->addChildLoop(Lp); 2968 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2969 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2970 } else { 2971 LI->addTopLevelLoop(Lp); 2972 } 2973 Lp->addBasicBlockToLoop(VecBody, *LI); 2974 2975 // Find the loop boundaries. 2976 Value *Count = getOrCreateTripCount(Lp); 2977 2978 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2979 2980 // We need to test whether the backedge-taken count is uint##_max. Adding one 2981 // to it will cause overflow and an incorrect loop trip count in the vector 2982 // body. In case of overflow we want to directly jump to the scalar remainder 2983 // loop. 2984 emitMinimumIterationCountCheck(Lp, ScalarPH); 2985 // Now, compare the new count to zero. If it is zero skip the vector loop and 2986 // jump to the scalar loop. 2987 emitVectorLoopEnteredCheck(Lp, ScalarPH); 2988 // Generate the code to check any assumptions that we've made for SCEV 2989 // expressions. 2990 emitSCEVChecks(Lp, ScalarPH); 2991 2992 // Generate the code that checks in runtime if arrays overlap. We put the 2993 // checks into a separate block to make the more common case of few elements 2994 // faster. 2995 emitMemRuntimeChecks(Lp, ScalarPH); 2996 2997 // Generate the induction variable. 2998 // The loop step is equal to the vectorization factor (num of SIMD elements) 2999 // times the unroll factor (num of SIMD instructions). 3000 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3001 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3002 Induction = 3003 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3004 getDebugLocFromInstOrOperands(OldInduction)); 3005 3006 // We are going to resume the execution of the scalar loop. 3007 // Go over all of the induction variables that we found and fix the 3008 // PHIs that are left in the scalar version of the loop. 3009 // The starting values of PHI nodes depend on the counter of the last 3010 // iteration in the vectorized loop. 3011 // If we come from a bypass edge then we need to start from the original 3012 // start value. 3013 3014 // This variable saves the new starting index for the scalar loop. It is used 3015 // to test if there are any tail iterations left once the vector loop has 3016 // completed. 3017 LoopVectorizationLegality::InductionList::iterator I, E; 3018 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 3019 for (I = List->begin(), E = List->end(); I != E; ++I) { 3020 PHINode *OrigPhi = I->first; 3021 InductionDescriptor II = I->second; 3022 3023 // Create phi nodes to merge from the backedge-taken check block. 3024 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, 3025 "bc.resume.val", 3026 ScalarPH->getTerminator()); 3027 Value *EndValue; 3028 if (OrigPhi == OldInduction) { 3029 // We know what the end value is. 3030 EndValue = CountRoundDown; 3031 } else { 3032 IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); 3033 Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, 3034 II.getStepValue()->getType(), 3035 "cast.crd"); 3036 EndValue = II.transform(B, CRD); 3037 EndValue->setName("ind.end"); 3038 } 3039 3040 // The new PHI merges the original incoming value, in case of a bypass, 3041 // or the value at the end of the vectorized loop. 3042 BCResumeVal->addIncoming(EndValue, MiddleBlock); 3043 3044 // Fix the scalar body counter (PHI node). 3045 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 3046 3047 // The old induction's phi node in the scalar body needs the truncated 3048 // value. 3049 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3050 BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); 3051 OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); 3052 } 3053 3054 // Add a check in the middle block to see if we have completed 3055 // all of the iterations in the first vector loop. 3056 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3057 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3058 CountRoundDown, "cmp.n", 3059 MiddleBlock->getTerminator()); 3060 ReplaceInstWithInst(MiddleBlock->getTerminator(), 3061 BranchInst::Create(ExitBlock, ScalarPH, CmpN)); 3062 3063 // Get ready to start creating new instructions into the vectorized body. 3064 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3065 3066 // Save the state. 3067 LoopVectorPreHeader = Lp->getLoopPreheader(); 3068 LoopScalarPreHeader = ScalarPH; 3069 LoopMiddleBlock = MiddleBlock; 3070 LoopExitBlock = ExitBlock; 3071 LoopVectorBody.push_back(VecBody); 3072 LoopScalarBody = OldBasicBlock; 3073 3074 LoopVectorizeHints Hints(Lp, true); 3075 Hints.setAlreadyVectorized(); 3076 } 3077 3078 namespace { 3079 struct CSEDenseMapInfo { 3080 static bool canHandle(Instruction *I) { 3081 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3082 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3083 } 3084 static inline Instruction *getEmptyKey() { 3085 return DenseMapInfo<Instruction *>::getEmptyKey(); 3086 } 3087 static inline Instruction *getTombstoneKey() { 3088 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3089 } 3090 static unsigned getHashValue(Instruction *I) { 3091 assert(canHandle(I) && "Unknown instruction!"); 3092 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3093 I->value_op_end())); 3094 } 3095 static bool isEqual(Instruction *LHS, Instruction *RHS) { 3096 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3097 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3098 return LHS == RHS; 3099 return LHS->isIdenticalTo(RHS); 3100 } 3101 }; 3102 } 3103 3104 /// \brief Check whether this block is a predicated block. 3105 /// Due to if predication of stores we might create a sequence of "if(pred) a[i] 3106 /// = ...; " blocks. We start with one vectorized basic block. For every 3107 /// conditional block we split this vectorized block. Therefore, every second 3108 /// block will be a predicated one. 3109 static bool isPredicatedBlock(unsigned BlockNum) { 3110 return BlockNum % 2; 3111 } 3112 3113 ///\brief Perform cse of induction variable instructions. 3114 static void cse(SmallVector<BasicBlock *, 4> &BBs) { 3115 // Perform simple cse. 3116 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3117 for (unsigned i = 0, e = BBs.size(); i != e; ++i) { 3118 BasicBlock *BB = BBs[i]; 3119 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3120 Instruction *In = &*I++; 3121 3122 if (!CSEDenseMapInfo::canHandle(In)) 3123 continue; 3124 3125 // Check if we can replace this instruction with any of the 3126 // visited instructions. 3127 if (Instruction *V = CSEMap.lookup(In)) { 3128 In->replaceAllUsesWith(V); 3129 In->eraseFromParent(); 3130 continue; 3131 } 3132 // Ignore instructions in conditional blocks. We create "if (pred) a[i] = 3133 // ...;" blocks for predicated stores. Every second block is a predicated 3134 // block. 3135 if (isPredicatedBlock(i)) 3136 continue; 3137 3138 CSEMap[In] = In; 3139 } 3140 } 3141 } 3142 3143 /// \brief Adds a 'fast' flag to floating point operations. 3144 static Value *addFastMathFlag(Value *V) { 3145 if (isa<FPMathOperator>(V)){ 3146 FastMathFlags Flags; 3147 Flags.setUnsafeAlgebra(); 3148 cast<Instruction>(V)->setFastMathFlags(Flags); 3149 } 3150 return V; 3151 } 3152 3153 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if 3154 /// the result needs to be inserted and/or extracted from vectors. 3155 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, 3156 const TargetTransformInfo &TTI) { 3157 if (Ty->isVoidTy()) 3158 return 0; 3159 3160 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 3161 unsigned Cost = 0; 3162 3163 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 3164 if (Insert) 3165 Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i); 3166 if (Extract) 3167 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i); 3168 } 3169 3170 return Cost; 3171 } 3172 3173 // Estimate cost of a call instruction CI if it were vectorized with factor VF. 3174 // Return the cost of the instruction, including scalarization overhead if it's 3175 // needed. The flag NeedToScalarize shows if the call needs to be scalarized - 3176 // i.e. either vector version isn't available, or is too expensive. 3177 static unsigned getVectorCallCost(CallInst *CI, unsigned VF, 3178 const TargetTransformInfo &TTI, 3179 const TargetLibraryInfo *TLI, 3180 bool &NeedToScalarize) { 3181 Function *F = CI->getCalledFunction(); 3182 StringRef FnName = CI->getCalledFunction()->getName(); 3183 Type *ScalarRetTy = CI->getType(); 3184 SmallVector<Type *, 4> Tys, ScalarTys; 3185 for (auto &ArgOp : CI->arg_operands()) 3186 ScalarTys.push_back(ArgOp->getType()); 3187 3188 // Estimate cost of scalarized vector call. The source operands are assumed 3189 // to be vectors, so we need to extract individual elements from there, 3190 // execute VF scalar calls, and then gather the result into the vector return 3191 // value. 3192 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3193 if (VF == 1) 3194 return ScalarCallCost; 3195 3196 // Compute corresponding vector type for return value and arguments. 3197 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3198 for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i) 3199 Tys.push_back(ToVectorTy(ScalarTys[i], VF)); 3200 3201 // Compute costs of unpacking argument values for the scalar calls and 3202 // packing the return values to a vector. 3203 unsigned ScalarizationCost = 3204 getScalarizationOverhead(RetTy, true, false, TTI); 3205 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) 3206 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI); 3207 3208 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3209 3210 // If we can't emit a vector call for this function, then the currently found 3211 // cost is the cost we need to return. 3212 NeedToScalarize = true; 3213 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3214 return Cost; 3215 3216 // If the corresponding vector cost is cheaper, return its cost. 3217 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3218 if (VectorCallCost < Cost) { 3219 NeedToScalarize = false; 3220 return VectorCallCost; 3221 } 3222 return Cost; 3223 } 3224 3225 // Estimate cost of an intrinsic call instruction CI if it were vectorized with 3226 // factor VF. Return the cost of the instruction, including scalarization 3227 // overhead if it's needed. 3228 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, 3229 const TargetTransformInfo &TTI, 3230 const TargetLibraryInfo *TLI) { 3231 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3232 assert(ID && "Expected intrinsic call!"); 3233 3234 Type *RetTy = ToVectorTy(CI->getType(), VF); 3235 SmallVector<Type *, 4> Tys; 3236 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3237 Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3238 3239 return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); 3240 } 3241 3242 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3243 IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); 3244 IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); 3245 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3246 } 3247 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3248 IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); 3249 IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); 3250 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3251 } 3252 3253 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3254 // For every instruction `I` in MinBWs, truncate the operands, create a 3255 // truncated version of `I` and reextend its result. InstCombine runs 3256 // later and will remove any ext/trunc pairs. 3257 // 3258 for (auto &KV : MinBWs) { 3259 VectorParts &Parts = WidenMap.get(KV.first); 3260 for (Value *&I : Parts) { 3261 if (I->use_empty()) 3262 continue; 3263 Type *OriginalTy = I->getType(); 3264 Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), 3265 KV.second); 3266 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3267 OriginalTy->getVectorNumElements()); 3268 if (TruncatedTy == OriginalTy) 3269 continue; 3270 3271 if (!isa<Instruction>(I)) 3272 continue; 3273 3274 IRBuilder<> B(cast<Instruction>(I)); 3275 auto ShrinkOperand = [&](Value *V) -> Value* { 3276 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3277 if (ZI->getSrcTy() == TruncatedTy) 3278 return ZI->getOperand(0); 3279 return B.CreateZExtOrTrunc(V, TruncatedTy); 3280 }; 3281 3282 // The actual instruction modification depends on the instruction type, 3283 // unfortunately. 3284 Value *NewI = nullptr; 3285 if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { 3286 NewI = B.CreateBinOp(BO->getOpcode(), 3287 ShrinkOperand(BO->getOperand(0)), 3288 ShrinkOperand(BO->getOperand(1))); 3289 cast<BinaryOperator>(NewI)->copyIRFlags(I); 3290 } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) { 3291 NewI = B.CreateICmp(CI->getPredicate(), 3292 ShrinkOperand(CI->getOperand(0)), 3293 ShrinkOperand(CI->getOperand(1))); 3294 } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 3295 NewI = B.CreateSelect(SI->getCondition(), 3296 ShrinkOperand(SI->getTrueValue()), 3297 ShrinkOperand(SI->getFalseValue())); 3298 } else if (CastInst *CI = dyn_cast<CastInst>(I)) { 3299 switch (CI->getOpcode()) { 3300 default: llvm_unreachable("Unhandled cast!"); 3301 case Instruction::Trunc: 3302 NewI = ShrinkOperand(CI->getOperand(0)); 3303 break; 3304 case Instruction::SExt: 3305 NewI = B.CreateSExtOrTrunc(CI->getOperand(0), 3306 smallestIntegerVectorType(OriginalTy, 3307 TruncatedTy)); 3308 break; 3309 case Instruction::ZExt: 3310 NewI = B.CreateZExtOrTrunc(CI->getOperand(0), 3311 smallestIntegerVectorType(OriginalTy, 3312 TruncatedTy)); 3313 break; 3314 } 3315 } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 3316 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3317 auto *O0 = 3318 B.CreateZExtOrTrunc(SI->getOperand(0), 3319 VectorType::get(ScalarTruncatedTy, Elements0)); 3320 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3321 auto *O1 = 3322 B.CreateZExtOrTrunc(SI->getOperand(1), 3323 VectorType::get(ScalarTruncatedTy, Elements1)); 3324 3325 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3326 } else if (isa<LoadInst>(I)) { 3327 // Don't do anything with the operands, just extend the result. 3328 continue; 3329 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3330 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3331 auto *O0 = B.CreateZExtOrTrunc( 3332 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3333 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3334 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3335 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3336 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3337 auto *O0 = B.CreateZExtOrTrunc( 3338 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3339 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3340 } else { 3341 llvm_unreachable("Unhandled instruction type!"); 3342 } 3343 3344 // Lastly, extend the result. 3345 NewI->takeName(cast<Instruction>(I)); 3346 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3347 I->replaceAllUsesWith(Res); 3348 cast<Instruction>(I)->eraseFromParent(); 3349 I = Res; 3350 } 3351 } 3352 3353 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3354 for (auto &KV : MinBWs) { 3355 VectorParts &Parts = WidenMap.get(KV.first); 3356 for (Value *&I : Parts) { 3357 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3358 if (Inst && Inst->use_empty()) { 3359 Value *NewI = Inst->getOperand(0); 3360 Inst->eraseFromParent(); 3361 I = NewI; 3362 } 3363 } 3364 } 3365 } 3366 3367 void InnerLoopVectorizer::vectorizeLoop() { 3368 //===------------------------------------------------===// 3369 // 3370 // Notice: any optimization or new instruction that go 3371 // into the code below should be also be implemented in 3372 // the cost-model. 3373 // 3374 //===------------------------------------------------===// 3375 Constant *Zero = Builder.getInt32(0); 3376 3377 // In order to support recurrences we need to be able to vectorize Phi nodes. 3378 // Phi nodes have cycles, so we need to vectorize them in two stages. First, 3379 // we create a new vector PHI node with no incoming edges. We use this value 3380 // when we vectorize all of the instructions that use the PHI. Next, after 3381 // all of the instructions in the block are complete we add the new incoming 3382 // edges to the PHI. At this point all of the instructions in the basic block 3383 // are vectorized, so we can use them to construct the PHI. 3384 PhiVector PHIsToFix; 3385 3386 // Scan the loop in a topological order to ensure that defs are vectorized 3387 // before users. 3388 LoopBlocksDFS DFS(OrigLoop); 3389 DFS.perform(LI); 3390 3391 // Vectorize all of the blocks in the original loop. 3392 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 3393 be = DFS.endRPO(); bb != be; ++bb) 3394 vectorizeBlockInLoop(*bb, &PHIsToFix); 3395 3396 // Insert truncates and extends for any truncated instructions as hints to 3397 // InstCombine. 3398 if (VF > 1) 3399 truncateToMinimalBitwidths(); 3400 3401 // At this point every instruction in the original loop is widened to a 3402 // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI 3403 // nodes are currently empty because we did not want to introduce cycles. 3404 // This is the second stage of vectorizing recurrences. 3405 for (PHINode *Phi : PHIsToFix) { 3406 assert(Phi && "Unable to recover vectorized PHI"); 3407 3408 // Handle first-order recurrences that need to be fixed. 3409 if (Legal->isFirstOrderRecurrence(Phi)) { 3410 fixFirstOrderRecurrence(Phi); 3411 continue; 3412 } 3413 3414 // If the phi node is not a first-order recurrence, it must be a reduction. 3415 // Get it's reduction variable descriptor. 3416 assert(Legal->isReductionVariable(Phi) && 3417 "Unable to find the reduction variable"); 3418 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3419 3420 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3421 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3422 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3423 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3424 RdxDesc.getMinMaxRecurrenceKind(); 3425 setDebugLocFromInst(Builder, ReductionStartValue); 3426 3427 // We need to generate a reduction vector from the incoming scalar. 3428 // To do so, we need to generate the 'identity' vector and override 3429 // one of the elements with the incoming scalar reduction. We need 3430 // to do it in the vector-loop preheader. 3431 Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); 3432 3433 // This is the vector-clone of the value that leaves the loop. 3434 VectorParts &VectorExit = getVectorValue(LoopExitInst); 3435 Type *VecTy = VectorExit[0]->getType(); 3436 3437 // Find the reduction identity variable. Zero for addition, or, xor, 3438 // one for multiplication, -1 for And. 3439 Value *Identity; 3440 Value *VectorStart; 3441 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3442 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3443 // MinMax reduction have the start value as their identify. 3444 if (VF == 1) { 3445 VectorStart = Identity = ReductionStartValue; 3446 } else { 3447 VectorStart = Identity = 3448 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3449 } 3450 } else { 3451 // Handle other reduction kinds: 3452 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3453 RK, VecTy->getScalarType()); 3454 if (VF == 1) { 3455 Identity = Iden; 3456 // This vector is the Identity vector where the first element is the 3457 // incoming scalar reduction. 3458 VectorStart = ReductionStartValue; 3459 } else { 3460 Identity = ConstantVector::getSplat(VF, Iden); 3461 3462 // This vector is the Identity vector where the first element is the 3463 // incoming scalar reduction. 3464 VectorStart = 3465 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3466 } 3467 } 3468 3469 // Fix the vector-loop phi. 3470 3471 // Reductions do not have to start at zero. They can start with 3472 // any loop invariant values. 3473 VectorParts &VecRdxPhi = WidenMap.get(Phi); 3474 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3475 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3476 VectorParts &Val = getVectorValue(LoopVal); 3477 for (unsigned part = 0; part < UF; ++part) { 3478 // Make sure to add the reduction stat value only to the 3479 // first unroll part. 3480 Value *StartVal = (part == 0) ? VectorStart : Identity; 3481 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, 3482 LoopVectorPreHeader); 3483 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], 3484 LoopVectorBody.back()); 3485 } 3486 3487 // Before each round, move the insertion point right between 3488 // the PHIs and the values we are going to write. 3489 // This allows us to write both PHINodes and the extractelement 3490 // instructions. 3491 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3492 3493 VectorParts RdxParts = getVectorValue(LoopExitInst); 3494 setDebugLocFromInst(Builder, LoopExitInst); 3495 3496 // If the vector reduction can be performed in a smaller type, we truncate 3497 // then extend the loop exit value to enable InstCombine to evaluate the 3498 // entire expression in the smaller type. 3499 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3500 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3501 Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); 3502 for (unsigned part = 0; part < UF; ++part) { 3503 Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); 3504 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3505 : Builder.CreateZExt(Trunc, VecTy); 3506 for (Value::user_iterator UI = RdxParts[part]->user_begin(); 3507 UI != RdxParts[part]->user_end();) 3508 if (*UI != Trunc) { 3509 (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); 3510 RdxParts[part] = Extnd; 3511 } else { 3512 ++UI; 3513 } 3514 } 3515 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3516 for (unsigned part = 0; part < UF; ++part) 3517 RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); 3518 } 3519 3520 // Reduce all of the unrolled parts into a single vector. 3521 Value *ReducedPartRdx = RdxParts[0]; 3522 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3523 setDebugLocFromInst(Builder, ReducedPartRdx); 3524 for (unsigned part = 1; part < UF; ++part) { 3525 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3526 // Floating point operations had to be 'fast' to enable the reduction. 3527 ReducedPartRdx = addFastMathFlag( 3528 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], 3529 ReducedPartRdx, "bin.rdx")); 3530 else 3531 ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( 3532 Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); 3533 } 3534 3535 if (VF > 1) { 3536 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 3537 // and vector ops, reducing the set of values being computed by half each 3538 // round. 3539 assert(isPowerOf2_32(VF) && 3540 "Reduction emission only supported for pow2 vectors!"); 3541 Value *TmpVec = ReducedPartRdx; 3542 SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); 3543 for (unsigned i = VF; i != 1; i >>= 1) { 3544 // Move the upper half of the vector to the lower half. 3545 for (unsigned j = 0; j != i/2; ++j) 3546 ShuffleMask[j] = Builder.getInt32(i/2 + j); 3547 3548 // Fill the rest of the mask with undef. 3549 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 3550 UndefValue::get(Builder.getInt32Ty())); 3551 3552 Value *Shuf = 3553 Builder.CreateShuffleVector(TmpVec, 3554 UndefValue::get(TmpVec->getType()), 3555 ConstantVector::get(ShuffleMask), 3556 "rdx.shuf"); 3557 3558 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3559 // Floating point operations had to be 'fast' to enable the reduction. 3560 TmpVec = addFastMathFlag(Builder.CreateBinOp( 3561 (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); 3562 else 3563 TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, 3564 TmpVec, Shuf); 3565 } 3566 3567 // The result is in the first element of the vector. 3568 ReducedPartRdx = Builder.CreateExtractElement(TmpVec, 3569 Builder.getInt32(0)); 3570 3571 // If the reduction can be performed in a smaller type, we need to extend 3572 // the reduction to the wider type before we branch to the original loop. 3573 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3574 ReducedPartRdx = 3575 RdxDesc.isSigned() 3576 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3577 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3578 } 3579 3580 // Create a phi node that merges control-flow from the backedge-taken check 3581 // block and the middle block. 3582 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3583 LoopScalarPreHeader->getTerminator()); 3584 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3585 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3586 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3587 3588 // Now, we need to fix the users of the reduction variable 3589 // inside and outside of the scalar remainder loop. 3590 // We know that the loop is in LCSSA form. We need to update the 3591 // PHI nodes in the exit blocks. 3592 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3593 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 3594 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3595 if (!LCSSAPhi) break; 3596 3597 // All PHINodes need to have a single entry edge, or two if 3598 // we already fixed them. 3599 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3600 3601 // We found our reduction value exit-PHI. Update it with the 3602 // incoming bypass edge. 3603 if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { 3604 // Add an edge coming from the bypass. 3605 LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3606 break; 3607 } 3608 }// end of the LCSSA phi scan. 3609 3610 // Fix the scalar loop reduction variable with the incoming reduction sum 3611 // from the vector body and from the backedge value. 3612 int IncomingEdgeBlockIdx = 3613 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3614 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3615 // Pick the other block. 3616 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3617 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3618 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3619 } // end of for each Phi in PHIsToFix. 3620 3621 fixLCSSAPHIs(); 3622 3623 // Make sure DomTree is updated. 3624 updateAnalysis(); 3625 3626 // Predicate any stores. 3627 for (auto KV : PredicatedStores) { 3628 BasicBlock::iterator I(KV.first); 3629 auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); 3630 auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, 3631 /*BranchWeights=*/nullptr, DT); 3632 I->moveBefore(T); 3633 I->getParent()->setName("pred.store.if"); 3634 BB->setName("pred.store.continue"); 3635 } 3636 DEBUG(DT->verifyDomTree()); 3637 // Remove redundant induction instructions. 3638 cse(LoopVectorBody); 3639 } 3640 3641 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3642 3643 // This is the second phase of vectorizing first-order rececurrences. An 3644 // overview of the transformation is described below. Suppose we have the 3645 // following loop. 3646 // 3647 // for (int i = 0; i < n; ++i) 3648 // b[i] = a[i] - a[i - 1]; 3649 // 3650 // There is a first-order recurrence on "a". For this loop, the shorthand 3651 // scalar IR looks like: 3652 // 3653 // scalar.ph: 3654 // s_init = a[-1] 3655 // br scalar.body 3656 // 3657 // scalar.body: 3658 // i = phi [0, scalar.ph], [i+1, scalar.body] 3659 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3660 // s2 = a[i] 3661 // b[i] = s2 - s1 3662 // br cond, scalar.body, ... 3663 // 3664 // In this example, s1 is a recurrence because it's value depends on the 3665 // previous iteration. In the first phase of vectorization, we created a 3666 // temporary value for s1. We now complete the vectorization and produce the 3667 // shorthand vector IR shown below (for VF = 4, UF = 1). 3668 // 3669 // vector.ph: 3670 // v_init = vector(..., ..., ..., a[-1]) 3671 // br vector.body 3672 // 3673 // vector.body 3674 // i = phi [0, vector.ph], [i+4, vector.body] 3675 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3676 // v2 = a[i, i+1, i+2, i+3]; 3677 // v3 = vector(v1(3), v2(0, 1, 2)) 3678 // b[i, i+1, i+2, i+3] = v2 - v3 3679 // br cond, vector.body, middle.block 3680 // 3681 // middle.block: 3682 // x = v2(3) 3683 // br scalar.ph 3684 // 3685 // scalar.ph: 3686 // s_init = phi [x, middle.block], [a[-1], otherwise] 3687 // br scalar.body 3688 // 3689 // After execution completes the vector loop, we extract the next value of 3690 // the recurrence (x) to use as the initial value in the scalar loop. 3691 3692 // Get the original loop preheader and single loop latch. 3693 auto *Preheader = OrigLoop->getLoopPreheader(); 3694 auto *Latch = OrigLoop->getLoopLatch(); 3695 3696 // Get the initial and previous values of the scalar recurrence. 3697 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3698 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3699 3700 // Create a vector from the initial value. 3701 auto *VectorInit = ScalarInit; 3702 if (VF > 1) { 3703 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3704 VectorInit = Builder.CreateInsertElement( 3705 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3706 Builder.getInt32(VF - 1), "vector.recur.init"); 3707 } 3708 3709 // We constructed a temporary phi node in the first phase of vectorization. 3710 // This phi node will eventually be deleted. 3711 auto &PhiParts = getVectorValue(Phi); 3712 Builder.SetInsertPoint(cast<Instruction>(PhiParts[0])); 3713 3714 // Create a phi node for the new recurrence. The current value will either be 3715 // the initial value inserted into a vector or loop-varying vector value. 3716 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3717 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3718 3719 // Get the vectorized previous value. We ensured the previous values was an 3720 // instruction when detecting the recurrence. 3721 auto &PreviousParts = getVectorValue(Previous); 3722 3723 // Set the insertion point to be after this instruction. We ensured the 3724 // previous value dominated all uses of the phi when detecting the 3725 // recurrence. 3726 Builder.SetInsertPoint( 3727 &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); 3728 3729 // We will construct a vector for the recurrence by combining the values for 3730 // the current and previous iterations. This is the required shuffle mask. 3731 SmallVector<Constant *, 8> ShuffleMask(VF); 3732 ShuffleMask[0] = Builder.getInt32(VF - 1); 3733 for (unsigned I = 1; I < VF; ++I) 3734 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3735 3736 // The vector from which to take the initial value for the current iteration 3737 // (actual or unrolled). Initially, this is the vector phi node. 3738 Value *Incoming = VecPhi; 3739 3740 // Shuffle the current and previous vector and update the vector parts. 3741 for (unsigned Part = 0; Part < UF; ++Part) { 3742 auto *Shuffle = 3743 VF > 1 3744 ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part], 3745 ConstantVector::get(ShuffleMask)) 3746 : Incoming; 3747 PhiParts[Part]->replaceAllUsesWith(Shuffle); 3748 cast<Instruction>(PhiParts[Part])->eraseFromParent(); 3749 PhiParts[Part] = Shuffle; 3750 Incoming = PreviousParts[Part]; 3751 } 3752 3753 // Fix the latch value of the new recurrence in the vector loop. 3754 VecPhi->addIncoming(Incoming, 3755 LI->getLoopFor(LoopVectorBody[0])->getLoopLatch()); 3756 3757 // Extract the last vector element in the middle block. This will be the 3758 // initial value for the recurrence when jumping to the scalar loop. 3759 auto *Extract = Incoming; 3760 if (VF > 1) { 3761 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3762 Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), 3763 "vector.recur.extract"); 3764 } 3765 3766 // Fix the initial value of the original recurrence in the scalar loop. 3767 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3768 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3769 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3770 auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit; 3771 Start->addIncoming(Incoming, BB); 3772 } 3773 3774 Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); 3775 Phi->setName("scalar.recur"); 3776 3777 // Finally, fix users of the recurrence outside the loop. The users will need 3778 // either the last value of the scalar recurrence or the last value of the 3779 // vector recurrence we extracted in the middle block. Since the loop is in 3780 // LCSSA form, we just need to find the phi node for the original scalar 3781 // recurrence in the exit block, and then add an edge for the middle block. 3782 for (auto &I : *LoopExitBlock) { 3783 auto *LCSSAPhi = dyn_cast<PHINode>(&I); 3784 if (!LCSSAPhi) 3785 break; 3786 if (LCSSAPhi->getIncomingValue(0) == Phi) { 3787 LCSSAPhi->addIncoming(Extract, LoopMiddleBlock); 3788 break; 3789 } 3790 } 3791 } 3792 3793 void InnerLoopVectorizer::fixLCSSAPHIs() { 3794 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3795 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 3796 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3797 if (!LCSSAPhi) break; 3798 if (LCSSAPhi->getNumIncomingValues() == 1) 3799 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 3800 LoopMiddleBlock); 3801 } 3802 } 3803 3804 InnerLoopVectorizer::VectorParts 3805 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 3806 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 3807 "Invalid edge"); 3808 3809 // Look for cached value. 3810 std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); 3811 EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); 3812 if (ECEntryIt != MaskCache.end()) 3813 return ECEntryIt->second; 3814 3815 VectorParts SrcMask = createBlockInMask(Src); 3816 3817 // The terminator has to be a branch inst! 3818 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 3819 assert(BI && "Unexpected terminator found"); 3820 3821 if (BI->isConditional()) { 3822 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 3823 3824 if (BI->getSuccessor(0) != Dst) 3825 for (unsigned part = 0; part < UF; ++part) 3826 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 3827 3828 for (unsigned part = 0; part < UF; ++part) 3829 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 3830 3831 MaskCache[Edge] = EdgeMask; 3832 return EdgeMask; 3833 } 3834 3835 MaskCache[Edge] = SrcMask; 3836 return SrcMask; 3837 } 3838 3839 InnerLoopVectorizer::VectorParts 3840 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 3841 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 3842 3843 // Loop incoming mask is all-one. 3844 if (OrigLoop->getHeader() == BB) { 3845 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 3846 return getVectorValue(C); 3847 } 3848 3849 // This is the block mask. We OR all incoming edges, and with zero. 3850 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 3851 VectorParts BlockMask = getVectorValue(Zero); 3852 3853 // For each pred: 3854 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 3855 VectorParts EM = createEdgeMask(*it, BB); 3856 for (unsigned part = 0; part < UF; ++part) 3857 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 3858 } 3859 3860 return BlockMask; 3861 } 3862 3863 void InnerLoopVectorizer::widenPHIInstruction( 3864 Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, 3865 unsigned VF, PhiVector *PV) { 3866 PHINode* P = cast<PHINode>(PN); 3867 // Handle recurrences. 3868 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 3869 for (unsigned part = 0; part < UF; ++part) { 3870 // This is phase one of vectorizing PHIs. 3871 Type *VecTy = (VF == 1) ? PN->getType() : 3872 VectorType::get(PN->getType(), VF); 3873 Entry[part] = PHINode::Create( 3874 VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); 3875 } 3876 PV->push_back(P); 3877 return; 3878 } 3879 3880 setDebugLocFromInst(Builder, P); 3881 // Check for PHI nodes that are lowered to vector selects. 3882 if (P->getParent() != OrigLoop->getHeader()) { 3883 // We know that all PHIs in non-header blocks are converted into 3884 // selects, so we don't have to worry about the insertion order and we 3885 // can just use the builder. 3886 // At this point we generate the predication tree. There may be 3887 // duplications since this is a simple recursive scan, but future 3888 // optimizations will clean it up. 3889 3890 unsigned NumIncoming = P->getNumIncomingValues(); 3891 3892 // Generate a sequence of selects of the form: 3893 // SELECT(Mask3, In3, 3894 // SELECT(Mask2, In2, 3895 // ( ...))) 3896 for (unsigned In = 0; In < NumIncoming; In++) { 3897 VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), 3898 P->getParent()); 3899 VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 3900 3901 for (unsigned part = 0; part < UF; ++part) { 3902 // We might have single edge PHIs (blocks) - use an identity 3903 // 'select' for the first PHI operand. 3904 if (In == 0) 3905 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 3906 In0[part]); 3907 else 3908 // Select between the current value and the previous incoming edge 3909 // based on the incoming mask. 3910 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 3911 Entry[part], "predphi"); 3912 } 3913 } 3914 return; 3915 } 3916 3917 // This PHINode must be an induction variable. 3918 // Make sure that we know about it. 3919 assert(Legal->getInductionVars()->count(P) && 3920 "Not an induction variable"); 3921 3922 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3923 3924 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3925 // which can be found from the original scalar operations. 3926 switch (II.getKind()) { 3927 case InductionDescriptor::IK_NoInduction: 3928 llvm_unreachable("Unknown induction"); 3929 case InductionDescriptor::IK_IntInduction: { 3930 assert(P->getType() == II.getStartValue()->getType() && 3931 "Types must match"); 3932 // Handle other induction variables that are now based on the 3933 // canonical one. 3934 Value *V = Induction; 3935 if (P != OldInduction) { 3936 V = Builder.CreateSExtOrTrunc(Induction, P->getType()); 3937 V = II.transform(Builder, V); 3938 V->setName("offset.idx"); 3939 } 3940 Value *Broadcasted = getBroadcastInstrs(V); 3941 // After broadcasting the induction variable we need to make the vector 3942 // consecutive by adding 0, 1, 2, etc. 3943 for (unsigned part = 0; part < UF; ++part) 3944 Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue()); 3945 return; 3946 } 3947 case InductionDescriptor::IK_PtrInduction: 3948 // Handle the pointer induction variable case. 3949 assert(P->getType()->isPointerTy() && "Unexpected type."); 3950 // This is the normalized GEP that starts counting at zero. 3951 Value *PtrInd = Induction; 3952 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType()); 3953 // This is the vector of results. Notice that we don't generate 3954 // vector geps because scalar geps result in better code. 3955 for (unsigned part = 0; part < UF; ++part) { 3956 if (VF == 1) { 3957 int EltIndex = part; 3958 Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); 3959 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3960 Value *SclrGep = II.transform(Builder, GlobalIdx); 3961 SclrGep->setName("next.gep"); 3962 Entry[part] = SclrGep; 3963 continue; 3964 } 3965 3966 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 3967 for (unsigned int i = 0; i < VF; ++i) { 3968 int EltIndex = i + part * VF; 3969 Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); 3970 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3971 Value *SclrGep = II.transform(Builder, GlobalIdx); 3972 SclrGep->setName("next.gep"); 3973 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 3974 Builder.getInt32(i), 3975 "insert.gep"); 3976 } 3977 Entry[part] = VecVal; 3978 } 3979 return; 3980 } 3981 } 3982 3983 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { 3984 // For each instruction in the old loop. 3985 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3986 VectorParts &Entry = WidenMap.get(&*it); 3987 3988 switch (it->getOpcode()) { 3989 case Instruction::Br: 3990 // Nothing to do for PHIs and BR, since we already took care of the 3991 // loop control flow instructions. 3992 continue; 3993 case Instruction::PHI: { 3994 // Vectorize PHINodes. 3995 widenPHIInstruction(&*it, Entry, UF, VF, PV); 3996 continue; 3997 }// End of PHI. 3998 3999 case Instruction::Add: 4000 case Instruction::FAdd: 4001 case Instruction::Sub: 4002 case Instruction::FSub: 4003 case Instruction::Mul: 4004 case Instruction::FMul: 4005 case Instruction::UDiv: 4006 case Instruction::SDiv: 4007 case Instruction::FDiv: 4008 case Instruction::URem: 4009 case Instruction::SRem: 4010 case Instruction::FRem: 4011 case Instruction::Shl: 4012 case Instruction::LShr: 4013 case Instruction::AShr: 4014 case Instruction::And: 4015 case Instruction::Or: 4016 case Instruction::Xor: { 4017 // Just widen binops. 4018 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 4019 setDebugLocFromInst(Builder, BinOp); 4020 VectorParts &A = getVectorValue(it->getOperand(0)); 4021 VectorParts &B = getVectorValue(it->getOperand(1)); 4022 4023 // Use this vector value for all users of the original instruction. 4024 for (unsigned Part = 0; Part < UF; ++Part) { 4025 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 4026 4027 if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) 4028 VecOp->copyIRFlags(BinOp); 4029 4030 Entry[Part] = V; 4031 } 4032 4033 propagateMetadata(Entry, &*it); 4034 break; 4035 } 4036 case Instruction::Select: { 4037 // Widen selects. 4038 // If the selector is loop invariant we can create a select 4039 // instruction with a scalar condition. Otherwise, use vector-select. 4040 auto *SE = PSE.getSE(); 4041 bool InvariantCond = 4042 SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); 4043 setDebugLocFromInst(Builder, &*it); 4044 4045 // The condition can be loop invariant but still defined inside the 4046 // loop. This means that we can't just use the original 'cond' value. 4047 // We have to take the 'vectorized' value and pick the first lane. 4048 // Instcombine will make this a no-op. 4049 VectorParts &Cond = getVectorValue(it->getOperand(0)); 4050 VectorParts &Op0 = getVectorValue(it->getOperand(1)); 4051 VectorParts &Op1 = getVectorValue(it->getOperand(2)); 4052 4053 Value *ScalarCond = (VF == 1) ? Cond[0] : 4054 Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); 4055 4056 for (unsigned Part = 0; Part < UF; ++Part) { 4057 Entry[Part] = Builder.CreateSelect( 4058 InvariantCond ? ScalarCond : Cond[Part], 4059 Op0[Part], 4060 Op1[Part]); 4061 } 4062 4063 propagateMetadata(Entry, &*it); 4064 break; 4065 } 4066 4067 case Instruction::ICmp: 4068 case Instruction::FCmp: { 4069 // Widen compares. Generate vector compares. 4070 bool FCmp = (it->getOpcode() == Instruction::FCmp); 4071 CmpInst *Cmp = dyn_cast<CmpInst>(it); 4072 setDebugLocFromInst(Builder, &*it); 4073 VectorParts &A = getVectorValue(it->getOperand(0)); 4074 VectorParts &B = getVectorValue(it->getOperand(1)); 4075 for (unsigned Part = 0; Part < UF; ++Part) { 4076 Value *C = nullptr; 4077 if (FCmp) { 4078 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 4079 cast<FCmpInst>(C)->copyFastMathFlags(&*it); 4080 } else { 4081 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 4082 } 4083 Entry[Part] = C; 4084 } 4085 4086 propagateMetadata(Entry, &*it); 4087 break; 4088 } 4089 4090 case Instruction::Store: 4091 case Instruction::Load: 4092 vectorizeMemoryInstruction(&*it); 4093 break; 4094 case Instruction::ZExt: 4095 case Instruction::SExt: 4096 case Instruction::FPToUI: 4097 case Instruction::FPToSI: 4098 case Instruction::FPExt: 4099 case Instruction::PtrToInt: 4100 case Instruction::IntToPtr: 4101 case Instruction::SIToFP: 4102 case Instruction::UIToFP: 4103 case Instruction::Trunc: 4104 case Instruction::FPTrunc: 4105 case Instruction::BitCast: { 4106 CastInst *CI = dyn_cast<CastInst>(it); 4107 setDebugLocFromInst(Builder, &*it); 4108 /// Optimize the special case where the source is the induction 4109 /// variable. Notice that we can only optimize the 'trunc' case 4110 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 4111 /// c. other casts depend on pointer size. 4112 if (CI->getOperand(0) == OldInduction && 4113 it->getOpcode() == Instruction::Trunc) { 4114 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 4115 CI->getType()); 4116 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 4117 InductionDescriptor II = 4118 Legal->getInductionVars()->lookup(OldInduction); 4119 Constant *Step = ConstantInt::getSigned( 4120 CI->getType(), II.getStepValue()->getSExtValue()); 4121 for (unsigned Part = 0; Part < UF; ++Part) 4122 Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); 4123 propagateMetadata(Entry, &*it); 4124 break; 4125 } 4126 /// Vectorize casts. 4127 Type *DestTy = (VF == 1) ? CI->getType() : 4128 VectorType::get(CI->getType(), VF); 4129 4130 VectorParts &A = getVectorValue(it->getOperand(0)); 4131 for (unsigned Part = 0; Part < UF; ++Part) 4132 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 4133 propagateMetadata(Entry, &*it); 4134 break; 4135 } 4136 4137 case Instruction::Call: { 4138 // Ignore dbg intrinsics. 4139 if (isa<DbgInfoIntrinsic>(it)) 4140 break; 4141 setDebugLocFromInst(Builder, &*it); 4142 4143 Module *M = BB->getParent()->getParent(); 4144 CallInst *CI = cast<CallInst>(it); 4145 4146 StringRef FnName = CI->getCalledFunction()->getName(); 4147 Function *F = CI->getCalledFunction(); 4148 Type *RetTy = ToVectorTy(CI->getType(), VF); 4149 SmallVector<Type *, 4> Tys; 4150 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 4151 Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 4152 4153 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 4154 if (ID && 4155 (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 4156 ID == Intrinsic::lifetime_start)) { 4157 scalarizeInstruction(&*it); 4158 break; 4159 } 4160 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4161 // version of the instruction. 4162 // Is it beneficial to perform intrinsic call compared to lib call? 4163 bool NeedToScalarize; 4164 unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); 4165 bool UseVectorIntrinsic = 4166 ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; 4167 if (!UseVectorIntrinsic && NeedToScalarize) { 4168 scalarizeInstruction(&*it); 4169 break; 4170 } 4171 4172 for (unsigned Part = 0; Part < UF; ++Part) { 4173 SmallVector<Value *, 4> Args; 4174 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4175 Value *Arg = CI->getArgOperand(i); 4176 // Some intrinsics have a scalar argument - don't replace it with a 4177 // vector. 4178 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { 4179 VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); 4180 Arg = VectorArg[Part]; 4181 } 4182 Args.push_back(Arg); 4183 } 4184 4185 Function *VectorF; 4186 if (UseVectorIntrinsic) { 4187 // Use vector version of the intrinsic. 4188 Type *TysForDecl[] = {CI->getType()}; 4189 if (VF > 1) 4190 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4191 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4192 } else { 4193 // Use vector version of the library call. 4194 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4195 assert(!VFnName.empty() && "Vector function name is empty."); 4196 VectorF = M->getFunction(VFnName); 4197 if (!VectorF) { 4198 // Generate a declaration 4199 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4200 VectorF = 4201 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4202 VectorF->copyAttributesFrom(F); 4203 } 4204 } 4205 assert(VectorF && "Can't create vector function."); 4206 Entry[Part] = Builder.CreateCall(VectorF, Args); 4207 } 4208 4209 propagateMetadata(Entry, &*it); 4210 break; 4211 } 4212 4213 default: 4214 // All other instructions are unsupported. Scalarize them. 4215 scalarizeInstruction(&*it); 4216 break; 4217 }// end of switch. 4218 }// end of for_each instr. 4219 } 4220 4221 void InnerLoopVectorizer::updateAnalysis() { 4222 // Forget the original basic block. 4223 PSE.getSE()->forgetLoop(OrigLoop); 4224 4225 // Update the dominator tree information. 4226 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4227 "Entry does not dominate exit."); 4228 4229 // We don't predicate stores by this point, so the vector body should be a 4230 // single loop. 4231 assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); 4232 DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); 4233 4234 DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back()); 4235 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4236 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4237 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4238 4239 DEBUG(DT->verifyDomTree()); 4240 } 4241 4242 /// \brief Check whether it is safe to if-convert this phi node. 4243 /// 4244 /// Phi nodes with constant expressions that can trap are not safe to if 4245 /// convert. 4246 static bool canIfConvertPHINodes(BasicBlock *BB) { 4247 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 4248 PHINode *Phi = dyn_cast<PHINode>(I); 4249 if (!Phi) 4250 return true; 4251 for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) 4252 if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) 4253 if (C->canTrap()) 4254 return false; 4255 } 4256 return true; 4257 } 4258 4259 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 4260 if (!EnableIfConversion) { 4261 emitAnalysis(VectorizationReport() << "if-conversion is disabled"); 4262 return false; 4263 } 4264 4265 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 4266 4267 // A list of pointers that we can safely read and write to. 4268 SmallPtrSet<Value *, 8> SafePointes; 4269 4270 // Collect safe addresses. 4271 for (Loop::block_iterator BI = TheLoop->block_begin(), 4272 BE = TheLoop->block_end(); BI != BE; ++BI) { 4273 BasicBlock *BB = *BI; 4274 4275 if (blockNeedsPredication(BB)) 4276 continue; 4277 4278 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 4279 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 4280 SafePointes.insert(LI->getPointerOperand()); 4281 else if (StoreInst *SI = dyn_cast<StoreInst>(I)) 4282 SafePointes.insert(SI->getPointerOperand()); 4283 } 4284 } 4285 4286 // Collect the blocks that need predication. 4287 BasicBlock *Header = TheLoop->getHeader(); 4288 for (Loop::block_iterator BI = TheLoop->block_begin(), 4289 BE = TheLoop->block_end(); BI != BE; ++BI) { 4290 BasicBlock *BB = *BI; 4291 4292 // We don't support switch statements inside loops. 4293 if (!isa<BranchInst>(BB->getTerminator())) { 4294 emitAnalysis(VectorizationReport(BB->getTerminator()) 4295 << "loop contains a switch statement"); 4296 return false; 4297 } 4298 4299 // We must be able to predicate all blocks that need to be predicated. 4300 if (blockNeedsPredication(BB)) { 4301 if (!blockCanBePredicated(BB, SafePointes)) { 4302 emitAnalysis(VectorizationReport(BB->getTerminator()) 4303 << "control flow cannot be substituted for a select"); 4304 return false; 4305 } 4306 } else if (BB != Header && !canIfConvertPHINodes(BB)) { 4307 emitAnalysis(VectorizationReport(BB->getTerminator()) 4308 << "control flow cannot be substituted for a select"); 4309 return false; 4310 } 4311 } 4312 4313 // We can if-convert this loop. 4314 return true; 4315 } 4316 4317 bool LoopVectorizationLegality::canVectorize() { 4318 // We must have a loop in canonical form. Loops with indirectbr in them cannot 4319 // be canonicalized. 4320 if (!TheLoop->getLoopPreheader()) { 4321 emitAnalysis( 4322 VectorizationReport() << 4323 "loop control flow is not understood by vectorizer"); 4324 return false; 4325 } 4326 4327 // We can only vectorize innermost loops. 4328 if (!TheLoop->empty()) { 4329 emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); 4330 return false; 4331 } 4332 4333 // We must have a single backedge. 4334 if (TheLoop->getNumBackEdges() != 1) { 4335 emitAnalysis( 4336 VectorizationReport() << 4337 "loop control flow is not understood by vectorizer"); 4338 return false; 4339 } 4340 4341 // We must have a single exiting block. 4342 if (!TheLoop->getExitingBlock()) { 4343 emitAnalysis( 4344 VectorizationReport() << 4345 "loop control flow is not understood by vectorizer"); 4346 return false; 4347 } 4348 4349 // We only handle bottom-tested loops, i.e. loop in which the condition is 4350 // checked at the end of each iteration. With that we can assume that all 4351 // instructions in the loop are executed the same number of times. 4352 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4353 emitAnalysis( 4354 VectorizationReport() << 4355 "loop control flow is not understood by vectorizer"); 4356 return false; 4357 } 4358 4359 // We need to have a loop header. 4360 DEBUG(dbgs() << "LV: Found a loop: " << 4361 TheLoop->getHeader()->getName() << '\n'); 4362 4363 // Check if we can if-convert non-single-bb loops. 4364 unsigned NumBlocks = TheLoop->getNumBlocks(); 4365 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 4366 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 4367 return false; 4368 } 4369 4370 // ScalarEvolution needs to be able to find the exit count. 4371 const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); 4372 if (ExitCount == PSE.getSE()->getCouldNotCompute()) { 4373 emitAnalysis(VectorizationReport() 4374 << "could not determine number of loop iterations"); 4375 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 4376 return false; 4377 } 4378 4379 // Check if we can vectorize the instructions and CFG in this loop. 4380 if (!canVectorizeInstrs()) { 4381 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 4382 return false; 4383 } 4384 4385 // Go over each instruction and look at memory deps. 4386 if (!canVectorizeMemory()) { 4387 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 4388 return false; 4389 } 4390 4391 // Collect all of the variables that remain uniform after vectorization. 4392 collectLoopUniforms(); 4393 4394 DEBUG(dbgs() << "LV: We can vectorize this loop" 4395 << (LAI->getRuntimePointerChecking()->Need 4396 ? " (with a runtime bound check)" 4397 : "") 4398 << "!\n"); 4399 4400 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 4401 4402 // If an override option has been passed in for interleaved accesses, use it. 4403 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 4404 UseInterleaved = EnableInterleavedMemAccesses; 4405 4406 // Analyze interleaved memory accesses. 4407 if (UseInterleaved) 4408 InterleaveInfo.analyzeInterleaving(Strides); 4409 4410 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 4411 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 4412 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 4413 4414 if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { 4415 emitAnalysis(VectorizationReport() 4416 << "Too many SCEV assumptions need to be made and checked " 4417 << "at runtime"); 4418 DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); 4419 return false; 4420 } 4421 4422 // Okay! We can vectorize. At this point we don't have any other mem analysis 4423 // which may limit our maximum vectorization factor, so just return true with 4424 // no restrictions. 4425 return true; 4426 } 4427 4428 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 4429 if (Ty->isPointerTy()) 4430 return DL.getIntPtrType(Ty); 4431 4432 // It is possible that char's or short's overflow when we ask for the loop's 4433 // trip count, work around this by changing the type size. 4434 if (Ty->getScalarSizeInBits() < 32) 4435 return Type::getInt32Ty(Ty->getContext()); 4436 4437 return Ty; 4438 } 4439 4440 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 4441 Ty0 = convertPointerToIntegerType(DL, Ty0); 4442 Ty1 = convertPointerToIntegerType(DL, Ty1); 4443 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 4444 return Ty0; 4445 return Ty1; 4446 } 4447 4448 /// \brief Check that the instruction has outside loop users and is not an 4449 /// identified reduction variable. 4450 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 4451 SmallPtrSetImpl<Value *> &Reductions) { 4452 // Reduction instructions are allowed to have exit users. All other 4453 // instructions must not have external users. 4454 if (!Reductions.count(Inst)) 4455 //Check that all of the users of the loop are inside the BB. 4456 for (User *U : Inst->users()) { 4457 Instruction *UI = cast<Instruction>(U); 4458 // This user may be a reduction exit value. 4459 if (!TheLoop->contains(UI)) { 4460 DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 4461 return true; 4462 } 4463 } 4464 return false; 4465 } 4466 4467 bool LoopVectorizationLegality::canVectorizeInstrs() { 4468 BasicBlock *Header = TheLoop->getHeader(); 4469 4470 // Look for the attribute signaling the absence of NaNs. 4471 Function &F = *Header->getParent(); 4472 const DataLayout &DL = F.getParent()->getDataLayout(); 4473 if (F.hasFnAttribute("no-nans-fp-math")) 4474 HasFunNoNaNAttr = 4475 F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; 4476 4477 // For each block in the loop. 4478 for (Loop::block_iterator bb = TheLoop->block_begin(), 4479 be = TheLoop->block_end(); bb != be; ++bb) { 4480 4481 // Scan the instructions in the block and look for hazards. 4482 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 4483 ++it) { 4484 4485 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 4486 Type *PhiTy = Phi->getType(); 4487 // Check that this PHI type is allowed. 4488 if (!PhiTy->isIntegerTy() && 4489 !PhiTy->isFloatingPointTy() && 4490 !PhiTy->isPointerTy()) { 4491 emitAnalysis(VectorizationReport(&*it) 4492 << "loop control flow is not understood by vectorizer"); 4493 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 4494 return false; 4495 } 4496 4497 // If this PHINode is not in the header block, then we know that we 4498 // can convert it to select during if-conversion. No need to check if 4499 // the PHIs in this block are induction or reduction variables. 4500 if (*bb != Header) { 4501 // Check that this instruction has no outside users or is an 4502 // identified reduction value with an outside user. 4503 if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) 4504 continue; 4505 emitAnalysis(VectorizationReport(&*it) << 4506 "value could not be identified as " 4507 "an induction or reduction variable"); 4508 return false; 4509 } 4510 4511 // We only allow if-converted PHIs with exactly two incoming values. 4512 if (Phi->getNumIncomingValues() != 2) { 4513 emitAnalysis(VectorizationReport(&*it) 4514 << "control flow not understood by vectorizer"); 4515 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 4516 return false; 4517 } 4518 4519 InductionDescriptor ID; 4520 if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { 4521 Inductions[Phi] = ID; 4522 // Get the widest type. 4523 if (!WidestIndTy) 4524 WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 4525 else 4526 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 4527 4528 // Int inductions are special because we only allow one IV. 4529 if (ID.getKind() == InductionDescriptor::IK_IntInduction && 4530 ID.getStepValue()->isOne() && 4531 isa<Constant>(ID.getStartValue()) && 4532 cast<Constant>(ID.getStartValue())->isNullValue()) { 4533 // Use the phi node with the widest type as induction. Use the last 4534 // one if there are multiple (no good reason for doing this other 4535 // than it is expedient). We've checked that it begins at zero and 4536 // steps by one, so this is a canonical induction variable. 4537 if (!Induction || PhiTy == WidestIndTy) 4538 Induction = Phi; 4539 } 4540 4541 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 4542 4543 // Until we explicitly handle the case of an induction variable with 4544 // an outside loop user we have to give up vectorizing this loop. 4545 if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { 4546 emitAnalysis(VectorizationReport(&*it) << 4547 "use of induction value outside of the " 4548 "loop is not handled by vectorizer"); 4549 return false; 4550 } 4551 4552 continue; 4553 } 4554 4555 RecurrenceDescriptor RedDes; 4556 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { 4557 if (RedDes.hasUnsafeAlgebra()) 4558 Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); 4559 AllowedExit.insert(RedDes.getLoopExitInstr()); 4560 Reductions[Phi] = RedDes; 4561 continue; 4562 } 4563 4564 if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) { 4565 FirstOrderRecurrences.insert(Phi); 4566 continue; 4567 } 4568 4569 emitAnalysis(VectorizationReport(&*it) << 4570 "value that could not be identified as " 4571 "reduction is used outside the loop"); 4572 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 4573 return false; 4574 }// end of PHI handling 4575 4576 // We handle calls that: 4577 // * Are debug info intrinsics. 4578 // * Have a mapping to an IR intrinsic. 4579 // * Have a vector version available. 4580 CallInst *CI = dyn_cast<CallInst>(it); 4581 if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && 4582 !(CI->getCalledFunction() && TLI && 4583 TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { 4584 emitAnalysis(VectorizationReport(&*it) 4585 << "call instruction cannot be vectorized"); 4586 DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); 4587 return false; 4588 } 4589 4590 // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the 4591 // second argument is the same (i.e. loop invariant) 4592 if (CI && 4593 hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { 4594 auto *SE = PSE.getSE(); 4595 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { 4596 emitAnalysis(VectorizationReport(&*it) 4597 << "intrinsic instruction cannot be vectorized"); 4598 DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); 4599 return false; 4600 } 4601 } 4602 4603 // Check that the instruction return type is vectorizable. 4604 // Also, we can't vectorize extractelement instructions. 4605 if ((!VectorType::isValidElementType(it->getType()) && 4606 !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { 4607 emitAnalysis(VectorizationReport(&*it) 4608 << "instruction return type cannot be vectorized"); 4609 DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); 4610 return false; 4611 } 4612 4613 // Check that the stored type is vectorizable. 4614 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 4615 Type *T = ST->getValueOperand()->getType(); 4616 if (!VectorType::isValidElementType(T)) { 4617 emitAnalysis(VectorizationReport(ST) << 4618 "store instruction cannot be vectorized"); 4619 return false; 4620 } 4621 if (EnableMemAccessVersioning) 4622 collectStridedAccess(ST); 4623 } 4624 4625 if (EnableMemAccessVersioning) 4626 if (LoadInst *LI = dyn_cast<LoadInst>(it)) 4627 collectStridedAccess(LI); 4628 4629 // Reduction instructions are allowed to have exit users. 4630 // All other instructions must not have external users. 4631 if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { 4632 emitAnalysis(VectorizationReport(&*it) << 4633 "value cannot be used outside the loop"); 4634 return false; 4635 } 4636 4637 } // next instr. 4638 4639 } 4640 4641 if (!Induction) { 4642 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 4643 if (Inductions.empty()) { 4644 emitAnalysis(VectorizationReport() 4645 << "loop induction variable could not be identified"); 4646 return false; 4647 } 4648 } 4649 4650 // Now we know the widest induction type, check if our found induction 4651 // is the same size. If it's not, unset it here and InnerLoopVectorizer 4652 // will create another. 4653 if (Induction && WidestIndTy != Induction->getType()) 4654 Induction = nullptr; 4655 4656 return true; 4657 } 4658 4659 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { 4660 Value *Ptr = nullptr; 4661 if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) 4662 Ptr = LI->getPointerOperand(); 4663 else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) 4664 Ptr = SI->getPointerOperand(); 4665 else 4666 return; 4667 4668 Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); 4669 if (!Stride) 4670 return; 4671 4672 DEBUG(dbgs() << "LV: Found a strided access that we can version"); 4673 DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); 4674 Strides[Ptr] = Stride; 4675 StrideSet.insert(Stride); 4676 } 4677 4678 void LoopVectorizationLegality::collectLoopUniforms() { 4679 // We now know that the loop is vectorizable! 4680 // Collect variables that will remain uniform after vectorization. 4681 std::vector<Value*> Worklist; 4682 BasicBlock *Latch = TheLoop->getLoopLatch(); 4683 4684 // Start with the conditional branch and walk up the block. 4685 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 4686 4687 // Also add all consecutive pointer values; these values will be uniform 4688 // after vectorization (and subsequent cleanup) and, until revectorization is 4689 // supported, all dependencies must also be uniform. 4690 for (Loop::block_iterator B = TheLoop->block_begin(), 4691 BE = TheLoop->block_end(); B != BE; ++B) 4692 for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); 4693 I != IE; ++I) 4694 if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) 4695 Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 4696 4697 while (!Worklist.empty()) { 4698 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 4699 Worklist.pop_back(); 4700 4701 // Look at instructions inside this loop. 4702 // Stop when reaching PHI nodes. 4703 // TODO: we need to follow values all over the loop, not only in this block. 4704 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 4705 continue; 4706 4707 // This is a known uniform. 4708 Uniforms.insert(I); 4709 4710 // Insert all operands. 4711 Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 4712 } 4713 } 4714 4715 bool LoopVectorizationLegality::canVectorizeMemory() { 4716 LAI = &LAA->getInfo(TheLoop, Strides); 4717 auto &OptionalReport = LAI->getReport(); 4718 if (OptionalReport) 4719 emitAnalysis(VectorizationReport(*OptionalReport)); 4720 if (!LAI->canVectorizeMemory()) 4721 return false; 4722 4723 if (LAI->hasStoreToLoopInvariantAddress()) { 4724 emitAnalysis( 4725 VectorizationReport() 4726 << "write to a loop invariant address could not be vectorized"); 4727 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 4728 return false; 4729 } 4730 4731 Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); 4732 PSE.addPredicate(LAI->PSE.getUnionPredicate()); 4733 4734 return true; 4735 } 4736 4737 bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 4738 Value *In0 = const_cast<Value*>(V); 4739 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 4740 if (!PN) 4741 return false; 4742 4743 return Inductions.count(PN); 4744 } 4745 4746 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { 4747 return FirstOrderRecurrences.count(Phi); 4748 } 4749 4750 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 4751 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 4752 } 4753 4754 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, 4755 SmallPtrSetImpl<Value *> &SafePtrs) { 4756 4757 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4758 // Check that we don't have a constant expression that can trap as operand. 4759 for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); 4760 OI != OE; ++OI) { 4761 if (Constant *C = dyn_cast<Constant>(*OI)) 4762 if (C->canTrap()) 4763 return false; 4764 } 4765 // We might be able to hoist the load. 4766 if (it->mayReadFromMemory()) { 4767 LoadInst *LI = dyn_cast<LoadInst>(it); 4768 if (!LI) 4769 return false; 4770 if (!SafePtrs.count(LI->getPointerOperand())) { 4771 if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || 4772 isLegalMaskedGather(LI->getType())) { 4773 MaskedOp.insert(LI); 4774 continue; 4775 } 4776 return false; 4777 } 4778 } 4779 4780 // We don't predicate stores at the moment. 4781 if (it->mayWriteToMemory()) { 4782 StoreInst *SI = dyn_cast<StoreInst>(it); 4783 // We only support predication of stores in basic blocks with one 4784 // predecessor. 4785 if (!SI) 4786 return false; 4787 4788 bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); 4789 bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); 4790 4791 if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || 4792 !isSinglePredecessor) { 4793 // Build a masked store if it is legal for the target, otherwise 4794 // scalarize the block. 4795 bool isLegalMaskedOp = 4796 isLegalMaskedStore(SI->getValueOperand()->getType(), 4797 SI->getPointerOperand()) || 4798 isLegalMaskedScatter(SI->getValueOperand()->getType()); 4799 if (isLegalMaskedOp) { 4800 --NumPredStores; 4801 MaskedOp.insert(SI); 4802 continue; 4803 } 4804 return false; 4805 } 4806 } 4807 if (it->mayThrow()) 4808 return false; 4809 4810 // The instructions below can trap. 4811 switch (it->getOpcode()) { 4812 default: continue; 4813 case Instruction::UDiv: 4814 case Instruction::SDiv: 4815 case Instruction::URem: 4816 case Instruction::SRem: 4817 return false; 4818 } 4819 } 4820 4821 return true; 4822 } 4823 4824 void InterleavedAccessInfo::collectConstStridedAccesses( 4825 MapVector<Instruction *, StrideDescriptor> &StrideAccesses, 4826 const ValueToValueMap &Strides) { 4827 // Holds load/store instructions in program order. 4828 SmallVector<Instruction *, 16> AccessList; 4829 4830 for (auto *BB : TheLoop->getBlocks()) { 4831 bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 4832 4833 for (auto &I : *BB) { 4834 if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I)) 4835 continue; 4836 // FIXME: Currently we can't handle mixed accesses and predicated accesses 4837 if (IsPred) 4838 return; 4839 4840 AccessList.push_back(&I); 4841 } 4842 } 4843 4844 if (AccessList.empty()) 4845 return; 4846 4847 auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); 4848 for (auto I : AccessList) { 4849 LoadInst *LI = dyn_cast<LoadInst>(I); 4850 StoreInst *SI = dyn_cast<StoreInst>(I); 4851 4852 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 4853 int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); 4854 4855 // The factor of the corresponding interleave group. 4856 unsigned Factor = std::abs(Stride); 4857 4858 // Ignore the access if the factor is too small or too large. 4859 if (Factor < 2 || Factor > MaxInterleaveGroupFactor) 4860 continue; 4861 4862 const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); 4863 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 4864 unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); 4865 4866 // An alignment of 0 means target ABI alignment. 4867 unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); 4868 if (!Align) 4869 Align = DL.getABITypeAlignment(PtrTy->getElementType()); 4870 4871 StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); 4872 } 4873 } 4874 4875 // Analyze interleaved accesses and collect them into interleave groups. 4876 // 4877 // Notice that the vectorization on interleaved groups will change instruction 4878 // orders and may break dependences. But the memory dependence check guarantees 4879 // that there is no overlap between two pointers of different strides, element 4880 // sizes or underlying bases. 4881 // 4882 // For pointers sharing the same stride, element size and underlying base, no 4883 // need to worry about Read-After-Write dependences and Write-After-Read 4884 // dependences. 4885 // 4886 // E.g. The RAW dependence: A[i] = a; 4887 // b = A[i]; 4888 // This won't exist as it is a store-load forwarding conflict, which has 4889 // already been checked and forbidden in the dependence check. 4890 // 4891 // E.g. The WAR dependence: a = A[i]; // (1) 4892 // A[i] = b; // (2) 4893 // The store group of (2) is always inserted at or below (2), and the load group 4894 // of (1) is always inserted at or above (1). The dependence is safe. 4895 void InterleavedAccessInfo::analyzeInterleaving( 4896 const ValueToValueMap &Strides) { 4897 DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); 4898 4899 // Holds all the stride accesses. 4900 MapVector<Instruction *, StrideDescriptor> StrideAccesses; 4901 collectConstStridedAccesses(StrideAccesses, Strides); 4902 4903 if (StrideAccesses.empty()) 4904 return; 4905 4906 // Holds all interleaved store groups temporarily. 4907 SmallSetVector<InterleaveGroup *, 4> StoreGroups; 4908 // Holds all interleaved load groups temporarily. 4909 SmallSetVector<InterleaveGroup *, 4> LoadGroups; 4910 4911 // Search the load-load/write-write pair B-A in bottom-up order and try to 4912 // insert B into the interleave group of A according to 3 rules: 4913 // 1. A and B have the same stride. 4914 // 2. A and B have the same memory object size. 4915 // 3. B belongs to the group according to the distance. 4916 // 4917 // The bottom-up order can avoid breaking the Write-After-Write dependences 4918 // between two pointers of the same base. 4919 // E.g. A[i] = a; (1) 4920 // A[i] = b; (2) 4921 // A[i+1] = c (3) 4922 // We form the group (2)+(3) in front, so (1) has to form groups with accesses 4923 // above (1), which guarantees that (1) is always above (2). 4924 for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; 4925 ++I) { 4926 Instruction *A = I->first; 4927 StrideDescriptor DesA = I->second; 4928 4929 InterleaveGroup *Group = getInterleaveGroup(A); 4930 if (!Group) { 4931 DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); 4932 Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); 4933 } 4934 4935 if (A->mayWriteToMemory()) 4936 StoreGroups.insert(Group); 4937 else 4938 LoadGroups.insert(Group); 4939 4940 for (auto II = std::next(I); II != E; ++II) { 4941 Instruction *B = II->first; 4942 StrideDescriptor DesB = II->second; 4943 4944 // Ignore if B is already in a group or B is a different memory operation. 4945 if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) 4946 continue; 4947 4948 // Check the rule 1 and 2. 4949 if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) 4950 continue; 4951 4952 // Calculate the distance and prepare for the rule 3. 4953 const SCEVConstant *DistToA = dyn_cast<SCEVConstant>( 4954 PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev)); 4955 if (!DistToA) 4956 continue; 4957 4958 int DistanceToA = DistToA->getAPInt().getSExtValue(); 4959 4960 // Skip if the distance is not multiple of size as they are not in the 4961 // same group. 4962 if (DistanceToA % static_cast<int>(DesA.Size)) 4963 continue; 4964 4965 // The index of B is the index of A plus the related index to A. 4966 int IndexB = 4967 Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size); 4968 4969 // Try to insert B into the group. 4970 if (Group->insertMember(B, IndexB, DesB.Align)) { 4971 DEBUG(dbgs() << "LV: Inserted:" << *B << '\n' 4972 << " into the interleave group with" << *A << '\n'); 4973 InterleaveGroupMap[B] = Group; 4974 4975 // Set the first load in program order as the insert position. 4976 if (B->mayReadFromMemory()) 4977 Group->setInsertPos(B); 4978 } 4979 } // Iteration on instruction B 4980 } // Iteration on instruction A 4981 4982 // Remove interleaved store groups with gaps. 4983 for (InterleaveGroup *Group : StoreGroups) 4984 if (Group->getNumMembers() != Group->getFactor()) 4985 releaseGroup(Group); 4986 4987 // Remove interleaved load groups that don't have the first and last member. 4988 // This guarantees that we won't do speculative out of bounds loads. 4989 for (InterleaveGroup *Group : LoadGroups) 4990 if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1)) 4991 releaseGroup(Group); 4992 } 4993 4994 LoopVectorizationCostModel::VectorizationFactor 4995 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { 4996 // Width 1 means no vectorize 4997 VectorizationFactor Factor = { 1U, 0U }; 4998 if (OptForSize && Legal->getRuntimePointerChecking()->Need) { 4999 emitAnalysis(VectorizationReport() << 5000 "runtime pointer checks needed. Enable vectorization of this " 5001 "loop with '#pragma clang loop vectorize(enable)' when " 5002 "compiling with -Os/-Oz"); 5003 DEBUG(dbgs() << 5004 "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); 5005 return Factor; 5006 } 5007 5008 if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { 5009 emitAnalysis(VectorizationReport() << 5010 "store that is conditionally executed prevents vectorization"); 5011 DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); 5012 return Factor; 5013 } 5014 5015 // Find the trip count. 5016 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 5017 DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5018 5019 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5020 unsigned SmallestType, WidestType; 5021 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5022 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5023 unsigned MaxSafeDepDist = -1U; 5024 if (Legal->getMaxSafeDepDistBytes() != -1U) 5025 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5026 WidestRegister = ((WidestRegister < MaxSafeDepDist) ? 5027 WidestRegister : MaxSafeDepDist); 5028 unsigned MaxVectorSize = WidestRegister / WidestType; 5029 5030 DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " 5031 << WidestType << " bits.\n"); 5032 DEBUG(dbgs() << "LV: The Widest register is: " 5033 << WidestRegister << " bits.\n"); 5034 5035 if (MaxVectorSize == 0) { 5036 DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5037 MaxVectorSize = 1; 5038 } 5039 5040 assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" 5041 " into one vector!"); 5042 5043 unsigned VF = MaxVectorSize; 5044 if (MaximizeBandwidth && !OptForSize) { 5045 // Collect all viable vectorization factors. 5046 SmallVector<unsigned, 8> VFs; 5047 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5048 for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) 5049 VFs.push_back(VS); 5050 5051 // For each VF calculate its register usage. 5052 auto RUs = calculateRegisterUsage(VFs); 5053 5054 // Select the largest VF which doesn't require more registers than existing 5055 // ones. 5056 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 5057 for (int i = RUs.size() - 1; i >= 0; --i) { 5058 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 5059 VF = VFs[i]; 5060 break; 5061 } 5062 } 5063 } 5064 5065 // If we optimize the program for size, avoid creating the tail loop. 5066 if (OptForSize) { 5067 // If we are unable to calculate the trip count then don't try to vectorize. 5068 if (TC < 2) { 5069 emitAnalysis 5070 (VectorizationReport() << 5071 "unable to calculate the loop count due to complex control flow"); 5072 DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 5073 return Factor; 5074 } 5075 5076 // Find the maximum SIMD width that can fit within the trip count. 5077 VF = TC % MaxVectorSize; 5078 5079 if (VF == 0) 5080 VF = MaxVectorSize; 5081 else { 5082 // If the trip count that we found modulo the vectorization factor is not 5083 // zero then we require a tail. 5084 emitAnalysis(VectorizationReport() << 5085 "cannot optimize for size and vectorize at the " 5086 "same time. Enable vectorization of this loop " 5087 "with '#pragma clang loop vectorize(enable)' " 5088 "when compiling with -Os/-Oz"); 5089 DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 5090 return Factor; 5091 } 5092 } 5093 5094 int UserVF = Hints->getWidth(); 5095 if (UserVF != 0) { 5096 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 5097 DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 5098 5099 Factor.Width = UserVF; 5100 return Factor; 5101 } 5102 5103 float Cost = expectedCost(1); 5104 #ifndef NDEBUG 5105 const float ScalarCost = Cost; 5106 #endif /* NDEBUG */ 5107 unsigned Width = 1; 5108 DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5109 5110 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5111 // Ignore scalar width, because the user explicitly wants vectorization. 5112 if (ForceVectorization && VF > 1) { 5113 Width = 2; 5114 Cost = expectedCost(Width) / (float)Width; 5115 } 5116 5117 for (unsigned i=2; i <= VF; i*=2) { 5118 // Notice that the vector loop needs to be executed less times, so 5119 // we need to divide the cost of the vector loops by the width of 5120 // the vector elements. 5121 float VectorCost = expectedCost(i) / (float)i; 5122 DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << 5123 (int)VectorCost << ".\n"); 5124 if (VectorCost < Cost) { 5125 Cost = VectorCost; 5126 Width = i; 5127 } 5128 } 5129 5130 DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5131 << "LV: Vectorization seems to be not beneficial, " 5132 << "but was forced by a user.\n"); 5133 DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); 5134 Factor.Width = Width; 5135 Factor.Cost = Width * Cost; 5136 return Factor; 5137 } 5138 5139 std::pair<unsigned, unsigned> 5140 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5141 unsigned MinWidth = -1U; 5142 unsigned MaxWidth = 8; 5143 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5144 5145 // For each block. 5146 for (Loop::block_iterator bb = TheLoop->block_begin(), 5147 be = TheLoop->block_end(); bb != be; ++bb) { 5148 BasicBlock *BB = *bb; 5149 5150 // For each instruction in the loop. 5151 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 5152 Type *T = it->getType(); 5153 5154 // Skip ignored values. 5155 if (ValuesToIgnore.count(&*it)) 5156 continue; 5157 5158 // Only examine Loads, Stores and PHINodes. 5159 if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 5160 continue; 5161 5162 // Examine PHI nodes that are reduction variables. Update the type to 5163 // account for the recurrence type. 5164 if (PHINode *PN = dyn_cast<PHINode>(it)) { 5165 if (!Legal->isReductionVariable(PN)) 5166 continue; 5167 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5168 T = RdxDesc.getRecurrenceType(); 5169 } 5170 5171 // Examine the stored values. 5172 if (StoreInst *ST = dyn_cast<StoreInst>(it)) 5173 T = ST->getValueOperand()->getType(); 5174 5175 // Ignore loaded pointer types and stored pointer types that are not 5176 // consecutive. However, we do want to take consecutive stores/loads of 5177 // pointer vectors into account. 5178 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) 5179 continue; 5180 5181 MinWidth = std::min(MinWidth, 5182 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5183 MaxWidth = std::max(MaxWidth, 5184 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5185 } 5186 } 5187 5188 return {MinWidth, MaxWidth}; 5189 } 5190 5191 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, 5192 unsigned VF, 5193 unsigned LoopCost) { 5194 5195 // -- The interleave heuristics -- 5196 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5197 // There are many micro-architectural considerations that we can't predict 5198 // at this level. For example, frontend pressure (on decode or fetch) due to 5199 // code size, or the number and capabilities of the execution ports. 5200 // 5201 // We use the following heuristics to select the interleave count: 5202 // 1. If the code has reductions, then we interleave to break the cross 5203 // iteration dependency. 5204 // 2. If the loop is really small, then we interleave to reduce the loop 5205 // overhead. 5206 // 3. We don't interleave if we think that we will spill registers to memory 5207 // due to the increased register pressure. 5208 5209 // When we optimize for size, we don't interleave. 5210 if (OptForSize) 5211 return 1; 5212 5213 // We used the distance for the interleave count. 5214 if (Legal->getMaxSafeDepDistBytes() != -1U) 5215 return 1; 5216 5217 // Do not interleave loops with a relatively small trip count. 5218 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 5219 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5220 return 1; 5221 5222 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5223 DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << 5224 " registers\n"); 5225 5226 if (VF == 1) { 5227 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5228 TargetNumRegisters = ForceTargetNumScalarRegs; 5229 } else { 5230 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5231 TargetNumRegisters = ForceTargetNumVectorRegs; 5232 } 5233 5234 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5235 // We divide by these constants so assume that we have at least one 5236 // instruction that uses at least one register. 5237 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5238 R.NumInstructions = std::max(R.NumInstructions, 1U); 5239 5240 // We calculate the interleave count using the following formula. 5241 // Subtract the number of loop invariants from the number of available 5242 // registers. These registers are used by all of the interleaved instances. 5243 // Next, divide the remaining registers by the number of registers that is 5244 // required by the loop, in order to estimate how many parallel instances 5245 // fit without causing spills. All of this is rounded down if necessary to be 5246 // a power of two. We want power of two interleave count to simplify any 5247 // addressing operations or alignment considerations. 5248 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5249 R.MaxLocalUsers); 5250 5251 // Don't count the induction variable as interleaved. 5252 if (EnableIndVarRegisterHeur) 5253 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5254 std::max(1U, (R.MaxLocalUsers - 1))); 5255 5256 // Clamp the interleave ranges to reasonable counts. 5257 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5258 5259 // Check if the user has overridden the max. 5260 if (VF == 1) { 5261 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5262 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5263 } else { 5264 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5265 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5266 } 5267 5268 // If we did not calculate the cost for VF (because the user selected the VF) 5269 // then we calculate the cost of VF here. 5270 if (LoopCost == 0) 5271 LoopCost = expectedCost(VF); 5272 5273 // Clamp the calculated IC to be between the 1 and the max interleave count 5274 // that the target allows. 5275 if (IC > MaxInterleaveCount) 5276 IC = MaxInterleaveCount; 5277 else if (IC < 1) 5278 IC = 1; 5279 5280 // Interleave if we vectorized this loop and there is a reduction that could 5281 // benefit from interleaving. 5282 if (VF > 1 && Legal->getReductionVars()->size()) { 5283 DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5284 return IC; 5285 } 5286 5287 // Note that if we've already vectorized the loop we will have done the 5288 // runtime check and so interleaving won't require further checks. 5289 bool InterleavingRequiresRuntimePointerCheck = 5290 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5291 5292 // We want to interleave small loops in order to reduce the loop overhead and 5293 // potentially expose ILP opportunities. 5294 DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5295 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5296 // We assume that the cost overhead is 1 and we use the cost model 5297 // to estimate the cost of the loop and interleave until the cost of the 5298 // loop overhead is about 5% of the cost of the loop. 5299 unsigned SmallIC = 5300 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5301 5302 // Interleave until store/load ports (estimated by max interleave count) are 5303 // saturated. 5304 unsigned NumStores = Legal->getNumStores(); 5305 unsigned NumLoads = Legal->getNumLoads(); 5306 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5307 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5308 5309 // If we have a scalar reduction (vector reductions are already dealt with 5310 // by this point), we can increase the critical path length if the loop 5311 // we're interleaving is inside another loop. Limit, by default to 2, so the 5312 // critical path only gets increased by one reduction operation. 5313 if (Legal->getReductionVars()->size() && 5314 TheLoop->getLoopDepth() > 1) { 5315 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5316 SmallIC = std::min(SmallIC, F); 5317 StoresIC = std::min(StoresIC, F); 5318 LoadsIC = std::min(LoadsIC, F); 5319 } 5320 5321 if (EnableLoadStoreRuntimeInterleave && 5322 std::max(StoresIC, LoadsIC) > SmallIC) { 5323 DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5324 return std::max(StoresIC, LoadsIC); 5325 } 5326 5327 DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5328 return SmallIC; 5329 } 5330 5331 // Interleave if this is a large loop (small loops are already dealt with by 5332 // this point) that could benefit from interleaving. 5333 bool HasReductions = (Legal->getReductionVars()->size() > 0); 5334 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5335 DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5336 return IC; 5337 } 5338 5339 DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5340 return 1; 5341 } 5342 5343 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5344 LoopVectorizationCostModel::calculateRegisterUsage( 5345 const SmallVector<unsigned, 8> &VFs) { 5346 // This function calculates the register usage by measuring the highest number 5347 // of values that are alive at a single location. Obviously, this is a very 5348 // rough estimation. We scan the loop in a topological order in order and 5349 // assign a number to each instruction. We use RPO to ensure that defs are 5350 // met before their users. We assume that each instruction that has in-loop 5351 // users starts an interval. We record every time that an in-loop value is 5352 // used, so we have a list of the first and last occurrences of each 5353 // instruction. Next, we transpose this data structure into a multi map that 5354 // holds the list of intervals that *end* at a specific location. This multi 5355 // map allows us to perform a linear search. We scan the instructions linearly 5356 // and record each time that a new interval starts, by placing it in a set. 5357 // If we find this value in the multi-map then we remove it from the set. 5358 // The max register usage is the maximum size of the set. 5359 // We also search for instructions that are defined outside the loop, but are 5360 // used inside the loop. We need this number separately from the max-interval 5361 // usage number because when we unroll, loop-invariant values do not take 5362 // more register. 5363 LoopBlocksDFS DFS(TheLoop); 5364 DFS.perform(LI); 5365 5366 RegisterUsage RU; 5367 RU.NumInstructions = 0; 5368 5369 // Each 'key' in the map opens a new interval. The values 5370 // of the map are the index of the 'last seen' usage of the 5371 // instruction that is the key. 5372 typedef DenseMap<Instruction*, unsigned> IntervalMap; 5373 // Maps instruction to its index. 5374 DenseMap<unsigned, Instruction*> IdxToInstr; 5375 // Marks the end of each interval. 5376 IntervalMap EndPoint; 5377 // Saves the list of instruction indices that are used in the loop. 5378 SmallSet<Instruction*, 8> Ends; 5379 // Saves the list of values that are used in the loop but are 5380 // defined outside the loop, such as arguments and constants. 5381 SmallPtrSet<Value*, 8> LoopInvariants; 5382 5383 unsigned Index = 0; 5384 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 5385 be = DFS.endRPO(); bb != be; ++bb) { 5386 RU.NumInstructions += (*bb)->size(); 5387 for (Instruction &I : **bb) { 5388 IdxToInstr[Index++] = &I; 5389 5390 // Save the end location of each USE. 5391 for (unsigned i = 0; i < I.getNumOperands(); ++i) { 5392 Value *U = I.getOperand(i); 5393 Instruction *Instr = dyn_cast<Instruction>(U); 5394 5395 // Ignore non-instruction values such as arguments, constants, etc. 5396 if (!Instr) continue; 5397 5398 // If this instruction is outside the loop then record it and continue. 5399 if (!TheLoop->contains(Instr)) { 5400 LoopInvariants.insert(Instr); 5401 continue; 5402 } 5403 5404 // Overwrite previous end points. 5405 EndPoint[Instr] = Index; 5406 Ends.insert(Instr); 5407 } 5408 } 5409 } 5410 5411 // Saves the list of intervals that end with the index in 'key'. 5412 typedef SmallVector<Instruction*, 2> InstrList; 5413 DenseMap<unsigned, InstrList> TransposeEnds; 5414 5415 // Transpose the EndPoints to a list of values that end at each index. 5416 for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 5417 it != e; ++it) 5418 TransposeEnds[it->second].push_back(it->first); 5419 5420 SmallSet<Instruction*, 8> OpenIntervals; 5421 5422 // Get the size of the widest register. 5423 unsigned MaxSafeDepDist = -1U; 5424 if (Legal->getMaxSafeDepDistBytes() != -1U) 5425 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5426 unsigned WidestRegister = 5427 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5428 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5429 5430 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5431 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5432 5433 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5434 5435 // A lambda that gets the register usage for the given type and VF. 5436 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5437 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5438 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5439 }; 5440 5441 for (unsigned int i = 0; i < Index; ++i) { 5442 Instruction *I = IdxToInstr[i]; 5443 // Ignore instructions that are never used within the loop. 5444 if (!Ends.count(I)) continue; 5445 5446 // Skip ignored values. 5447 if (ValuesToIgnore.count(I)) 5448 continue; 5449 5450 // Remove all of the instructions that end at this location. 5451 InstrList &List = TransposeEnds[i]; 5452 for (unsigned int j = 0, e = List.size(); j < e; ++j) 5453 OpenIntervals.erase(List[j]); 5454 5455 // For each VF find the maximum usage of registers. 5456 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5457 if (VFs[j] == 1) { 5458 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5459 continue; 5460 } 5461 5462 // Count the number of live intervals. 5463 unsigned RegUsage = 0; 5464 for (auto Inst : OpenIntervals) 5465 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5466 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5467 } 5468 5469 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5470 << OpenIntervals.size() << '\n'); 5471 5472 // Add the current instruction to the list of open intervals. 5473 OpenIntervals.insert(I); 5474 } 5475 5476 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5477 unsigned Invariant = 0; 5478 if (VFs[i] == 1) 5479 Invariant = LoopInvariants.size(); 5480 else { 5481 for (auto Inst : LoopInvariants) 5482 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5483 } 5484 5485 DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5486 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5487 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); 5488 DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); 5489 5490 RU.LoopInvariantRegs = Invariant; 5491 RU.MaxLocalUsers = MaxUsages[i]; 5492 RUs[i] = RU; 5493 } 5494 5495 return RUs; 5496 } 5497 5498 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 5499 unsigned Cost = 0; 5500 5501 // For each block. 5502 for (Loop::block_iterator bb = TheLoop->block_begin(), 5503 be = TheLoop->block_end(); bb != be; ++bb) { 5504 unsigned BlockCost = 0; 5505 BasicBlock *BB = *bb; 5506 5507 // For each instruction in the old loop. 5508 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 5509 // Skip dbg intrinsics. 5510 if (isa<DbgInfoIntrinsic>(it)) 5511 continue; 5512 5513 // Skip ignored values. 5514 if (ValuesToIgnore.count(&*it)) 5515 continue; 5516 5517 unsigned C = getInstructionCost(&*it, VF); 5518 5519 // Check if we should override the cost. 5520 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5521 C = ForceTargetInstructionCost; 5522 5523 BlockCost += C; 5524 DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << 5525 VF << " For instruction: " << *it << '\n'); 5526 } 5527 5528 // We assume that if-converted blocks have a 50% chance of being executed. 5529 // When the code is scalar then some of the blocks are avoided due to CF. 5530 // When the code is vectorized we execute all code paths. 5531 if (VF == 1 && Legal->blockNeedsPredication(*bb)) 5532 BlockCost /= 2; 5533 5534 Cost += BlockCost; 5535 } 5536 5537 return Cost; 5538 } 5539 5540 /// \brief Check if the load/store instruction \p I may be translated into 5541 /// gather/scatter during vectorization. 5542 /// 5543 /// Pointer \p Ptr specifies address in memory for the given scalar memory 5544 /// instruction. We need it to retrieve data type. 5545 /// Using gather/scatter is possible when it is supported by target. 5546 static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr, 5547 LoopVectorizationLegality *Legal) { 5548 Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType(); 5549 return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) || 5550 (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy)); 5551 } 5552 5553 /// \brief Check whether the address computation for a non-consecutive memory 5554 /// access looks like an unlikely candidate for being merged into the indexing 5555 /// mode. 5556 /// 5557 /// We look for a GEP which has one index that is an induction variable and all 5558 /// other indices are loop invariant. If the stride of this access is also 5559 /// within a small bound we decide that this address computation can likely be 5560 /// merged into the addressing mode. 5561 /// In all other cases, we identify the address computation as complex. 5562 static bool isLikelyComplexAddressComputation(Value *Ptr, 5563 LoopVectorizationLegality *Legal, 5564 ScalarEvolution *SE, 5565 const Loop *TheLoop) { 5566 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5567 if (!Gep) 5568 return true; 5569 5570 // We are looking for a gep with all loop invariant indices except for one 5571 // which should be an induction variable. 5572 unsigned NumOperands = Gep->getNumOperands(); 5573 for (unsigned i = 1; i < NumOperands; ++i) { 5574 Value *Opd = Gep->getOperand(i); 5575 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5576 !Legal->isInductionVariable(Opd)) 5577 return true; 5578 } 5579 5580 // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step 5581 // can likely be merged into the address computation. 5582 unsigned MaxMergeDistance = 64; 5583 5584 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); 5585 if (!AddRec) 5586 return true; 5587 5588 // Check the step is constant. 5589 const SCEV *Step = AddRec->getStepRecurrence(*SE); 5590 // Calculate the pointer stride and check if it is consecutive. 5591 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 5592 if (!C) 5593 return true; 5594 5595 const APInt &APStepVal = C->getAPInt(); 5596 5597 // Huge step value - give up. 5598 if (APStepVal.getBitWidth() > 64) 5599 return true; 5600 5601 int64_t StepVal = APStepVal.getSExtValue(); 5602 5603 return StepVal > MaxMergeDistance; 5604 } 5605 5606 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5607 return Legal->hasStride(I->getOperand(0)) || 5608 Legal->hasStride(I->getOperand(1)); 5609 } 5610 5611 unsigned 5612 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5613 // If we know that this instruction will remain uniform, check the cost of 5614 // the scalar version. 5615 if (Legal->isUniformAfterVectorization(I)) 5616 VF = 1; 5617 5618 Type *RetTy = I->getType(); 5619 if (VF > 1 && MinBWs.count(I)) 5620 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 5621 Type *VectorTy = ToVectorTy(RetTy, VF); 5622 5623 // TODO: We need to estimate the cost of intrinsic calls. 5624 switch (I->getOpcode()) { 5625 case Instruction::GetElementPtr: 5626 // We mark this instruction as zero-cost because the cost of GEPs in 5627 // vectorized code depends on whether the corresponding memory instruction 5628 // is scalarized or not. Therefore, we handle GEPs with the memory 5629 // instruction cost. 5630 return 0; 5631 case Instruction::Br: { 5632 return TTI.getCFInstrCost(I->getOpcode()); 5633 } 5634 case Instruction::PHI: { 5635 auto *Phi = cast<PHINode>(I); 5636 5637 // First-order recurrences are replaced by vector shuffles inside the loop. 5638 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 5639 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 5640 VectorTy, VF - 1, VectorTy); 5641 5642 // TODO: IF-converted IFs become selects. 5643 return 0; 5644 } 5645 case Instruction::Add: 5646 case Instruction::FAdd: 5647 case Instruction::Sub: 5648 case Instruction::FSub: 5649 case Instruction::Mul: 5650 case Instruction::FMul: 5651 case Instruction::UDiv: 5652 case Instruction::SDiv: 5653 case Instruction::FDiv: 5654 case Instruction::URem: 5655 case Instruction::SRem: 5656 case Instruction::FRem: 5657 case Instruction::Shl: 5658 case Instruction::LShr: 5659 case Instruction::AShr: 5660 case Instruction::And: 5661 case Instruction::Or: 5662 case Instruction::Xor: { 5663 // Since we will replace the stride by 1 the multiplication should go away. 5664 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 5665 return 0; 5666 // Certain instructions can be cheaper to vectorize if they have a constant 5667 // second vector operand. One example of this are shifts on x86. 5668 TargetTransformInfo::OperandValueKind Op1VK = 5669 TargetTransformInfo::OK_AnyValue; 5670 TargetTransformInfo::OperandValueKind Op2VK = 5671 TargetTransformInfo::OK_AnyValue; 5672 TargetTransformInfo::OperandValueProperties Op1VP = 5673 TargetTransformInfo::OP_None; 5674 TargetTransformInfo::OperandValueProperties Op2VP = 5675 TargetTransformInfo::OP_None; 5676 Value *Op2 = I->getOperand(1); 5677 5678 // Check for a splat of a constant or for a non uniform vector of constants. 5679 if (isa<ConstantInt>(Op2)) { 5680 ConstantInt *CInt = cast<ConstantInt>(Op2); 5681 if (CInt && CInt->getValue().isPowerOf2()) 5682 Op2VP = TargetTransformInfo::OP_PowerOf2; 5683 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 5684 } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { 5685 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 5686 Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); 5687 if (SplatValue) { 5688 ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); 5689 if (CInt && CInt->getValue().isPowerOf2()) 5690 Op2VP = TargetTransformInfo::OP_PowerOf2; 5691 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 5692 } 5693 } 5694 5695 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, 5696 Op1VP, Op2VP); 5697 } 5698 case Instruction::Select: { 5699 SelectInst *SI = cast<SelectInst>(I); 5700 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 5701 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 5702 Type *CondTy = SI->getCondition()->getType(); 5703 if (!ScalarCond) 5704 CondTy = VectorType::get(CondTy, VF); 5705 5706 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 5707 } 5708 case Instruction::ICmp: 5709 case Instruction::FCmp: { 5710 Type *ValTy = I->getOperand(0)->getType(); 5711 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 5712 auto It = MinBWs.find(Op0AsInstruction); 5713 if (VF > 1 && It != MinBWs.end()) 5714 ValTy = IntegerType::get(ValTy->getContext(), It->second); 5715 VectorTy = ToVectorTy(ValTy, VF); 5716 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 5717 } 5718 case Instruction::Store: 5719 case Instruction::Load: { 5720 StoreInst *SI = dyn_cast<StoreInst>(I); 5721 LoadInst *LI = dyn_cast<LoadInst>(I); 5722 Type *ValTy = (SI ? SI->getValueOperand()->getType() : 5723 LI->getType()); 5724 VectorTy = ToVectorTy(ValTy, VF); 5725 5726 unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 5727 unsigned AS = SI ? SI->getPointerAddressSpace() : 5728 LI->getPointerAddressSpace(); 5729 Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 5730 // We add the cost of address computation here instead of with the gep 5731 // instruction because only here we know whether the operation is 5732 // scalarized. 5733 if (VF == 1) 5734 return TTI.getAddressComputationCost(VectorTy) + 5735 TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5736 5737 // For an interleaved access, calculate the total cost of the whole 5738 // interleave group. 5739 if (Legal->isAccessInterleaved(I)) { 5740 auto Group = Legal->getInterleavedAccessGroup(I); 5741 assert(Group && "Fail to get an interleaved access group."); 5742 5743 // Only calculate the cost once at the insert position. 5744 if (Group->getInsertPos() != I) 5745 return 0; 5746 5747 unsigned InterleaveFactor = Group->getFactor(); 5748 Type *WideVecTy = 5749 VectorType::get(VectorTy->getVectorElementType(), 5750 VectorTy->getVectorNumElements() * InterleaveFactor); 5751 5752 // Holds the indices of existing members in an interleaved load group. 5753 // An interleaved store group doesn't need this as it dones't allow gaps. 5754 SmallVector<unsigned, 4> Indices; 5755 if (LI) { 5756 for (unsigned i = 0; i < InterleaveFactor; i++) 5757 if (Group->getMember(i)) 5758 Indices.push_back(i); 5759 } 5760 5761 // Calculate the cost of the whole interleaved group. 5762 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5763 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5764 Group->getAlignment(), AS); 5765 5766 if (Group->isReverse()) 5767 Cost += 5768 Group->getNumMembers() * 5769 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5770 5771 // FIXME: The interleaved load group with a huge gap could be even more 5772 // expensive than scalar operations. Then we could ignore such group and 5773 // use scalar operations instead. 5774 return Cost; 5775 } 5776 5777 // Scalarized loads/stores. 5778 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5779 bool UseGatherOrScatter = (ConsecutiveStride == 0) && 5780 isGatherOrScatterLegal(I, Ptr, Legal); 5781 5782 bool Reverse = ConsecutiveStride < 0; 5783 const DataLayout &DL = I->getModule()->getDataLayout(); 5784 unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); 5785 unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; 5786 if ((!ConsecutiveStride && !UseGatherOrScatter) || 5787 ScalarAllocatedSize != VectorElementSize) { 5788 bool IsComplexComputation = 5789 isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); 5790 unsigned Cost = 0; 5791 // The cost of extracting from the value vector and pointer vector. 5792 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5793 for (unsigned i = 0; i < VF; ++i) { 5794 // The cost of extracting the pointer operand. 5795 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 5796 // In case of STORE, the cost of ExtractElement from the vector. 5797 // In case of LOAD, the cost of InsertElement into the returned 5798 // vector. 5799 Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 5800 Instruction::InsertElement, 5801 VectorTy, i); 5802 } 5803 5804 // The cost of the scalar loads/stores. 5805 Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); 5806 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5807 Alignment, AS); 5808 return Cost; 5809 } 5810 5811 unsigned Cost = TTI.getAddressComputationCost(VectorTy); 5812 if (UseGatherOrScatter) { 5813 assert(ConsecutiveStride == 0 && 5814 "Gather/Scatter are not used for consecutive stride"); 5815 return Cost + 5816 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5817 Legal->isMaskRequired(I), Alignment); 5818 } 5819 // Wide load/stores. 5820 if (Legal->isMaskRequired(I)) 5821 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, 5822 AS); 5823 else 5824 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5825 5826 if (Reverse) 5827 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 5828 VectorTy, 0); 5829 return Cost; 5830 } 5831 case Instruction::ZExt: 5832 case Instruction::SExt: 5833 case Instruction::FPToUI: 5834 case Instruction::FPToSI: 5835 case Instruction::FPExt: 5836 case Instruction::PtrToInt: 5837 case Instruction::IntToPtr: 5838 case Instruction::SIToFP: 5839 case Instruction::UIToFP: 5840 case Instruction::Trunc: 5841 case Instruction::FPTrunc: 5842 case Instruction::BitCast: { 5843 // We optimize the truncation of induction variable. 5844 // The cost of these is the same as the scalar operation. 5845 if (I->getOpcode() == Instruction::Trunc && 5846 Legal->isInductionVariable(I->getOperand(0))) 5847 return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 5848 I->getOperand(0)->getType()); 5849 5850 Type *SrcScalarTy = I->getOperand(0)->getType(); 5851 Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); 5852 if (VF > 1 && MinBWs.count(I)) { 5853 // This cast is going to be shrunk. This may remove the cast or it might 5854 // turn it into slightly different cast. For example, if MinBW == 16, 5855 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 5856 // 5857 // Calculate the modified src and dest types. 5858 Type *MinVecTy = VectorTy; 5859 if (I->getOpcode() == Instruction::Trunc) { 5860 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 5861 VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), 5862 MinVecTy); 5863 } else if (I->getOpcode() == Instruction::ZExt || 5864 I->getOpcode() == Instruction::SExt) { 5865 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 5866 VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), 5867 MinVecTy); 5868 } 5869 } 5870 5871 return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 5872 } 5873 case Instruction::Call: { 5874 bool NeedToScalarize; 5875 CallInst *CI = cast<CallInst>(I); 5876 unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); 5877 if (getIntrinsicIDForCall(CI, TLI)) 5878 return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); 5879 return CallCost; 5880 } 5881 default: { 5882 // We are scalarizing the instruction. Return the cost of the scalar 5883 // instruction, plus the cost of insert and extract into vector 5884 // elements, times the vector width. 5885 unsigned Cost = 0; 5886 5887 if (!RetTy->isVoidTy() && VF != 1) { 5888 unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 5889 VectorTy); 5890 unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 5891 VectorTy); 5892 5893 // The cost of inserting the results plus extracting each one of the 5894 // operands. 5895 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 5896 } 5897 5898 // The cost of executing VF copies of the scalar instruction. This opcode 5899 // is unknown. Assume that it is the same as 'mul'. 5900 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 5901 return Cost; 5902 } 5903 }// end of switch. 5904 } 5905 5906 char LoopVectorize::ID = 0; 5907 static const char lv_name[] = "Loop Vectorization"; 5908 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 5909 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 5910 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 5911 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 5912 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 5913 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 5914 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 5915 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 5916 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 5917 INITIALIZE_PASS_DEPENDENCY(LCSSA) 5918 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 5919 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 5920 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 5921 INITIALIZE_PASS_DEPENDENCY(DemandedBits) 5922 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 5923 5924 namespace llvm { 5925 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { 5926 return new LoopVectorize(NoUnrolling, AlwaysVectorize); 5927 } 5928 } 5929 5930 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 5931 // Check for a store. 5932 if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) 5933 return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 5934 5935 // Check for a load. 5936 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 5937 return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 5938 5939 return false; 5940 } 5941 5942 5943 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, 5944 bool IfPredicateStore) { 5945 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 5946 // Holds vector parameters or scalars, in case of uniform vals. 5947 SmallVector<VectorParts, 4> Params; 5948 5949 setDebugLocFromInst(Builder, Instr); 5950 5951 // Find all of the vectorized parameters. 5952 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5953 Value *SrcOp = Instr->getOperand(op); 5954 5955 // If we are accessing the old induction variable, use the new one. 5956 if (SrcOp == OldInduction) { 5957 Params.push_back(getVectorValue(SrcOp)); 5958 continue; 5959 } 5960 5961 // Try using previously calculated values. 5962 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 5963 5964 // If the src is an instruction that appeared earlier in the basic block 5965 // then it should already be vectorized. 5966 if (SrcInst && OrigLoop->contains(SrcInst)) { 5967 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 5968 // The parameter is a vector value from earlier. 5969 Params.push_back(WidenMap.get(SrcInst)); 5970 } else { 5971 // The parameter is a scalar from outside the loop. Maybe even a constant. 5972 VectorParts Scalars; 5973 Scalars.append(UF, SrcOp); 5974 Params.push_back(Scalars); 5975 } 5976 } 5977 5978 assert(Params.size() == Instr->getNumOperands() && 5979 "Invalid number of operands"); 5980 5981 // Does this instruction return a value ? 5982 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 5983 5984 Value *UndefVec = IsVoidRetTy ? nullptr : 5985 UndefValue::get(Instr->getType()); 5986 // Create a new entry in the WidenMap and initialize it to Undef or Null. 5987 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 5988 5989 VectorParts Cond; 5990 if (IfPredicateStore) { 5991 assert(Instr->getParent()->getSinglePredecessor() && 5992 "Only support single predecessor blocks"); 5993 Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 5994 Instr->getParent()); 5995 } 5996 5997 // For each vector unroll 'part': 5998 for (unsigned Part = 0; Part < UF; ++Part) { 5999 // For each scalar that we create: 6000 6001 // Start an "if (pred) a[i] = ..." block. 6002 Value *Cmp = nullptr; 6003 if (IfPredicateStore) { 6004 if (Cond[Part]->getType()->isVectorTy()) 6005 Cond[Part] = 6006 Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); 6007 Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], 6008 ConstantInt::get(Cond[Part]->getType(), 1)); 6009 } 6010 6011 Instruction *Cloned = Instr->clone(); 6012 if (!IsVoidRetTy) 6013 Cloned->setName(Instr->getName() + ".cloned"); 6014 // Replace the operands of the cloned instructions with extracted scalars. 6015 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 6016 Value *Op = Params[op][Part]; 6017 Cloned->setOperand(op, Op); 6018 } 6019 6020 // Place the cloned scalar in the new loop. 6021 Builder.Insert(Cloned); 6022 6023 // If the original scalar returns a value we need to place it in a vector 6024 // so that future users will be able to use it. 6025 if (!IsVoidRetTy) 6026 VecResults[Part] = Cloned; 6027 6028 // End if-block. 6029 if (IfPredicateStore) 6030 PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), 6031 Cmp)); 6032 } 6033 } 6034 6035 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { 6036 StoreInst *SI = dyn_cast<StoreInst>(Instr); 6037 bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); 6038 6039 return scalarizeInstruction(Instr, IfPredicateStore); 6040 } 6041 6042 Value *InnerLoopUnroller::reverseVector(Value *Vec) { 6043 return Vec; 6044 } 6045 6046 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { 6047 return V; 6048 } 6049 6050 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { 6051 // When unrolling and the VF is 1, we only need to add a simple scalar. 6052 Type *ITy = Val->getType(); 6053 assert(!ITy->isVectorTy() && "Val must be a scalar"); 6054 Constant *C = ConstantInt::get(ITy, StartIdx); 6055 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6056 } 6057