1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. 12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13 // of instructions in order to estimate the profitability of vectorization. 14 // 15 // The loop vectorizer combines consecutive loop iterations into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. InnerLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // The interleaved access vectorization is based on the paper: 38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 39 // Data for SIMD 40 // 41 // Other ideas/concepts are from: 42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 43 // 44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 45 // Vectorizing Compilers. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #include "llvm/Transforms/Vectorize.h" 50 #include "llvm/ADT/DenseMap.h" 51 #include "llvm/ADT/EquivalenceClasses.h" 52 #include "llvm/ADT/Hashing.h" 53 #include "llvm/ADT/MapVector.h" 54 #include "llvm/ADT/SetVector.h" 55 #include "llvm/ADT/SmallPtrSet.h" 56 #include "llvm/ADT/SmallSet.h" 57 #include "llvm/ADT/SmallVector.h" 58 #include "llvm/ADT/Statistic.h" 59 #include "llvm/ADT/StringExtras.h" 60 #include "llvm/Analysis/AliasAnalysis.h" 61 #include "llvm/Analysis/AliasSetTracker.h" 62 #include "llvm/Analysis/AssumptionCache.h" 63 #include "llvm/Analysis/BlockFrequencyInfo.h" 64 #include "llvm/Analysis/CodeMetrics.h" 65 #include "llvm/Analysis/LoopAccessAnalysis.h" 66 #include "llvm/Analysis/LoopInfo.h" 67 #include "llvm/Analysis/LoopIterator.h" 68 #include "llvm/Analysis/LoopPass.h" 69 #include "llvm/Analysis/ScalarEvolution.h" 70 #include "llvm/Analysis/ScalarEvolutionExpander.h" 71 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 72 #include "llvm/Analysis/TargetTransformInfo.h" 73 #include "llvm/Analysis/ValueTracking.h" 74 #include "llvm/IR/Constants.h" 75 #include "llvm/IR/DataLayout.h" 76 #include "llvm/IR/DebugInfo.h" 77 #include "llvm/IR/DerivedTypes.h" 78 #include "llvm/IR/DiagnosticInfo.h" 79 #include "llvm/IR/Dominators.h" 80 #include "llvm/IR/Function.h" 81 #include "llvm/IR/IRBuilder.h" 82 #include "llvm/IR/Instructions.h" 83 #include "llvm/IR/IntrinsicInst.h" 84 #include "llvm/IR/LLVMContext.h" 85 #include "llvm/IR/Module.h" 86 #include "llvm/IR/PatternMatch.h" 87 #include "llvm/IR/Type.h" 88 #include "llvm/IR/Value.h" 89 #include "llvm/IR/ValueHandle.h" 90 #include "llvm/IR/Verifier.h" 91 #include "llvm/Pass.h" 92 #include "llvm/Support/BranchProbability.h" 93 #include "llvm/Support/CommandLine.h" 94 #include "llvm/Support/Debug.h" 95 #include "llvm/Support/raw_ostream.h" 96 #include "llvm/Transforms/Scalar.h" 97 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 98 #include "llvm/Transforms/Utils/Local.h" 99 #include "llvm/Analysis/VectorUtils.h" 100 #include "llvm/Transforms/Utils/LoopUtils.h" 101 #include <algorithm> 102 #include <map> 103 #include <tuple> 104 105 using namespace llvm; 106 using namespace llvm::PatternMatch; 107 108 #define LV_NAME "loop-vectorize" 109 #define DEBUG_TYPE LV_NAME 110 111 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 112 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 113 114 static cl::opt<bool> 115 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 116 cl::desc("Enable if-conversion during vectorization.")); 117 118 /// We don't vectorize loops with a known constant trip count below this number. 119 static cl::opt<unsigned> 120 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), 121 cl::Hidden, 122 cl::desc("Don't vectorize loops with a constant " 123 "trip count that is smaller than this " 124 "value.")); 125 126 /// This enables versioning on the strides of symbolically striding memory 127 /// accesses in code like the following. 128 /// for (i = 0; i < N; ++i) 129 /// A[i * Stride1] += B[i * Stride2] ... 130 /// 131 /// Will be roughly translated to 132 /// if (Stride1 == 1 && Stride2 == 1) { 133 /// for (i = 0; i < N; i+=4) 134 /// A[i:i+3] += ... 135 /// } else 136 /// ... 137 static cl::opt<bool> EnableMemAccessVersioning( 138 "enable-mem-access-versioning", cl::init(true), cl::Hidden, 139 cl::desc("Enable symblic stride memory access versioning")); 140 141 static cl::opt<bool> EnableInterleavedMemAccesses( 142 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 143 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 144 145 /// Maximum factor for an interleaved memory access. 146 static cl::opt<unsigned> MaxInterleaveGroupFactor( 147 "max-interleave-group-factor", cl::Hidden, 148 cl::desc("Maximum factor for an interleaved access group (default = 8)"), 149 cl::init(8)); 150 151 /// We don't interleave loops with a known constant trip count below this 152 /// number. 153 static const unsigned TinyTripCountInterleaveThreshold = 128; 154 155 static cl::opt<unsigned> ForceTargetNumScalarRegs( 156 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 157 cl::desc("A flag that overrides the target's number of scalar registers.")); 158 159 static cl::opt<unsigned> ForceTargetNumVectorRegs( 160 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 161 cl::desc("A flag that overrides the target's number of vector registers.")); 162 163 /// Maximum vectorization interleave count. 164 static const unsigned MaxInterleaveFactor = 16; 165 166 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 167 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 168 cl::desc("A flag that overrides the target's max interleave factor for " 169 "scalar loops.")); 170 171 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 172 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 173 cl::desc("A flag that overrides the target's max interleave factor for " 174 "vectorized loops.")); 175 176 static cl::opt<unsigned> ForceTargetInstructionCost( 177 "force-target-instruction-cost", cl::init(0), cl::Hidden, 178 cl::desc("A flag that overrides the target's expected cost for " 179 "an instruction to a single constant value. Mostly " 180 "useful for getting consistent testing.")); 181 182 static cl::opt<unsigned> SmallLoopCost( 183 "small-loop-cost", cl::init(20), cl::Hidden, 184 cl::desc( 185 "The cost of a loop that is considered 'small' by the interleaver.")); 186 187 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 188 "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, 189 cl::desc("Enable the use of the block frequency analysis to access PGO " 190 "heuristics minimizing code growth in cold regions and being more " 191 "aggressive in hot regions.")); 192 193 // Runtime interleave loops for load/store throughput. 194 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 195 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 196 cl::desc( 197 "Enable runtime interleaving until load/store ports are saturated")); 198 199 /// The number of stores in a loop that are allowed to need predication. 200 static cl::opt<unsigned> NumberOfStoresToPredicate( 201 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 202 cl::desc("Max number of stores to be predicated behind an if.")); 203 204 static cl::opt<bool> EnableIndVarRegisterHeur( 205 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 206 cl::desc("Count the induction variable only once when interleaving")); 207 208 static cl::opt<bool> EnableCondStoresVectorization( 209 "enable-cond-stores-vec", cl::init(false), cl::Hidden, 210 cl::desc("Enable if predication of stores during vectorization.")); 211 212 static cl::opt<unsigned> MaxNestedScalarReductionIC( 213 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 214 cl::desc("The maximum interleave count to use when interleaving a scalar " 215 "reduction in a nested loop.")); 216 217 namespace { 218 219 // Forward declarations. 220 class LoopVectorizeHints; 221 class LoopVectorizationLegality; 222 class LoopVectorizationCostModel; 223 class LoopVectorizationRequirements; 224 225 /// \brief This modifies LoopAccessReport to initialize message with 226 /// loop-vectorizer-specific part. 227 class VectorizationReport : public LoopAccessReport { 228 public: 229 VectorizationReport(Instruction *I = nullptr) 230 : LoopAccessReport("loop not vectorized: ", I) {} 231 232 /// \brief This allows promotion of the loop-access analysis report into the 233 /// loop-vectorizer report. It modifies the message to add the 234 /// loop-vectorizer-specific part of the message. 235 explicit VectorizationReport(const LoopAccessReport &R) 236 : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), 237 R.getInstr()) {} 238 }; 239 240 /// A helper function for converting Scalar types to vector types. 241 /// If the incoming type is void, we return void. If the VF is 1, we return 242 /// the scalar type. 243 static Type* ToVectorTy(Type *Scalar, unsigned VF) { 244 if (Scalar->isVoidTy() || VF == 1) 245 return Scalar; 246 return VectorType::get(Scalar, VF); 247 } 248 249 /// InnerLoopVectorizer vectorizes loops which contain only one basic 250 /// block to a specified vectorization factor (VF). 251 /// This class performs the widening of scalars into vectors, or multiple 252 /// scalars. This class also implements the following features: 253 /// * It inserts an epilogue loop for handling loops that don't have iteration 254 /// counts that are known to be a multiple of the vectorization factor. 255 /// * It handles the code generation for reduction variables. 256 /// * Scalarization (implementation using scalars) of un-vectorizable 257 /// instructions. 258 /// InnerLoopVectorizer does not perform any vectorization-legality 259 /// checks, and relies on the caller to check for the different legality 260 /// aspects. The InnerLoopVectorizer relies on the 261 /// LoopVectorizationLegality class to provide information about the induction 262 /// and reduction variables that were found to a given vectorization factor. 263 class InnerLoopVectorizer { 264 public: 265 InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 266 DominatorTree *DT, const TargetLibraryInfo *TLI, 267 const TargetTransformInfo *TTI, unsigned VecWidth, 268 unsigned UnrollFactor) 269 : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 270 VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), 271 Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), 272 Legal(nullptr), AddedSafetyChecks(false) {} 273 274 // Perform the actual loop widening (vectorization). 275 void vectorize(LoopVectorizationLegality *L) { 276 Legal = L; 277 // Create a new empty loop. Unlink the old loop and connect the new one. 278 createEmptyLoop(); 279 // Widen each instruction in the old loop to a new one in the new loop. 280 // Use the Legality module to find the induction and reduction variables. 281 vectorizeLoop(); 282 // Register the new loop and update the analysis passes. 283 updateAnalysis(); 284 } 285 286 // Return true if any runtime check is added. 287 bool IsSafetyChecksAdded() { 288 return AddedSafetyChecks; 289 } 290 291 virtual ~InnerLoopVectorizer() {} 292 293 protected: 294 /// A small list of PHINodes. 295 typedef SmallVector<PHINode*, 4> PhiVector; 296 /// When we unroll loops we have multiple vector values for each scalar. 297 /// This data structure holds the unrolled and vectorized values that 298 /// originated from one scalar instruction. 299 typedef SmallVector<Value*, 2> VectorParts; 300 301 // When we if-convert we need to create edge masks. We have to cache values 302 // so that we don't end up with exponential recursion/IR. 303 typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, 304 VectorParts> EdgeMaskCache; 305 306 /// \brief Add checks for strides that were assumed to be 1. 307 /// 308 /// Returns the last check instruction and the first check instruction in the 309 /// pair as (first, last). 310 std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); 311 312 /// Create an empty loop, based on the loop ranges of the old loop. 313 void createEmptyLoop(); 314 /// Copy and widen the instructions from the old loop. 315 virtual void vectorizeLoop(); 316 317 /// \brief The Loop exit block may have single value PHI nodes where the 318 /// incoming value is 'Undef'. While vectorizing we only handled real values 319 /// that were defined inside the loop. Here we fix the 'undef case'. 320 /// See PR14725. 321 void fixLCSSAPHIs(); 322 323 /// A helper function that computes the predicate of the block BB, assuming 324 /// that the header block of the loop is set to True. It returns the *entry* 325 /// mask for the block BB. 326 VectorParts createBlockInMask(BasicBlock *BB); 327 /// A helper function that computes the predicate of the edge between SRC 328 /// and DST. 329 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 330 331 /// A helper function to vectorize a single BB within the innermost loop. 332 void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); 333 334 /// Vectorize a single PHINode in a block. This method handles the induction 335 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 336 /// arbitrary length vectors. 337 void widenPHIInstruction(Instruction *PN, VectorParts &Entry, 338 unsigned UF, unsigned VF, PhiVector *PV); 339 340 /// Insert the new loop to the loop hierarchy and pass manager 341 /// and update the analysis passes. 342 void updateAnalysis(); 343 344 /// This instruction is un-vectorizable. Implement it as a sequence 345 /// of scalars. If \p IfPredicateStore is true we need to 'hide' each 346 /// scalarized instruction behind an if block predicated on the control 347 /// dependence of the instruction. 348 virtual void scalarizeInstruction(Instruction *Instr, 349 bool IfPredicateStore=false); 350 351 /// Vectorize Load and Store instructions, 352 virtual void vectorizeMemoryInstruction(Instruction *Instr); 353 354 /// Create a broadcast instruction. This method generates a broadcast 355 /// instruction (shuffle) for loop invariant values and for the induction 356 /// value. If this is the induction variable then we extend it to N, N+1, ... 357 /// this is needed because each iteration in the loop corresponds to a SIMD 358 /// element. 359 virtual Value *getBroadcastInstrs(Value *V); 360 361 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 362 /// to each vector element of Val. The sequence starts at StartIndex. 363 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); 364 365 /// When we go over instructions in the basic block we rely on previous 366 /// values within the current basic block or on loop invariant values. 367 /// When we widen (vectorize) values we place them in the map. If the values 368 /// are not within the map, they have to be loop invariant, so we simply 369 /// broadcast them into a vector. 370 VectorParts &getVectorValue(Value *V); 371 372 /// Try to vectorize the interleaved access group that \p Instr belongs to. 373 void vectorizeInterleaveGroup(Instruction *Instr); 374 375 /// Generate a shuffle sequence that will reverse the vector Vec. 376 virtual Value *reverseVector(Value *Vec); 377 378 /// This is a helper class that holds the vectorizer state. It maps scalar 379 /// instructions to vector instructions. When the code is 'unrolled' then 380 /// then a single scalar value is mapped to multiple vector parts. The parts 381 /// are stored in the VectorPart type. 382 struct ValueMap { 383 /// C'tor. UnrollFactor controls the number of vectors ('parts') that 384 /// are mapped. 385 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 386 387 /// \return True if 'Key' is saved in the Value Map. 388 bool has(Value *Key) const { return MapStorage.count(Key); } 389 390 /// Initializes a new entry in the map. Sets all of the vector parts to the 391 /// save value in 'Val'. 392 /// \return A reference to a vector with splat values. 393 VectorParts &splat(Value *Key, Value *Val) { 394 VectorParts &Entry = MapStorage[Key]; 395 Entry.assign(UF, Val); 396 return Entry; 397 } 398 399 ///\return A reference to the value that is stored at 'Key'. 400 VectorParts &get(Value *Key) { 401 VectorParts &Entry = MapStorage[Key]; 402 if (Entry.empty()) 403 Entry.resize(UF); 404 assert(Entry.size() == UF); 405 return Entry; 406 } 407 408 private: 409 /// The unroll factor. Each entry in the map stores this number of vector 410 /// elements. 411 unsigned UF; 412 413 /// Map storage. We use std::map and not DenseMap because insertions to a 414 /// dense map invalidates its iterators. 415 std::map<Value *, VectorParts> MapStorage; 416 }; 417 418 /// The original loop. 419 Loop *OrigLoop; 420 /// Scev analysis to use. 421 ScalarEvolution *SE; 422 /// Loop Info. 423 LoopInfo *LI; 424 /// Dominator Tree. 425 DominatorTree *DT; 426 /// Alias Analysis. 427 AliasAnalysis *AA; 428 /// Target Library Info. 429 const TargetLibraryInfo *TLI; 430 /// Target Transform Info. 431 const TargetTransformInfo *TTI; 432 433 /// The vectorization SIMD factor to use. Each vector will have this many 434 /// vector elements. 435 unsigned VF; 436 437 protected: 438 /// The vectorization unroll factor to use. Each scalar is vectorized to this 439 /// many different vector instructions. 440 unsigned UF; 441 442 /// The builder that we use 443 IRBuilder<> Builder; 444 445 // --- Vectorization state --- 446 447 /// The vector-loop preheader. 448 BasicBlock *LoopVectorPreHeader; 449 /// The scalar-loop preheader. 450 BasicBlock *LoopScalarPreHeader; 451 /// Middle Block between the vector and the scalar. 452 BasicBlock *LoopMiddleBlock; 453 ///The ExitBlock of the scalar loop. 454 BasicBlock *LoopExitBlock; 455 ///The vector loop body. 456 SmallVector<BasicBlock *, 4> LoopVectorBody; 457 ///The scalar loop body. 458 BasicBlock *LoopScalarBody; 459 /// A list of all bypass blocks. The first block is the entry of the loop. 460 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 461 462 /// The new Induction variable which was added to the new block. 463 PHINode *Induction; 464 /// The induction variable of the old basic block. 465 PHINode *OldInduction; 466 /// Holds the extended (to the widest induction type) start index. 467 Value *ExtendedIdx; 468 /// Maps scalars to widened vectors. 469 ValueMap WidenMap; 470 EdgeMaskCache MaskCache; 471 472 LoopVectorizationLegality *Legal; 473 474 // Record whether runtime check is added. 475 bool AddedSafetyChecks; 476 }; 477 478 class InnerLoopUnroller : public InnerLoopVectorizer { 479 public: 480 InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 481 DominatorTree *DT, const TargetLibraryInfo *TLI, 482 const TargetTransformInfo *TTI, unsigned UnrollFactor) 483 : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} 484 485 private: 486 void scalarizeInstruction(Instruction *Instr, 487 bool IfPredicateStore = false) override; 488 void vectorizeMemoryInstruction(Instruction *Instr) override; 489 Value *getBroadcastInstrs(Value *V) override; 490 Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; 491 Value *reverseVector(Value *Vec) override; 492 }; 493 494 /// \brief Look for a meaningful debug location on the instruction or it's 495 /// operands. 496 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 497 if (!I) 498 return I; 499 500 DebugLoc Empty; 501 if (I->getDebugLoc() != Empty) 502 return I; 503 504 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 505 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 506 if (OpInst->getDebugLoc() != Empty) 507 return OpInst; 508 } 509 510 return I; 511 } 512 513 /// \brief Set the debug location in the builder using the debug location in the 514 /// instruction. 515 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 516 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) 517 B.SetCurrentDebugLocation(Inst->getDebugLoc()); 518 else 519 B.SetCurrentDebugLocation(DebugLoc()); 520 } 521 522 #ifndef NDEBUG 523 /// \return string containing a file name and a line # for the given loop. 524 static std::string getDebugLocString(const Loop *L) { 525 std::string Result; 526 if (L) { 527 raw_string_ostream OS(Result); 528 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 529 LoopDbgLoc.print(OS); 530 else 531 // Just print the module name. 532 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 533 OS.flush(); 534 } 535 return Result; 536 } 537 #endif 538 539 /// \brief Propagate known metadata from one instruction to another. 540 static void propagateMetadata(Instruction *To, const Instruction *From) { 541 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 542 From->getAllMetadataOtherThanDebugLoc(Metadata); 543 544 for (auto M : Metadata) { 545 unsigned Kind = M.first; 546 547 // These are safe to transfer (this is safe for TBAA, even when we 548 // if-convert, because should that metadata have had a control dependency 549 // on the condition, and thus actually aliased with some other 550 // non-speculated memory access when the condition was false, this would be 551 // caught by the runtime overlap checks). 552 if (Kind != LLVMContext::MD_tbaa && 553 Kind != LLVMContext::MD_alias_scope && 554 Kind != LLVMContext::MD_noalias && 555 Kind != LLVMContext::MD_fpmath) 556 continue; 557 558 To->setMetadata(Kind, M.second); 559 } 560 } 561 562 /// \brief Propagate known metadata from one instruction to a vector of others. 563 static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { 564 for (Value *V : To) 565 if (Instruction *I = dyn_cast<Instruction>(V)) 566 propagateMetadata(I, From); 567 } 568 569 /// \brief The group of interleaved loads/stores sharing the same stride and 570 /// close to each other. 571 /// 572 /// Each member in this group has an index starting from 0, and the largest 573 /// index should be less than interleaved factor, which is equal to the absolute 574 /// value of the access's stride. 575 /// 576 /// E.g. An interleaved load group of factor 4: 577 /// for (unsigned i = 0; i < 1024; i+=4) { 578 /// a = A[i]; // Member of index 0 579 /// b = A[i+1]; // Member of index 1 580 /// d = A[i+3]; // Member of index 3 581 /// ... 582 /// } 583 /// 584 /// An interleaved store group of factor 4: 585 /// for (unsigned i = 0; i < 1024; i+=4) { 586 /// ... 587 /// A[i] = a; // Member of index 0 588 /// A[i+1] = b; // Member of index 1 589 /// A[i+2] = c; // Member of index 2 590 /// A[i+3] = d; // Member of index 3 591 /// } 592 /// 593 /// Note: the interleaved load group could have gaps (missing members), but 594 /// the interleaved store group doesn't allow gaps. 595 class InterleaveGroup { 596 public: 597 InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) 598 : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { 599 assert(Align && "The alignment should be non-zero"); 600 601 Factor = std::abs(Stride); 602 assert(Factor > 1 && "Invalid interleave factor"); 603 604 Reverse = Stride < 0; 605 Members[0] = Instr; 606 } 607 608 bool isReverse() const { return Reverse; } 609 unsigned getFactor() const { return Factor; } 610 unsigned getAlignment() const { return Align; } 611 unsigned getNumMembers() const { return Members.size(); } 612 613 /// \brief Try to insert a new member \p Instr with index \p Index and 614 /// alignment \p NewAlign. The index is related to the leader and it could be 615 /// negative if it is the new leader. 616 /// 617 /// \returns false if the instruction doesn't belong to the group. 618 bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { 619 assert(NewAlign && "The new member's alignment should be non-zero"); 620 621 int Key = Index + SmallestKey; 622 623 // Skip if there is already a member with the same index. 624 if (Members.count(Key)) 625 return false; 626 627 if (Key > LargestKey) { 628 // The largest index is always less than the interleave factor. 629 if (Index >= static_cast<int>(Factor)) 630 return false; 631 632 LargestKey = Key; 633 } else if (Key < SmallestKey) { 634 // The largest index is always less than the interleave factor. 635 if (LargestKey - Key >= static_cast<int>(Factor)) 636 return false; 637 638 SmallestKey = Key; 639 } 640 641 // It's always safe to select the minimum alignment. 642 Align = std::min(Align, NewAlign); 643 Members[Key] = Instr; 644 return true; 645 } 646 647 /// \brief Get the member with the given index \p Index 648 /// 649 /// \returns nullptr if contains no such member. 650 Instruction *getMember(unsigned Index) const { 651 int Key = SmallestKey + Index; 652 if (!Members.count(Key)) 653 return nullptr; 654 655 return Members.find(Key)->second; 656 } 657 658 /// \brief Get the index for the given member. Unlike the key in the member 659 /// map, the index starts from 0. 660 unsigned getIndex(Instruction *Instr) const { 661 for (auto I : Members) 662 if (I.second == Instr) 663 return I.first - SmallestKey; 664 665 llvm_unreachable("InterleaveGroup contains no such member"); 666 } 667 668 Instruction *getInsertPos() const { return InsertPos; } 669 void setInsertPos(Instruction *Inst) { InsertPos = Inst; } 670 671 private: 672 unsigned Factor; // Interleave Factor. 673 bool Reverse; 674 unsigned Align; 675 DenseMap<int, Instruction *> Members; 676 int SmallestKey; 677 int LargestKey; 678 679 // To avoid breaking dependences, vectorized instructions of an interleave 680 // group should be inserted at either the first load or the last store in 681 // program order. 682 // 683 // E.g. %even = load i32 // Insert Position 684 // %add = add i32 %even // Use of %even 685 // %odd = load i32 686 // 687 // store i32 %even 688 // %odd = add i32 // Def of %odd 689 // store i32 %odd // Insert Position 690 Instruction *InsertPos; 691 }; 692 693 /// \brief Drive the analysis of interleaved memory accesses in the loop. 694 /// 695 /// Use this class to analyze interleaved accesses only when we can vectorize 696 /// a loop. Otherwise it's meaningless to do analysis as the vectorization 697 /// on interleaved accesses is unsafe. 698 /// 699 /// The analysis collects interleave groups and records the relationships 700 /// between the member and the group in a map. 701 class InterleavedAccessInfo { 702 public: 703 InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) 704 : SE(SE), TheLoop(L), DT(DT) {} 705 706 ~InterleavedAccessInfo() { 707 SmallSet<InterleaveGroup *, 4> DelSet; 708 // Avoid releasing a pointer twice. 709 for (auto &I : InterleaveGroupMap) 710 DelSet.insert(I.second); 711 for (auto *Ptr : DelSet) 712 delete Ptr; 713 } 714 715 /// \brief Analyze the interleaved accesses and collect them in interleave 716 /// groups. Substitute symbolic strides using \p Strides. 717 void analyzeInterleaving(const ValueToValueMap &Strides); 718 719 /// \brief Check if \p Instr belongs to any interleave group. 720 bool isInterleaved(Instruction *Instr) const { 721 return InterleaveGroupMap.count(Instr); 722 } 723 724 /// \brief Get the interleave group that \p Instr belongs to. 725 /// 726 /// \returns nullptr if doesn't have such group. 727 InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { 728 if (InterleaveGroupMap.count(Instr)) 729 return InterleaveGroupMap.find(Instr)->second; 730 return nullptr; 731 } 732 733 private: 734 ScalarEvolution *SE; 735 Loop *TheLoop; 736 DominatorTree *DT; 737 738 /// Holds the relationships between the members and the interleave group. 739 DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; 740 741 /// \brief The descriptor for a strided memory access. 742 struct StrideDescriptor { 743 StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, 744 unsigned Align) 745 : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} 746 747 StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} 748 749 int Stride; // The access's stride. It is negative for a reverse access. 750 const SCEV *Scev; // The scalar expression of this access 751 unsigned Size; // The size of the memory object. 752 unsigned Align; // The alignment of this access. 753 }; 754 755 /// \brief Create a new interleave group with the given instruction \p Instr, 756 /// stride \p Stride and alignment \p Align. 757 /// 758 /// \returns the newly created interleave group. 759 InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, 760 unsigned Align) { 761 assert(!InterleaveGroupMap.count(Instr) && 762 "Already in an interleaved access group"); 763 InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); 764 return InterleaveGroupMap[Instr]; 765 } 766 767 /// \brief Release the group and remove all the relationships. 768 void releaseGroup(InterleaveGroup *Group) { 769 for (unsigned i = 0; i < Group->getFactor(); i++) 770 if (Instruction *Member = Group->getMember(i)) 771 InterleaveGroupMap.erase(Member); 772 773 delete Group; 774 } 775 776 /// \brief Collect all the accesses with a constant stride in program order. 777 void collectConstStridedAccesses( 778 MapVector<Instruction *, StrideDescriptor> &StrideAccesses, 779 const ValueToValueMap &Strides); 780 }; 781 782 /// Utility class for getting and setting loop vectorizer hints in the form 783 /// of loop metadata. 784 /// This class keeps a number of loop annotations locally (as member variables) 785 /// and can, upon request, write them back as metadata on the loop. It will 786 /// initially scan the loop for existing metadata, and will update the local 787 /// values based on information in the loop. 788 /// We cannot write all values to metadata, as the mere presence of some info, 789 /// for example 'force', means a decision has been made. So, we need to be 790 /// careful NOT to add them if the user hasn't specifically asked so. 791 class LoopVectorizeHints { 792 enum HintKind { 793 HK_WIDTH, 794 HK_UNROLL, 795 HK_FORCE 796 }; 797 798 /// Hint - associates name and validation with the hint value. 799 struct Hint { 800 const char * Name; 801 unsigned Value; // This may have to change for non-numeric values. 802 HintKind Kind; 803 804 Hint(const char * Name, unsigned Value, HintKind Kind) 805 : Name(Name), Value(Value), Kind(Kind) { } 806 807 bool validate(unsigned Val) { 808 switch (Kind) { 809 case HK_WIDTH: 810 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 811 case HK_UNROLL: 812 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 813 case HK_FORCE: 814 return (Val <= 1); 815 } 816 return false; 817 } 818 }; 819 820 /// Vectorization width. 821 Hint Width; 822 /// Vectorization interleave factor. 823 Hint Interleave; 824 /// Vectorization forced 825 Hint Force; 826 827 /// Return the loop metadata prefix. 828 static StringRef Prefix() { return "llvm.loop."; } 829 830 public: 831 enum ForceKind { 832 FK_Undefined = -1, ///< Not selected. 833 FK_Disabled = 0, ///< Forcing disabled. 834 FK_Enabled = 1, ///< Forcing enabled. 835 }; 836 837 LoopVectorizeHints(const Loop *L, bool DisableInterleaving) 838 : Width("vectorize.width", VectorizerParams::VectorizationFactor, 839 HK_WIDTH), 840 Interleave("interleave.count", DisableInterleaving, HK_UNROLL), 841 Force("vectorize.enable", FK_Undefined, HK_FORCE), 842 TheLoop(L) { 843 // Populate values with existing loop metadata. 844 getHintsFromMetadata(); 845 846 // force-vector-interleave overrides DisableInterleaving. 847 if (VectorizerParams::isInterleaveForced()) 848 Interleave.Value = VectorizerParams::VectorizationInterleave; 849 850 DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() 851 << "LV: Interleaving disabled by the pass manager\n"); 852 } 853 854 /// Mark the loop L as already vectorized by setting the width to 1. 855 void setAlreadyVectorized() { 856 Width.Value = Interleave.Value = 1; 857 Hint Hints[] = {Width, Interleave}; 858 writeHintsToMetadata(Hints); 859 } 860 861 bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { 862 if (getForce() == LoopVectorizeHints::FK_Disabled) { 863 DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 864 emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, 865 L->getStartLoc(), emitRemark()); 866 return false; 867 } 868 869 if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { 870 DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 871 emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, 872 L->getStartLoc(), emitRemark()); 873 return false; 874 } 875 876 if (getWidth() == 1 && getInterleave() == 1) { 877 // FIXME: Add a separate metadata to indicate when the loop has already 878 // been vectorized instead of setting width and count to 1. 879 DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 880 // FIXME: Add interleave.disable metadata. This will allow 881 // vectorize.disable to be used without disabling the pass and errors 882 // to differentiate between disabled vectorization and a width of 1. 883 emitOptimizationRemarkAnalysis( 884 F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), 885 "loop not vectorized: vectorization and interleaving are explicitly " 886 "disabled, or vectorize width and interleave count are both set to " 887 "1"); 888 return false; 889 } 890 891 return true; 892 } 893 894 /// Dumps all the hint information. 895 std::string emitRemark() const { 896 VectorizationReport R; 897 if (Force.Value == LoopVectorizeHints::FK_Disabled) 898 R << "vectorization is explicitly disabled"; 899 else { 900 R << "use -Rpass-analysis=loop-vectorize for more info"; 901 if (Force.Value == LoopVectorizeHints::FK_Enabled) { 902 R << " (Force=true"; 903 if (Width.Value != 0) 904 R << ", Vector Width=" << Width.Value; 905 if (Interleave.Value != 0) 906 R << ", Interleave Count=" << Interleave.Value; 907 R << ")"; 908 } 909 } 910 911 return R.str(); 912 } 913 914 unsigned getWidth() const { return Width.Value; } 915 unsigned getInterleave() const { return Interleave.Value; } 916 enum ForceKind getForce() const { return (ForceKind)Force.Value; } 917 bool isForced() const { 918 return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1 || 919 getInterleave() > 1; 920 } 921 922 private: 923 /// Find hints specified in the loop metadata and update local values. 924 void getHintsFromMetadata() { 925 MDNode *LoopID = TheLoop->getLoopID(); 926 if (!LoopID) 927 return; 928 929 // First operand should refer to the loop id itself. 930 assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 931 assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 932 933 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 934 const MDString *S = nullptr; 935 SmallVector<Metadata *, 4> Args; 936 937 // The expected hint is either a MDString or a MDNode with the first 938 // operand a MDString. 939 if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 940 if (!MD || MD->getNumOperands() == 0) 941 continue; 942 S = dyn_cast<MDString>(MD->getOperand(0)); 943 for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 944 Args.push_back(MD->getOperand(i)); 945 } else { 946 S = dyn_cast<MDString>(LoopID->getOperand(i)); 947 assert(Args.size() == 0 && "too many arguments for MDString"); 948 } 949 950 if (!S) 951 continue; 952 953 // Check if the hint starts with the loop metadata prefix. 954 StringRef Name = S->getString(); 955 if (Args.size() == 1) 956 setHint(Name, Args[0]); 957 } 958 } 959 960 /// Checks string hint with one operand and set value if valid. 961 void setHint(StringRef Name, Metadata *Arg) { 962 if (!Name.startswith(Prefix())) 963 return; 964 Name = Name.substr(Prefix().size(), StringRef::npos); 965 966 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 967 if (!C) return; 968 unsigned Val = C->getZExtValue(); 969 970 Hint *Hints[] = {&Width, &Interleave, &Force}; 971 for (auto H : Hints) { 972 if (Name == H->Name) { 973 if (H->validate(Val)) 974 H->Value = Val; 975 else 976 DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 977 break; 978 } 979 } 980 } 981 982 /// Create a new hint from name / value pair. 983 MDNode *createHintMetadata(StringRef Name, unsigned V) const { 984 LLVMContext &Context = TheLoop->getHeader()->getContext(); 985 Metadata *MDs[] = {MDString::get(Context, Name), 986 ConstantAsMetadata::get( 987 ConstantInt::get(Type::getInt32Ty(Context), V))}; 988 return MDNode::get(Context, MDs); 989 } 990 991 /// Matches metadata with hint name. 992 bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { 993 MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); 994 if (!Name) 995 return false; 996 997 for (auto H : HintTypes) 998 if (Name->getString().endswith(H.Name)) 999 return true; 1000 return false; 1001 } 1002 1003 /// Sets current hints into loop metadata, keeping other values intact. 1004 void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { 1005 if (HintTypes.size() == 0) 1006 return; 1007 1008 // Reserve the first element to LoopID (see below). 1009 SmallVector<Metadata *, 4> MDs(1); 1010 // If the loop already has metadata, then ignore the existing operands. 1011 MDNode *LoopID = TheLoop->getLoopID(); 1012 if (LoopID) { 1013 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1014 MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); 1015 // If node in update list, ignore old value. 1016 if (!matchesHintMetadataName(Node, HintTypes)) 1017 MDs.push_back(Node); 1018 } 1019 } 1020 1021 // Now, add the missing hints. 1022 for (auto H : HintTypes) 1023 MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); 1024 1025 // Replace current metadata node with new one. 1026 LLVMContext &Context = TheLoop->getHeader()->getContext(); 1027 MDNode *NewLoopID = MDNode::get(Context, MDs); 1028 // Set operand 0 to refer to the loop id itself. 1029 NewLoopID->replaceOperandWith(0, NewLoopID); 1030 1031 TheLoop->setLoopID(NewLoopID); 1032 } 1033 1034 /// The loop these hints belong to. 1035 const Loop *TheLoop; 1036 }; 1037 1038 static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, 1039 const LoopVectorizeHints &Hints, 1040 const LoopAccessReport &Message) { 1041 // If a loop hint is provided the diagnostic is always produced. 1042 const char *Name = Hints.isForced() ? DiagnosticInfo::AlwaysPrint : LV_NAME; 1043 LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); 1044 } 1045 1046 static void emitMissedWarning(Function *F, Loop *L, 1047 const LoopVectorizeHints &LH) { 1048 emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, 1049 L->getStartLoc(), LH.emitRemark()); 1050 1051 if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { 1052 if (LH.getWidth() != 1) 1053 emitLoopVectorizeWarning( 1054 F->getContext(), *F, L->getStartLoc(), 1055 "failed explicitly specified loop vectorization"); 1056 else if (LH.getInterleave() != 1) 1057 emitLoopInterleaveWarning( 1058 F->getContext(), *F, L->getStartLoc(), 1059 "failed explicitly specified loop interleaving"); 1060 } 1061 } 1062 1063 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 1064 /// to what vectorization factor. 1065 /// This class does not look at the profitability of vectorization, only the 1066 /// legality. This class has two main kinds of checks: 1067 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 1068 /// will change the order of memory accesses in a way that will change the 1069 /// correctness of the program. 1070 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 1071 /// checks for a number of different conditions, such as the availability of a 1072 /// single induction variable, that all types are supported and vectorize-able, 1073 /// etc. This code reflects the capabilities of InnerLoopVectorizer. 1074 /// This class is also used by InnerLoopVectorizer for identifying 1075 /// induction variable and the different reduction variables. 1076 class LoopVectorizationLegality { 1077 public: 1078 LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, 1079 TargetLibraryInfo *TLI, AliasAnalysis *AA, 1080 Function *F, const TargetTransformInfo *TTI, 1081 LoopAccessAnalysis *LAA, 1082 LoopVectorizationRequirements *R, 1083 const LoopVectorizeHints *H) 1084 : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), 1085 TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), 1086 Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), 1087 Requirements(R), Hints(H) {} 1088 1089 /// This enum represents the kinds of inductions that we support. 1090 enum InductionKind { 1091 IK_NoInduction, ///< Not an induction variable. 1092 IK_IntInduction, ///< Integer induction variable. Step = C. 1093 IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). 1094 }; 1095 1096 /// A struct for saving information about induction variables. 1097 struct InductionInfo { 1098 InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) 1099 : StartValue(Start), IK(K), StepValue(Step) { 1100 assert(IK != IK_NoInduction && "Not an induction"); 1101 assert(StartValue && "StartValue is null"); 1102 assert(StepValue && !StepValue->isZero() && "StepValue is zero"); 1103 assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && 1104 "StartValue is not a pointer for pointer induction"); 1105 assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && 1106 "StartValue is not an integer for integer induction"); 1107 assert(StepValue->getType()->isIntegerTy() && 1108 "StepValue is not an integer"); 1109 } 1110 InductionInfo() 1111 : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} 1112 1113 /// Get the consecutive direction. Returns: 1114 /// 0 - unknown or non-consecutive. 1115 /// 1 - consecutive and increasing. 1116 /// -1 - consecutive and decreasing. 1117 int getConsecutiveDirection() const { 1118 if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) 1119 return StepValue->getSExtValue(); 1120 return 0; 1121 } 1122 1123 /// Compute the transformed value of Index at offset StartValue using step 1124 /// StepValue. 1125 /// For integer induction, returns StartValue + Index * StepValue. 1126 /// For pointer induction, returns StartValue[Index * StepValue]. 1127 /// FIXME: The newly created binary instructions should contain nsw/nuw 1128 /// flags, which can be found from the original scalar operations. 1129 Value *transform(IRBuilder<> &B, Value *Index) const { 1130 switch (IK) { 1131 case IK_IntInduction: 1132 assert(Index->getType() == StartValue->getType() && 1133 "Index type does not match StartValue type"); 1134 if (StepValue->isMinusOne()) 1135 return B.CreateSub(StartValue, Index); 1136 if (!StepValue->isOne()) 1137 Index = B.CreateMul(Index, StepValue); 1138 return B.CreateAdd(StartValue, Index); 1139 1140 case IK_PtrInduction: 1141 assert(Index->getType() == StepValue->getType() && 1142 "Index type does not match StepValue type"); 1143 if (StepValue->isMinusOne()) 1144 Index = B.CreateNeg(Index); 1145 else if (!StepValue->isOne()) 1146 Index = B.CreateMul(Index, StepValue); 1147 return B.CreateGEP(nullptr, StartValue, Index); 1148 1149 case IK_NoInduction: 1150 return nullptr; 1151 } 1152 llvm_unreachable("invalid enum"); 1153 } 1154 1155 /// Start value. 1156 TrackingVH<Value> StartValue; 1157 /// Induction kind. 1158 InductionKind IK; 1159 /// Step value. 1160 ConstantInt *StepValue; 1161 }; 1162 1163 /// ReductionList contains the reduction descriptors for all 1164 /// of the reductions that were found in the loop. 1165 typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; 1166 1167 /// InductionList saves induction variables and maps them to the 1168 /// induction descriptor. 1169 typedef MapVector<PHINode*, InductionInfo> InductionList; 1170 1171 /// Returns true if it is legal to vectorize this loop. 1172 /// This does not mean that it is profitable to vectorize this 1173 /// loop, only that it is legal to do so. 1174 bool canVectorize(); 1175 1176 /// Returns the Induction variable. 1177 PHINode *getInduction() { return Induction; } 1178 1179 /// Returns the reduction variables found in the loop. 1180 ReductionList *getReductionVars() { return &Reductions; } 1181 1182 /// Returns the induction variables found in the loop. 1183 InductionList *getInductionVars() { return &Inductions; } 1184 1185 /// Returns the widest induction type. 1186 Type *getWidestInductionType() { return WidestIndTy; } 1187 1188 /// Returns True if V is an induction variable in this loop. 1189 bool isInductionVariable(const Value *V); 1190 1191 /// Return true if the block BB needs to be predicated in order for the loop 1192 /// to be vectorized. 1193 bool blockNeedsPredication(BasicBlock *BB); 1194 1195 /// Check if this pointer is consecutive when vectorizing. This happens 1196 /// when the last index of the GEP is the induction variable, or that the 1197 /// pointer itself is an induction variable. 1198 /// This check allows us to vectorize A[idx] into a wide load/store. 1199 /// Returns: 1200 /// 0 - Stride is unknown or non-consecutive. 1201 /// 1 - Address is consecutive. 1202 /// -1 - Address is consecutive, and decreasing. 1203 int isConsecutivePtr(Value *Ptr); 1204 1205 /// Returns true if the value V is uniform within the loop. 1206 bool isUniform(Value *V); 1207 1208 /// Returns true if this instruction will remain scalar after vectorization. 1209 bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 1210 1211 /// Returns the information that we collected about runtime memory check. 1212 const RuntimePointerChecking *getRuntimePointerChecking() const { 1213 return LAI->getRuntimePointerChecking(); 1214 } 1215 1216 const LoopAccessInfo *getLAI() const { 1217 return LAI; 1218 } 1219 1220 /// \brief Check if \p Instr belongs to any interleaved access group. 1221 bool isAccessInterleaved(Instruction *Instr) { 1222 return InterleaveInfo.isInterleaved(Instr); 1223 } 1224 1225 /// \brief Get the interleaved access group that \p Instr belongs to. 1226 const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { 1227 return InterleaveInfo.getInterleaveGroup(Instr); 1228 } 1229 1230 unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } 1231 1232 bool hasStride(Value *V) { return StrideSet.count(V); } 1233 bool mustCheckStrides() { return !StrideSet.empty(); } 1234 SmallPtrSet<Value *, 8>::iterator strides_begin() { 1235 return StrideSet.begin(); 1236 } 1237 SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } 1238 1239 /// Returns true if the target machine supports masked store operation 1240 /// for the given \p DataType and kind of access to \p Ptr. 1241 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1242 return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); 1243 } 1244 /// Returns true if the target machine supports masked load operation 1245 /// for the given \p DataType and kind of access to \p Ptr. 1246 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1247 return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); 1248 } 1249 /// Returns true if vector representation of the instruction \p I 1250 /// requires mask. 1251 bool isMaskRequired(const Instruction* I) { 1252 return (MaskedOp.count(I) != 0); 1253 } 1254 unsigned getNumStores() const { 1255 return LAI->getNumStores(); 1256 } 1257 unsigned getNumLoads() const { 1258 return LAI->getNumLoads(); 1259 } 1260 unsigned getNumPredStores() const { 1261 return NumPredStores; 1262 } 1263 private: 1264 /// Check if a single basic block loop is vectorizable. 1265 /// At this point we know that this is a loop with a constant trip count 1266 /// and we only need to check individual instructions. 1267 bool canVectorizeInstrs(); 1268 1269 /// When we vectorize loops we may change the order in which 1270 /// we read and write from memory. This method checks if it is 1271 /// legal to vectorize the code, considering only memory constrains. 1272 /// Returns true if the loop is vectorizable 1273 bool canVectorizeMemory(); 1274 1275 /// Return true if we can vectorize this loop using the IF-conversion 1276 /// transformation. 1277 bool canVectorizeWithIfConvert(); 1278 1279 /// Collect the variables that need to stay uniform after vectorization. 1280 void collectLoopUniforms(); 1281 1282 /// Return true if all of the instructions in the block can be speculatively 1283 /// executed. \p SafePtrs is a list of addresses that are known to be legal 1284 /// and we know that we can read from them without segfault. 1285 bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); 1286 1287 /// Returns the induction kind of Phi and record the step. This function may 1288 /// return NoInduction if the PHI is not an induction variable. 1289 InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); 1290 1291 /// \brief Collect memory access with loop invariant strides. 1292 /// 1293 /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop 1294 /// invariant. 1295 void collectStridedAccess(Value *LoadOrStoreInst); 1296 1297 /// Report an analysis message to assist the user in diagnosing loops that are 1298 /// not vectorized. These are handled as LoopAccessReport rather than 1299 /// VectorizationReport because the << operator of VectorizationReport returns 1300 /// LoopAccessReport. 1301 void emitAnalysis(const LoopAccessReport &Message) const { 1302 emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); 1303 } 1304 1305 unsigned NumPredStores; 1306 1307 /// The loop that we evaluate. 1308 Loop *TheLoop; 1309 /// Scev analysis. 1310 ScalarEvolution *SE; 1311 /// Target Library Info. 1312 TargetLibraryInfo *TLI; 1313 /// Parent function 1314 Function *TheFunction; 1315 /// Target Transform Info 1316 const TargetTransformInfo *TTI; 1317 /// Dominator Tree. 1318 DominatorTree *DT; 1319 // LoopAccess analysis. 1320 LoopAccessAnalysis *LAA; 1321 // And the loop-accesses info corresponding to this loop. This pointer is 1322 // null until canVectorizeMemory sets it up. 1323 const LoopAccessInfo *LAI; 1324 1325 /// The interleave access information contains groups of interleaved accesses 1326 /// with the same stride and close to each other. 1327 InterleavedAccessInfo InterleaveInfo; 1328 1329 // --- vectorization state --- // 1330 1331 /// Holds the integer induction variable. This is the counter of the 1332 /// loop. 1333 PHINode *Induction; 1334 /// Holds the reduction variables. 1335 ReductionList Reductions; 1336 /// Holds all of the induction variables that we found in the loop. 1337 /// Notice that inductions don't need to start at zero and that induction 1338 /// variables can be pointers. 1339 InductionList Inductions; 1340 /// Holds the widest induction type encountered. 1341 Type *WidestIndTy; 1342 1343 /// Allowed outside users. This holds the reduction 1344 /// vars which can be accessed from outside the loop. 1345 SmallPtrSet<Value*, 4> AllowedExit; 1346 /// This set holds the variables which are known to be uniform after 1347 /// vectorization. 1348 SmallPtrSet<Instruction*, 4> Uniforms; 1349 1350 /// Can we assume the absence of NaNs. 1351 bool HasFunNoNaNAttr; 1352 1353 /// Vectorization requirements that will go through late-evaluation. 1354 LoopVectorizationRequirements *Requirements; 1355 1356 /// Used to emit an analysis of any legality issues. 1357 const LoopVectorizeHints *Hints; 1358 1359 ValueToValueMap Strides; 1360 SmallPtrSet<Value *, 8> StrideSet; 1361 1362 /// While vectorizing these instructions we have to generate a 1363 /// call to the appropriate masked intrinsic 1364 SmallPtrSet<const Instruction*, 8> MaskedOp; 1365 }; 1366 1367 /// LoopVectorizationCostModel - estimates the expected speedups due to 1368 /// vectorization. 1369 /// In many cases vectorization is not profitable. This can happen because of 1370 /// a number of reasons. In this class we mainly attempt to predict the 1371 /// expected speedup/slowdowns due to the supported instruction set. We use the 1372 /// TargetTransformInfo to query the different backends for the cost of 1373 /// different operations. 1374 class LoopVectorizationCostModel { 1375 public: 1376 LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 1377 LoopVectorizationLegality *Legal, 1378 const TargetTransformInfo &TTI, 1379 const TargetLibraryInfo *TLI, AssumptionCache *AC, 1380 const Function *F, const LoopVectorizeHints *Hints) 1381 : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), 1382 TheFunction(F), Hints(Hints) { 1383 CodeMetrics::collectEphemeralValues(L, AC, EphValues); 1384 } 1385 1386 /// Information about vectorization costs 1387 struct VectorizationFactor { 1388 unsigned Width; // Vector width with best cost 1389 unsigned Cost; // Cost of the loop with that width 1390 }; 1391 /// \return The most profitable vectorization factor and the cost of that VF. 1392 /// This method checks every power of two up to VF. If UserVF is not ZERO 1393 /// then this vectorization factor will be selected if vectorization is 1394 /// possible. 1395 VectorizationFactor selectVectorizationFactor(bool OptForSize); 1396 1397 /// \return The size (in bits) of the widest type in the code that 1398 /// needs to be vectorized. We ignore values that remain scalar such as 1399 /// 64 bit loop indices. 1400 unsigned getWidestType(); 1401 1402 /// \return The desired interleave count. 1403 /// If interleave count has been specified by metadata it will be returned. 1404 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1405 /// are the selected vectorization factor and the cost of the selected VF. 1406 unsigned selectInterleaveCount(bool OptForSize, unsigned VF, 1407 unsigned LoopCost); 1408 1409 /// \return The most profitable unroll factor. 1410 /// This method finds the best unroll-factor based on register pressure and 1411 /// other parameters. VF and LoopCost are the selected vectorization factor 1412 /// and the cost of the selected VF. 1413 unsigned computeInterleaveCount(bool OptForSize, unsigned VF, 1414 unsigned LoopCost); 1415 1416 /// \brief A struct that represents some properties of the register usage 1417 /// of a loop. 1418 struct RegisterUsage { 1419 /// Holds the number of loop invariant values that are used in the loop. 1420 unsigned LoopInvariantRegs; 1421 /// Holds the maximum number of concurrent live intervals in the loop. 1422 unsigned MaxLocalUsers; 1423 /// Holds the number of instructions in the loop. 1424 unsigned NumInstructions; 1425 }; 1426 1427 /// \return information about the register usage of the loop. 1428 RegisterUsage calculateRegisterUsage(); 1429 1430 private: 1431 /// Returns the expected execution cost. The unit of the cost does 1432 /// not matter because we use the 'cost' units to compare different 1433 /// vector widths. The cost that is returned is *not* normalized by 1434 /// the factor width. 1435 unsigned expectedCost(unsigned VF); 1436 1437 /// Returns the execution time cost of an instruction for a given vector 1438 /// width. Vector width of one means scalar. 1439 unsigned getInstructionCost(Instruction *I, unsigned VF); 1440 1441 /// Returns whether the instruction is a load or store and will be a emitted 1442 /// as a vector operation. 1443 bool isConsecutiveLoadOrStore(Instruction *I); 1444 1445 /// Report an analysis message to assist the user in diagnosing loops that are 1446 /// not vectorized. These are handled as LoopAccessReport rather than 1447 /// VectorizationReport because the << operator of VectorizationReport returns 1448 /// LoopAccessReport. 1449 void emitAnalysis(const LoopAccessReport &Message) const { 1450 emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); 1451 } 1452 1453 /// Values used only by @llvm.assume calls. 1454 SmallPtrSet<const Value *, 32> EphValues; 1455 1456 /// The loop that we evaluate. 1457 Loop *TheLoop; 1458 /// Scev analysis. 1459 ScalarEvolution *SE; 1460 /// Loop Info analysis. 1461 LoopInfo *LI; 1462 /// Vectorization legality. 1463 LoopVectorizationLegality *Legal; 1464 /// Vector target information. 1465 const TargetTransformInfo &TTI; 1466 /// Target Library Info. 1467 const TargetLibraryInfo *TLI; 1468 const Function *TheFunction; 1469 // Loop Vectorize Hint. 1470 const LoopVectorizeHints *Hints; 1471 }; 1472 1473 /// \brief This holds vectorization requirements that must be verified late in 1474 /// the process. The requirements are set by legalize and costmodel. Once 1475 /// vectorization has been determined to be possible and profitable the 1476 /// requirements can be verified by looking for metadata or compiler options. 1477 /// For example, some loops require FP commutativity which is only allowed if 1478 /// vectorization is explicitly specified or if the fast-math compiler option 1479 /// has been provided. 1480 /// Late evaluation of these requirements allows helpful diagnostics to be 1481 /// composed that tells the user what need to be done to vectorize the loop. For 1482 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late 1483 /// evaluation should be used only when diagnostics can generated that can be 1484 /// followed by a non-expert user. 1485 class LoopVectorizationRequirements { 1486 public: 1487 LoopVectorizationRequirements() 1488 : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} 1489 1490 void addUnsafeAlgebraInst(Instruction *I) { 1491 // First unsafe algebra instruction. 1492 if (!UnsafeAlgebraInst) 1493 UnsafeAlgebraInst = I; 1494 } 1495 1496 void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } 1497 1498 bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { 1499 // If a loop hint is provided the diagnostic is always produced. 1500 const char *Name = Hints.isForced() ? DiagnosticInfo::AlwaysPrint : LV_NAME; 1501 bool failed = false; 1502 if (UnsafeAlgebraInst && 1503 Hints.getForce() == LoopVectorizeHints::FK_Undefined && 1504 Hints.getWidth() == 0) { 1505 emitOptimizationRemarkAnalysisFPCommute( 1506 F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), 1507 VectorizationReport() << "vectorization requires changes in the " 1508 "order of operations, however IEEE 754 " 1509 "floating-point operations are not " 1510 "commutative"); 1511 failed = true; 1512 } 1513 1514 if (NumRuntimePointerChecks > 1515 VectorizerParams::RuntimeMemoryCheckThreshold) { 1516 emitOptimizationRemarkAnalysisAliasing( 1517 F->getContext(), Name, *F, L->getStartLoc(), 1518 VectorizationReport() 1519 << "cannot prove pointers refer to independent arrays in memory. " 1520 "The loop requires " 1521 << NumRuntimePointerChecks 1522 << " runtime independence checks to vectorize the loop, but that " 1523 "would exceed the limit of " 1524 << VectorizerParams::RuntimeMemoryCheckThreshold << " checks"); 1525 DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 1526 failed = true; 1527 } 1528 1529 return failed; 1530 } 1531 1532 private: 1533 unsigned NumRuntimePointerChecks; 1534 Instruction *UnsafeAlgebraInst; 1535 }; 1536 1537 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { 1538 if (L.empty()) 1539 return V.push_back(&L); 1540 1541 for (Loop *InnerL : L) 1542 addInnerLoop(*InnerL, V); 1543 } 1544 1545 /// The LoopVectorize Pass. 1546 struct LoopVectorize : public FunctionPass { 1547 /// Pass identification, replacement for typeid 1548 static char ID; 1549 1550 explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) 1551 : FunctionPass(ID), 1552 DisableUnrolling(NoUnrolling), 1553 AlwaysVectorize(AlwaysVectorize) { 1554 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1555 } 1556 1557 ScalarEvolution *SE; 1558 LoopInfo *LI; 1559 TargetTransformInfo *TTI; 1560 DominatorTree *DT; 1561 BlockFrequencyInfo *BFI; 1562 TargetLibraryInfo *TLI; 1563 AliasAnalysis *AA; 1564 AssumptionCache *AC; 1565 LoopAccessAnalysis *LAA; 1566 bool DisableUnrolling; 1567 bool AlwaysVectorize; 1568 1569 BlockFrequency ColdEntryFreq; 1570 1571 bool runOnFunction(Function &F) override { 1572 SE = &getAnalysis<ScalarEvolution>(); 1573 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1574 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1575 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1576 BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1577 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1578 TLI = TLIP ? &TLIP->getTLI() : nullptr; 1579 AA = &getAnalysis<AliasAnalysis>(); 1580 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1581 LAA = &getAnalysis<LoopAccessAnalysis>(); 1582 1583 // Compute some weights outside of the loop over the loops. Compute this 1584 // using a BranchProbability to re-use its scaling math. 1585 const BranchProbability ColdProb(1, 5); // 20% 1586 ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; 1587 1588 // Don't attempt if 1589 // 1. the target claims to have no vector registers, and 1590 // 2. interleaving won't help ILP. 1591 // 1592 // The second condition is necessary because, even if the target has no 1593 // vector registers, loop vectorization may still enable scalar 1594 // interleaving. 1595 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 1596 return false; 1597 1598 // Build up a worklist of inner-loops to vectorize. This is necessary as 1599 // the act of vectorizing or partially unrolling a loop creates new loops 1600 // and can invalidate iterators across the loops. 1601 SmallVector<Loop *, 8> Worklist; 1602 1603 for (Loop *L : *LI) 1604 addInnerLoop(*L, Worklist); 1605 1606 LoopsAnalyzed += Worklist.size(); 1607 1608 // Now walk the identified inner loops. 1609 bool Changed = false; 1610 while (!Worklist.empty()) 1611 Changed |= processLoop(Worklist.pop_back_val()); 1612 1613 // Process each loop nest in the function. 1614 return Changed; 1615 } 1616 1617 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 1618 SmallVector<Metadata *, 4> MDs; 1619 // Reserve first location for self reference to the LoopID metadata node. 1620 MDs.push_back(nullptr); 1621 bool IsUnrollMetadata = false; 1622 MDNode *LoopID = L->getLoopID(); 1623 if (LoopID) { 1624 // First find existing loop unrolling disable metadata. 1625 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1626 MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 1627 if (MD) { 1628 const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); 1629 IsUnrollMetadata = 1630 S && S->getString().startswith("llvm.loop.unroll.disable"); 1631 } 1632 MDs.push_back(LoopID->getOperand(i)); 1633 } 1634 } 1635 1636 if (!IsUnrollMetadata) { 1637 // Add runtime unroll disable metadata. 1638 LLVMContext &Context = L->getHeader()->getContext(); 1639 SmallVector<Metadata *, 1> DisableOperands; 1640 DisableOperands.push_back( 1641 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 1642 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 1643 MDs.push_back(DisableNode); 1644 MDNode *NewLoopID = MDNode::get(Context, MDs); 1645 // Set operand 0 to refer to the loop id itself. 1646 NewLoopID->replaceOperandWith(0, NewLoopID); 1647 L->setLoopID(NewLoopID); 1648 } 1649 } 1650 1651 bool processLoop(Loop *L) { 1652 assert(L->empty() && "Only process inner loops."); 1653 1654 #ifndef NDEBUG 1655 const std::string DebugLocStr = getDebugLocString(L); 1656 #endif /* NDEBUG */ 1657 1658 DEBUG(dbgs() << "\nLV: Checking a loop in \"" 1659 << L->getHeader()->getParent()->getName() << "\" from " 1660 << DebugLocStr << "\n"); 1661 1662 LoopVectorizeHints Hints(L, DisableUnrolling); 1663 1664 DEBUG(dbgs() << "LV: Loop hints:" 1665 << " force=" 1666 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 1667 ? "disabled" 1668 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 1669 ? "enabled" 1670 : "?")) << " width=" << Hints.getWidth() 1671 << " unroll=" << Hints.getInterleave() << "\n"); 1672 1673 // Function containing loop 1674 Function *F = L->getHeader()->getParent(); 1675 1676 // Looking at the diagnostic output is the only way to determine if a loop 1677 // was vectorized (other than looking at the IR or machine code), so it 1678 // is important to generate an optimization remark for each loop. Most of 1679 // these messages are generated by emitOptimizationRemarkAnalysis. Remarks 1680 // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are 1681 // less verbose reporting vectorized loops and unvectorized loops that may 1682 // benefit from vectorization, respectively. 1683 1684 if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { 1685 DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 1686 return false; 1687 } 1688 1689 // Check the loop for a trip count threshold: 1690 // do not vectorize loops with a tiny trip count. 1691 const unsigned TC = SE->getSmallConstantTripCount(L); 1692 if (TC > 0u && TC < TinyTripCountVectorThreshold) { 1693 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 1694 << "This loop is not worth vectorizing."); 1695 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 1696 DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 1697 else { 1698 DEBUG(dbgs() << "\n"); 1699 emitAnalysisDiag(F, L, Hints, VectorizationReport() 1700 << "vectorization is not beneficial " 1701 "and is not explicitly forced"); 1702 return false; 1703 } 1704 } 1705 1706 // Check if it is legal to vectorize the loop. 1707 LoopVectorizationRequirements Requirements; 1708 LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA, 1709 &Requirements, &Hints); 1710 if (!LVL.canVectorize()) { 1711 DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 1712 emitMissedWarning(F, L, Hints); 1713 return false; 1714 } 1715 1716 // Use the cost model. 1717 LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); 1718 1719 // Check the function attributes to find out if this function should be 1720 // optimized for size. 1721 bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && 1722 F->optForSize(); 1723 1724 // Compute the weighted frequency of this loop being executed and see if it 1725 // is less than 20% of the function entry baseline frequency. Note that we 1726 // always have a canonical loop here because we think we *can* vectorize. 1727 // FIXME: This is hidden behind a flag due to pervasive problems with 1728 // exactly what block frequency models. 1729 if (LoopVectorizeWithBlockFrequency) { 1730 BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); 1731 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 1732 LoopEntryFreq < ColdEntryFreq) 1733 OptForSize = true; 1734 } 1735 1736 // Check the function attributes to see if implicit floats are allowed. 1737 // FIXME: This check doesn't seem possibly correct -- what if the loop is 1738 // an integer loop and the vector instructions selected are purely integer 1739 // vector instructions? 1740 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 1741 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 1742 "attribute is used.\n"); 1743 emitAnalysisDiag( 1744 F, L, Hints, 1745 VectorizationReport() 1746 << "loop not vectorized due to NoImplicitFloat attribute"); 1747 emitMissedWarning(F, L, Hints); 1748 return false; 1749 } 1750 1751 // Select the optimal vectorization factor. 1752 const LoopVectorizationCostModel::VectorizationFactor VF = 1753 CM.selectVectorizationFactor(OptForSize); 1754 1755 // Select the interleave count. 1756 unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); 1757 1758 // Get user interleave count. 1759 unsigned UserIC = Hints.getInterleave(); 1760 1761 // Identify the diagnostic messages that should be produced. 1762 std::string VecDiagMsg, IntDiagMsg; 1763 bool VectorizeLoop = true, InterleaveLoop = true; 1764 1765 if (Requirements.doesNotMeet(F, L, Hints)) { 1766 DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 1767 "requirements.\n"); 1768 emitMissedWarning(F, L, Hints); 1769 return false; 1770 } 1771 1772 if (VF.Width == 1) { 1773 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 1774 VecDiagMsg = 1775 "the cost-model indicates that vectorization is not beneficial"; 1776 VectorizeLoop = false; 1777 } 1778 1779 if (IC == 1 && UserIC <= 1) { 1780 // Tell the user interleaving is not beneficial. 1781 DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 1782 IntDiagMsg = 1783 "the cost-model indicates that interleaving is not beneficial"; 1784 InterleaveLoop = false; 1785 if (UserIC == 1) 1786 IntDiagMsg += 1787 " and is explicitly disabled or interleave count is set to 1"; 1788 } else if (IC > 1 && UserIC == 1) { 1789 // Tell the user interleaving is beneficial, but it explicitly disabled. 1790 DEBUG(dbgs() 1791 << "LV: Interleaving is beneficial but is explicitly disabled."); 1792 IntDiagMsg = "the cost-model indicates that interleaving is beneficial " 1793 "but is explicitly disabled or interleave count is set to 1"; 1794 InterleaveLoop = false; 1795 } 1796 1797 // Override IC if user provided an interleave count. 1798 IC = UserIC > 0 ? UserIC : IC; 1799 1800 // Emit diagnostic messages, if any. 1801 if (!VectorizeLoop && !InterleaveLoop) { 1802 // Do not vectorize or interleaving the loop. 1803 emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, 1804 L->getStartLoc(), VecDiagMsg); 1805 emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, 1806 L->getStartLoc(), IntDiagMsg); 1807 return false; 1808 } else if (!VectorizeLoop && InterleaveLoop) { 1809 DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 1810 emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, 1811 L->getStartLoc(), VecDiagMsg); 1812 } else if (VectorizeLoop && !InterleaveLoop) { 1813 DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 1814 << DebugLocStr << '\n'); 1815 emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, 1816 L->getStartLoc(), IntDiagMsg); 1817 } else if (VectorizeLoop && InterleaveLoop) { 1818 DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 1819 << DebugLocStr << '\n'); 1820 DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 1821 } 1822 1823 if (!VectorizeLoop) { 1824 assert(IC > 1 && "interleave count should not be 1 or 0"); 1825 // If we decided that it is not legal to vectorize the loop then 1826 // interleave it. 1827 InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); 1828 Unroller.vectorize(&LVL); 1829 1830 emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), 1831 Twine("interleaved loop (interleaved count: ") + 1832 Twine(IC) + ")"); 1833 } else { 1834 // If we decided that it is *legal* to vectorize the loop then do it. 1835 InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); 1836 LB.vectorize(&LVL); 1837 ++LoopsVectorized; 1838 1839 // Add metadata to disable runtime unrolling scalar loop when there's no 1840 // runtime check about strides and memory. Because at this situation, 1841 // scalar loop is rarely used not worthy to be unrolled. 1842 if (!LB.IsSafetyChecksAdded()) 1843 AddRuntimeUnrollDisableMetaData(L); 1844 1845 // Report the vectorization decision. 1846 emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), 1847 Twine("vectorized loop (vectorization width: ") + 1848 Twine(VF.Width) + ", interleaved count: " + 1849 Twine(IC) + ")"); 1850 } 1851 1852 // Mark the loop as already vectorized to avoid vectorizing again. 1853 Hints.setAlreadyVectorized(); 1854 1855 DEBUG(verifyFunction(*L->getHeader()->getParent())); 1856 return true; 1857 } 1858 1859 void getAnalysisUsage(AnalysisUsage &AU) const override { 1860 AU.addRequired<AssumptionCacheTracker>(); 1861 AU.addRequiredID(LoopSimplifyID); 1862 AU.addRequiredID(LCSSAID); 1863 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1864 AU.addRequired<DominatorTreeWrapperPass>(); 1865 AU.addRequired<LoopInfoWrapperPass>(); 1866 AU.addRequired<ScalarEvolution>(); 1867 AU.addRequired<TargetTransformInfoWrapperPass>(); 1868 AU.addRequired<AliasAnalysis>(); 1869 AU.addRequired<LoopAccessAnalysis>(); 1870 AU.addPreserved<LoopInfoWrapperPass>(); 1871 AU.addPreserved<DominatorTreeWrapperPass>(); 1872 AU.addPreserved<AliasAnalysis>(); 1873 } 1874 1875 }; 1876 1877 } // end anonymous namespace 1878 1879 //===----------------------------------------------------------------------===// 1880 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1881 // LoopVectorizationCostModel. 1882 //===----------------------------------------------------------------------===// 1883 1884 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1885 // We need to place the broadcast of invariant variables outside the loop. 1886 Instruction *Instr = dyn_cast<Instruction>(V); 1887 bool NewInstr = 1888 (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), 1889 Instr->getParent()) != LoopVectorBody.end()); 1890 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 1891 1892 // Place the code for broadcasting invariant variables in the new preheader. 1893 IRBuilder<>::InsertPointGuard Guard(Builder); 1894 if (Invariant) 1895 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1896 1897 // Broadcast the scalar into all locations in the vector. 1898 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1899 1900 return Shuf; 1901 } 1902 1903 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, 1904 Value *Step) { 1905 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1906 assert(Val->getType()->getScalarType()->isIntegerTy() && 1907 "Elem must be an integer"); 1908 assert(Step->getType() == Val->getType()->getScalarType() && 1909 "Step has wrong type"); 1910 // Create the types. 1911 Type *ITy = Val->getType()->getScalarType(); 1912 VectorType *Ty = cast<VectorType>(Val->getType()); 1913 int VLen = Ty->getNumElements(); 1914 SmallVector<Constant*, 8> Indices; 1915 1916 // Create a vector of consecutive numbers from zero to VF. 1917 for (int i = 0; i < VLen; ++i) 1918 Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); 1919 1920 // Add the consecutive indices to the vector value. 1921 Constant *Cv = ConstantVector::get(Indices); 1922 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1923 Step = Builder.CreateVectorSplat(VLen, Step); 1924 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1925 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1926 // which can be found from the original scalar operations. 1927 Step = Builder.CreateMul(Cv, Step); 1928 return Builder.CreateAdd(Val, Step, "induction"); 1929 } 1930 1931 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 1932 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); 1933 // Make sure that the pointer does not point to structs. 1934 if (Ptr->getType()->getPointerElementType()->isAggregateType()) 1935 return 0; 1936 1937 // If this value is a pointer induction variable we know it is consecutive. 1938 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 1939 if (Phi && Inductions.count(Phi)) { 1940 InductionInfo II = Inductions[Phi]; 1941 return II.getConsecutiveDirection(); 1942 } 1943 1944 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 1945 if (!Gep) 1946 return 0; 1947 1948 unsigned NumOperands = Gep->getNumOperands(); 1949 Value *GpPtr = Gep->getPointerOperand(); 1950 // If this GEP value is a consecutive pointer induction variable and all of 1951 // the indices are constant then we know it is consecutive. We can 1952 Phi = dyn_cast<PHINode>(GpPtr); 1953 if (Phi && Inductions.count(Phi)) { 1954 1955 // Make sure that the pointer does not point to structs. 1956 PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 1957 if (GepPtrType->getElementType()->isAggregateType()) 1958 return 0; 1959 1960 // Make sure that all of the index operands are loop invariant. 1961 for (unsigned i = 1; i < NumOperands; ++i) 1962 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 1963 return 0; 1964 1965 InductionInfo II = Inductions[Phi]; 1966 return II.getConsecutiveDirection(); 1967 } 1968 1969 unsigned InductionOperand = getGEPInductionOperand(Gep); 1970 1971 // Check that all of the gep indices are uniform except for our induction 1972 // operand. 1973 for (unsigned i = 0; i != NumOperands; ++i) 1974 if (i != InductionOperand && 1975 !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 1976 return 0; 1977 1978 // We can emit wide load/stores only if the last non-zero index is the 1979 // induction variable. 1980 const SCEV *Last = nullptr; 1981 if (!Strides.count(Gep)) 1982 Last = SE->getSCEV(Gep->getOperand(InductionOperand)); 1983 else { 1984 // Because of the multiplication by a stride we can have a s/zext cast. 1985 // We are going to replace this stride by 1 so the cast is safe to ignore. 1986 // 1987 // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 1988 // %0 = trunc i64 %indvars.iv to i32 1989 // %mul = mul i32 %0, %Stride1 1990 // %idxprom = zext i32 %mul to i64 << Safe cast. 1991 // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom 1992 // 1993 Last = replaceSymbolicStrideSCEV(SE, Strides, 1994 Gep->getOperand(InductionOperand), Gep); 1995 if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) 1996 Last = 1997 (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) 1998 ? C->getOperand() 1999 : Last; 2000 } 2001 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 2002 const SCEV *Step = AR->getStepRecurrence(*SE); 2003 2004 // The memory is consecutive because the last index is consecutive 2005 // and all other indices are loop invariant. 2006 if (Step->isOne()) 2007 return 1; 2008 if (Step->isAllOnesValue()) 2009 return -1; 2010 } 2011 2012 return 0; 2013 } 2014 2015 bool LoopVectorizationLegality::isUniform(Value *V) { 2016 return LAI->isUniform(V); 2017 } 2018 2019 InnerLoopVectorizer::VectorParts& 2020 InnerLoopVectorizer::getVectorValue(Value *V) { 2021 assert(V != Induction && "The new induction variable should not be used."); 2022 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2023 2024 // If we have a stride that is replaced by one, do it here. 2025 if (Legal->hasStride(V)) 2026 V = ConstantInt::get(V->getType(), 1); 2027 2028 // If we have this scalar in the map, return it. 2029 if (WidenMap.has(V)) 2030 return WidenMap.get(V); 2031 2032 // If this scalar is unknown, assume that it is a constant or that it is 2033 // loop invariant. Broadcast V and save the value for future uses. 2034 Value *B = getBroadcastInstrs(V); 2035 return WidenMap.splat(V, B); 2036 } 2037 2038 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2039 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2040 SmallVector<Constant*, 8> ShuffleMask; 2041 for (unsigned i = 0; i < VF; ++i) 2042 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2043 2044 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2045 ConstantVector::get(ShuffleMask), 2046 "reverse"); 2047 } 2048 2049 // Get a mask to interleave \p NumVec vectors into a wide vector. 2050 // I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> 2051 // E.g. For 2 interleaved vectors, if VF is 4, the mask is: 2052 // <0, 4, 1, 5, 2, 6, 3, 7> 2053 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, 2054 unsigned NumVec) { 2055 SmallVector<Constant *, 16> Mask; 2056 for (unsigned i = 0; i < VF; i++) 2057 for (unsigned j = 0; j < NumVec; j++) 2058 Mask.push_back(Builder.getInt32(j * VF + i)); 2059 2060 return ConstantVector::get(Mask); 2061 } 2062 2063 // Get the strided mask starting from index \p Start. 2064 // I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> 2065 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, 2066 unsigned Stride, unsigned VF) { 2067 SmallVector<Constant *, 16> Mask; 2068 for (unsigned i = 0; i < VF; i++) 2069 Mask.push_back(Builder.getInt32(Start + i * Stride)); 2070 2071 return ConstantVector::get(Mask); 2072 } 2073 2074 // Get a mask of two parts: The first part consists of sequential integers 2075 // starting from 0, The second part consists of UNDEFs. 2076 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> 2077 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, 2078 unsigned NumUndef) { 2079 SmallVector<Constant *, 16> Mask; 2080 for (unsigned i = 0; i < NumInt; i++) 2081 Mask.push_back(Builder.getInt32(i)); 2082 2083 Constant *Undef = UndefValue::get(Builder.getInt32Ty()); 2084 for (unsigned i = 0; i < NumUndef; i++) 2085 Mask.push_back(Undef); 2086 2087 return ConstantVector::get(Mask); 2088 } 2089 2090 // Concatenate two vectors with the same element type. The 2nd vector should 2091 // not have more elements than the 1st vector. If the 2nd vector has less 2092 // elements, extend it with UNDEFs. 2093 static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, 2094 Value *V2) { 2095 VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); 2096 VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); 2097 assert(VecTy1 && VecTy2 && 2098 VecTy1->getScalarType() == VecTy2->getScalarType() && 2099 "Expect two vectors with the same element type"); 2100 2101 unsigned NumElts1 = VecTy1->getNumElements(); 2102 unsigned NumElts2 = VecTy2->getNumElements(); 2103 assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); 2104 2105 if (NumElts1 > NumElts2) { 2106 // Extend with UNDEFs. 2107 Constant *ExtMask = 2108 getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); 2109 V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); 2110 } 2111 2112 Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); 2113 return Builder.CreateShuffleVector(V1, V2, Mask); 2114 } 2115 2116 // Concatenate vectors in the given list. All vectors have the same type. 2117 static Value *ConcatenateVectors(IRBuilder<> &Builder, 2118 ArrayRef<Value *> InputList) { 2119 unsigned NumVec = InputList.size(); 2120 assert(NumVec > 1 && "Should be at least two vectors"); 2121 2122 SmallVector<Value *, 8> ResList; 2123 ResList.append(InputList.begin(), InputList.end()); 2124 do { 2125 SmallVector<Value *, 8> TmpList; 2126 for (unsigned i = 0; i < NumVec - 1; i += 2) { 2127 Value *V0 = ResList[i], *V1 = ResList[i + 1]; 2128 assert((V0->getType() == V1->getType() || i == NumVec - 2) && 2129 "Only the last vector may have a different type"); 2130 2131 TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); 2132 } 2133 2134 // Push the last vector if the total number of vectors is odd. 2135 if (NumVec % 2 != 0) 2136 TmpList.push_back(ResList[NumVec - 1]); 2137 2138 ResList = TmpList; 2139 NumVec = ResList.size(); 2140 } while (NumVec > 1); 2141 2142 return ResList[0]; 2143 } 2144 2145 // Try to vectorize the interleave group that \p Instr belongs to. 2146 // 2147 // E.g. Translate following interleaved load group (factor = 3): 2148 // for (i = 0; i < N; i+=3) { 2149 // R = Pic[i]; // Member of index 0 2150 // G = Pic[i+1]; // Member of index 1 2151 // B = Pic[i+2]; // Member of index 2 2152 // ... // do something to R, G, B 2153 // } 2154 // To: 2155 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2156 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2157 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2158 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2159 // 2160 // Or translate following interleaved store group (factor = 3): 2161 // for (i = 0; i < N; i+=3) { 2162 // ... do something to R, G, B 2163 // Pic[i] = R; // Member of index 0 2164 // Pic[i+1] = G; // Member of index 1 2165 // Pic[i+2] = B; // Member of index 2 2166 // } 2167 // To: 2168 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2169 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2170 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2171 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2172 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2173 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { 2174 const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); 2175 assert(Group && "Fail to get an interleaved access group."); 2176 2177 // Skip if current instruction is not the insert position. 2178 if (Instr != Group->getInsertPos()) 2179 return; 2180 2181 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2182 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2183 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 2184 2185 // Prepare for the vector type of the interleaved load/store. 2186 Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2187 unsigned InterleaveFactor = Group->getFactor(); 2188 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2189 Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); 2190 2191 // Prepare for the new pointers. 2192 setDebugLocFromInst(Builder, Ptr); 2193 VectorParts &PtrParts = getVectorValue(Ptr); 2194 SmallVector<Value *, 2> NewPtrs; 2195 unsigned Index = Group->getIndex(Instr); 2196 for (unsigned Part = 0; Part < UF; Part++) { 2197 // Extract the pointer for current instruction from the pointer vector. A 2198 // reverse access uses the pointer in the last lane. 2199 Value *NewPtr = Builder.CreateExtractElement( 2200 PtrParts[Part], 2201 Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); 2202 2203 // Notice current instruction could be any index. Need to adjust the address 2204 // to the member of index 0. 2205 // 2206 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2207 // b = A[i]; // Member of index 0 2208 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2209 // 2210 // E.g. A[i+1] = a; // Member of index 1 2211 // A[i] = b; // Member of index 0 2212 // A[i+2] = c; // Member of index 2 (Current instruction) 2213 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2214 NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); 2215 2216 // Cast to the vector pointer type. 2217 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2218 } 2219 2220 setDebugLocFromInst(Builder, Instr); 2221 Value *UndefVec = UndefValue::get(VecTy); 2222 2223 // Vectorize the interleaved load group. 2224 if (LI) { 2225 for (unsigned Part = 0; Part < UF; Part++) { 2226 Instruction *NewLoadInstr = Builder.CreateAlignedLoad( 2227 NewPtrs[Part], Group->getAlignment(), "wide.vec"); 2228 2229 for (unsigned i = 0; i < InterleaveFactor; i++) { 2230 Instruction *Member = Group->getMember(i); 2231 2232 // Skip the gaps in the group. 2233 if (!Member) 2234 continue; 2235 2236 Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); 2237 Value *StridedVec = Builder.CreateShuffleVector( 2238 NewLoadInstr, UndefVec, StrideMask, "strided.vec"); 2239 2240 // If this member has different type, cast the result type. 2241 if (Member->getType() != ScalarTy) { 2242 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2243 StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); 2244 } 2245 2246 VectorParts &Entry = WidenMap.get(Member); 2247 Entry[Part] = 2248 Group->isReverse() ? reverseVector(StridedVec) : StridedVec; 2249 } 2250 2251 propagateMetadata(NewLoadInstr, Instr); 2252 } 2253 return; 2254 } 2255 2256 // The sub vector type for current instruction. 2257 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2258 2259 // Vectorize the interleaved store group. 2260 for (unsigned Part = 0; Part < UF; Part++) { 2261 // Collect the stored vector from each member. 2262 SmallVector<Value *, 4> StoredVecs; 2263 for (unsigned i = 0; i < InterleaveFactor; i++) { 2264 // Interleaved store group doesn't allow a gap, so each index has a member 2265 Instruction *Member = Group->getMember(i); 2266 assert(Member && "Fail to get a member from an interleaved store group"); 2267 2268 Value *StoredVec = 2269 getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part]; 2270 if (Group->isReverse()) 2271 StoredVec = reverseVector(StoredVec); 2272 2273 // If this member has different type, cast it to an unified type. 2274 if (StoredVec->getType() != SubVT) 2275 StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); 2276 2277 StoredVecs.push_back(StoredVec); 2278 } 2279 2280 // Concatenate all vectors into a wide vector. 2281 Value *WideVec = ConcatenateVectors(Builder, StoredVecs); 2282 2283 // Interleave the elements in the wide vector. 2284 Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); 2285 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2286 "interleaved.vec"); 2287 2288 Instruction *NewStoreInstr = 2289 Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); 2290 propagateMetadata(NewStoreInstr, Instr); 2291 } 2292 } 2293 2294 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { 2295 // Attempt to issue a wide load. 2296 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2297 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2298 2299 assert((LI || SI) && "Invalid Load/Store instruction"); 2300 2301 // Try to vectorize the interleave group if this access is interleaved. 2302 if (Legal->isAccessInterleaved(Instr)) 2303 return vectorizeInterleaveGroup(Instr); 2304 2305 Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2306 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2307 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 2308 unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 2309 // An alignment of 0 means target abi alignment. We need to use the scalar's 2310 // target abi alignment in such a case. 2311 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2312 if (!Alignment) 2313 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2314 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2315 unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); 2316 unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; 2317 2318 if (SI && Legal->blockNeedsPredication(SI->getParent()) && 2319 !Legal->isMaskRequired(SI)) 2320 return scalarizeInstruction(Instr, true); 2321 2322 if (ScalarAllocatedSize != VectorElementSize) 2323 return scalarizeInstruction(Instr); 2324 2325 // If the pointer is loop invariant or if it is non-consecutive, 2326 // scalarize the load. 2327 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 2328 bool Reverse = ConsecutiveStride < 0; 2329 bool UniformLoad = LI && Legal->isUniform(Ptr); 2330 if (!ConsecutiveStride || UniformLoad) 2331 return scalarizeInstruction(Instr); 2332 2333 Constant *Zero = Builder.getInt32(0); 2334 VectorParts &Entry = WidenMap.get(Instr); 2335 2336 // Handle consecutive loads/stores. 2337 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 2338 if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 2339 setDebugLocFromInst(Builder, Gep); 2340 Value *PtrOperand = Gep->getPointerOperand(); 2341 Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 2342 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 2343 2344 // Create the new GEP with the new induction variable. 2345 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2346 Gep2->setOperand(0, FirstBasePtr); 2347 Gep2->setName("gep.indvar.base"); 2348 Ptr = Builder.Insert(Gep2); 2349 } else if (Gep) { 2350 setDebugLocFromInst(Builder, Gep); 2351 assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), 2352 OrigLoop) && "Base ptr must be invariant"); 2353 2354 // The last index does not have to be the induction. It can be 2355 // consecutive and be a function of the index. For example A[I+1]; 2356 unsigned NumOperands = Gep->getNumOperands(); 2357 unsigned InductionOperand = getGEPInductionOperand(Gep); 2358 // Create the new GEP with the new induction variable. 2359 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2360 2361 for (unsigned i = 0; i < NumOperands; ++i) { 2362 Value *GepOperand = Gep->getOperand(i); 2363 Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); 2364 2365 // Update last index or loop invariant instruction anchored in loop. 2366 if (i == InductionOperand || 2367 (GepOperandInst && OrigLoop->contains(GepOperandInst))) { 2368 assert((i == InductionOperand || 2369 SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && 2370 "Must be last index or loop invariant"); 2371 2372 VectorParts &GEPParts = getVectorValue(GepOperand); 2373 Value *Index = GEPParts[0]; 2374 Index = Builder.CreateExtractElement(Index, Zero); 2375 Gep2->setOperand(i, Index); 2376 Gep2->setName("gep.indvar.idx"); 2377 } 2378 } 2379 Ptr = Builder.Insert(Gep2); 2380 } else { 2381 // Use the induction element ptr. 2382 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 2383 setDebugLocFromInst(Builder, Ptr); 2384 VectorParts &PtrVal = getVectorValue(Ptr); 2385 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 2386 } 2387 2388 VectorParts Mask = createBlockInMask(Instr->getParent()); 2389 // Handle Stores: 2390 if (SI) { 2391 assert(!Legal->isUniform(SI->getPointerOperand()) && 2392 "We do not allow storing to uniform addresses"); 2393 setDebugLocFromInst(Builder, SI); 2394 // We don't want to update the value in the map as it might be used in 2395 // another expression. So don't use a reference type for "StoredVal". 2396 VectorParts StoredVal = getVectorValue(SI->getValueOperand()); 2397 2398 for (unsigned Part = 0; Part < UF; ++Part) { 2399 // Calculate the pointer for the specific unroll-part. 2400 Value *PartPtr = 2401 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2402 2403 if (Reverse) { 2404 // If we store to reverse consecutive memory locations, then we need 2405 // to reverse the order of elements in the stored value. 2406 StoredVal[Part] = reverseVector(StoredVal[Part]); 2407 // If the address is consecutive but reversed, then the 2408 // wide store needs to start at the last vector element. 2409 PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2410 PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2411 Mask[Part] = reverseVector(Mask[Part]); 2412 } 2413 2414 Value *VecPtr = Builder.CreateBitCast(PartPtr, 2415 DataTy->getPointerTo(AddressSpace)); 2416 2417 Instruction *NewSI; 2418 if (Legal->isMaskRequired(SI)) 2419 NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, 2420 Mask[Part]); 2421 else 2422 NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); 2423 propagateMetadata(NewSI, SI); 2424 } 2425 return; 2426 } 2427 2428 // Handle loads. 2429 assert(LI && "Must have a load instruction"); 2430 setDebugLocFromInst(Builder, LI); 2431 for (unsigned Part = 0; Part < UF; ++Part) { 2432 // Calculate the pointer for the specific unroll-part. 2433 Value *PartPtr = 2434 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2435 2436 if (Reverse) { 2437 // If the address is consecutive but reversed, then the 2438 // wide load needs to start at the last vector element. 2439 PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2440 PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2441 Mask[Part] = reverseVector(Mask[Part]); 2442 } 2443 2444 Instruction* NewLI; 2445 Value *VecPtr = Builder.CreateBitCast(PartPtr, 2446 DataTy->getPointerTo(AddressSpace)); 2447 if (Legal->isMaskRequired(LI)) 2448 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2449 UndefValue::get(DataTy), 2450 "wide.masked.load"); 2451 else 2452 NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); 2453 propagateMetadata(NewLI, LI); 2454 Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; 2455 } 2456 } 2457 2458 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { 2459 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2460 // Holds vector parameters or scalars, in case of uniform vals. 2461 SmallVector<VectorParts, 4> Params; 2462 2463 setDebugLocFromInst(Builder, Instr); 2464 2465 // Find all of the vectorized parameters. 2466 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2467 Value *SrcOp = Instr->getOperand(op); 2468 2469 // If we are accessing the old induction variable, use the new one. 2470 if (SrcOp == OldInduction) { 2471 Params.push_back(getVectorValue(SrcOp)); 2472 continue; 2473 } 2474 2475 // Try using previously calculated values. 2476 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 2477 2478 // If the src is an instruction that appeared earlier in the basic block, 2479 // then it should already be vectorized. 2480 if (SrcInst && OrigLoop->contains(SrcInst)) { 2481 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 2482 // The parameter is a vector value from earlier. 2483 Params.push_back(WidenMap.get(SrcInst)); 2484 } else { 2485 // The parameter is a scalar from outside the loop. Maybe even a constant. 2486 VectorParts Scalars; 2487 Scalars.append(UF, SrcOp); 2488 Params.push_back(Scalars); 2489 } 2490 } 2491 2492 assert(Params.size() == Instr->getNumOperands() && 2493 "Invalid number of operands"); 2494 2495 // Does this instruction return a value ? 2496 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2497 2498 Value *UndefVec = IsVoidRetTy ? nullptr : 2499 UndefValue::get(VectorType::get(Instr->getType(), VF)); 2500 // Create a new entry in the WidenMap and initialize it to Undef or Null. 2501 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 2502 2503 Instruction *InsertPt = Builder.GetInsertPoint(); 2504 BasicBlock *IfBlock = Builder.GetInsertBlock(); 2505 BasicBlock *CondBlock = nullptr; 2506 2507 VectorParts Cond; 2508 Loop *VectorLp = nullptr; 2509 if (IfPredicateStore) { 2510 assert(Instr->getParent()->getSinglePredecessor() && 2511 "Only support single predecessor blocks"); 2512 Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 2513 Instr->getParent()); 2514 VectorLp = LI->getLoopFor(IfBlock); 2515 assert(VectorLp && "Must have a loop for this block"); 2516 } 2517 2518 // For each vector unroll 'part': 2519 for (unsigned Part = 0; Part < UF; ++Part) { 2520 // For each scalar that we create: 2521 for (unsigned Width = 0; Width < VF; ++Width) { 2522 2523 // Start if-block. 2524 Value *Cmp = nullptr; 2525 if (IfPredicateStore) { 2526 Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); 2527 Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); 2528 CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); 2529 LoopVectorBody.push_back(CondBlock); 2530 VectorLp->addBasicBlockToLoop(CondBlock, *LI); 2531 // Update Builder with newly created basic block. 2532 Builder.SetInsertPoint(InsertPt); 2533 } 2534 2535 Instruction *Cloned = Instr->clone(); 2536 if (!IsVoidRetTy) 2537 Cloned->setName(Instr->getName() + ".cloned"); 2538 // Replace the operands of the cloned instructions with extracted scalars. 2539 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2540 Value *Op = Params[op][Part]; 2541 // Param is a vector. Need to extract the right lane. 2542 if (Op->getType()->isVectorTy()) 2543 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 2544 Cloned->setOperand(op, Op); 2545 } 2546 2547 // Place the cloned scalar in the new loop. 2548 Builder.Insert(Cloned); 2549 2550 // If the original scalar returns a value we need to place it in a vector 2551 // so that future users will be able to use it. 2552 if (!IsVoidRetTy) 2553 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 2554 Builder.getInt32(Width)); 2555 // End if-block. 2556 if (IfPredicateStore) { 2557 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); 2558 LoopVectorBody.push_back(NewIfBlock); 2559 VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); 2560 Builder.SetInsertPoint(InsertPt); 2561 ReplaceInstWithInst(IfBlock->getTerminator(), 2562 BranchInst::Create(CondBlock, NewIfBlock, Cmp)); 2563 IfBlock = NewIfBlock; 2564 } 2565 } 2566 } 2567 } 2568 2569 static Instruction *getFirstInst(Instruction *FirstInst, Value *V, 2570 Instruction *Loc) { 2571 if (FirstInst) 2572 return FirstInst; 2573 if (Instruction *I = dyn_cast<Instruction>(V)) 2574 return I->getParent() == Loc->getParent() ? I : nullptr; 2575 return nullptr; 2576 } 2577 2578 std::pair<Instruction *, Instruction *> 2579 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { 2580 Instruction *tnullptr = nullptr; 2581 if (!Legal->mustCheckStrides()) 2582 return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); 2583 2584 IRBuilder<> ChkBuilder(Loc); 2585 2586 // Emit checks. 2587 Value *Check = nullptr; 2588 Instruction *FirstInst = nullptr; 2589 for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), 2590 SE = Legal->strides_end(); 2591 SI != SE; ++SI) { 2592 Value *Ptr = stripIntegerCast(*SI); 2593 Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), 2594 "stride.chk"); 2595 // Store the first instruction we create. 2596 FirstInst = getFirstInst(FirstInst, C, Loc); 2597 if (Check) 2598 Check = ChkBuilder.CreateOr(Check, C); 2599 else 2600 Check = C; 2601 } 2602 2603 // We have to do this trickery because the IRBuilder might fold the check to a 2604 // constant expression in which case there is no Instruction anchored in a 2605 // the block. 2606 LLVMContext &Ctx = Loc->getContext(); 2607 Instruction *TheCheck = 2608 BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); 2609 ChkBuilder.Insert(TheCheck, "stride.not.one"); 2610 FirstInst = getFirstInst(FirstInst, TheCheck, Loc); 2611 2612 return std::make_pair(FirstInst, TheCheck); 2613 } 2614 2615 void InnerLoopVectorizer::createEmptyLoop() { 2616 /* 2617 In this function we generate a new loop. The new loop will contain 2618 the vectorized instructions while the old loop will continue to run the 2619 scalar remainder. 2620 2621 [ ] <-- Back-edge taken count overflow check. 2622 / | 2623 / v 2624 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2625 | / | 2626 | / v 2627 || [ ] <-- vector pre header. 2628 || | 2629 || v 2630 || [ ] \ 2631 || [ ]_| <-- vector loop. 2632 || | 2633 | \ v 2634 | >[ ] <--- middle-block. 2635 | / | 2636 | / v 2637 -|- >[ ] <--- new preheader. 2638 | | 2639 | v 2640 | [ ] \ 2641 | [ ]_| <-- old scalar loop to handle remainder. 2642 \ | 2643 \ v 2644 >[ ] <-- exit block. 2645 ... 2646 */ 2647 2648 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2649 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2650 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2651 assert(VectorPH && "Invalid loop structure"); 2652 assert(ExitBlock && "Must have an exit block"); 2653 2654 // Some loops have a single integer induction variable, while other loops 2655 // don't. One example is c++ iterators that often have multiple pointer 2656 // induction variables. In the code below we also support a case where we 2657 // don't have a single induction variable. 2658 OldInduction = Legal->getInduction(); 2659 Type *IdxTy = Legal->getWidestInductionType(); 2660 2661 // Find the loop boundaries. 2662 const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); 2663 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 2664 2665 // The exit count might have the type of i64 while the phi is i32. This can 2666 // happen if we have an induction variable that is sign extended before the 2667 // compare. The only way that we get a backedge taken count is that the 2668 // induction variable was signed and as such will not overflow. In such a case 2669 // truncation is legal. 2670 if (ExitCount->getType()->getPrimitiveSizeInBits() > 2671 IdxTy->getPrimitiveSizeInBits()) 2672 ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); 2673 2674 const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); 2675 // Get the total trip count from the count by adding 1. 2676 ExitCount = SE->getAddExpr(BackedgeTakeCount, 2677 SE->getConstant(BackedgeTakeCount->getType(), 1)); 2678 2679 const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); 2680 2681 // Expand the trip count and place the new instructions in the preheader. 2682 // Notice that the pre-header does not change, only the loop body. 2683 SCEVExpander Exp(*SE, DL, "induction"); 2684 2685 // We need to test whether the backedge-taken count is uint##_max. Adding one 2686 // to it will cause overflow and an incorrect loop trip count in the vector 2687 // body. In case of overflow we want to directly jump to the scalar remainder 2688 // loop. 2689 Value *BackedgeCount = 2690 Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), 2691 VectorPH->getTerminator()); 2692 if (BackedgeCount->getType()->isPointerTy()) 2693 BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, 2694 "backedge.ptrcnt.to.int", 2695 VectorPH->getTerminator()); 2696 Instruction *CheckBCOverflow = 2697 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, 2698 Constant::getAllOnesValue(BackedgeCount->getType()), 2699 "backedge.overflow", VectorPH->getTerminator()); 2700 2701 // The loop index does not have to start at Zero. Find the original start 2702 // value from the induction PHI node. If we don't have an induction variable 2703 // then we know that it starts at zero. 2704 Builder.SetInsertPoint(VectorPH->getTerminator()); 2705 Value *StartIdx = ExtendedIdx = 2706 OldInduction 2707 ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), 2708 IdxTy) 2709 : ConstantInt::get(IdxTy, 0); 2710 2711 // Count holds the overall loop count (N). 2712 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2713 VectorPH->getTerminator()); 2714 2715 LoopBypassBlocks.push_back(VectorPH); 2716 2717 // Split the single block loop into the two loop structure described above. 2718 BasicBlock *VecBody = 2719 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2720 BasicBlock *MiddleBlock = 2721 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2722 BasicBlock *ScalarPH = 2723 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2724 2725 // Create and register the new vector loop. 2726 Loop* Lp = new Loop(); 2727 Loop *ParentLoop = OrigLoop->getParentLoop(); 2728 2729 // Insert the new loop into the loop nest and register the new basic blocks 2730 // before calling any utilities such as SCEV that require valid LoopInfo. 2731 if (ParentLoop) { 2732 ParentLoop->addChildLoop(Lp); 2733 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2734 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2735 } else { 2736 LI->addTopLevelLoop(Lp); 2737 } 2738 Lp->addBasicBlockToLoop(VecBody, *LI); 2739 2740 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 2741 // inside the loop. 2742 Builder.SetInsertPoint(VecBody->getFirstNonPHI()); 2743 2744 // Generate the induction variable. 2745 setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); 2746 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 2747 // The loop step is equal to the vectorization factor (num of SIMD elements) 2748 // times the unroll factor (num of SIMD instructions). 2749 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2750 2751 // Generate code to check that the loop's trip count that we computed by 2752 // adding one to the backedge-taken count will not overflow. 2753 BasicBlock *NewVectorPH = 2754 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); 2755 if (ParentLoop) 2756 ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); 2757 ReplaceInstWithInst( 2758 VectorPH->getTerminator(), 2759 BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); 2760 VectorPH = NewVectorPH; 2761 2762 // This is the IR builder that we use to add all of the logic for bypassing 2763 // the new vector loop. 2764 IRBuilder<> BypassBuilder(VectorPH->getTerminator()); 2765 setDebugLocFromInst(BypassBuilder, 2766 getDebugLocFromInstOrOperands(OldInduction)); 2767 2768 // We may need to extend the index in case there is a type mismatch. 2769 // We know that the count starts at zero and does not overflow. 2770 if (Count->getType() != IdxTy) { 2771 // The exit count can be of pointer type. Convert it to the correct 2772 // integer type. 2773 if (ExitCount->getType()->isPointerTy()) 2774 Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); 2775 else 2776 Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); 2777 } 2778 2779 // Add the start index to the loop count to get the new end index. 2780 Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); 2781 2782 // Now we need to generate the expression for N - (N % VF), which is 2783 // the part that the vectorized body will execute. 2784 Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); 2785 Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); 2786 Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, 2787 "end.idx.rnd.down"); 2788 2789 // Now, compare the new count to zero. If it is zero skip the vector loop and 2790 // jump to the scalar loop. 2791 Value *Cmp = 2792 BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); 2793 NewVectorPH = 2794 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); 2795 if (ParentLoop) 2796 ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); 2797 LoopBypassBlocks.push_back(VectorPH); 2798 ReplaceInstWithInst(VectorPH->getTerminator(), 2799 BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); 2800 VectorPH = NewVectorPH; 2801 2802 // Generate the code to check that the strides we assumed to be one are really 2803 // one. We want the new basic block to start at the first instruction in a 2804 // sequence of instructions that form a check. 2805 Instruction *StrideCheck; 2806 Instruction *FirstCheckInst; 2807 std::tie(FirstCheckInst, StrideCheck) = 2808 addStrideCheck(VectorPH->getTerminator()); 2809 if (StrideCheck) { 2810 AddedSafetyChecks = true; 2811 // Create a new block containing the stride check. 2812 VectorPH->setName("vector.stridecheck"); 2813 NewVectorPH = 2814 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); 2815 if (ParentLoop) 2816 ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); 2817 LoopBypassBlocks.push_back(VectorPH); 2818 2819 // Replace the branch into the memory check block with a conditional branch 2820 // for the "few elements case". 2821 ReplaceInstWithInst( 2822 VectorPH->getTerminator(), 2823 BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); 2824 2825 VectorPH = NewVectorPH; 2826 } 2827 2828 // Generate the code that checks in runtime if arrays overlap. We put the 2829 // checks into a separate block to make the more common case of few elements 2830 // faster. 2831 Instruction *MemRuntimeCheck; 2832 std::tie(FirstCheckInst, MemRuntimeCheck) = 2833 Legal->getLAI()->addRuntimeChecks(VectorPH->getTerminator()); 2834 if (MemRuntimeCheck) { 2835 AddedSafetyChecks = true; 2836 // Create a new block containing the memory check. 2837 VectorPH->setName("vector.memcheck"); 2838 NewVectorPH = 2839 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); 2840 if (ParentLoop) 2841 ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); 2842 LoopBypassBlocks.push_back(VectorPH); 2843 2844 // Replace the branch into the memory check block with a conditional branch 2845 // for the "few elements case". 2846 ReplaceInstWithInst( 2847 VectorPH->getTerminator(), 2848 BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); 2849 2850 VectorPH = NewVectorPH; 2851 } 2852 2853 // We are going to resume the execution of the scalar loop. 2854 // Go over all of the induction variables that we found and fix the 2855 // PHIs that are left in the scalar version of the loop. 2856 // The starting values of PHI nodes depend on the counter of the last 2857 // iteration in the vectorized loop. 2858 // If we come from a bypass edge then we need to start from the original 2859 // start value. 2860 2861 // This variable saves the new starting index for the scalar loop. 2862 PHINode *ResumeIndex = nullptr; 2863 LoopVectorizationLegality::InductionList::iterator I, E; 2864 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2865 // Set builder to point to last bypass block. 2866 BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); 2867 for (I = List->begin(), E = List->end(); I != E; ++I) { 2868 PHINode *OrigPhi = I->first; 2869 LoopVectorizationLegality::InductionInfo II = I->second; 2870 2871 Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); 2872 PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", 2873 MiddleBlock->getTerminator()); 2874 // We might have extended the type of the induction variable but we need a 2875 // truncated version for the scalar loop. 2876 PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? 2877 PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", 2878 MiddleBlock->getTerminator()) : nullptr; 2879 2880 // Create phi nodes to merge from the backedge-taken check block. 2881 PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", 2882 ScalarPH->getTerminator()); 2883 BCResumeVal->addIncoming(ResumeVal, MiddleBlock); 2884 2885 PHINode *BCTruncResumeVal = nullptr; 2886 if (OrigPhi == OldInduction) { 2887 BCTruncResumeVal = 2888 PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", 2889 ScalarPH->getTerminator()); 2890 BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); 2891 } 2892 2893 Value *EndValue = nullptr; 2894 switch (II.IK) { 2895 case LoopVectorizationLegality::IK_NoInduction: 2896 llvm_unreachable("Unknown induction"); 2897 case LoopVectorizationLegality::IK_IntInduction: { 2898 // Handle the integer induction counter. 2899 assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 2900 2901 // We have the canonical induction variable. 2902 if (OrigPhi == OldInduction) { 2903 // Create a truncated version of the resume value for the scalar loop, 2904 // we might have promoted the type to a larger width. 2905 EndValue = 2906 BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); 2907 // The new PHI merges the original incoming value, in case of a bypass, 2908 // or the value at the end of the vectorized loop. 2909 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 2910 TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 2911 TruncResumeVal->addIncoming(EndValue, VecBody); 2912 2913 BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); 2914 2915 // We know what the end value is. 2916 EndValue = IdxEndRoundDown; 2917 // We also know which PHI node holds it. 2918 ResumeIndex = ResumeVal; 2919 break; 2920 } 2921 2922 // Not the canonical induction variable - add the vector loop count to the 2923 // start value. 2924 Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, 2925 II.StartValue->getType(), 2926 "cast.crd"); 2927 EndValue = II.transform(BypassBuilder, CRD); 2928 EndValue->setName("ind.end"); 2929 break; 2930 } 2931 case LoopVectorizationLegality::IK_PtrInduction: { 2932 Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, 2933 II.StepValue->getType(), 2934 "cast.crd"); 2935 EndValue = II.transform(BypassBuilder, CRD); 2936 EndValue->setName("ptr.ind.end"); 2937 break; 2938 } 2939 }// end of case 2940 2941 // The new PHI merges the original incoming value, in case of a bypass, 2942 // or the value at the end of the vectorized loop. 2943 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { 2944 if (OrigPhi == OldInduction) 2945 ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); 2946 else 2947 ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 2948 } 2949 ResumeVal->addIncoming(EndValue, VecBody); 2950 2951 // Fix the scalar body counter (PHI node). 2952 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 2953 2954 // The old induction's phi node in the scalar body needs the truncated 2955 // value. 2956 if (OrigPhi == OldInduction) { 2957 BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); 2958 OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); 2959 } else { 2960 BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); 2961 OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); 2962 } 2963 } 2964 2965 // If we are generating a new induction variable then we also need to 2966 // generate the code that calculates the exit value. This value is not 2967 // simply the end of the counter because we may skip the vectorized body 2968 // in case of a runtime check. 2969 if (!OldInduction){ 2970 assert(!ResumeIndex && "Unexpected resume value found"); 2971 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 2972 MiddleBlock->getTerminator()); 2973 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 2974 ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); 2975 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 2976 } 2977 2978 // Make sure that we found the index where scalar loop needs to continue. 2979 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 2980 "Invalid resume Index"); 2981 2982 // Add a check in the middle block to see if we have completed 2983 // all of the iterations in the first vector loop. 2984 // If (N - N%VF) == N, then we *don't* need to run the remainder. 2985 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 2986 ResumeIndex, "cmp.n", 2987 MiddleBlock->getTerminator()); 2988 ReplaceInstWithInst(MiddleBlock->getTerminator(), 2989 BranchInst::Create(ExitBlock, ScalarPH, CmpN)); 2990 2991 // Create i+1 and fill the PHINode. 2992 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 2993 Induction->addIncoming(StartIdx, VectorPH); 2994 Induction->addIncoming(NextIdx, VecBody); 2995 // Create the compare. 2996 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 2997 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 2998 2999 // Now we have two terminators. Remove the old one from the block. 3000 VecBody->getTerminator()->eraseFromParent(); 3001 3002 // Get ready to start creating new instructions into the vectorized body. 3003 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 3004 3005 // Save the state. 3006 LoopVectorPreHeader = VectorPH; 3007 LoopScalarPreHeader = ScalarPH; 3008 LoopMiddleBlock = MiddleBlock; 3009 LoopExitBlock = ExitBlock; 3010 LoopVectorBody.push_back(VecBody); 3011 LoopScalarBody = OldBasicBlock; 3012 3013 LoopVectorizeHints Hints(Lp, true); 3014 Hints.setAlreadyVectorized(); 3015 } 3016 3017 namespace { 3018 struct CSEDenseMapInfo { 3019 static bool canHandle(Instruction *I) { 3020 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3021 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3022 } 3023 static inline Instruction *getEmptyKey() { 3024 return DenseMapInfo<Instruction *>::getEmptyKey(); 3025 } 3026 static inline Instruction *getTombstoneKey() { 3027 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3028 } 3029 static unsigned getHashValue(Instruction *I) { 3030 assert(canHandle(I) && "Unknown instruction!"); 3031 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3032 I->value_op_end())); 3033 } 3034 static bool isEqual(Instruction *LHS, Instruction *RHS) { 3035 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3036 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3037 return LHS == RHS; 3038 return LHS->isIdenticalTo(RHS); 3039 } 3040 }; 3041 } 3042 3043 /// \brief Check whether this block is a predicated block. 3044 /// Due to if predication of stores we might create a sequence of "if(pred) a[i] 3045 /// = ...; " blocks. We start with one vectorized basic block. For every 3046 /// conditional block we split this vectorized block. Therefore, every second 3047 /// block will be a predicated one. 3048 static bool isPredicatedBlock(unsigned BlockNum) { 3049 return BlockNum % 2; 3050 } 3051 3052 ///\brief Perform cse of induction variable instructions. 3053 static void cse(SmallVector<BasicBlock *, 4> &BBs) { 3054 // Perform simple cse. 3055 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3056 for (unsigned i = 0, e = BBs.size(); i != e; ++i) { 3057 BasicBlock *BB = BBs[i]; 3058 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3059 Instruction *In = I++; 3060 3061 if (!CSEDenseMapInfo::canHandle(In)) 3062 continue; 3063 3064 // Check if we can replace this instruction with any of the 3065 // visited instructions. 3066 if (Instruction *V = CSEMap.lookup(In)) { 3067 In->replaceAllUsesWith(V); 3068 In->eraseFromParent(); 3069 continue; 3070 } 3071 // Ignore instructions in conditional blocks. We create "if (pred) a[i] = 3072 // ...;" blocks for predicated stores. Every second block is a predicated 3073 // block. 3074 if (isPredicatedBlock(i)) 3075 continue; 3076 3077 CSEMap[In] = In; 3078 } 3079 } 3080 } 3081 3082 /// \brief Adds a 'fast' flag to floating point operations. 3083 static Value *addFastMathFlag(Value *V) { 3084 if (isa<FPMathOperator>(V)){ 3085 FastMathFlags Flags; 3086 Flags.setUnsafeAlgebra(); 3087 cast<Instruction>(V)->setFastMathFlags(Flags); 3088 } 3089 return V; 3090 } 3091 3092 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if 3093 /// the result needs to be inserted and/or extracted from vectors. 3094 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, 3095 const TargetTransformInfo &TTI) { 3096 if (Ty->isVoidTy()) 3097 return 0; 3098 3099 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 3100 unsigned Cost = 0; 3101 3102 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 3103 if (Insert) 3104 Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i); 3105 if (Extract) 3106 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i); 3107 } 3108 3109 return Cost; 3110 } 3111 3112 // Estimate cost of a call instruction CI if it were vectorized with factor VF. 3113 // Return the cost of the instruction, including scalarization overhead if it's 3114 // needed. The flag NeedToScalarize shows if the call needs to be scalarized - 3115 // i.e. either vector version isn't available, or is too expensive. 3116 static unsigned getVectorCallCost(CallInst *CI, unsigned VF, 3117 const TargetTransformInfo &TTI, 3118 const TargetLibraryInfo *TLI, 3119 bool &NeedToScalarize) { 3120 Function *F = CI->getCalledFunction(); 3121 StringRef FnName = CI->getCalledFunction()->getName(); 3122 Type *ScalarRetTy = CI->getType(); 3123 SmallVector<Type *, 4> Tys, ScalarTys; 3124 for (auto &ArgOp : CI->arg_operands()) 3125 ScalarTys.push_back(ArgOp->getType()); 3126 3127 // Estimate cost of scalarized vector call. The source operands are assumed 3128 // to be vectors, so we need to extract individual elements from there, 3129 // execute VF scalar calls, and then gather the result into the vector return 3130 // value. 3131 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3132 if (VF == 1) 3133 return ScalarCallCost; 3134 3135 // Compute corresponding vector type for return value and arguments. 3136 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3137 for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i) 3138 Tys.push_back(ToVectorTy(ScalarTys[i], VF)); 3139 3140 // Compute costs of unpacking argument values for the scalar calls and 3141 // packing the return values to a vector. 3142 unsigned ScalarizationCost = 3143 getScalarizationOverhead(RetTy, true, false, TTI); 3144 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) 3145 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI); 3146 3147 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3148 3149 // If we can't emit a vector call for this function, then the currently found 3150 // cost is the cost we need to return. 3151 NeedToScalarize = true; 3152 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3153 return Cost; 3154 3155 // If the corresponding vector cost is cheaper, return its cost. 3156 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3157 if (VectorCallCost < Cost) { 3158 NeedToScalarize = false; 3159 return VectorCallCost; 3160 } 3161 return Cost; 3162 } 3163 3164 // Estimate cost of an intrinsic call instruction CI if it were vectorized with 3165 // factor VF. Return the cost of the instruction, including scalarization 3166 // overhead if it's needed. 3167 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, 3168 const TargetTransformInfo &TTI, 3169 const TargetLibraryInfo *TLI) { 3170 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3171 assert(ID && "Expected intrinsic call!"); 3172 3173 Type *RetTy = ToVectorTy(CI->getType(), VF); 3174 SmallVector<Type *, 4> Tys; 3175 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3176 Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3177 3178 return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); 3179 } 3180 3181 void InnerLoopVectorizer::vectorizeLoop() { 3182 //===------------------------------------------------===// 3183 // 3184 // Notice: any optimization or new instruction that go 3185 // into the code below should be also be implemented in 3186 // the cost-model. 3187 // 3188 //===------------------------------------------------===// 3189 Constant *Zero = Builder.getInt32(0); 3190 3191 // In order to support reduction variables we need to be able to vectorize 3192 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 3193 // stages. First, we create a new vector PHI node with no incoming edges. 3194 // We use this value when we vectorize all of the instructions that use the 3195 // PHI. Next, after all of the instructions in the block are complete we 3196 // add the new incoming edges to the PHI. At this point all of the 3197 // instructions in the basic block are vectorized, so we can use them to 3198 // construct the PHI. 3199 PhiVector RdxPHIsToFix; 3200 3201 // Scan the loop in a topological order to ensure that defs are vectorized 3202 // before users. 3203 LoopBlocksDFS DFS(OrigLoop); 3204 DFS.perform(LI); 3205 3206 // Vectorize all of the blocks in the original loop. 3207 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 3208 be = DFS.endRPO(); bb != be; ++bb) 3209 vectorizeBlockInLoop(*bb, &RdxPHIsToFix); 3210 3211 // At this point every instruction in the original loop is widened to 3212 // a vector form. We are almost done. Now, we need to fix the PHI nodes 3213 // that we vectorized. The PHI nodes are currently empty because we did 3214 // not want to introduce cycles. Notice that the remaining PHI nodes 3215 // that we need to fix are reduction variables. 3216 3217 // Create the 'reduced' values for each of the induction vars. 3218 // The reduced values are the vector values that we scalarize and combine 3219 // after the loop is finished. 3220 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 3221 it != e; ++it) { 3222 PHINode *RdxPhi = *it; 3223 assert(RdxPhi && "Unable to recover vectorized PHI"); 3224 3225 // Find the reduction variable descriptor. 3226 assert(Legal->getReductionVars()->count(RdxPhi) && 3227 "Unable to find the reduction variable"); 3228 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; 3229 3230 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3231 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3232 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3233 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3234 RdxDesc.getMinMaxRecurrenceKind(); 3235 setDebugLocFromInst(Builder, ReductionStartValue); 3236 3237 // We need to generate a reduction vector from the incoming scalar. 3238 // To do so, we need to generate the 'identity' vector and override 3239 // one of the elements with the incoming scalar reduction. We need 3240 // to do it in the vector-loop preheader. 3241 Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); 3242 3243 // This is the vector-clone of the value that leaves the loop. 3244 VectorParts &VectorExit = getVectorValue(LoopExitInst); 3245 Type *VecTy = VectorExit[0]->getType(); 3246 3247 // Find the reduction identity variable. Zero for addition, or, xor, 3248 // one for multiplication, -1 for And. 3249 Value *Identity; 3250 Value *VectorStart; 3251 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3252 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3253 // MinMax reduction have the start value as their identify. 3254 if (VF == 1) { 3255 VectorStart = Identity = ReductionStartValue; 3256 } else { 3257 VectorStart = Identity = 3258 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3259 } 3260 } else { 3261 // Handle other reduction kinds: 3262 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3263 RK, VecTy->getScalarType()); 3264 if (VF == 1) { 3265 Identity = Iden; 3266 // This vector is the Identity vector where the first element is the 3267 // incoming scalar reduction. 3268 VectorStart = ReductionStartValue; 3269 } else { 3270 Identity = ConstantVector::getSplat(VF, Iden); 3271 3272 // This vector is the Identity vector where the first element is the 3273 // incoming scalar reduction. 3274 VectorStart = 3275 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3276 } 3277 } 3278 3279 // Fix the vector-loop phi. 3280 3281 // Reductions do not have to start at zero. They can start with 3282 // any loop invariant values. 3283 VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 3284 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3285 Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 3286 VectorParts &Val = getVectorValue(LoopVal); 3287 for (unsigned part = 0; part < UF; ++part) { 3288 // Make sure to add the reduction stat value only to the 3289 // first unroll part. 3290 Value *StartVal = (part == 0) ? VectorStart : Identity; 3291 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, 3292 LoopVectorPreHeader); 3293 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], 3294 LoopVectorBody.back()); 3295 } 3296 3297 // Before each round, move the insertion point right between 3298 // the PHIs and the values we are going to write. 3299 // This allows us to write both PHINodes and the extractelement 3300 // instructions. 3301 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 3302 3303 VectorParts RdxParts; 3304 setDebugLocFromInst(Builder, LoopExitInst); 3305 for (unsigned part = 0; part < UF; ++part) { 3306 // This PHINode contains the vectorized reduction variable, or 3307 // the initial value vector, if we bypass the vector loop. 3308 VectorParts &RdxExitVal = getVectorValue(LoopExitInst); 3309 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 3310 Value *StartVal = (part == 0) ? VectorStart : Identity; 3311 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 3312 NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); 3313 NewPhi->addIncoming(RdxExitVal[part], 3314 LoopVectorBody.back()); 3315 RdxParts.push_back(NewPhi); 3316 } 3317 3318 // Reduce all of the unrolled parts into a single vector. 3319 Value *ReducedPartRdx = RdxParts[0]; 3320 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3321 setDebugLocFromInst(Builder, ReducedPartRdx); 3322 for (unsigned part = 1; part < UF; ++part) { 3323 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3324 // Floating point operations had to be 'fast' to enable the reduction. 3325 ReducedPartRdx = addFastMathFlag( 3326 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], 3327 ReducedPartRdx, "bin.rdx")); 3328 else 3329 ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( 3330 Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); 3331 } 3332 3333 if (VF > 1) { 3334 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 3335 // and vector ops, reducing the set of values being computed by half each 3336 // round. 3337 assert(isPowerOf2_32(VF) && 3338 "Reduction emission only supported for pow2 vectors!"); 3339 Value *TmpVec = ReducedPartRdx; 3340 SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); 3341 for (unsigned i = VF; i != 1; i >>= 1) { 3342 // Move the upper half of the vector to the lower half. 3343 for (unsigned j = 0; j != i/2; ++j) 3344 ShuffleMask[j] = Builder.getInt32(i/2 + j); 3345 3346 // Fill the rest of the mask with undef. 3347 std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 3348 UndefValue::get(Builder.getInt32Ty())); 3349 3350 Value *Shuf = 3351 Builder.CreateShuffleVector(TmpVec, 3352 UndefValue::get(TmpVec->getType()), 3353 ConstantVector::get(ShuffleMask), 3354 "rdx.shuf"); 3355 3356 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3357 // Floating point operations had to be 'fast' to enable the reduction. 3358 TmpVec = addFastMathFlag(Builder.CreateBinOp( 3359 (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); 3360 else 3361 TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, 3362 TmpVec, Shuf); 3363 } 3364 3365 // The result is in the first element of the vector. 3366 ReducedPartRdx = Builder.CreateExtractElement(TmpVec, 3367 Builder.getInt32(0)); 3368 } 3369 3370 // Create a phi node that merges control-flow from the backedge-taken check 3371 // block and the middle block. 3372 PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", 3373 LoopScalarPreHeader->getTerminator()); 3374 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); 3375 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3376 3377 // Now, we need to fix the users of the reduction variable 3378 // inside and outside of the scalar remainder loop. 3379 // We know that the loop is in LCSSA form. We need to update the 3380 // PHI nodes in the exit blocks. 3381 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3382 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 3383 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3384 if (!LCSSAPhi) break; 3385 3386 // All PHINodes need to have a single entry edge, or two if 3387 // we already fixed them. 3388 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3389 3390 // We found our reduction value exit-PHI. Update it with the 3391 // incoming bypass edge. 3392 if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { 3393 // Add an edge coming from the bypass. 3394 LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3395 break; 3396 } 3397 }// end of the LCSSA phi scan. 3398 3399 // Fix the scalar loop reduction variable with the incoming reduction sum 3400 // from the vector body and from the backedge value. 3401 int IncomingEdgeBlockIdx = 3402 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3403 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3404 // Pick the other block. 3405 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3406 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3407 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3408 }// end of for each redux variable. 3409 3410 fixLCSSAPHIs(); 3411 3412 // Remove redundant induction instructions. 3413 cse(LoopVectorBody); 3414 } 3415 3416 void InnerLoopVectorizer::fixLCSSAPHIs() { 3417 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3418 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 3419 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3420 if (!LCSSAPhi) break; 3421 if (LCSSAPhi->getNumIncomingValues() == 1) 3422 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 3423 LoopMiddleBlock); 3424 } 3425 } 3426 3427 InnerLoopVectorizer::VectorParts 3428 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 3429 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 3430 "Invalid edge"); 3431 3432 // Look for cached value. 3433 std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); 3434 EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); 3435 if (ECEntryIt != MaskCache.end()) 3436 return ECEntryIt->second; 3437 3438 VectorParts SrcMask = createBlockInMask(Src); 3439 3440 // The terminator has to be a branch inst! 3441 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 3442 assert(BI && "Unexpected terminator found"); 3443 3444 if (BI->isConditional()) { 3445 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 3446 3447 if (BI->getSuccessor(0) != Dst) 3448 for (unsigned part = 0; part < UF; ++part) 3449 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 3450 3451 for (unsigned part = 0; part < UF; ++part) 3452 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 3453 3454 MaskCache[Edge] = EdgeMask; 3455 return EdgeMask; 3456 } 3457 3458 MaskCache[Edge] = SrcMask; 3459 return SrcMask; 3460 } 3461 3462 InnerLoopVectorizer::VectorParts 3463 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 3464 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 3465 3466 // Loop incoming mask is all-one. 3467 if (OrigLoop->getHeader() == BB) { 3468 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 3469 return getVectorValue(C); 3470 } 3471 3472 // This is the block mask. We OR all incoming edges, and with zero. 3473 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 3474 VectorParts BlockMask = getVectorValue(Zero); 3475 3476 // For each pred: 3477 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 3478 VectorParts EM = createEdgeMask(*it, BB); 3479 for (unsigned part = 0; part < UF; ++part) 3480 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 3481 } 3482 3483 return BlockMask; 3484 } 3485 3486 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 3487 InnerLoopVectorizer::VectorParts &Entry, 3488 unsigned UF, unsigned VF, PhiVector *PV) { 3489 PHINode* P = cast<PHINode>(PN); 3490 // Handle reduction variables: 3491 if (Legal->getReductionVars()->count(P)) { 3492 for (unsigned part = 0; part < UF; ++part) { 3493 // This is phase one of vectorizing PHIs. 3494 Type *VecTy = (VF == 1) ? PN->getType() : 3495 VectorType::get(PN->getType(), VF); 3496 Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", 3497 LoopVectorBody.back()-> getFirstInsertionPt()); 3498 } 3499 PV->push_back(P); 3500 return; 3501 } 3502 3503 setDebugLocFromInst(Builder, P); 3504 // Check for PHI nodes that are lowered to vector selects. 3505 if (P->getParent() != OrigLoop->getHeader()) { 3506 // We know that all PHIs in non-header blocks are converted into 3507 // selects, so we don't have to worry about the insertion order and we 3508 // can just use the builder. 3509 // At this point we generate the predication tree. There may be 3510 // duplications since this is a simple recursive scan, but future 3511 // optimizations will clean it up. 3512 3513 unsigned NumIncoming = P->getNumIncomingValues(); 3514 3515 // Generate a sequence of selects of the form: 3516 // SELECT(Mask3, In3, 3517 // SELECT(Mask2, In2, 3518 // ( ...))) 3519 for (unsigned In = 0; In < NumIncoming; In++) { 3520 VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), 3521 P->getParent()); 3522 VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 3523 3524 for (unsigned part = 0; part < UF; ++part) { 3525 // We might have single edge PHIs (blocks) - use an identity 3526 // 'select' for the first PHI operand. 3527 if (In == 0) 3528 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 3529 In0[part]); 3530 else 3531 // Select between the current value and the previous incoming edge 3532 // based on the incoming mask. 3533 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 3534 Entry[part], "predphi"); 3535 } 3536 } 3537 return; 3538 } 3539 3540 // This PHINode must be an induction variable. 3541 // Make sure that we know about it. 3542 assert(Legal->getInductionVars()->count(P) && 3543 "Not an induction variable"); 3544 3545 LoopVectorizationLegality::InductionInfo II = 3546 Legal->getInductionVars()->lookup(P); 3547 3548 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3549 // which can be found from the original scalar operations. 3550 switch (II.IK) { 3551 case LoopVectorizationLegality::IK_NoInduction: 3552 llvm_unreachable("Unknown induction"); 3553 case LoopVectorizationLegality::IK_IntInduction: { 3554 assert(P->getType() == II.StartValue->getType() && "Types must match"); 3555 Type *PhiTy = P->getType(); 3556 Value *Broadcasted; 3557 if (P == OldInduction) { 3558 // Handle the canonical induction variable. We might have had to 3559 // extend the type. 3560 Broadcasted = Builder.CreateTrunc(Induction, PhiTy); 3561 } else { 3562 // Handle other induction variables that are now based on the 3563 // canonical one. 3564 Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, 3565 "normalized.idx"); 3566 NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); 3567 Broadcasted = II.transform(Builder, NormalizedIdx); 3568 Broadcasted->setName("offset.idx"); 3569 } 3570 Broadcasted = getBroadcastInstrs(Broadcasted); 3571 // After broadcasting the induction variable we need to make the vector 3572 // consecutive by adding 0, 1, 2, etc. 3573 for (unsigned part = 0; part < UF; ++part) 3574 Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); 3575 return; 3576 } 3577 case LoopVectorizationLegality::IK_PtrInduction: 3578 // Handle the pointer induction variable case. 3579 assert(P->getType()->isPointerTy() && "Unexpected type."); 3580 // This is the normalized GEP that starts counting at zero. 3581 Value *NormalizedIdx = 3582 Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); 3583 NormalizedIdx = 3584 Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType()); 3585 // This is the vector of results. Notice that we don't generate 3586 // vector geps because scalar geps result in better code. 3587 for (unsigned part = 0; part < UF; ++part) { 3588 if (VF == 1) { 3589 int EltIndex = part; 3590 Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); 3591 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); 3592 Value *SclrGep = II.transform(Builder, GlobalIdx); 3593 SclrGep->setName("next.gep"); 3594 Entry[part] = SclrGep; 3595 continue; 3596 } 3597 3598 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 3599 for (unsigned int i = 0; i < VF; ++i) { 3600 int EltIndex = i + part * VF; 3601 Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); 3602 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); 3603 Value *SclrGep = II.transform(Builder, GlobalIdx); 3604 SclrGep->setName("next.gep"); 3605 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 3606 Builder.getInt32(i), 3607 "insert.gep"); 3608 } 3609 Entry[part] = VecVal; 3610 } 3611 return; 3612 } 3613 } 3614 3615 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { 3616 // For each instruction in the old loop. 3617 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3618 VectorParts &Entry = WidenMap.get(it); 3619 switch (it->getOpcode()) { 3620 case Instruction::Br: 3621 // Nothing to do for PHIs and BR, since we already took care of the 3622 // loop control flow instructions. 3623 continue; 3624 case Instruction::PHI: { 3625 // Vectorize PHINodes. 3626 widenPHIInstruction(it, Entry, UF, VF, PV); 3627 continue; 3628 }// End of PHI. 3629 3630 case Instruction::Add: 3631 case Instruction::FAdd: 3632 case Instruction::Sub: 3633 case Instruction::FSub: 3634 case Instruction::Mul: 3635 case Instruction::FMul: 3636 case Instruction::UDiv: 3637 case Instruction::SDiv: 3638 case Instruction::FDiv: 3639 case Instruction::URem: 3640 case Instruction::SRem: 3641 case Instruction::FRem: 3642 case Instruction::Shl: 3643 case Instruction::LShr: 3644 case Instruction::AShr: 3645 case Instruction::And: 3646 case Instruction::Or: 3647 case Instruction::Xor: { 3648 // Just widen binops. 3649 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 3650 setDebugLocFromInst(Builder, BinOp); 3651 VectorParts &A = getVectorValue(it->getOperand(0)); 3652 VectorParts &B = getVectorValue(it->getOperand(1)); 3653 3654 // Use this vector value for all users of the original instruction. 3655 for (unsigned Part = 0; Part < UF; ++Part) { 3656 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 3657 3658 if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) 3659 VecOp->copyIRFlags(BinOp); 3660 3661 Entry[Part] = V; 3662 } 3663 3664 propagateMetadata(Entry, it); 3665 break; 3666 } 3667 case Instruction::Select: { 3668 // Widen selects. 3669 // If the selector is loop invariant we can create a select 3670 // instruction with a scalar condition. Otherwise, use vector-select. 3671 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), 3672 OrigLoop); 3673 setDebugLocFromInst(Builder, it); 3674 3675 // The condition can be loop invariant but still defined inside the 3676 // loop. This means that we can't just use the original 'cond' value. 3677 // We have to take the 'vectorized' value and pick the first lane. 3678 // Instcombine will make this a no-op. 3679 VectorParts &Cond = getVectorValue(it->getOperand(0)); 3680 VectorParts &Op0 = getVectorValue(it->getOperand(1)); 3681 VectorParts &Op1 = getVectorValue(it->getOperand(2)); 3682 3683 Value *ScalarCond = (VF == 1) ? Cond[0] : 3684 Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); 3685 3686 for (unsigned Part = 0; Part < UF; ++Part) { 3687 Entry[Part] = Builder.CreateSelect( 3688 InvariantCond ? ScalarCond : Cond[Part], 3689 Op0[Part], 3690 Op1[Part]); 3691 } 3692 3693 propagateMetadata(Entry, it); 3694 break; 3695 } 3696 3697 case Instruction::ICmp: 3698 case Instruction::FCmp: { 3699 // Widen compares. Generate vector compares. 3700 bool FCmp = (it->getOpcode() == Instruction::FCmp); 3701 CmpInst *Cmp = dyn_cast<CmpInst>(it); 3702 setDebugLocFromInst(Builder, it); 3703 VectorParts &A = getVectorValue(it->getOperand(0)); 3704 VectorParts &B = getVectorValue(it->getOperand(1)); 3705 for (unsigned Part = 0; Part < UF; ++Part) { 3706 Value *C = nullptr; 3707 if (FCmp) 3708 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 3709 else 3710 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 3711 Entry[Part] = C; 3712 } 3713 3714 propagateMetadata(Entry, it); 3715 break; 3716 } 3717 3718 case Instruction::Store: 3719 case Instruction::Load: 3720 vectorizeMemoryInstruction(it); 3721 break; 3722 case Instruction::ZExt: 3723 case Instruction::SExt: 3724 case Instruction::FPToUI: 3725 case Instruction::FPToSI: 3726 case Instruction::FPExt: 3727 case Instruction::PtrToInt: 3728 case Instruction::IntToPtr: 3729 case Instruction::SIToFP: 3730 case Instruction::UIToFP: 3731 case Instruction::Trunc: 3732 case Instruction::FPTrunc: 3733 case Instruction::BitCast: { 3734 CastInst *CI = dyn_cast<CastInst>(it); 3735 setDebugLocFromInst(Builder, it); 3736 /// Optimize the special case where the source is the induction 3737 /// variable. Notice that we can only optimize the 'trunc' case 3738 /// because: a. FP conversions lose precision, b. sext/zext may wrap, 3739 /// c. other casts depend on pointer size. 3740 if (CI->getOperand(0) == OldInduction && 3741 it->getOpcode() == Instruction::Trunc) { 3742 Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 3743 CI->getType()); 3744 Value *Broadcasted = getBroadcastInstrs(ScalarCast); 3745 LoopVectorizationLegality::InductionInfo II = 3746 Legal->getInductionVars()->lookup(OldInduction); 3747 Constant *Step = 3748 ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); 3749 for (unsigned Part = 0; Part < UF; ++Part) 3750 Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); 3751 propagateMetadata(Entry, it); 3752 break; 3753 } 3754 /// Vectorize casts. 3755 Type *DestTy = (VF == 1) ? CI->getType() : 3756 VectorType::get(CI->getType(), VF); 3757 3758 VectorParts &A = getVectorValue(it->getOperand(0)); 3759 for (unsigned Part = 0; Part < UF; ++Part) 3760 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 3761 propagateMetadata(Entry, it); 3762 break; 3763 } 3764 3765 case Instruction::Call: { 3766 // Ignore dbg intrinsics. 3767 if (isa<DbgInfoIntrinsic>(it)) 3768 break; 3769 setDebugLocFromInst(Builder, it); 3770 3771 Module *M = BB->getParent()->getParent(); 3772 CallInst *CI = cast<CallInst>(it); 3773 3774 StringRef FnName = CI->getCalledFunction()->getName(); 3775 Function *F = CI->getCalledFunction(); 3776 Type *RetTy = ToVectorTy(CI->getType(), VF); 3777 SmallVector<Type *, 4> Tys; 3778 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3779 Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3780 3781 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3782 if (ID && 3783 (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 3784 ID == Intrinsic::lifetime_start)) { 3785 scalarizeInstruction(it); 3786 break; 3787 } 3788 // The flag shows whether we use Intrinsic or a usual Call for vectorized 3789 // version of the instruction. 3790 // Is it beneficial to perform intrinsic call compared to lib call? 3791 bool NeedToScalarize; 3792 unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); 3793 bool UseVectorIntrinsic = 3794 ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; 3795 if (!UseVectorIntrinsic && NeedToScalarize) { 3796 scalarizeInstruction(it); 3797 break; 3798 } 3799 3800 for (unsigned Part = 0; Part < UF; ++Part) { 3801 SmallVector<Value *, 4> Args; 3802 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 3803 Value *Arg = CI->getArgOperand(i); 3804 // Some intrinsics have a scalar argument - don't replace it with a 3805 // vector. 3806 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { 3807 VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); 3808 Arg = VectorArg[Part]; 3809 } 3810 Args.push_back(Arg); 3811 } 3812 3813 Function *VectorF; 3814 if (UseVectorIntrinsic) { 3815 // Use vector version of the intrinsic. 3816 Type *TysForDecl[] = {CI->getType()}; 3817 if (VF > 1) 3818 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 3819 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 3820 } else { 3821 // Use vector version of the library call. 3822 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 3823 assert(!VFnName.empty() && "Vector function name is empty."); 3824 VectorF = M->getFunction(VFnName); 3825 if (!VectorF) { 3826 // Generate a declaration 3827 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 3828 VectorF = 3829 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 3830 VectorF->copyAttributesFrom(F); 3831 } 3832 } 3833 assert(VectorF && "Can't create vector function."); 3834 Entry[Part] = Builder.CreateCall(VectorF, Args); 3835 } 3836 3837 propagateMetadata(Entry, it); 3838 break; 3839 } 3840 3841 default: 3842 // All other instructions are unsupported. Scalarize them. 3843 scalarizeInstruction(it); 3844 break; 3845 }// end of switch. 3846 }// end of for_each instr. 3847 } 3848 3849 void InnerLoopVectorizer::updateAnalysis() { 3850 // Forget the original basic block. 3851 SE->forgetLoop(OrigLoop); 3852 3853 // Update the dominator tree information. 3854 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 3855 "Entry does not dominate exit."); 3856 3857 for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 3858 DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); 3859 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); 3860 3861 // Due to if predication of stores we might create a sequence of "if(pred) 3862 // a[i] = ...; " blocks. 3863 for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { 3864 if (i == 0) 3865 DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); 3866 else if (isPredicatedBlock(i)) { 3867 DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); 3868 } else { 3869 DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); 3870 } 3871 } 3872 3873 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); 3874 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 3875 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 3876 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 3877 3878 DEBUG(DT->verifyDomTree()); 3879 } 3880 3881 /// \brief Check whether it is safe to if-convert this phi node. 3882 /// 3883 /// Phi nodes with constant expressions that can trap are not safe to if 3884 /// convert. 3885 static bool canIfConvertPHINodes(BasicBlock *BB) { 3886 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 3887 PHINode *Phi = dyn_cast<PHINode>(I); 3888 if (!Phi) 3889 return true; 3890 for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) 3891 if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) 3892 if (C->canTrap()) 3893 return false; 3894 } 3895 return true; 3896 } 3897 3898 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 3899 if (!EnableIfConversion) { 3900 emitAnalysis(VectorizationReport() << "if-conversion is disabled"); 3901 return false; 3902 } 3903 3904 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 3905 3906 // A list of pointers that we can safely read and write to. 3907 SmallPtrSet<Value *, 8> SafePointes; 3908 3909 // Collect safe addresses. 3910 for (Loop::block_iterator BI = TheLoop->block_begin(), 3911 BE = TheLoop->block_end(); BI != BE; ++BI) { 3912 BasicBlock *BB = *BI; 3913 3914 if (blockNeedsPredication(BB)) 3915 continue; 3916 3917 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 3918 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 3919 SafePointes.insert(LI->getPointerOperand()); 3920 else if (StoreInst *SI = dyn_cast<StoreInst>(I)) 3921 SafePointes.insert(SI->getPointerOperand()); 3922 } 3923 } 3924 3925 // Collect the blocks that need predication. 3926 BasicBlock *Header = TheLoop->getHeader(); 3927 for (Loop::block_iterator BI = TheLoop->block_begin(), 3928 BE = TheLoop->block_end(); BI != BE; ++BI) { 3929 BasicBlock *BB = *BI; 3930 3931 // We don't support switch statements inside loops. 3932 if (!isa<BranchInst>(BB->getTerminator())) { 3933 emitAnalysis(VectorizationReport(BB->getTerminator()) 3934 << "loop contains a switch statement"); 3935 return false; 3936 } 3937 3938 // We must be able to predicate all blocks that need to be predicated. 3939 if (blockNeedsPredication(BB)) { 3940 if (!blockCanBePredicated(BB, SafePointes)) { 3941 emitAnalysis(VectorizationReport(BB->getTerminator()) 3942 << "control flow cannot be substituted for a select"); 3943 return false; 3944 } 3945 } else if (BB != Header && !canIfConvertPHINodes(BB)) { 3946 emitAnalysis(VectorizationReport(BB->getTerminator()) 3947 << "control flow cannot be substituted for a select"); 3948 return false; 3949 } 3950 } 3951 3952 // We can if-convert this loop. 3953 return true; 3954 } 3955 3956 bool LoopVectorizationLegality::canVectorize() { 3957 // We must have a loop in canonical form. Loops with indirectbr in them cannot 3958 // be canonicalized. 3959 if (!TheLoop->getLoopPreheader()) { 3960 emitAnalysis( 3961 VectorizationReport() << 3962 "loop control flow is not understood by vectorizer"); 3963 return false; 3964 } 3965 3966 // We can only vectorize innermost loops. 3967 if (!TheLoop->empty()) { 3968 emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); 3969 return false; 3970 } 3971 3972 // We must have a single backedge. 3973 if (TheLoop->getNumBackEdges() != 1) { 3974 emitAnalysis( 3975 VectorizationReport() << 3976 "loop control flow is not understood by vectorizer"); 3977 return false; 3978 } 3979 3980 // We must have a single exiting block. 3981 if (!TheLoop->getExitingBlock()) { 3982 emitAnalysis( 3983 VectorizationReport() << 3984 "loop control flow is not understood by vectorizer"); 3985 return false; 3986 } 3987 3988 // We only handle bottom-tested loops, i.e. loop in which the condition is 3989 // checked at the end of each iteration. With that we can assume that all 3990 // instructions in the loop are executed the same number of times. 3991 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 3992 emitAnalysis( 3993 VectorizationReport() << 3994 "loop control flow is not understood by vectorizer"); 3995 return false; 3996 } 3997 3998 // We need to have a loop header. 3999 DEBUG(dbgs() << "LV: Found a loop: " << 4000 TheLoop->getHeader()->getName() << '\n'); 4001 4002 // Check if we can if-convert non-single-bb loops. 4003 unsigned NumBlocks = TheLoop->getNumBlocks(); 4004 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 4005 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 4006 return false; 4007 } 4008 4009 // ScalarEvolution needs to be able to find the exit count. 4010 const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); 4011 if (ExitCount == SE->getCouldNotCompute()) { 4012 emitAnalysis(VectorizationReport() << 4013 "could not determine number of loop iterations"); 4014 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 4015 return false; 4016 } 4017 4018 // Check if we can vectorize the instructions and CFG in this loop. 4019 if (!canVectorizeInstrs()) { 4020 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 4021 return false; 4022 } 4023 4024 // Go over each instruction and look at memory deps. 4025 if (!canVectorizeMemory()) { 4026 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 4027 return false; 4028 } 4029 4030 // Collect all of the variables that remain uniform after vectorization. 4031 collectLoopUniforms(); 4032 4033 DEBUG(dbgs() << "LV: We can vectorize this loop" 4034 << (LAI->getRuntimePointerChecking()->Need 4035 ? " (with a runtime bound check)" 4036 : "") 4037 << "!\n"); 4038 4039 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 4040 4041 // If an override option has been passed in for interleaved accesses, use it. 4042 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 4043 UseInterleaved = EnableInterleavedMemAccesses; 4044 4045 // Analyze interleaved memory accesses. 4046 if (UseInterleaved) 4047 InterleaveInfo.analyzeInterleaving(Strides); 4048 4049 // Okay! We can vectorize. At this point we don't have any other mem analysis 4050 // which may limit our maximum vectorization factor, so just return true with 4051 // no restrictions. 4052 return true; 4053 } 4054 4055 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 4056 if (Ty->isPointerTy()) 4057 return DL.getIntPtrType(Ty); 4058 4059 // It is possible that char's or short's overflow when we ask for the loop's 4060 // trip count, work around this by changing the type size. 4061 if (Ty->getScalarSizeInBits() < 32) 4062 return Type::getInt32Ty(Ty->getContext()); 4063 4064 return Ty; 4065 } 4066 4067 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 4068 Ty0 = convertPointerToIntegerType(DL, Ty0); 4069 Ty1 = convertPointerToIntegerType(DL, Ty1); 4070 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 4071 return Ty0; 4072 return Ty1; 4073 } 4074 4075 /// \brief Check that the instruction has outside loop users and is not an 4076 /// identified reduction variable. 4077 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 4078 SmallPtrSetImpl<Value *> &Reductions) { 4079 // Reduction instructions are allowed to have exit users. All other 4080 // instructions must not have external users. 4081 if (!Reductions.count(Inst)) 4082 //Check that all of the users of the loop are inside the BB. 4083 for (User *U : Inst->users()) { 4084 Instruction *UI = cast<Instruction>(U); 4085 // This user may be a reduction exit value. 4086 if (!TheLoop->contains(UI)) { 4087 DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 4088 return true; 4089 } 4090 } 4091 return false; 4092 } 4093 4094 bool LoopVectorizationLegality::canVectorizeInstrs() { 4095 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 4096 BasicBlock *Header = TheLoop->getHeader(); 4097 4098 // Look for the attribute signaling the absence of NaNs. 4099 Function &F = *Header->getParent(); 4100 const DataLayout &DL = F.getParent()->getDataLayout(); 4101 if (F.hasFnAttribute("no-nans-fp-math")) 4102 HasFunNoNaNAttr = 4103 F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; 4104 4105 // For each block in the loop. 4106 for (Loop::block_iterator bb = TheLoop->block_begin(), 4107 be = TheLoop->block_end(); bb != be; ++bb) { 4108 4109 // Scan the instructions in the block and look for hazards. 4110 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 4111 ++it) { 4112 4113 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 4114 Type *PhiTy = Phi->getType(); 4115 // Check that this PHI type is allowed. 4116 if (!PhiTy->isIntegerTy() && 4117 !PhiTy->isFloatingPointTy() && 4118 !PhiTy->isPointerTy()) { 4119 emitAnalysis(VectorizationReport(it) 4120 << "loop control flow is not understood by vectorizer"); 4121 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 4122 return false; 4123 } 4124 4125 // If this PHINode is not in the header block, then we know that we 4126 // can convert it to select during if-conversion. No need to check if 4127 // the PHIs in this block are induction or reduction variables. 4128 if (*bb != Header) { 4129 // Check that this instruction has no outside users or is an 4130 // identified reduction value with an outside user. 4131 if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) 4132 continue; 4133 emitAnalysis(VectorizationReport(it) << 4134 "value could not be identified as " 4135 "an induction or reduction variable"); 4136 return false; 4137 } 4138 4139 // We only allow if-converted PHIs with exactly two incoming values. 4140 if (Phi->getNumIncomingValues() != 2) { 4141 emitAnalysis(VectorizationReport(it) 4142 << "control flow not understood by vectorizer"); 4143 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 4144 return false; 4145 } 4146 4147 // This is the value coming from the preheader. 4148 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 4149 ConstantInt *StepValue = nullptr; 4150 // Check if this is an induction variable. 4151 InductionKind IK = isInductionVariable(Phi, StepValue); 4152 4153 if (IK_NoInduction != IK) { 4154 // Get the widest type. 4155 if (!WidestIndTy) 4156 WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 4157 else 4158 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 4159 4160 // Int inductions are special because we only allow one IV. 4161 if (IK == IK_IntInduction && StepValue->isOne()) { 4162 // Use the phi node with the widest type as induction. Use the last 4163 // one if there are multiple (no good reason for doing this other 4164 // than it is expedient). 4165 if (!Induction || PhiTy == WidestIndTy) 4166 Induction = Phi; 4167 } 4168 4169 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 4170 Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); 4171 4172 // Until we explicitly handle the case of an induction variable with 4173 // an outside loop user we have to give up vectorizing this loop. 4174 if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { 4175 emitAnalysis(VectorizationReport(it) << 4176 "use of induction value outside of the " 4177 "loop is not handled by vectorizer"); 4178 return false; 4179 } 4180 4181 continue; 4182 } 4183 4184 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, 4185 Reductions[Phi])) { 4186 if (Reductions[Phi].hasUnsafeAlgebra()) 4187 Requirements->addUnsafeAlgebraInst( 4188 Reductions[Phi].getUnsafeAlgebraInst()); 4189 AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); 4190 continue; 4191 } 4192 4193 emitAnalysis(VectorizationReport(it) << 4194 "value that could not be identified as " 4195 "reduction is used outside the loop"); 4196 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 4197 return false; 4198 }// end of PHI handling 4199 4200 // We handle calls that: 4201 // * Are debug info intrinsics. 4202 // * Have a mapping to an IR intrinsic. 4203 // * Have a vector version available. 4204 CallInst *CI = dyn_cast<CallInst>(it); 4205 if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && 4206 !(CI->getCalledFunction() && TLI && 4207 TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { 4208 emitAnalysis(VectorizationReport(it) << 4209 "call instruction cannot be vectorized"); 4210 DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); 4211 return false; 4212 } 4213 4214 // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the 4215 // second argument is the same (i.e. loop invariant) 4216 if (CI && 4217 hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { 4218 if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { 4219 emitAnalysis(VectorizationReport(it) 4220 << "intrinsic instruction cannot be vectorized"); 4221 DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); 4222 return false; 4223 } 4224 } 4225 4226 // Check that the instruction return type is vectorizable. 4227 // Also, we can't vectorize extractelement instructions. 4228 if ((!VectorType::isValidElementType(it->getType()) && 4229 !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { 4230 emitAnalysis(VectorizationReport(it) 4231 << "instruction return type cannot be vectorized"); 4232 DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); 4233 return false; 4234 } 4235 4236 // Check that the stored type is vectorizable. 4237 if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 4238 Type *T = ST->getValueOperand()->getType(); 4239 if (!VectorType::isValidElementType(T)) { 4240 emitAnalysis(VectorizationReport(ST) << 4241 "store instruction cannot be vectorized"); 4242 return false; 4243 } 4244 if (EnableMemAccessVersioning) 4245 collectStridedAccess(ST); 4246 } 4247 4248 if (EnableMemAccessVersioning) 4249 if (LoadInst *LI = dyn_cast<LoadInst>(it)) 4250 collectStridedAccess(LI); 4251 4252 // Reduction instructions are allowed to have exit users. 4253 // All other instructions must not have external users. 4254 if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { 4255 emitAnalysis(VectorizationReport(it) << 4256 "value cannot be used outside the loop"); 4257 return false; 4258 } 4259 4260 } // next instr. 4261 4262 } 4263 4264 if (!Induction) { 4265 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 4266 if (Inductions.empty()) { 4267 emitAnalysis(VectorizationReport() 4268 << "loop induction variable could not be identified"); 4269 return false; 4270 } 4271 } 4272 4273 return true; 4274 } 4275 4276 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { 4277 Value *Ptr = nullptr; 4278 if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) 4279 Ptr = LI->getPointerOperand(); 4280 else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) 4281 Ptr = SI->getPointerOperand(); 4282 else 4283 return; 4284 4285 Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); 4286 if (!Stride) 4287 return; 4288 4289 DEBUG(dbgs() << "LV: Found a strided access that we can version"); 4290 DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); 4291 Strides[Ptr] = Stride; 4292 StrideSet.insert(Stride); 4293 } 4294 4295 void LoopVectorizationLegality::collectLoopUniforms() { 4296 // We now know that the loop is vectorizable! 4297 // Collect variables that will remain uniform after vectorization. 4298 std::vector<Value*> Worklist; 4299 BasicBlock *Latch = TheLoop->getLoopLatch(); 4300 4301 // Start with the conditional branch and walk up the block. 4302 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 4303 4304 // Also add all consecutive pointer values; these values will be uniform 4305 // after vectorization (and subsequent cleanup) and, until revectorization is 4306 // supported, all dependencies must also be uniform. 4307 for (Loop::block_iterator B = TheLoop->block_begin(), 4308 BE = TheLoop->block_end(); B != BE; ++B) 4309 for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); 4310 I != IE; ++I) 4311 if (I->getType()->isPointerTy() && isConsecutivePtr(I)) 4312 Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 4313 4314 while (!Worklist.empty()) { 4315 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 4316 Worklist.pop_back(); 4317 4318 // Look at instructions inside this loop. 4319 // Stop when reaching PHI nodes. 4320 // TODO: we need to follow values all over the loop, not only in this block. 4321 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 4322 continue; 4323 4324 // This is a known uniform. 4325 Uniforms.insert(I); 4326 4327 // Insert all operands. 4328 Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 4329 } 4330 } 4331 4332 bool LoopVectorizationLegality::canVectorizeMemory() { 4333 LAI = &LAA->getInfo(TheLoop, Strides); 4334 auto &OptionalReport = LAI->getReport(); 4335 if (OptionalReport) 4336 emitAnalysis(VectorizationReport(*OptionalReport)); 4337 if (!LAI->canVectorizeMemory()) 4338 return false; 4339 4340 if (LAI->hasStoreToLoopInvariantAddress()) { 4341 emitAnalysis( 4342 VectorizationReport() 4343 << "write to a loop invariant address could not be vectorized"); 4344 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 4345 return false; 4346 } 4347 4348 Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); 4349 4350 return true; 4351 } 4352 4353 LoopVectorizationLegality::InductionKind 4354 LoopVectorizationLegality::isInductionVariable(PHINode *Phi, 4355 ConstantInt *&StepValue) { 4356 if (!isInductionPHI(Phi, SE, StepValue)) 4357 return IK_NoInduction; 4358 4359 Type *PhiTy = Phi->getType(); 4360 // Found an Integer induction variable. 4361 if (PhiTy->isIntegerTy()) 4362 return IK_IntInduction; 4363 // Found an Pointer induction variable. 4364 return IK_PtrInduction; 4365 } 4366 4367 bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 4368 Value *In0 = const_cast<Value*>(V); 4369 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 4370 if (!PN) 4371 return false; 4372 4373 return Inductions.count(PN); 4374 } 4375 4376 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 4377 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 4378 } 4379 4380 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, 4381 SmallPtrSetImpl<Value *> &SafePtrs) { 4382 4383 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4384 // Check that we don't have a constant expression that can trap as operand. 4385 for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); 4386 OI != OE; ++OI) { 4387 if (Constant *C = dyn_cast<Constant>(*OI)) 4388 if (C->canTrap()) 4389 return false; 4390 } 4391 // We might be able to hoist the load. 4392 if (it->mayReadFromMemory()) { 4393 LoadInst *LI = dyn_cast<LoadInst>(it); 4394 if (!LI) 4395 return false; 4396 if (!SafePtrs.count(LI->getPointerOperand())) { 4397 if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { 4398 MaskedOp.insert(LI); 4399 continue; 4400 } 4401 return false; 4402 } 4403 } 4404 4405 // We don't predicate stores at the moment. 4406 if (it->mayWriteToMemory()) { 4407 StoreInst *SI = dyn_cast<StoreInst>(it); 4408 // We only support predication of stores in basic blocks with one 4409 // predecessor. 4410 if (!SI) 4411 return false; 4412 4413 bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); 4414 bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); 4415 4416 if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || 4417 !isSinglePredecessor) { 4418 // Build a masked store if it is legal for the target, otherwise scalarize 4419 // the block. 4420 bool isLegalMaskedOp = 4421 isLegalMaskedStore(SI->getValueOperand()->getType(), 4422 SI->getPointerOperand()); 4423 if (isLegalMaskedOp) { 4424 --NumPredStores; 4425 MaskedOp.insert(SI); 4426 continue; 4427 } 4428 return false; 4429 } 4430 } 4431 if (it->mayThrow()) 4432 return false; 4433 4434 // The instructions below can trap. 4435 switch (it->getOpcode()) { 4436 default: continue; 4437 case Instruction::UDiv: 4438 case Instruction::SDiv: 4439 case Instruction::URem: 4440 case Instruction::SRem: 4441 return false; 4442 } 4443 } 4444 4445 return true; 4446 } 4447 4448 void InterleavedAccessInfo::collectConstStridedAccesses( 4449 MapVector<Instruction *, StrideDescriptor> &StrideAccesses, 4450 const ValueToValueMap &Strides) { 4451 // Holds load/store instructions in program order. 4452 SmallVector<Instruction *, 16> AccessList; 4453 4454 for (auto *BB : TheLoop->getBlocks()) { 4455 bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 4456 4457 for (auto &I : *BB) { 4458 if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I)) 4459 continue; 4460 // FIXME: Currently we can't handle mixed accesses and predicated accesses 4461 if (IsPred) 4462 return; 4463 4464 AccessList.push_back(&I); 4465 } 4466 } 4467 4468 if (AccessList.empty()) 4469 return; 4470 4471 auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); 4472 for (auto I : AccessList) { 4473 LoadInst *LI = dyn_cast<LoadInst>(I); 4474 StoreInst *SI = dyn_cast<StoreInst>(I); 4475 4476 Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 4477 int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); 4478 4479 // The factor of the corresponding interleave group. 4480 unsigned Factor = std::abs(Stride); 4481 4482 // Ignore the access if the factor is too small or too large. 4483 if (Factor < 2 || Factor > MaxInterleaveGroupFactor) 4484 continue; 4485 4486 const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); 4487 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 4488 unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); 4489 4490 // An alignment of 0 means target ABI alignment. 4491 unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); 4492 if (!Align) 4493 Align = DL.getABITypeAlignment(PtrTy->getElementType()); 4494 4495 StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); 4496 } 4497 } 4498 4499 // Analyze interleaved accesses and collect them into interleave groups. 4500 // 4501 // Notice that the vectorization on interleaved groups will change instruction 4502 // orders and may break dependences. But the memory dependence check guarantees 4503 // that there is no overlap between two pointers of different strides, element 4504 // sizes or underlying bases. 4505 // 4506 // For pointers sharing the same stride, element size and underlying base, no 4507 // need to worry about Read-After-Write dependences and Write-After-Read 4508 // dependences. 4509 // 4510 // E.g. The RAW dependence: A[i] = a; 4511 // b = A[i]; 4512 // This won't exist as it is a store-load forwarding conflict, which has 4513 // already been checked and forbidden in the dependence check. 4514 // 4515 // E.g. The WAR dependence: a = A[i]; // (1) 4516 // A[i] = b; // (2) 4517 // The store group of (2) is always inserted at or below (2), and the load group 4518 // of (1) is always inserted at or above (1). The dependence is safe. 4519 void InterleavedAccessInfo::analyzeInterleaving( 4520 const ValueToValueMap &Strides) { 4521 DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); 4522 4523 // Holds all the stride accesses. 4524 MapVector<Instruction *, StrideDescriptor> StrideAccesses; 4525 collectConstStridedAccesses(StrideAccesses, Strides); 4526 4527 if (StrideAccesses.empty()) 4528 return; 4529 4530 // Holds all interleaved store groups temporarily. 4531 SmallSetVector<InterleaveGroup *, 4> StoreGroups; 4532 4533 // Search the load-load/write-write pair B-A in bottom-up order and try to 4534 // insert B into the interleave group of A according to 3 rules: 4535 // 1. A and B have the same stride. 4536 // 2. A and B have the same memory object size. 4537 // 3. B belongs to the group according to the distance. 4538 // 4539 // The bottom-up order can avoid breaking the Write-After-Write dependences 4540 // between two pointers of the same base. 4541 // E.g. A[i] = a; (1) 4542 // A[i] = b; (2) 4543 // A[i+1] = c (3) 4544 // We form the group (2)+(3) in front, so (1) has to form groups with accesses 4545 // above (1), which guarantees that (1) is always above (2). 4546 for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; 4547 ++I) { 4548 Instruction *A = I->first; 4549 StrideDescriptor DesA = I->second; 4550 4551 InterleaveGroup *Group = getInterleaveGroup(A); 4552 if (!Group) { 4553 DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); 4554 Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); 4555 } 4556 4557 if (A->mayWriteToMemory()) 4558 StoreGroups.insert(Group); 4559 4560 for (auto II = std::next(I); II != E; ++II) { 4561 Instruction *B = II->first; 4562 StrideDescriptor DesB = II->second; 4563 4564 // Ignore if B is already in a group or B is a different memory operation. 4565 if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) 4566 continue; 4567 4568 // Check the rule 1 and 2. 4569 if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) 4570 continue; 4571 4572 // Calculate the distance and prepare for the rule 3. 4573 const SCEVConstant *DistToA = 4574 dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); 4575 if (!DistToA) 4576 continue; 4577 4578 int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); 4579 4580 // Skip if the distance is not multiple of size as they are not in the 4581 // same group. 4582 if (DistanceToA % static_cast<int>(DesA.Size)) 4583 continue; 4584 4585 // The index of B is the index of A plus the related index to A. 4586 int IndexB = 4587 Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size); 4588 4589 // Try to insert B into the group. 4590 if (Group->insertMember(B, IndexB, DesB.Align)) { 4591 DEBUG(dbgs() << "LV: Inserted:" << *B << '\n' 4592 << " into the interleave group with" << *A << '\n'); 4593 InterleaveGroupMap[B] = Group; 4594 4595 // Set the first load in program order as the insert position. 4596 if (B->mayReadFromMemory()) 4597 Group->setInsertPos(B); 4598 } 4599 } // Iteration on instruction B 4600 } // Iteration on instruction A 4601 4602 // Remove interleaved store groups with gaps. 4603 for (InterleaveGroup *Group : StoreGroups) 4604 if (Group->getNumMembers() != Group->getFactor()) 4605 releaseGroup(Group); 4606 } 4607 4608 LoopVectorizationCostModel::VectorizationFactor 4609 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { 4610 // Width 1 means no vectorize 4611 VectorizationFactor Factor = { 1U, 0U }; 4612 if (OptForSize && Legal->getRuntimePointerChecking()->Need) { 4613 emitAnalysis(VectorizationReport() << 4614 "runtime pointer checks needed. Enable vectorization of this " 4615 "loop with '#pragma clang loop vectorize(enable)' when " 4616 "compiling with -Os/-Oz"); 4617 DEBUG(dbgs() << 4618 "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); 4619 return Factor; 4620 } 4621 4622 if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { 4623 emitAnalysis(VectorizationReport() << 4624 "store that is conditionally executed prevents vectorization"); 4625 DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); 4626 return Factor; 4627 } 4628 4629 // Find the trip count. 4630 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 4631 DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4632 4633 unsigned WidestType = getWidestType(); 4634 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4635 unsigned MaxSafeDepDist = -1U; 4636 if (Legal->getMaxSafeDepDistBytes() != -1U) 4637 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 4638 WidestRegister = ((WidestRegister < MaxSafeDepDist) ? 4639 WidestRegister : MaxSafeDepDist); 4640 unsigned MaxVectorSize = WidestRegister / WidestType; 4641 DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); 4642 DEBUG(dbgs() << "LV: The Widest register is: " 4643 << WidestRegister << " bits.\n"); 4644 4645 if (MaxVectorSize == 0) { 4646 DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4647 MaxVectorSize = 1; 4648 } 4649 4650 assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" 4651 " into one vector!"); 4652 4653 unsigned VF = MaxVectorSize; 4654 4655 // If we optimize the program for size, avoid creating the tail loop. 4656 if (OptForSize) { 4657 // If we are unable to calculate the trip count then don't try to vectorize. 4658 if (TC < 2) { 4659 emitAnalysis 4660 (VectorizationReport() << 4661 "unable to calculate the loop count due to complex control flow"); 4662 DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 4663 return Factor; 4664 } 4665 4666 // Find the maximum SIMD width that can fit within the trip count. 4667 VF = TC % MaxVectorSize; 4668 4669 if (VF == 0) 4670 VF = MaxVectorSize; 4671 else { 4672 // If the trip count that we found modulo the vectorization factor is not 4673 // zero then we require a tail. 4674 emitAnalysis(VectorizationReport() << 4675 "cannot optimize for size and vectorize at the " 4676 "same time. Enable vectorization of this loop " 4677 "with '#pragma clang loop vectorize(enable)' " 4678 "when compiling with -Os/-Oz"); 4679 DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 4680 return Factor; 4681 } 4682 } 4683 4684 int UserVF = Hints->getWidth(); 4685 if (UserVF != 0) { 4686 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 4687 DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 4688 4689 Factor.Width = UserVF; 4690 return Factor; 4691 } 4692 4693 float Cost = expectedCost(1); 4694 #ifndef NDEBUG 4695 const float ScalarCost = Cost; 4696 #endif /* NDEBUG */ 4697 unsigned Width = 1; 4698 DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4699 4700 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4701 // Ignore scalar width, because the user explicitly wants vectorization. 4702 if (ForceVectorization && VF > 1) { 4703 Width = 2; 4704 Cost = expectedCost(Width) / (float)Width; 4705 } 4706 4707 for (unsigned i=2; i <= VF; i*=2) { 4708 // Notice that the vector loop needs to be executed less times, so 4709 // we need to divide the cost of the vector loops by the width of 4710 // the vector elements. 4711 float VectorCost = expectedCost(i) / (float)i; 4712 DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << 4713 (int)VectorCost << ".\n"); 4714 if (VectorCost < Cost) { 4715 Cost = VectorCost; 4716 Width = i; 4717 } 4718 } 4719 4720 DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 4721 << "LV: Vectorization seems to be not beneficial, " 4722 << "but was forced by a user.\n"); 4723 DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); 4724 Factor.Width = Width; 4725 Factor.Cost = Width * Cost; 4726 return Factor; 4727 } 4728 4729 unsigned LoopVectorizationCostModel::getWidestType() { 4730 unsigned MaxWidth = 8; 4731 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 4732 4733 // For each block. 4734 for (Loop::block_iterator bb = TheLoop->block_begin(), 4735 be = TheLoop->block_end(); bb != be; ++bb) { 4736 BasicBlock *BB = *bb; 4737 4738 // For each instruction in the loop. 4739 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4740 Type *T = it->getType(); 4741 4742 // Ignore ephemeral values. 4743 if (EphValues.count(it)) 4744 continue; 4745 4746 // Only examine Loads, Stores and PHINodes. 4747 if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 4748 continue; 4749 4750 // Examine PHI nodes that are reduction variables. 4751 if (PHINode *PN = dyn_cast<PHINode>(it)) 4752 if (!Legal->getReductionVars()->count(PN)) 4753 continue; 4754 4755 // Examine the stored values. 4756 if (StoreInst *ST = dyn_cast<StoreInst>(it)) 4757 T = ST->getValueOperand()->getType(); 4758 4759 // Ignore loaded pointer types and stored pointer types that are not 4760 // consecutive. However, we do want to take consecutive stores/loads of 4761 // pointer vectors into account. 4762 if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) 4763 continue; 4764 4765 MaxWidth = std::max(MaxWidth, 4766 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 4767 } 4768 } 4769 4770 return MaxWidth; 4771 } 4772 4773 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, 4774 unsigned VF, 4775 unsigned LoopCost) { 4776 4777 // -- The interleave heuristics -- 4778 // We interleave the loop in order to expose ILP and reduce the loop overhead. 4779 // There are many micro-architectural considerations that we can't predict 4780 // at this level. For example, frontend pressure (on decode or fetch) due to 4781 // code size, or the number and capabilities of the execution ports. 4782 // 4783 // We use the following heuristics to select the interleave count: 4784 // 1. If the code has reductions, then we interleave to break the cross 4785 // iteration dependency. 4786 // 2. If the loop is really small, then we interleave to reduce the loop 4787 // overhead. 4788 // 3. We don't interleave if we think that we will spill registers to memory 4789 // due to the increased register pressure. 4790 4791 // When we optimize for size, we don't interleave. 4792 if (OptForSize) 4793 return 1; 4794 4795 // We used the distance for the interleave count. 4796 if (Legal->getMaxSafeDepDistBytes() != -1U) 4797 return 1; 4798 4799 // Do not interleave loops with a relatively small trip count. 4800 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 4801 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 4802 return 1; 4803 4804 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 4805 DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << 4806 " registers\n"); 4807 4808 if (VF == 1) { 4809 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 4810 TargetNumRegisters = ForceTargetNumScalarRegs; 4811 } else { 4812 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 4813 TargetNumRegisters = ForceTargetNumVectorRegs; 4814 } 4815 4816 LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); 4817 // We divide by these constants so assume that we have at least one 4818 // instruction that uses at least one register. 4819 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 4820 R.NumInstructions = std::max(R.NumInstructions, 1U); 4821 4822 // We calculate the interleave count using the following formula. 4823 // Subtract the number of loop invariants from the number of available 4824 // registers. These registers are used by all of the interleaved instances. 4825 // Next, divide the remaining registers by the number of registers that is 4826 // required by the loop, in order to estimate how many parallel instances 4827 // fit without causing spills. All of this is rounded down if necessary to be 4828 // a power of two. We want power of two interleave count to simplify any 4829 // addressing operations or alignment considerations. 4830 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 4831 R.MaxLocalUsers); 4832 4833 // Don't count the induction variable as interleaved. 4834 if (EnableIndVarRegisterHeur) 4835 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 4836 std::max(1U, (R.MaxLocalUsers - 1))); 4837 4838 // Clamp the interleave ranges to reasonable counts. 4839 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 4840 4841 // Check if the user has overridden the max. 4842 if (VF == 1) { 4843 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 4844 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 4845 } else { 4846 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 4847 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 4848 } 4849 4850 // If we did not calculate the cost for VF (because the user selected the VF) 4851 // then we calculate the cost of VF here. 4852 if (LoopCost == 0) 4853 LoopCost = expectedCost(VF); 4854 4855 // Clamp the calculated IC to be between the 1 and the max interleave count 4856 // that the target allows. 4857 if (IC > MaxInterleaveCount) 4858 IC = MaxInterleaveCount; 4859 else if (IC < 1) 4860 IC = 1; 4861 4862 // Interleave if we vectorized this loop and there is a reduction that could 4863 // benefit from interleaving. 4864 if (VF > 1 && Legal->getReductionVars()->size()) { 4865 DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 4866 return IC; 4867 } 4868 4869 // Note that if we've already vectorized the loop we will have done the 4870 // runtime check and so interleaving won't require further checks. 4871 bool InterleavingRequiresRuntimePointerCheck = 4872 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 4873 4874 // We want to interleave small loops in order to reduce the loop overhead and 4875 // potentially expose ILP opportunities. 4876 DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 4877 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 4878 // We assume that the cost overhead is 1 and we use the cost model 4879 // to estimate the cost of the loop and interleave until the cost of the 4880 // loop overhead is about 5% of the cost of the loop. 4881 unsigned SmallIC = 4882 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 4883 4884 // Interleave until store/load ports (estimated by max interleave count) are 4885 // saturated. 4886 unsigned NumStores = Legal->getNumStores(); 4887 unsigned NumLoads = Legal->getNumLoads(); 4888 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 4889 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 4890 4891 // If we have a scalar reduction (vector reductions are already dealt with 4892 // by this point), we can increase the critical path length if the loop 4893 // we're interleaving is inside another loop. Limit, by default to 2, so the 4894 // critical path only gets increased by one reduction operation. 4895 if (Legal->getReductionVars()->size() && 4896 TheLoop->getLoopDepth() > 1) { 4897 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 4898 SmallIC = std::min(SmallIC, F); 4899 StoresIC = std::min(StoresIC, F); 4900 LoadsIC = std::min(LoadsIC, F); 4901 } 4902 4903 if (EnableLoadStoreRuntimeInterleave && 4904 std::max(StoresIC, LoadsIC) > SmallIC) { 4905 DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 4906 return std::max(StoresIC, LoadsIC); 4907 } 4908 4909 DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 4910 return SmallIC; 4911 } 4912 4913 // Interleave if this is a large loop (small loops are already dealt with by 4914 // this 4915 // point) that could benefit from interleaving. 4916 bool HasReductions = (Legal->getReductionVars()->size() > 0); 4917 if (TTI.enableAggressiveInterleaving(HasReductions)) { 4918 DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 4919 return IC; 4920 } 4921 4922 DEBUG(dbgs() << "LV: Not Interleaving.\n"); 4923 return 1; 4924 } 4925 4926 LoopVectorizationCostModel::RegisterUsage 4927 LoopVectorizationCostModel::calculateRegisterUsage() { 4928 // This function calculates the register usage by measuring the highest number 4929 // of values that are alive at a single location. Obviously, this is a very 4930 // rough estimation. We scan the loop in a topological order in order and 4931 // assign a number to each instruction. We use RPO to ensure that defs are 4932 // met before their users. We assume that each instruction that has in-loop 4933 // users starts an interval. We record every time that an in-loop value is 4934 // used, so we have a list of the first and last occurrences of each 4935 // instruction. Next, we transpose this data structure into a multi map that 4936 // holds the list of intervals that *end* at a specific location. This multi 4937 // map allows us to perform a linear search. We scan the instructions linearly 4938 // and record each time that a new interval starts, by placing it in a set. 4939 // If we find this value in the multi-map then we remove it from the set. 4940 // The max register usage is the maximum size of the set. 4941 // We also search for instructions that are defined outside the loop, but are 4942 // used inside the loop. We need this number separately from the max-interval 4943 // usage number because when we unroll, loop-invariant values do not take 4944 // more register. 4945 LoopBlocksDFS DFS(TheLoop); 4946 DFS.perform(LI); 4947 4948 RegisterUsage R; 4949 R.NumInstructions = 0; 4950 4951 // Each 'key' in the map opens a new interval. The values 4952 // of the map are the index of the 'last seen' usage of the 4953 // instruction that is the key. 4954 typedef DenseMap<Instruction*, unsigned> IntervalMap; 4955 // Maps instruction to its index. 4956 DenseMap<unsigned, Instruction*> IdxToInstr; 4957 // Marks the end of each interval. 4958 IntervalMap EndPoint; 4959 // Saves the list of instruction indices that are used in the loop. 4960 SmallSet<Instruction*, 8> Ends; 4961 // Saves the list of values that are used in the loop but are 4962 // defined outside the loop, such as arguments and constants. 4963 SmallPtrSet<Value*, 8> LoopInvariants; 4964 4965 unsigned Index = 0; 4966 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 4967 be = DFS.endRPO(); bb != be; ++bb) { 4968 R.NumInstructions += (*bb)->size(); 4969 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 4970 ++it) { 4971 Instruction *I = it; 4972 IdxToInstr[Index++] = I; 4973 4974 // Save the end location of each USE. 4975 for (unsigned i = 0; i < I->getNumOperands(); ++i) { 4976 Value *U = I->getOperand(i); 4977 Instruction *Instr = dyn_cast<Instruction>(U); 4978 4979 // Ignore non-instruction values such as arguments, constants, etc. 4980 if (!Instr) continue; 4981 4982 // If this instruction is outside the loop then record it and continue. 4983 if (!TheLoop->contains(Instr)) { 4984 LoopInvariants.insert(Instr); 4985 continue; 4986 } 4987 4988 // Overwrite previous end points. 4989 EndPoint[Instr] = Index; 4990 Ends.insert(Instr); 4991 } 4992 } 4993 } 4994 4995 // Saves the list of intervals that end with the index in 'key'. 4996 typedef SmallVector<Instruction*, 2> InstrList; 4997 DenseMap<unsigned, InstrList> TransposeEnds; 4998 4999 // Transpose the EndPoints to a list of values that end at each index. 5000 for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 5001 it != e; ++it) 5002 TransposeEnds[it->second].push_back(it->first); 5003 5004 SmallSet<Instruction*, 8> OpenIntervals; 5005 unsigned MaxUsage = 0; 5006 5007 5008 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5009 for (unsigned int i = 0; i < Index; ++i) { 5010 Instruction *I = IdxToInstr[i]; 5011 // Ignore instructions that are never used within the loop. 5012 if (!Ends.count(I)) continue; 5013 5014 // Ignore ephemeral values. 5015 if (EphValues.count(I)) 5016 continue; 5017 5018 // Remove all of the instructions that end at this location. 5019 InstrList &List = TransposeEnds[i]; 5020 for (unsigned int j=0, e = List.size(); j < e; ++j) 5021 OpenIntervals.erase(List[j]); 5022 5023 // Count the number of live interals. 5024 MaxUsage = std::max(MaxUsage, OpenIntervals.size()); 5025 5026 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << 5027 OpenIntervals.size() << '\n'); 5028 5029 // Add the current instruction to the list of open intervals. 5030 OpenIntervals.insert(I); 5031 } 5032 5033 unsigned Invariant = LoopInvariants.size(); 5034 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); 5035 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); 5036 DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); 5037 5038 R.LoopInvariantRegs = Invariant; 5039 R.MaxLocalUsers = MaxUsage; 5040 return R; 5041 } 5042 5043 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 5044 unsigned Cost = 0; 5045 5046 // For each block. 5047 for (Loop::block_iterator bb = TheLoop->block_begin(), 5048 be = TheLoop->block_end(); bb != be; ++bb) { 5049 unsigned BlockCost = 0; 5050 BasicBlock *BB = *bb; 5051 5052 // For each instruction in the old loop. 5053 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 5054 // Skip dbg intrinsics. 5055 if (isa<DbgInfoIntrinsic>(it)) 5056 continue; 5057 5058 // Ignore ephemeral values. 5059 if (EphValues.count(it)) 5060 continue; 5061 5062 unsigned C = getInstructionCost(it, VF); 5063 5064 // Check if we should override the cost. 5065 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5066 C = ForceTargetInstructionCost; 5067 5068 BlockCost += C; 5069 DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << 5070 VF << " For instruction: " << *it << '\n'); 5071 } 5072 5073 // We assume that if-converted blocks have a 50% chance of being executed. 5074 // When the code is scalar then some of the blocks are avoided due to CF. 5075 // When the code is vectorized we execute all code paths. 5076 if (VF == 1 && Legal->blockNeedsPredication(*bb)) 5077 BlockCost /= 2; 5078 5079 Cost += BlockCost; 5080 } 5081 5082 return Cost; 5083 } 5084 5085 /// \brief Check whether the address computation for a non-consecutive memory 5086 /// access looks like an unlikely candidate for being merged into the indexing 5087 /// mode. 5088 /// 5089 /// We look for a GEP which has one index that is an induction variable and all 5090 /// other indices are loop invariant. If the stride of this access is also 5091 /// within a small bound we decide that this address computation can likely be 5092 /// merged into the addressing mode. 5093 /// In all other cases, we identify the address computation as complex. 5094 static bool isLikelyComplexAddressComputation(Value *Ptr, 5095 LoopVectorizationLegality *Legal, 5096 ScalarEvolution *SE, 5097 const Loop *TheLoop) { 5098 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5099 if (!Gep) 5100 return true; 5101 5102 // We are looking for a gep with all loop invariant indices except for one 5103 // which should be an induction variable. 5104 unsigned NumOperands = Gep->getNumOperands(); 5105 for (unsigned i = 1; i < NumOperands; ++i) { 5106 Value *Opd = Gep->getOperand(i); 5107 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5108 !Legal->isInductionVariable(Opd)) 5109 return true; 5110 } 5111 5112 // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step 5113 // can likely be merged into the address computation. 5114 unsigned MaxMergeDistance = 64; 5115 5116 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); 5117 if (!AddRec) 5118 return true; 5119 5120 // Check the step is constant. 5121 const SCEV *Step = AddRec->getStepRecurrence(*SE); 5122 // Calculate the pointer stride and check if it is consecutive. 5123 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 5124 if (!C) 5125 return true; 5126 5127 const APInt &APStepVal = C->getValue()->getValue(); 5128 5129 // Huge step value - give up. 5130 if (APStepVal.getBitWidth() > 64) 5131 return true; 5132 5133 int64_t StepVal = APStepVal.getSExtValue(); 5134 5135 return StepVal > MaxMergeDistance; 5136 } 5137 5138 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5139 if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) 5140 return true; 5141 return false; 5142 } 5143 5144 unsigned 5145 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5146 // If we know that this instruction will remain uniform, check the cost of 5147 // the scalar version. 5148 if (Legal->isUniformAfterVectorization(I)) 5149 VF = 1; 5150 5151 Type *RetTy = I->getType(); 5152 Type *VectorTy = ToVectorTy(RetTy, VF); 5153 5154 // TODO: We need to estimate the cost of intrinsic calls. 5155 switch (I->getOpcode()) { 5156 case Instruction::GetElementPtr: 5157 // We mark this instruction as zero-cost because the cost of GEPs in 5158 // vectorized code depends on whether the corresponding memory instruction 5159 // is scalarized or not. Therefore, we handle GEPs with the memory 5160 // instruction cost. 5161 return 0; 5162 case Instruction::Br: { 5163 return TTI.getCFInstrCost(I->getOpcode()); 5164 } 5165 case Instruction::PHI: 5166 //TODO: IF-converted IFs become selects. 5167 return 0; 5168 case Instruction::Add: 5169 case Instruction::FAdd: 5170 case Instruction::Sub: 5171 case Instruction::FSub: 5172 case Instruction::Mul: 5173 case Instruction::FMul: 5174 case Instruction::UDiv: 5175 case Instruction::SDiv: 5176 case Instruction::FDiv: 5177 case Instruction::URem: 5178 case Instruction::SRem: 5179 case Instruction::FRem: 5180 case Instruction::Shl: 5181 case Instruction::LShr: 5182 case Instruction::AShr: 5183 case Instruction::And: 5184 case Instruction::Or: 5185 case Instruction::Xor: { 5186 // Since we will replace the stride by 1 the multiplication should go away. 5187 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 5188 return 0; 5189 // Certain instructions can be cheaper to vectorize if they have a constant 5190 // second vector operand. One example of this are shifts on x86. 5191 TargetTransformInfo::OperandValueKind Op1VK = 5192 TargetTransformInfo::OK_AnyValue; 5193 TargetTransformInfo::OperandValueKind Op2VK = 5194 TargetTransformInfo::OK_AnyValue; 5195 TargetTransformInfo::OperandValueProperties Op1VP = 5196 TargetTransformInfo::OP_None; 5197 TargetTransformInfo::OperandValueProperties Op2VP = 5198 TargetTransformInfo::OP_None; 5199 Value *Op2 = I->getOperand(1); 5200 5201 // Check for a splat of a constant or for a non uniform vector of constants. 5202 if (isa<ConstantInt>(Op2)) { 5203 ConstantInt *CInt = cast<ConstantInt>(Op2); 5204 if (CInt && CInt->getValue().isPowerOf2()) 5205 Op2VP = TargetTransformInfo::OP_PowerOf2; 5206 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 5207 } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { 5208 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 5209 Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); 5210 if (SplatValue) { 5211 ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); 5212 if (CInt && CInt->getValue().isPowerOf2()) 5213 Op2VP = TargetTransformInfo::OP_PowerOf2; 5214 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 5215 } 5216 } 5217 5218 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, 5219 Op1VP, Op2VP); 5220 } 5221 case Instruction::Select: { 5222 SelectInst *SI = cast<SelectInst>(I); 5223 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 5224 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 5225 Type *CondTy = SI->getCondition()->getType(); 5226 if (!ScalarCond) 5227 CondTy = VectorType::get(CondTy, VF); 5228 5229 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 5230 } 5231 case Instruction::ICmp: 5232 case Instruction::FCmp: { 5233 Type *ValTy = I->getOperand(0)->getType(); 5234 VectorTy = ToVectorTy(ValTy, VF); 5235 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 5236 } 5237 case Instruction::Store: 5238 case Instruction::Load: { 5239 StoreInst *SI = dyn_cast<StoreInst>(I); 5240 LoadInst *LI = dyn_cast<LoadInst>(I); 5241 Type *ValTy = (SI ? SI->getValueOperand()->getType() : 5242 LI->getType()); 5243 VectorTy = ToVectorTy(ValTy, VF); 5244 5245 unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 5246 unsigned AS = SI ? SI->getPointerAddressSpace() : 5247 LI->getPointerAddressSpace(); 5248 Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 5249 // We add the cost of address computation here instead of with the gep 5250 // instruction because only here we know whether the operation is 5251 // scalarized. 5252 if (VF == 1) 5253 return TTI.getAddressComputationCost(VectorTy) + 5254 TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5255 5256 // For an interleaved access, calculate the total cost of the whole 5257 // interleave group. 5258 if (Legal->isAccessInterleaved(I)) { 5259 auto Group = Legal->getInterleavedAccessGroup(I); 5260 assert(Group && "Fail to get an interleaved access group."); 5261 5262 // Only calculate the cost once at the insert position. 5263 if (Group->getInsertPos() != I) 5264 return 0; 5265 5266 unsigned InterleaveFactor = Group->getFactor(); 5267 Type *WideVecTy = 5268 VectorType::get(VectorTy->getVectorElementType(), 5269 VectorTy->getVectorNumElements() * InterleaveFactor); 5270 5271 // Holds the indices of existing members in an interleaved load group. 5272 // An interleaved store group doesn't need this as it dones't allow gaps. 5273 SmallVector<unsigned, 4> Indices; 5274 if (LI) { 5275 for (unsigned i = 0; i < InterleaveFactor; i++) 5276 if (Group->getMember(i)) 5277 Indices.push_back(i); 5278 } 5279 5280 // Calculate the cost of the whole interleaved group. 5281 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5282 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5283 Group->getAlignment(), AS); 5284 5285 if (Group->isReverse()) 5286 Cost += 5287 Group->getNumMembers() * 5288 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5289 5290 // FIXME: The interleaved load group with a huge gap could be even more 5291 // expensive than scalar operations. Then we could ignore such group and 5292 // use scalar operations instead. 5293 return Cost; 5294 } 5295 5296 // Scalarized loads/stores. 5297 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5298 bool Reverse = ConsecutiveStride < 0; 5299 const DataLayout &DL = I->getModule()->getDataLayout(); 5300 unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); 5301 unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; 5302 if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { 5303 bool IsComplexComputation = 5304 isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); 5305 unsigned Cost = 0; 5306 // The cost of extracting from the value vector and pointer vector. 5307 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5308 for (unsigned i = 0; i < VF; ++i) { 5309 // The cost of extracting the pointer operand. 5310 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 5311 // In case of STORE, the cost of ExtractElement from the vector. 5312 // In case of LOAD, the cost of InsertElement into the returned 5313 // vector. 5314 Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 5315 Instruction::InsertElement, 5316 VectorTy, i); 5317 } 5318 5319 // The cost of the scalar loads/stores. 5320 Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); 5321 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5322 Alignment, AS); 5323 return Cost; 5324 } 5325 5326 // Wide load/stores. 5327 unsigned Cost = TTI.getAddressComputationCost(VectorTy); 5328 if (Legal->isMaskRequired(I)) 5329 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, 5330 AS); 5331 else 5332 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5333 5334 if (Reverse) 5335 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 5336 VectorTy, 0); 5337 return Cost; 5338 } 5339 case Instruction::ZExt: 5340 case Instruction::SExt: 5341 case Instruction::FPToUI: 5342 case Instruction::FPToSI: 5343 case Instruction::FPExt: 5344 case Instruction::PtrToInt: 5345 case Instruction::IntToPtr: 5346 case Instruction::SIToFP: 5347 case Instruction::UIToFP: 5348 case Instruction::Trunc: 5349 case Instruction::FPTrunc: 5350 case Instruction::BitCast: { 5351 // We optimize the truncation of induction variable. 5352 // The cost of these is the same as the scalar operation. 5353 if (I->getOpcode() == Instruction::Trunc && 5354 Legal->isInductionVariable(I->getOperand(0))) 5355 return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 5356 I->getOperand(0)->getType()); 5357 5358 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 5359 return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 5360 } 5361 case Instruction::Call: { 5362 bool NeedToScalarize; 5363 CallInst *CI = cast<CallInst>(I); 5364 unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); 5365 if (getIntrinsicIDForCall(CI, TLI)) 5366 return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); 5367 return CallCost; 5368 } 5369 default: { 5370 // We are scalarizing the instruction. Return the cost of the scalar 5371 // instruction, plus the cost of insert and extract into vector 5372 // elements, times the vector width. 5373 unsigned Cost = 0; 5374 5375 if (!RetTy->isVoidTy() && VF != 1) { 5376 unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 5377 VectorTy); 5378 unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 5379 VectorTy); 5380 5381 // The cost of inserting the results plus extracting each one of the 5382 // operands. 5383 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 5384 } 5385 5386 // The cost of executing VF copies of the scalar instruction. This opcode 5387 // is unknown. Assume that it is the same as 'mul'. 5388 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 5389 return Cost; 5390 } 5391 }// end of switch. 5392 } 5393 5394 char LoopVectorize::ID = 0; 5395 static const char lv_name[] = "Loop Vectorization"; 5396 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 5397 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 5398 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 5399 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 5400 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 5401 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 5402 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 5403 INITIALIZE_PASS_DEPENDENCY(LCSSA) 5404 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 5405 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 5406 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 5407 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 5408 5409 namespace llvm { 5410 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { 5411 return new LoopVectorize(NoUnrolling, AlwaysVectorize); 5412 } 5413 } 5414 5415 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 5416 // Check for a store. 5417 if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) 5418 return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 5419 5420 // Check for a load. 5421 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 5422 return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 5423 5424 return false; 5425 } 5426 5427 5428 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, 5429 bool IfPredicateStore) { 5430 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 5431 // Holds vector parameters or scalars, in case of uniform vals. 5432 SmallVector<VectorParts, 4> Params; 5433 5434 setDebugLocFromInst(Builder, Instr); 5435 5436 // Find all of the vectorized parameters. 5437 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5438 Value *SrcOp = Instr->getOperand(op); 5439 5440 // If we are accessing the old induction variable, use the new one. 5441 if (SrcOp == OldInduction) { 5442 Params.push_back(getVectorValue(SrcOp)); 5443 continue; 5444 } 5445 5446 // Try using previously calculated values. 5447 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 5448 5449 // If the src is an instruction that appeared earlier in the basic block 5450 // then it should already be vectorized. 5451 if (SrcInst && OrigLoop->contains(SrcInst)) { 5452 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 5453 // The parameter is a vector value from earlier. 5454 Params.push_back(WidenMap.get(SrcInst)); 5455 } else { 5456 // The parameter is a scalar from outside the loop. Maybe even a constant. 5457 VectorParts Scalars; 5458 Scalars.append(UF, SrcOp); 5459 Params.push_back(Scalars); 5460 } 5461 } 5462 5463 assert(Params.size() == Instr->getNumOperands() && 5464 "Invalid number of operands"); 5465 5466 // Does this instruction return a value ? 5467 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 5468 5469 Value *UndefVec = IsVoidRetTy ? nullptr : 5470 UndefValue::get(Instr->getType()); 5471 // Create a new entry in the WidenMap and initialize it to Undef or Null. 5472 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 5473 5474 Instruction *InsertPt = Builder.GetInsertPoint(); 5475 BasicBlock *IfBlock = Builder.GetInsertBlock(); 5476 BasicBlock *CondBlock = nullptr; 5477 5478 VectorParts Cond; 5479 Loop *VectorLp = nullptr; 5480 if (IfPredicateStore) { 5481 assert(Instr->getParent()->getSinglePredecessor() && 5482 "Only support single predecessor blocks"); 5483 Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 5484 Instr->getParent()); 5485 VectorLp = LI->getLoopFor(IfBlock); 5486 assert(VectorLp && "Must have a loop for this block"); 5487 } 5488 5489 // For each vector unroll 'part': 5490 for (unsigned Part = 0; Part < UF; ++Part) { 5491 // For each scalar that we create: 5492 5493 // Start an "if (pred) a[i] = ..." block. 5494 Value *Cmp = nullptr; 5495 if (IfPredicateStore) { 5496 if (Cond[Part]->getType()->isVectorTy()) 5497 Cond[Part] = 5498 Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); 5499 Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], 5500 ConstantInt::get(Cond[Part]->getType(), 1)); 5501 CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); 5502 LoopVectorBody.push_back(CondBlock); 5503 VectorLp->addBasicBlockToLoop(CondBlock, *LI); 5504 // Update Builder with newly created basic block. 5505 Builder.SetInsertPoint(InsertPt); 5506 } 5507 5508 Instruction *Cloned = Instr->clone(); 5509 if (!IsVoidRetTy) 5510 Cloned->setName(Instr->getName() + ".cloned"); 5511 // Replace the operands of the cloned instructions with extracted scalars. 5512 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5513 Value *Op = Params[op][Part]; 5514 Cloned->setOperand(op, Op); 5515 } 5516 5517 // Place the cloned scalar in the new loop. 5518 Builder.Insert(Cloned); 5519 5520 // If the original scalar returns a value we need to place it in a vector 5521 // so that future users will be able to use it. 5522 if (!IsVoidRetTy) 5523 VecResults[Part] = Cloned; 5524 5525 // End if-block. 5526 if (IfPredicateStore) { 5527 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); 5528 LoopVectorBody.push_back(NewIfBlock); 5529 VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); 5530 Builder.SetInsertPoint(InsertPt); 5531 ReplaceInstWithInst(IfBlock->getTerminator(), 5532 BranchInst::Create(CondBlock, NewIfBlock, Cmp)); 5533 IfBlock = NewIfBlock; 5534 } 5535 } 5536 } 5537 5538 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { 5539 StoreInst *SI = dyn_cast<StoreInst>(Instr); 5540 bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); 5541 5542 return scalarizeInstruction(Instr, IfPredicateStore); 5543 } 5544 5545 Value *InnerLoopUnroller::reverseVector(Value *Vec) { 5546 return Vec; 5547 } 5548 5549 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { 5550 return V; 5551 } 5552 5553 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { 5554 // When unrolling and the VF is 1, we only need to add a simple scalar. 5555 Type *ITy = Val->getType(); 5556 assert(!ITy->isVectorTy() && "Val must be a scalar"); 5557 Constant *C = ConstantInt::get(ITy, StartIdx); 5558 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 5559 } 5560