1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. 12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13 // of instructions in order to estimate the profitability of vectorization. 14 // 15 // The loop vectorizer combines consecutive loop iterations into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. InnerLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // The interleaved access vectorization is based on the paper: 38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 39 // Data for SIMD 40 // 41 // Other ideas/concepts are from: 42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 43 // 44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 45 // Vectorizing Compilers. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 50 #include "llvm/ADT/DenseMap.h" 51 #include "llvm/ADT/Hashing.h" 52 #include "llvm/ADT/MapVector.h" 53 #include "llvm/ADT/SetVector.h" 54 #include "llvm/ADT/SmallPtrSet.h" 55 #include "llvm/ADT/SmallSet.h" 56 #include "llvm/ADT/SmallVector.h" 57 #include "llvm/ADT/Statistic.h" 58 #include "llvm/ADT/StringExtras.h" 59 #include "llvm/Analysis/CodeMetrics.h" 60 #include "llvm/Analysis/GlobalsModRef.h" 61 #include "llvm/Analysis/LoopInfo.h" 62 #include "llvm/Analysis/LoopIterator.h" 63 #include "llvm/Analysis/LoopPass.h" 64 #include "llvm/Analysis/ScalarEvolutionExpander.h" 65 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 66 #include "llvm/Analysis/ValueTracking.h" 67 #include "llvm/Analysis/VectorUtils.h" 68 #include "llvm/IR/Constants.h" 69 #include "llvm/IR/DataLayout.h" 70 #include "llvm/IR/DebugInfo.h" 71 #include "llvm/IR/DerivedTypes.h" 72 #include "llvm/IR/DiagnosticInfo.h" 73 #include "llvm/IR/Dominators.h" 74 #include "llvm/IR/Function.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/Instructions.h" 77 #include "llvm/IR/IntrinsicInst.h" 78 #include "llvm/IR/LLVMContext.h" 79 #include "llvm/IR/Module.h" 80 #include "llvm/IR/PatternMatch.h" 81 #include "llvm/IR/Type.h" 82 #include "llvm/IR/Value.h" 83 #include "llvm/IR/ValueHandle.h" 84 #include "llvm/IR/Verifier.h" 85 #include "llvm/Pass.h" 86 #include "llvm/Support/BranchProbability.h" 87 #include "llvm/Support/CommandLine.h" 88 #include "llvm/Support/Debug.h" 89 #include "llvm/Support/raw_ostream.h" 90 #include "llvm/Transforms/Scalar.h" 91 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 92 #include "llvm/Transforms/Utils/Local.h" 93 #include "llvm/Transforms/Utils/LoopUtils.h" 94 #include "llvm/Transforms/Utils/LoopVersioning.h" 95 #include "llvm/Transforms/Vectorize.h" 96 #include <algorithm> 97 #include <map> 98 #include <tuple> 99 100 using namespace llvm; 101 using namespace llvm::PatternMatch; 102 103 #define LV_NAME "loop-vectorize" 104 #define DEBUG_TYPE LV_NAME 105 106 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 107 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 108 109 static cl::opt<bool> 110 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 111 cl::desc("Enable if-conversion during vectorization.")); 112 113 /// We don't vectorize loops with a known constant trip count below this number. 114 static cl::opt<unsigned> TinyTripCountVectorThreshold( 115 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 116 cl::desc("Don't vectorize loops with a constant " 117 "trip count that is smaller than this " 118 "value.")); 119 120 static cl::opt<bool> MaximizeBandwidth( 121 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 122 cl::desc("Maximize bandwidth when selecting vectorization factor which " 123 "will be determined by the smallest type in loop.")); 124 125 static cl::opt<bool> EnableInterleavedMemAccesses( 126 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 127 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 128 129 /// Maximum factor for an interleaved memory access. 130 static cl::opt<unsigned> MaxInterleaveGroupFactor( 131 "max-interleave-group-factor", cl::Hidden, 132 cl::desc("Maximum factor for an interleaved access group (default = 8)"), 133 cl::init(8)); 134 135 /// We don't interleave loops with a known constant trip count below this 136 /// number. 137 static const unsigned TinyTripCountInterleaveThreshold = 128; 138 139 static cl::opt<unsigned> ForceTargetNumScalarRegs( 140 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 141 cl::desc("A flag that overrides the target's number of scalar registers.")); 142 143 static cl::opt<unsigned> ForceTargetNumVectorRegs( 144 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 145 cl::desc("A flag that overrides the target's number of vector registers.")); 146 147 /// Maximum vectorization interleave count. 148 static const unsigned MaxInterleaveFactor = 16; 149 150 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 151 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 152 cl::desc("A flag that overrides the target's max interleave factor for " 153 "scalar loops.")); 154 155 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 156 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 157 cl::desc("A flag that overrides the target's max interleave factor for " 158 "vectorized loops.")); 159 160 static cl::opt<unsigned> ForceTargetInstructionCost( 161 "force-target-instruction-cost", cl::init(0), cl::Hidden, 162 cl::desc("A flag that overrides the target's expected cost for " 163 "an instruction to a single constant value. Mostly " 164 "useful for getting consistent testing.")); 165 166 static cl::opt<unsigned> SmallLoopCost( 167 "small-loop-cost", cl::init(20), cl::Hidden, 168 cl::desc( 169 "The cost of a loop that is considered 'small' by the interleaver.")); 170 171 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 172 "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, 173 cl::desc("Enable the use of the block frequency analysis to access PGO " 174 "heuristics minimizing code growth in cold regions and being more " 175 "aggressive in hot regions.")); 176 177 // Runtime interleave loops for load/store throughput. 178 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 179 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 180 cl::desc( 181 "Enable runtime interleaving until load/store ports are saturated")); 182 183 /// The number of stores in a loop that are allowed to need predication. 184 static cl::opt<unsigned> NumberOfStoresToPredicate( 185 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 186 cl::desc("Max number of stores to be predicated behind an if.")); 187 188 static cl::opt<bool> EnableIndVarRegisterHeur( 189 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 190 cl::desc("Count the induction variable only once when interleaving")); 191 192 static cl::opt<bool> EnableCondStoresVectorization( 193 "enable-cond-stores-vec", cl::init(false), cl::Hidden, 194 cl::desc("Enable if predication of stores during vectorization.")); 195 196 static cl::opt<unsigned> MaxNestedScalarReductionIC( 197 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 198 cl::desc("The maximum interleave count to use when interleaving a scalar " 199 "reduction in a nested loop.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 static cl::opt<unsigned> VectorizeSCEVCheckThreshold( 207 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 208 cl::desc("The maximum number of SCEV checks allowed.")); 209 210 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 211 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 212 cl::desc("The maximum number of SCEV checks allowed with a " 213 "vectorize(enable) pragma")); 214 215 namespace { 216 217 // Forward declarations. 218 class LoopVectorizeHints; 219 class LoopVectorizationLegality; 220 class LoopVectorizationCostModel; 221 class LoopVectorizationRequirements; 222 223 /// \brief This modifies LoopAccessReport to initialize message with 224 /// loop-vectorizer-specific part. 225 class VectorizationReport : public LoopAccessReport { 226 public: 227 VectorizationReport(Instruction *I = nullptr) 228 : LoopAccessReport("loop not vectorized: ", I) {} 229 230 /// \brief This allows promotion of the loop-access analysis report into the 231 /// loop-vectorizer report. It modifies the message to add the 232 /// loop-vectorizer-specific part of the message. 233 explicit VectorizationReport(const LoopAccessReport &R) 234 : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), 235 R.getInstr()) {} 236 }; 237 238 /// A helper function for converting Scalar types to vector types. 239 /// If the incoming type is void, we return void. If the VF is 1, we return 240 /// the scalar type. 241 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 242 if (Scalar->isVoidTy() || VF == 1) 243 return Scalar; 244 return VectorType::get(Scalar, VF); 245 } 246 247 /// A helper function that returns GEP instruction and knows to skip a 248 /// 'bitcast'. The 'bitcast' may be skipped if the source and the destination 249 /// pointee types of the 'bitcast' have the same size. 250 /// For example: 251 /// bitcast double** %var to i64* - can be skipped 252 /// bitcast double** %var to i8* - can not 253 static GetElementPtrInst *getGEPInstruction(Value *Ptr) { 254 255 if (isa<GetElementPtrInst>(Ptr)) 256 return cast<GetElementPtrInst>(Ptr); 257 258 if (isa<BitCastInst>(Ptr) && 259 isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { 260 Type *BitcastTy = Ptr->getType(); 261 Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); 262 if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) 263 return nullptr; 264 Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); 265 Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); 266 const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); 267 if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) 268 return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); 269 } 270 return nullptr; 271 } 272 273 /// A helper function that returns the pointer operand of a load or store 274 /// instruction. 275 static Value *getPointerOperand(Value *I) { 276 if (auto *LI = dyn_cast<LoadInst>(I)) 277 return LI->getPointerOperand(); 278 if (auto *SI = dyn_cast<StoreInst>(I)) 279 return SI->getPointerOperand(); 280 return nullptr; 281 } 282 283 /// InnerLoopVectorizer vectorizes loops which contain only one basic 284 /// block to a specified vectorization factor (VF). 285 /// This class performs the widening of scalars into vectors, or multiple 286 /// scalars. This class also implements the following features: 287 /// * It inserts an epilogue loop for handling loops that don't have iteration 288 /// counts that are known to be a multiple of the vectorization factor. 289 /// * It handles the code generation for reduction variables. 290 /// * Scalarization (implementation using scalars) of un-vectorizable 291 /// instructions. 292 /// InnerLoopVectorizer does not perform any vectorization-legality 293 /// checks, and relies on the caller to check for the different legality 294 /// aspects. The InnerLoopVectorizer relies on the 295 /// LoopVectorizationLegality class to provide information about the induction 296 /// and reduction variables that were found to a given vectorization factor. 297 class InnerLoopVectorizer { 298 public: 299 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 300 LoopInfo *LI, DominatorTree *DT, 301 const TargetLibraryInfo *TLI, 302 const TargetTransformInfo *TTI, AssumptionCache *AC, 303 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 304 unsigned UnrollFactor) 305 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 306 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 307 Builder(PSE.getSE()->getContext()), Induction(nullptr), 308 OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), 309 VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} 310 311 // Perform the actual loop widening (vectorization). 312 // MinimumBitWidths maps scalar integer values to the smallest bitwidth they 313 // can be validly truncated to. The cost model has assumed this truncation 314 // will happen when vectorizing. VecValuesToIgnore contains scalar values 315 // that the cost model has chosen to ignore because they will not be 316 // vectorized. 317 void vectorize(LoopVectorizationLegality *L, 318 const MapVector<Instruction *, uint64_t> &MinimumBitWidths) { 319 MinBWs = &MinimumBitWidths; 320 Legal = L; 321 // Create a new empty loop. Unlink the old loop and connect the new one. 322 createEmptyLoop(); 323 // Widen each instruction in the old loop to a new one in the new loop. 324 // Use the Legality module to find the induction and reduction variables. 325 vectorizeLoop(); 326 } 327 328 // Return true if any runtime check is added. 329 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 330 331 virtual ~InnerLoopVectorizer() {} 332 333 protected: 334 /// A small list of PHINodes. 335 typedef SmallVector<PHINode *, 4> PhiVector; 336 /// When we unroll loops we have multiple vector values for each scalar. 337 /// This data structure holds the unrolled and vectorized values that 338 /// originated from one scalar instruction. 339 typedef SmallVector<Value *, 2> VectorParts; 340 341 // When we if-convert we need to create edge masks. We have to cache values 342 // so that we don't end up with exponential recursion/IR. 343 typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts> 344 EdgeMaskCache; 345 346 /// Create an empty loop, based on the loop ranges of the old loop. 347 void createEmptyLoop(); 348 349 /// Set up the values of the IVs correctly when exiting the vector loop. 350 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 351 Value *CountRoundDown, Value *EndValue, 352 BasicBlock *MiddleBlock); 353 354 /// Create a new induction variable inside L. 355 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 356 Value *Step, Instruction *DL); 357 /// Copy and widen the instructions from the old loop. 358 virtual void vectorizeLoop(); 359 360 /// Fix a first-order recurrence. This is the second phase of vectorizing 361 /// this phi node. 362 void fixFirstOrderRecurrence(PHINode *Phi); 363 364 /// \brief The Loop exit block may have single value PHI nodes where the 365 /// incoming value is 'Undef'. While vectorizing we only handled real values 366 /// that were defined inside the loop. Here we fix the 'undef case'. 367 /// See PR14725. 368 void fixLCSSAPHIs(); 369 370 /// Predicate conditional stores on their respective conditions. 371 void predicateStores(); 372 373 /// Shrinks vector element sizes based on information in "MinBWs". 374 void truncateToMinimalBitwidths(); 375 376 /// A helper function that computes the predicate of the block BB, assuming 377 /// that the header block of the loop is set to True. It returns the *entry* 378 /// mask for the block BB. 379 VectorParts createBlockInMask(BasicBlock *BB); 380 /// A helper function that computes the predicate of the edge between SRC 381 /// and DST. 382 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 383 384 /// A helper function to vectorize a single BB within the innermost loop. 385 void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); 386 387 /// Vectorize a single PHINode in a block. This method handles the induction 388 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 389 /// arbitrary length vectors. 390 void widenPHIInstruction(Instruction *PN, VectorParts &Entry, unsigned UF, 391 unsigned VF, PhiVector *PV); 392 393 /// Insert the new loop to the loop hierarchy and pass manager 394 /// and update the analysis passes. 395 void updateAnalysis(); 396 397 /// This instruction is un-vectorizable. Implement it as a sequence 398 /// of scalars. If \p IfPredicateStore is true we need to 'hide' each 399 /// scalarized instruction behind an if block predicated on the control 400 /// dependence of the instruction. 401 virtual void scalarizeInstruction(Instruction *Instr, 402 bool IfPredicateStore = false); 403 404 /// Vectorize Load and Store instructions, 405 virtual void vectorizeMemoryInstruction(Instruction *Instr); 406 407 /// Create a broadcast instruction. This method generates a broadcast 408 /// instruction (shuffle) for loop invariant values and for the induction 409 /// value. If this is the induction variable then we extend it to N, N+1, ... 410 /// this is needed because each iteration in the loop corresponds to a SIMD 411 /// element. 412 virtual Value *getBroadcastInstrs(Value *V); 413 414 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 415 /// to each vector element of Val. The sequence starts at StartIndex. 416 /// \p Opcode is relevant for FP induction variable. 417 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 418 Instruction::BinaryOps Opcode = 419 Instruction::BinaryOpsEnd); 420 421 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 422 /// variable on which to base the steps, \p Step is the size of the step, and 423 /// \p EntryVal is the value from the original loop that maps to the steps. 424 /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it 425 /// can be a truncate instruction). 426 void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal); 427 428 /// Create a vector induction phi node based on an existing scalar one. This 429 /// currently only works for integer induction variables with a constant 430 /// step. If \p TruncType is non-null, instead of widening the original IV, 431 /// we widen a version of the IV truncated to \p TruncType. 432 void createVectorIntInductionPHI(const InductionDescriptor &II, 433 VectorParts &Entry, IntegerType *TruncType); 434 435 /// Widen an integer induction variable \p IV. If \p Trunc is provided, the 436 /// induction variable will first be truncated to the corresponding type. The 437 /// widened values are placed in \p Entry. 438 void widenIntInduction(PHINode *IV, VectorParts &Entry, 439 TruncInst *Trunc = nullptr); 440 441 /// Returns true if we should generate a scalar version of \p IV. 442 bool needsScalarInduction(Instruction *IV) const; 443 444 /// When we go over instructions in the basic block we rely on previous 445 /// values within the current basic block or on loop invariant values. 446 /// When we widen (vectorize) values we place them in the map. If the values 447 /// are not within the map, they have to be loop invariant, so we simply 448 /// broadcast them into a vector. 449 VectorParts &getVectorValue(Value *V); 450 451 /// Try to vectorize the interleaved access group that \p Instr belongs to. 452 void vectorizeInterleaveGroup(Instruction *Instr); 453 454 /// Generate a shuffle sequence that will reverse the vector Vec. 455 virtual Value *reverseVector(Value *Vec); 456 457 /// Returns (and creates if needed) the original loop trip count. 458 Value *getOrCreateTripCount(Loop *NewLoop); 459 460 /// Returns (and creates if needed) the trip count of the widened loop. 461 Value *getOrCreateVectorTripCount(Loop *NewLoop); 462 463 /// Emit a bypass check to see if the trip count would overflow, or we 464 /// wouldn't have enough iterations to execute one vector loop. 465 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 466 /// Emit a bypass check to see if the vector trip count is nonzero. 467 void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); 468 /// Emit a bypass check to see if all of the SCEV assumptions we've 469 /// had to make are correct. 470 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 471 /// Emit bypass checks to check any memory assumptions we may have made. 472 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 473 474 /// Add additional metadata to \p To that was not present on \p Orig. 475 /// 476 /// Currently this is used to add the noalias annotations based on the 477 /// inserted memchecks. Use this for instructions that are *cloned* into the 478 /// vector loop. 479 void addNewMetadata(Instruction *To, const Instruction *Orig); 480 481 /// Add metadata from one instruction to another. 482 /// 483 /// This includes both the original MDs from \p From and additional ones (\see 484 /// addNewMetadata). Use this for *newly created* instructions in the vector 485 /// loop. 486 void addMetadata(Instruction *To, Instruction *From); 487 488 /// \brief Similar to the previous function but it adds the metadata to a 489 /// vector of instructions. 490 void addMetadata(ArrayRef<Value *> To, Instruction *From); 491 492 /// This is a helper class that holds the vectorizer state. It maps scalar 493 /// instructions to vector instructions. When the code is 'unrolled' then 494 /// then a single scalar value is mapped to multiple vector parts. The parts 495 /// are stored in the VectorPart type. 496 struct ValueMap { 497 /// C'tor. UnrollFactor controls the number of vectors ('parts') that 498 /// are mapped. 499 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 500 501 /// \return True if 'Key' is saved in the Value Map. 502 bool has(Value *Key) const { return MapStorage.count(Key); } 503 504 /// Initializes a new entry in the map. Sets all of the vector parts to the 505 /// save value in 'Val'. 506 /// \return A reference to a vector with splat values. 507 VectorParts &splat(Value *Key, Value *Val) { 508 VectorParts &Entry = MapStorage[Key]; 509 Entry.assign(UF, Val); 510 return Entry; 511 } 512 513 ///\return A reference to the value that is stored at 'Key'. 514 VectorParts &get(Value *Key) { 515 VectorParts &Entry = MapStorage[Key]; 516 if (Entry.empty()) 517 Entry.resize(UF); 518 assert(Entry.size() == UF); 519 return Entry; 520 } 521 522 private: 523 /// The unroll factor. Each entry in the map stores this number of vector 524 /// elements. 525 unsigned UF; 526 527 /// Map storage. We use std::map and not DenseMap because insertions to a 528 /// dense map invalidates its iterators. 529 std::map<Value *, VectorParts> MapStorage; 530 }; 531 532 /// The original loop. 533 Loop *OrigLoop; 534 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 535 /// dynamic knowledge to simplify SCEV expressions and converts them to a 536 /// more usable form. 537 PredicatedScalarEvolution &PSE; 538 /// Loop Info. 539 LoopInfo *LI; 540 /// Dominator Tree. 541 DominatorTree *DT; 542 /// Alias Analysis. 543 AliasAnalysis *AA; 544 /// Target Library Info. 545 const TargetLibraryInfo *TLI; 546 /// Target Transform Info. 547 const TargetTransformInfo *TTI; 548 /// Assumption Cache. 549 AssumptionCache *AC; 550 /// Interface to emit optimization remarks. 551 OptimizationRemarkEmitter *ORE; 552 553 /// \brief LoopVersioning. It's only set up (non-null) if memchecks were 554 /// used. 555 /// 556 /// This is currently only used to add no-alias metadata based on the 557 /// memchecks. The actually versioning is performed manually. 558 std::unique_ptr<LoopVersioning> LVer; 559 560 /// The vectorization SIMD factor to use. Each vector will have this many 561 /// vector elements. 562 unsigned VF; 563 564 protected: 565 /// The vectorization unroll factor to use. Each scalar is vectorized to this 566 /// many different vector instructions. 567 unsigned UF; 568 569 /// The builder that we use 570 IRBuilder<> Builder; 571 572 // --- Vectorization state --- 573 574 /// The vector-loop preheader. 575 BasicBlock *LoopVectorPreHeader; 576 /// The scalar-loop preheader. 577 BasicBlock *LoopScalarPreHeader; 578 /// Middle Block between the vector and the scalar. 579 BasicBlock *LoopMiddleBlock; 580 /// The ExitBlock of the scalar loop. 581 BasicBlock *LoopExitBlock; 582 /// The vector loop body. 583 BasicBlock *LoopVectorBody; 584 /// The scalar loop body. 585 BasicBlock *LoopScalarBody; 586 /// A list of all bypass blocks. The first block is the entry of the loop. 587 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 588 589 /// The new Induction variable which was added to the new block. 590 PHINode *Induction; 591 /// The induction variable of the old basic block. 592 PHINode *OldInduction; 593 /// Maps scalars to widened vectors. 594 ValueMap WidenMap; 595 596 /// A map of induction variables from the original loop to their 597 /// corresponding VF * UF scalarized values in the vectorized loop. The 598 /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap 599 /// maps original loop values to their vector versions in the new loop, 600 /// ScalarIVMap maps induction variables from the original loop that are not 601 /// vectorized to their scalar equivalents in the vector loop. Maintaining a 602 /// separate map for scalarized induction variables allows us to avoid 603 /// unnecessary scalar-to-vector-to-scalar conversions. 604 DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap; 605 606 /// Store instructions that should be predicated, as a pair 607 /// <StoreInst, Predicate> 608 SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores; 609 EdgeMaskCache MaskCache; 610 /// Trip count of the original loop. 611 Value *TripCount; 612 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 613 Value *VectorTripCount; 614 615 /// Map of scalar integer values to the smallest bitwidth they can be legally 616 /// represented as. The vector equivalents of these values should be truncated 617 /// to this type. 618 const MapVector<Instruction *, uint64_t> *MinBWs; 619 620 LoopVectorizationLegality *Legal; 621 622 // Record whether runtime checks are added. 623 bool AddedSafetyChecks; 624 }; 625 626 class InnerLoopUnroller : public InnerLoopVectorizer { 627 public: 628 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 629 LoopInfo *LI, DominatorTree *DT, 630 const TargetLibraryInfo *TLI, 631 const TargetTransformInfo *TTI, AssumptionCache *AC, 632 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor) 633 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 634 UnrollFactor) {} 635 636 private: 637 void scalarizeInstruction(Instruction *Instr, 638 bool IfPredicateStore = false) override; 639 void vectorizeMemoryInstruction(Instruction *Instr) override; 640 Value *getBroadcastInstrs(Value *V) override; 641 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 642 Instruction::BinaryOps Opcode = 643 Instruction::BinaryOpsEnd) override; 644 Value *reverseVector(Value *Vec) override; 645 }; 646 647 /// \brief Look for a meaningful debug location on the instruction or it's 648 /// operands. 649 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 650 if (!I) 651 return I; 652 653 DebugLoc Empty; 654 if (I->getDebugLoc() != Empty) 655 return I; 656 657 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 658 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 659 if (OpInst->getDebugLoc() != Empty) 660 return OpInst; 661 } 662 663 return I; 664 } 665 666 /// \brief Set the debug location in the builder using the debug location in the 667 /// instruction. 668 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 669 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) 670 B.SetCurrentDebugLocation(Inst->getDebugLoc()); 671 else 672 B.SetCurrentDebugLocation(DebugLoc()); 673 } 674 675 #ifndef NDEBUG 676 /// \return string containing a file name and a line # for the given loop. 677 static std::string getDebugLocString(const Loop *L) { 678 std::string Result; 679 if (L) { 680 raw_string_ostream OS(Result); 681 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 682 LoopDbgLoc.print(OS); 683 else 684 // Just print the module name. 685 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 686 OS.flush(); 687 } 688 return Result; 689 } 690 #endif 691 692 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 693 const Instruction *Orig) { 694 // If the loop was versioned with memchecks, add the corresponding no-alias 695 // metadata. 696 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 697 LVer->annotateInstWithNoAlias(To, Orig); 698 } 699 700 void InnerLoopVectorizer::addMetadata(Instruction *To, 701 Instruction *From) { 702 propagateMetadata(To, From); 703 addNewMetadata(To, From); 704 } 705 706 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 707 Instruction *From) { 708 for (Value *V : To) { 709 if (Instruction *I = dyn_cast<Instruction>(V)) 710 addMetadata(I, From); 711 } 712 } 713 714 /// \brief The group of interleaved loads/stores sharing the same stride and 715 /// close to each other. 716 /// 717 /// Each member in this group has an index starting from 0, and the largest 718 /// index should be less than interleaved factor, which is equal to the absolute 719 /// value of the access's stride. 720 /// 721 /// E.g. An interleaved load group of factor 4: 722 /// for (unsigned i = 0; i < 1024; i+=4) { 723 /// a = A[i]; // Member of index 0 724 /// b = A[i+1]; // Member of index 1 725 /// d = A[i+3]; // Member of index 3 726 /// ... 727 /// } 728 /// 729 /// An interleaved store group of factor 4: 730 /// for (unsigned i = 0; i < 1024; i+=4) { 731 /// ... 732 /// A[i] = a; // Member of index 0 733 /// A[i+1] = b; // Member of index 1 734 /// A[i+2] = c; // Member of index 2 735 /// A[i+3] = d; // Member of index 3 736 /// } 737 /// 738 /// Note: the interleaved load group could have gaps (missing members), but 739 /// the interleaved store group doesn't allow gaps. 740 class InterleaveGroup { 741 public: 742 InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) 743 : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { 744 assert(Align && "The alignment should be non-zero"); 745 746 Factor = std::abs(Stride); 747 assert(Factor > 1 && "Invalid interleave factor"); 748 749 Reverse = Stride < 0; 750 Members[0] = Instr; 751 } 752 753 bool isReverse() const { return Reverse; } 754 unsigned getFactor() const { return Factor; } 755 unsigned getAlignment() const { return Align; } 756 unsigned getNumMembers() const { return Members.size(); } 757 758 /// \brief Try to insert a new member \p Instr with index \p Index and 759 /// alignment \p NewAlign. The index is related to the leader and it could be 760 /// negative if it is the new leader. 761 /// 762 /// \returns false if the instruction doesn't belong to the group. 763 bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { 764 assert(NewAlign && "The new member's alignment should be non-zero"); 765 766 int Key = Index + SmallestKey; 767 768 // Skip if there is already a member with the same index. 769 if (Members.count(Key)) 770 return false; 771 772 if (Key > LargestKey) { 773 // The largest index is always less than the interleave factor. 774 if (Index >= static_cast<int>(Factor)) 775 return false; 776 777 LargestKey = Key; 778 } else if (Key < SmallestKey) { 779 // The largest index is always less than the interleave factor. 780 if (LargestKey - Key >= static_cast<int>(Factor)) 781 return false; 782 783 SmallestKey = Key; 784 } 785 786 // It's always safe to select the minimum alignment. 787 Align = std::min(Align, NewAlign); 788 Members[Key] = Instr; 789 return true; 790 } 791 792 /// \brief Get the member with the given index \p Index 793 /// 794 /// \returns nullptr if contains no such member. 795 Instruction *getMember(unsigned Index) const { 796 int Key = SmallestKey + Index; 797 if (!Members.count(Key)) 798 return nullptr; 799 800 return Members.find(Key)->second; 801 } 802 803 /// \brief Get the index for the given member. Unlike the key in the member 804 /// map, the index starts from 0. 805 unsigned getIndex(Instruction *Instr) const { 806 for (auto I : Members) 807 if (I.second == Instr) 808 return I.first - SmallestKey; 809 810 llvm_unreachable("InterleaveGroup contains no such member"); 811 } 812 813 Instruction *getInsertPos() const { return InsertPos; } 814 void setInsertPos(Instruction *Inst) { InsertPos = Inst; } 815 816 private: 817 unsigned Factor; // Interleave Factor. 818 bool Reverse; 819 unsigned Align; 820 DenseMap<int, Instruction *> Members; 821 int SmallestKey; 822 int LargestKey; 823 824 // To avoid breaking dependences, vectorized instructions of an interleave 825 // group should be inserted at either the first load or the last store in 826 // program order. 827 // 828 // E.g. %even = load i32 // Insert Position 829 // %add = add i32 %even // Use of %even 830 // %odd = load i32 831 // 832 // store i32 %even 833 // %odd = add i32 // Def of %odd 834 // store i32 %odd // Insert Position 835 Instruction *InsertPos; 836 }; 837 838 /// \brief Drive the analysis of interleaved memory accesses in the loop. 839 /// 840 /// Use this class to analyze interleaved accesses only when we can vectorize 841 /// a loop. Otherwise it's meaningless to do analysis as the vectorization 842 /// on interleaved accesses is unsafe. 843 /// 844 /// The analysis collects interleave groups and records the relationships 845 /// between the member and the group in a map. 846 class InterleavedAccessInfo { 847 public: 848 InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, 849 DominatorTree *DT, LoopInfo *LI) 850 : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr), 851 RequiresScalarEpilogue(false) {} 852 853 ~InterleavedAccessInfo() { 854 SmallSet<InterleaveGroup *, 4> DelSet; 855 // Avoid releasing a pointer twice. 856 for (auto &I : InterleaveGroupMap) 857 DelSet.insert(I.second); 858 for (auto *Ptr : DelSet) 859 delete Ptr; 860 } 861 862 /// \brief Analyze the interleaved accesses and collect them in interleave 863 /// groups. Substitute symbolic strides using \p Strides. 864 void analyzeInterleaving(const ValueToValueMap &Strides); 865 866 /// \brief Check if \p Instr belongs to any interleave group. 867 bool isInterleaved(Instruction *Instr) const { 868 return InterleaveGroupMap.count(Instr); 869 } 870 871 /// \brief Return the maximum interleave factor of all interleaved groups. 872 unsigned getMaxInterleaveFactor() const { 873 unsigned MaxFactor = 1; 874 for (auto &Entry : InterleaveGroupMap) 875 MaxFactor = std::max(MaxFactor, Entry.second->getFactor()); 876 return MaxFactor; 877 } 878 879 /// \brief Get the interleave group that \p Instr belongs to. 880 /// 881 /// \returns nullptr if doesn't have such group. 882 InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { 883 if (InterleaveGroupMap.count(Instr)) 884 return InterleaveGroupMap.find(Instr)->second; 885 return nullptr; 886 } 887 888 /// \brief Returns true if an interleaved group that may access memory 889 /// out-of-bounds requires a scalar epilogue iteration for correctness. 890 bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } 891 892 /// \brief Initialize the LoopAccessInfo used for dependence checking. 893 void setLAI(const LoopAccessInfo *Info) { LAI = Info; } 894 895 private: 896 /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. 897 /// Simplifies SCEV expressions in the context of existing SCEV assumptions. 898 /// The interleaved access analysis can also add new predicates (for example 899 /// by versioning strides of pointers). 900 PredicatedScalarEvolution &PSE; 901 Loop *TheLoop; 902 DominatorTree *DT; 903 LoopInfo *LI; 904 const LoopAccessInfo *LAI; 905 906 /// True if the loop may contain non-reversed interleaved groups with 907 /// out-of-bounds accesses. We ensure we don't speculatively access memory 908 /// out-of-bounds by executing at least one scalar epilogue iteration. 909 bool RequiresScalarEpilogue; 910 911 /// Holds the relationships between the members and the interleave group. 912 DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; 913 914 /// Holds dependences among the memory accesses in the loop. It maps a source 915 /// access to a set of dependent sink accesses. 916 DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; 917 918 /// \brief The descriptor for a strided memory access. 919 struct StrideDescriptor { 920 StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, 921 unsigned Align) 922 : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} 923 924 StrideDescriptor() = default; 925 926 // The access's stride. It is negative for a reverse access. 927 int64_t Stride = 0; 928 const SCEV *Scev = nullptr; // The scalar expression of this access 929 uint64_t Size = 0; // The size of the memory object. 930 unsigned Align = 0; // The alignment of this access. 931 }; 932 933 /// \brief A type for holding instructions and their stride descriptors. 934 typedef std::pair<Instruction *, StrideDescriptor> StrideEntry; 935 936 /// \brief Create a new interleave group with the given instruction \p Instr, 937 /// stride \p Stride and alignment \p Align. 938 /// 939 /// \returns the newly created interleave group. 940 InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, 941 unsigned Align) { 942 assert(!InterleaveGroupMap.count(Instr) && 943 "Already in an interleaved access group"); 944 InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); 945 return InterleaveGroupMap[Instr]; 946 } 947 948 /// \brief Release the group and remove all the relationships. 949 void releaseGroup(InterleaveGroup *Group) { 950 for (unsigned i = 0; i < Group->getFactor(); i++) 951 if (Instruction *Member = Group->getMember(i)) 952 InterleaveGroupMap.erase(Member); 953 954 delete Group; 955 } 956 957 /// \brief Collect all the accesses with a constant stride in program order. 958 void collectConstStrideAccesses( 959 MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, 960 const ValueToValueMap &Strides); 961 962 /// \brief Returns true if \p Stride is allowed in an interleaved group. 963 static bool isStrided(int Stride) { 964 unsigned Factor = std::abs(Stride); 965 return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; 966 } 967 968 /// \brief Returns true if \p BB is a predicated block. 969 bool isPredicated(BasicBlock *BB) const { 970 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 971 } 972 973 /// \brief Returns true if LoopAccessInfo can be used for dependence queries. 974 bool areDependencesValid() const { 975 return LAI && LAI->getDepChecker().getDependences(); 976 } 977 978 /// \brief Returns true if memory accesses \p A and \p B can be reordered, if 979 /// necessary, when constructing interleaved groups. 980 /// 981 /// \p A must precede \p B in program order. We return false if reordering is 982 /// not necessary or is prevented because \p A and \p B may be dependent. 983 bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A, 984 StrideEntry *B) const { 985 986 // Code motion for interleaved accesses can potentially hoist strided loads 987 // and sink strided stores. The code below checks the legality of the 988 // following two conditions: 989 // 990 // 1. Potentially moving a strided load (B) before any store (A) that 991 // precedes B, or 992 // 993 // 2. Potentially moving a strided store (A) after any load or store (B) 994 // that A precedes. 995 // 996 // It's legal to reorder A and B if we know there isn't a dependence from A 997 // to B. Note that this determination is conservative since some 998 // dependences could potentially be reordered safely. 999 1000 // A is potentially the source of a dependence. 1001 auto *Src = A->first; 1002 auto SrcDes = A->second; 1003 1004 // B is potentially the sink of a dependence. 1005 auto *Sink = B->first; 1006 auto SinkDes = B->second; 1007 1008 // Code motion for interleaved accesses can't violate WAR dependences. 1009 // Thus, reordering is legal if the source isn't a write. 1010 if (!Src->mayWriteToMemory()) 1011 return true; 1012 1013 // At least one of the accesses must be strided. 1014 if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) 1015 return true; 1016 1017 // If dependence information is not available from LoopAccessInfo, 1018 // conservatively assume the instructions can't be reordered. 1019 if (!areDependencesValid()) 1020 return false; 1021 1022 // If we know there is a dependence from source to sink, assume the 1023 // instructions can't be reordered. Otherwise, reordering is legal. 1024 return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); 1025 } 1026 1027 /// \brief Collect the dependences from LoopAccessInfo. 1028 /// 1029 /// We process the dependences once during the interleaved access analysis to 1030 /// enable constant-time dependence queries. 1031 void collectDependences() { 1032 if (!areDependencesValid()) 1033 return; 1034 auto *Deps = LAI->getDepChecker().getDependences(); 1035 for (auto Dep : *Deps) 1036 Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); 1037 } 1038 }; 1039 1040 /// Utility class for getting and setting loop vectorizer hints in the form 1041 /// of loop metadata. 1042 /// This class keeps a number of loop annotations locally (as member variables) 1043 /// and can, upon request, write them back as metadata on the loop. It will 1044 /// initially scan the loop for existing metadata, and will update the local 1045 /// values based on information in the loop. 1046 /// We cannot write all values to metadata, as the mere presence of some info, 1047 /// for example 'force', means a decision has been made. So, we need to be 1048 /// careful NOT to add them if the user hasn't specifically asked so. 1049 class LoopVectorizeHints { 1050 enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE }; 1051 1052 /// Hint - associates name and validation with the hint value. 1053 struct Hint { 1054 const char *Name; 1055 unsigned Value; // This may have to change for non-numeric values. 1056 HintKind Kind; 1057 1058 Hint(const char *Name, unsigned Value, HintKind Kind) 1059 : Name(Name), Value(Value), Kind(Kind) {} 1060 1061 bool validate(unsigned Val) { 1062 switch (Kind) { 1063 case HK_WIDTH: 1064 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 1065 case HK_UNROLL: 1066 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 1067 case HK_FORCE: 1068 return (Val <= 1); 1069 } 1070 return false; 1071 } 1072 }; 1073 1074 /// Vectorization width. 1075 Hint Width; 1076 /// Vectorization interleave factor. 1077 Hint Interleave; 1078 /// Vectorization forced 1079 Hint Force; 1080 1081 /// Return the loop metadata prefix. 1082 static StringRef Prefix() { return "llvm.loop."; } 1083 1084 /// True if there is any unsafe math in the loop. 1085 bool PotentiallyUnsafe; 1086 1087 public: 1088 enum ForceKind { 1089 FK_Undefined = -1, ///< Not selected. 1090 FK_Disabled = 0, ///< Forcing disabled. 1091 FK_Enabled = 1, ///< Forcing enabled. 1092 }; 1093 1094 LoopVectorizeHints(const Loop *L, bool DisableInterleaving, 1095 OptimizationRemarkEmitter &ORE) 1096 : Width("vectorize.width", VectorizerParams::VectorizationFactor, 1097 HK_WIDTH), 1098 Interleave("interleave.count", DisableInterleaving, HK_UNROLL), 1099 Force("vectorize.enable", FK_Undefined, HK_FORCE), 1100 PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) { 1101 // Populate values with existing loop metadata. 1102 getHintsFromMetadata(); 1103 1104 // force-vector-interleave overrides DisableInterleaving. 1105 if (VectorizerParams::isInterleaveForced()) 1106 Interleave.Value = VectorizerParams::VectorizationInterleave; 1107 1108 DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() 1109 << "LV: Interleaving disabled by the pass manager\n"); 1110 } 1111 1112 /// Mark the loop L as already vectorized by setting the width to 1. 1113 void setAlreadyVectorized() { 1114 Width.Value = Interleave.Value = 1; 1115 Hint Hints[] = {Width, Interleave}; 1116 writeHintsToMetadata(Hints); 1117 } 1118 1119 bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { 1120 if (getForce() == LoopVectorizeHints::FK_Disabled) { 1121 DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 1122 ORE.emitOptimizationRemarkAnalysis(vectorizeAnalysisPassName(), L, 1123 emitRemark()); 1124 return false; 1125 } 1126 1127 if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { 1128 DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 1129 ORE.emitOptimizationRemarkAnalysis(vectorizeAnalysisPassName(), L, 1130 emitRemark()); 1131 return false; 1132 } 1133 1134 if (getWidth() == 1 && getInterleave() == 1) { 1135 // FIXME: Add a separate metadata to indicate when the loop has already 1136 // been vectorized instead of setting width and count to 1. 1137 DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 1138 // FIXME: Add interleave.disable metadata. This will allow 1139 // vectorize.disable to be used without disabling the pass and errors 1140 // to differentiate between disabled vectorization and a width of 1. 1141 ORE.emitOptimizationRemarkAnalysis( 1142 vectorizeAnalysisPassName(), L, 1143 "loop not vectorized: vectorization and interleaving are explicitly " 1144 "disabled, or vectorize width and interleave count are both set to " 1145 "1"); 1146 return false; 1147 } 1148 1149 return true; 1150 } 1151 1152 /// Dumps all the hint information. 1153 std::string emitRemark() const { 1154 VectorizationReport R; 1155 if (Force.Value == LoopVectorizeHints::FK_Disabled) 1156 R << "vectorization is explicitly disabled"; 1157 else { 1158 R << "use -Rpass-analysis=loop-vectorize for more info"; 1159 if (Force.Value == LoopVectorizeHints::FK_Enabled) { 1160 R << " (Force=true"; 1161 if (Width.Value != 0) 1162 R << ", Vector Width=" << Width.Value; 1163 if (Interleave.Value != 0) 1164 R << ", Interleave Count=" << Interleave.Value; 1165 R << ")"; 1166 } 1167 } 1168 1169 return R.str(); 1170 } 1171 1172 unsigned getWidth() const { return Width.Value; } 1173 unsigned getInterleave() const { return Interleave.Value; } 1174 enum ForceKind getForce() const { return (ForceKind)Force.Value; } 1175 1176 /// \brief If hints are provided that force vectorization, use the AlwaysPrint 1177 /// pass name to force the frontend to print the diagnostic. 1178 const char *vectorizeAnalysisPassName() const { 1179 if (getWidth() == 1) 1180 return LV_NAME; 1181 if (getForce() == LoopVectorizeHints::FK_Disabled) 1182 return LV_NAME; 1183 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) 1184 return LV_NAME; 1185 return DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint; 1186 } 1187 1188 bool allowReordering() const { 1189 // When enabling loop hints are provided we allow the vectorizer to change 1190 // the order of operations that is given by the scalar loop. This is not 1191 // enabled by default because can be unsafe or inefficient. For example, 1192 // reordering floating-point operations will change the way round-off 1193 // error accumulates in the loop. 1194 return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; 1195 } 1196 1197 bool isPotentiallyUnsafe() const { 1198 // Avoid FP vectorization if the target is unsure about proper support. 1199 // This may be related to the SIMD unit in the target not handling 1200 // IEEE 754 FP ops properly, or bad single-to-double promotions. 1201 // Otherwise, a sequence of vectorized loops, even without reduction, 1202 // could lead to different end results on the destination vectors. 1203 return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; 1204 } 1205 1206 void setPotentiallyUnsafe() { PotentiallyUnsafe = true; } 1207 1208 private: 1209 /// Find hints specified in the loop metadata and update local values. 1210 void getHintsFromMetadata() { 1211 MDNode *LoopID = TheLoop->getLoopID(); 1212 if (!LoopID) 1213 return; 1214 1215 // First operand should refer to the loop id itself. 1216 assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 1217 assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 1218 1219 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1220 const MDString *S = nullptr; 1221 SmallVector<Metadata *, 4> Args; 1222 1223 // The expected hint is either a MDString or a MDNode with the first 1224 // operand a MDString. 1225 if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 1226 if (!MD || MD->getNumOperands() == 0) 1227 continue; 1228 S = dyn_cast<MDString>(MD->getOperand(0)); 1229 for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 1230 Args.push_back(MD->getOperand(i)); 1231 } else { 1232 S = dyn_cast<MDString>(LoopID->getOperand(i)); 1233 assert(Args.size() == 0 && "too many arguments for MDString"); 1234 } 1235 1236 if (!S) 1237 continue; 1238 1239 // Check if the hint starts with the loop metadata prefix. 1240 StringRef Name = S->getString(); 1241 if (Args.size() == 1) 1242 setHint(Name, Args[0]); 1243 } 1244 } 1245 1246 /// Checks string hint with one operand and set value if valid. 1247 void setHint(StringRef Name, Metadata *Arg) { 1248 if (!Name.startswith(Prefix())) 1249 return; 1250 Name = Name.substr(Prefix().size(), StringRef::npos); 1251 1252 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 1253 if (!C) 1254 return; 1255 unsigned Val = C->getZExtValue(); 1256 1257 Hint *Hints[] = {&Width, &Interleave, &Force}; 1258 for (auto H : Hints) { 1259 if (Name == H->Name) { 1260 if (H->validate(Val)) 1261 H->Value = Val; 1262 else 1263 DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 1264 break; 1265 } 1266 } 1267 } 1268 1269 /// Create a new hint from name / value pair. 1270 MDNode *createHintMetadata(StringRef Name, unsigned V) const { 1271 LLVMContext &Context = TheLoop->getHeader()->getContext(); 1272 Metadata *MDs[] = {MDString::get(Context, Name), 1273 ConstantAsMetadata::get( 1274 ConstantInt::get(Type::getInt32Ty(Context), V))}; 1275 return MDNode::get(Context, MDs); 1276 } 1277 1278 /// Matches metadata with hint name. 1279 bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { 1280 MDString *Name = dyn_cast<MDString>(Node->getOperand(0)); 1281 if (!Name) 1282 return false; 1283 1284 for (auto H : HintTypes) 1285 if (Name->getString().endswith(H.Name)) 1286 return true; 1287 return false; 1288 } 1289 1290 /// Sets current hints into loop metadata, keeping other values intact. 1291 void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { 1292 if (HintTypes.size() == 0) 1293 return; 1294 1295 // Reserve the first element to LoopID (see below). 1296 SmallVector<Metadata *, 4> MDs(1); 1297 // If the loop already has metadata, then ignore the existing operands. 1298 MDNode *LoopID = TheLoop->getLoopID(); 1299 if (LoopID) { 1300 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1301 MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); 1302 // If node in update list, ignore old value. 1303 if (!matchesHintMetadataName(Node, HintTypes)) 1304 MDs.push_back(Node); 1305 } 1306 } 1307 1308 // Now, add the missing hints. 1309 for (auto H : HintTypes) 1310 MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); 1311 1312 // Replace current metadata node with new one. 1313 LLVMContext &Context = TheLoop->getHeader()->getContext(); 1314 MDNode *NewLoopID = MDNode::get(Context, MDs); 1315 // Set operand 0 to refer to the loop id itself. 1316 NewLoopID->replaceOperandWith(0, NewLoopID); 1317 1318 TheLoop->setLoopID(NewLoopID); 1319 } 1320 1321 /// The loop these hints belong to. 1322 const Loop *TheLoop; 1323 1324 /// Interface to emit optimization remarks. 1325 OptimizationRemarkEmitter &ORE; 1326 }; 1327 1328 static void emitAnalysisDiag(const Loop *TheLoop, 1329 const LoopVectorizeHints &Hints, 1330 OptimizationRemarkEmitter &ORE, 1331 const LoopAccessReport &Message) { 1332 const char *Name = Hints.vectorizeAnalysisPassName(); 1333 LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); 1334 } 1335 1336 static void emitMissedWarning(Function *F, Loop *L, 1337 const LoopVectorizeHints &LH, 1338 OptimizationRemarkEmitter *ORE) { 1339 ORE->emitOptimizationRemarkMissed(LV_NAME, L, LH.emitRemark()); 1340 1341 if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { 1342 if (LH.getWidth() != 1) 1343 emitLoopVectorizeWarning( 1344 F->getContext(), *F, L->getStartLoc(), 1345 "failed explicitly specified loop vectorization"); 1346 else if (LH.getInterleave() != 1) 1347 emitLoopInterleaveWarning( 1348 F->getContext(), *F, L->getStartLoc(), 1349 "failed explicitly specified loop interleaving"); 1350 } 1351 } 1352 1353 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 1354 /// to what vectorization factor. 1355 /// This class does not look at the profitability of vectorization, only the 1356 /// legality. This class has two main kinds of checks: 1357 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 1358 /// will change the order of memory accesses in a way that will change the 1359 /// correctness of the program. 1360 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 1361 /// checks for a number of different conditions, such as the availability of a 1362 /// single induction variable, that all types are supported and vectorize-able, 1363 /// etc. This code reflects the capabilities of InnerLoopVectorizer. 1364 /// This class is also used by InnerLoopVectorizer for identifying 1365 /// induction variable and the different reduction variables. 1366 class LoopVectorizationLegality { 1367 public: 1368 LoopVectorizationLegality( 1369 Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, 1370 TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, 1371 const TargetTransformInfo *TTI, 1372 std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, 1373 OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, 1374 LoopVectorizeHints *H) 1375 : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), 1376 GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), 1377 Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), 1378 Requirements(R), Hints(H) {} 1379 1380 /// ReductionList contains the reduction descriptors for all 1381 /// of the reductions that were found in the loop. 1382 typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; 1383 1384 /// InductionList saves induction variables and maps them to the 1385 /// induction descriptor. 1386 typedef MapVector<PHINode *, InductionDescriptor> InductionList; 1387 1388 /// RecurrenceSet contains the phi nodes that are recurrences other than 1389 /// inductions and reductions. 1390 typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet; 1391 1392 /// Returns true if it is legal to vectorize this loop. 1393 /// This does not mean that it is profitable to vectorize this 1394 /// loop, only that it is legal to do so. 1395 bool canVectorize(); 1396 1397 /// Returns the Induction variable. 1398 PHINode *getInduction() { return Induction; } 1399 1400 /// Returns the reduction variables found in the loop. 1401 ReductionList *getReductionVars() { return &Reductions; } 1402 1403 /// Returns the induction variables found in the loop. 1404 InductionList *getInductionVars() { return &Inductions; } 1405 1406 /// Return the first-order recurrences found in the loop. 1407 RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } 1408 1409 /// Returns the widest induction type. 1410 Type *getWidestInductionType() { return WidestIndTy; } 1411 1412 /// Returns True if V is an induction variable in this loop. 1413 bool isInductionVariable(const Value *V); 1414 1415 /// Returns True if PN is a reduction variable in this loop. 1416 bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } 1417 1418 /// Returns True if Phi is a first-order recurrence in this loop. 1419 bool isFirstOrderRecurrence(const PHINode *Phi); 1420 1421 /// Return true if the block BB needs to be predicated in order for the loop 1422 /// to be vectorized. 1423 bool blockNeedsPredication(BasicBlock *BB); 1424 1425 /// Check if this pointer is consecutive when vectorizing. This happens 1426 /// when the last index of the GEP is the induction variable, or that the 1427 /// pointer itself is an induction variable. 1428 /// This check allows us to vectorize A[idx] into a wide load/store. 1429 /// Returns: 1430 /// 0 - Stride is unknown or non-consecutive. 1431 /// 1 - Address is consecutive. 1432 /// -1 - Address is consecutive, and decreasing. 1433 int isConsecutivePtr(Value *Ptr); 1434 1435 /// Returns true if the value V is uniform within the loop. 1436 bool isUniform(Value *V); 1437 1438 /// Returns true if \p I is known to be uniform after vectorization. 1439 bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } 1440 1441 /// Returns true if \p I is known to be scalar after vectorization. 1442 bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } 1443 1444 /// Returns the information that we collected about runtime memory check. 1445 const RuntimePointerChecking *getRuntimePointerChecking() const { 1446 return LAI->getRuntimePointerChecking(); 1447 } 1448 1449 const LoopAccessInfo *getLAI() const { return LAI; } 1450 1451 /// \brief Check if \p Instr belongs to any interleaved access group. 1452 bool isAccessInterleaved(Instruction *Instr) { 1453 return InterleaveInfo.isInterleaved(Instr); 1454 } 1455 1456 /// \brief Return the maximum interleave factor of all interleaved groups. 1457 unsigned getMaxInterleaveFactor() const { 1458 return InterleaveInfo.getMaxInterleaveFactor(); 1459 } 1460 1461 /// \brief Get the interleaved access group that \p Instr belongs to. 1462 const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { 1463 return InterleaveInfo.getInterleaveGroup(Instr); 1464 } 1465 1466 /// \brief Returns true if an interleaved group requires a scalar iteration 1467 /// to handle accesses with gaps. 1468 bool requiresScalarEpilogue() const { 1469 return InterleaveInfo.requiresScalarEpilogue(); 1470 } 1471 1472 unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } 1473 1474 bool hasStride(Value *V) { return LAI->hasStride(V); } 1475 1476 /// Returns true if the target machine supports masked store operation 1477 /// for the given \p DataType and kind of access to \p Ptr. 1478 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1479 return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); 1480 } 1481 /// Returns true if the target machine supports masked load operation 1482 /// for the given \p DataType and kind of access to \p Ptr. 1483 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1484 return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); 1485 } 1486 /// Returns true if the target machine supports masked scatter operation 1487 /// for the given \p DataType. 1488 bool isLegalMaskedScatter(Type *DataType) { 1489 return TTI->isLegalMaskedScatter(DataType); 1490 } 1491 /// Returns true if the target machine supports masked gather operation 1492 /// for the given \p DataType. 1493 bool isLegalMaskedGather(Type *DataType) { 1494 return TTI->isLegalMaskedGather(DataType); 1495 } 1496 /// Returns true if the target machine can represent \p V as a masked gather 1497 /// or scatter operation. 1498 bool isLegalGatherOrScatter(Value *V) { 1499 auto *LI = dyn_cast<LoadInst>(V); 1500 auto *SI = dyn_cast<StoreInst>(V); 1501 if (!LI && !SI) 1502 return false; 1503 auto *Ptr = getPointerOperand(V); 1504 auto *Ty = cast<PointerType>(Ptr->getType())->getElementType(); 1505 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1506 } 1507 1508 /// Returns true if vector representation of the instruction \p I 1509 /// requires mask. 1510 bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } 1511 unsigned getNumStores() const { return LAI->getNumStores(); } 1512 unsigned getNumLoads() const { return LAI->getNumLoads(); } 1513 unsigned getNumPredStores() const { return NumPredStores; } 1514 1515 private: 1516 /// Check if a single basic block loop is vectorizable. 1517 /// At this point we know that this is a loop with a constant trip count 1518 /// and we only need to check individual instructions. 1519 bool canVectorizeInstrs(); 1520 1521 /// When we vectorize loops we may change the order in which 1522 /// we read and write from memory. This method checks if it is 1523 /// legal to vectorize the code, considering only memory constrains. 1524 /// Returns true if the loop is vectorizable 1525 bool canVectorizeMemory(); 1526 1527 /// Return true if we can vectorize this loop using the IF-conversion 1528 /// transformation. 1529 bool canVectorizeWithIfConvert(); 1530 1531 /// Collect the instructions that are uniform after vectorization. An 1532 /// instruction is uniform if we represent it with a single scalar value in 1533 /// the vectorized loop corresponding to each vector iteration. Examples of 1534 /// uniform instructions include pointer operands of consecutive or 1535 /// interleaved memory accesses. Note that although uniformity implies an 1536 /// instruction will be scalar, the reverse is not true. In general, a 1537 /// scalarized instruction will be represented by VF scalar values in the 1538 /// vectorized loop, each corresponding to an iteration of the original 1539 /// scalar loop. 1540 void collectLoopUniforms(); 1541 1542 /// Collect the instructions that are scalar after vectorization. An 1543 /// instruction is scalar if it is known to be uniform or will be scalarized 1544 /// during vectorization. Non-uniform scalarized instructions will be 1545 /// represented by VF values in the vectorized loop, each corresponding to an 1546 /// iteration of the original scalar loop. 1547 void collectLoopScalars(); 1548 1549 /// Return true if all of the instructions in the block can be speculatively 1550 /// executed. \p SafePtrs is a list of addresses that are known to be legal 1551 /// and we know that we can read from them without segfault. 1552 bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); 1553 1554 /// Updates the vectorization state by adding \p Phi to the inductions list. 1555 /// This can set \p Phi as the main induction of the loop if \p Phi is a 1556 /// better choice for the main induction than the existing one. 1557 void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, 1558 SmallPtrSetImpl<Value *> &AllowedExit); 1559 1560 /// Report an analysis message to assist the user in diagnosing loops that are 1561 /// not vectorized. These are handled as LoopAccessReport rather than 1562 /// VectorizationReport because the << operator of VectorizationReport returns 1563 /// LoopAccessReport. 1564 void emitAnalysis(const LoopAccessReport &Message) const { 1565 emitAnalysisDiag(TheLoop, *Hints, *ORE, Message); 1566 } 1567 1568 /// \brief If an access has a symbolic strides, this maps the pointer value to 1569 /// the stride symbol. 1570 const ValueToValueMap *getSymbolicStrides() { 1571 // FIXME: Currently, the set of symbolic strides is sometimes queried before 1572 // it's collected. This happens from canVectorizeWithIfConvert, when the 1573 // pointer is checked to reference consecutive elements suitable for a 1574 // masked access. 1575 return LAI ? &LAI->getSymbolicStrides() : nullptr; 1576 } 1577 1578 unsigned NumPredStores; 1579 1580 /// The loop that we evaluate. 1581 Loop *TheLoop; 1582 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. 1583 /// Applies dynamic knowledge to simplify SCEV expressions in the context 1584 /// of existing SCEV assumptions. The analysis will also add a minimal set 1585 /// of new predicates if this is required to enable vectorization and 1586 /// unrolling. 1587 PredicatedScalarEvolution &PSE; 1588 /// Target Library Info. 1589 TargetLibraryInfo *TLI; 1590 /// Target Transform Info 1591 const TargetTransformInfo *TTI; 1592 /// Dominator Tree. 1593 DominatorTree *DT; 1594 // LoopAccess analysis. 1595 std::function<const LoopAccessInfo &(Loop &)> *GetLAA; 1596 // And the loop-accesses info corresponding to this loop. This pointer is 1597 // null until canVectorizeMemory sets it up. 1598 const LoopAccessInfo *LAI; 1599 /// Interface to emit optimization remarks. 1600 OptimizationRemarkEmitter *ORE; 1601 1602 /// The interleave access information contains groups of interleaved accesses 1603 /// with the same stride and close to each other. 1604 InterleavedAccessInfo InterleaveInfo; 1605 1606 // --- vectorization state --- // 1607 1608 /// Holds the integer induction variable. This is the counter of the 1609 /// loop. 1610 PHINode *Induction; 1611 /// Holds the reduction variables. 1612 ReductionList Reductions; 1613 /// Holds all of the induction variables that we found in the loop. 1614 /// Notice that inductions don't need to start at zero and that induction 1615 /// variables can be pointers. 1616 InductionList Inductions; 1617 /// Holds the phi nodes that are first-order recurrences. 1618 RecurrenceSet FirstOrderRecurrences; 1619 /// Holds the widest induction type encountered. 1620 Type *WidestIndTy; 1621 1622 /// Allowed outside users. This holds the induction and reduction 1623 /// vars which can be accessed from outside the loop. 1624 SmallPtrSet<Value *, 4> AllowedExit; 1625 1626 /// Holds the instructions known to be uniform after vectorization. 1627 SmallPtrSet<Instruction *, 4> Uniforms; 1628 1629 /// Holds the instructions known to be scalar after vectorization. 1630 SmallPtrSet<Instruction *, 4> Scalars; 1631 1632 /// Can we assume the absence of NaNs. 1633 bool HasFunNoNaNAttr; 1634 1635 /// Vectorization requirements that will go through late-evaluation. 1636 LoopVectorizationRequirements *Requirements; 1637 1638 /// Used to emit an analysis of any legality issues. 1639 LoopVectorizeHints *Hints; 1640 1641 /// While vectorizing these instructions we have to generate a 1642 /// call to the appropriate masked intrinsic 1643 SmallPtrSet<const Instruction *, 8> MaskedOp; 1644 }; 1645 1646 /// LoopVectorizationCostModel - estimates the expected speedups due to 1647 /// vectorization. 1648 /// In many cases vectorization is not profitable. This can happen because of 1649 /// a number of reasons. In this class we mainly attempt to predict the 1650 /// expected speedup/slowdowns due to the supported instruction set. We use the 1651 /// TargetTransformInfo to query the different backends for the cost of 1652 /// different operations. 1653 class LoopVectorizationCostModel { 1654 public: 1655 LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, 1656 LoopInfo *LI, LoopVectorizationLegality *Legal, 1657 const TargetTransformInfo &TTI, 1658 const TargetLibraryInfo *TLI, DemandedBits *DB, 1659 AssumptionCache *AC, 1660 OptimizationRemarkEmitter *ORE, const Function *F, 1661 const LoopVectorizeHints *Hints) 1662 : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), 1663 AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} 1664 1665 /// Information about vectorization costs 1666 struct VectorizationFactor { 1667 unsigned Width; // Vector width with best cost 1668 unsigned Cost; // Cost of the loop with that width 1669 }; 1670 /// \return The most profitable vectorization factor and the cost of that VF. 1671 /// This method checks every power of two up to VF. If UserVF is not ZERO 1672 /// then this vectorization factor will be selected if vectorization is 1673 /// possible. 1674 VectorizationFactor selectVectorizationFactor(bool OptForSize); 1675 1676 /// \return The size (in bits) of the smallest and widest types in the code 1677 /// that needs to be vectorized. We ignore values that remain scalar such as 1678 /// 64 bit loop indices. 1679 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1680 1681 /// \return The desired interleave count. 1682 /// If interleave count has been specified by metadata it will be returned. 1683 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1684 /// are the selected vectorization factor and the cost of the selected VF. 1685 unsigned selectInterleaveCount(bool OptForSize, unsigned VF, 1686 unsigned LoopCost); 1687 1688 /// \return The most profitable unroll factor. 1689 /// This method finds the best unroll-factor based on register pressure and 1690 /// other parameters. VF and LoopCost are the selected vectorization factor 1691 /// and the cost of the selected VF. 1692 unsigned computeInterleaveCount(bool OptForSize, unsigned VF, 1693 unsigned LoopCost); 1694 1695 /// \brief A struct that represents some properties of the register usage 1696 /// of a loop. 1697 struct RegisterUsage { 1698 /// Holds the number of loop invariant values that are used in the loop. 1699 unsigned LoopInvariantRegs; 1700 /// Holds the maximum number of concurrent live intervals in the loop. 1701 unsigned MaxLocalUsers; 1702 /// Holds the number of instructions in the loop. 1703 unsigned NumInstructions; 1704 }; 1705 1706 /// \return Returns information about the register usages of the loop for the 1707 /// given vectorization factors. 1708 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1709 1710 /// Collect values we want to ignore in the cost model. 1711 void collectValuesToIgnore(); 1712 1713 private: 1714 /// The vectorization cost is a combination of the cost itself and a boolean 1715 /// indicating whether any of the contributing operations will actually 1716 /// operate on 1717 /// vector values after type legalization in the backend. If this latter value 1718 /// is 1719 /// false, then all operations will be scalarized (i.e. no vectorization has 1720 /// actually taken place). 1721 typedef std::pair<unsigned, bool> VectorizationCostTy; 1722 1723 /// Returns the expected execution cost. The unit of the cost does 1724 /// not matter because we use the 'cost' units to compare different 1725 /// vector widths. The cost that is returned is *not* normalized by 1726 /// the factor width. 1727 VectorizationCostTy expectedCost(unsigned VF); 1728 1729 /// Returns the execution time cost of an instruction for a given vector 1730 /// width. Vector width of one means scalar. 1731 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1732 1733 /// The cost-computation logic from getInstructionCost which provides 1734 /// the vector type as an output parameter. 1735 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1736 1737 /// Returns whether the instruction is a load or store and will be a emitted 1738 /// as a vector operation. 1739 bool isConsecutiveLoadOrStore(Instruction *I); 1740 1741 /// Report an analysis message to assist the user in diagnosing loops that are 1742 /// not vectorized. These are handled as LoopAccessReport rather than 1743 /// VectorizationReport because the << operator of VectorizationReport returns 1744 /// LoopAccessReport. 1745 void emitAnalysis(const LoopAccessReport &Message) const { 1746 emitAnalysisDiag(TheLoop, *Hints, *ORE, Message); 1747 } 1748 1749 public: 1750 /// Map of scalar integer values to the smallest bitwidth they can be legally 1751 /// represented as. The vector equivalents of these values should be truncated 1752 /// to this type. 1753 MapVector<Instruction *, uint64_t> MinBWs; 1754 1755 /// The loop that we evaluate. 1756 Loop *TheLoop; 1757 /// Predicated scalar evolution analysis. 1758 PredicatedScalarEvolution &PSE; 1759 /// Loop Info analysis. 1760 LoopInfo *LI; 1761 /// Vectorization legality. 1762 LoopVectorizationLegality *Legal; 1763 /// Vector target information. 1764 const TargetTransformInfo &TTI; 1765 /// Target Library Info. 1766 const TargetLibraryInfo *TLI; 1767 /// Demanded bits analysis. 1768 DemandedBits *DB; 1769 /// Assumption cache. 1770 AssumptionCache *AC; 1771 /// Interface to emit optimization remarks. 1772 OptimizationRemarkEmitter *ORE; 1773 1774 const Function *TheFunction; 1775 /// Loop Vectorize Hint. 1776 const LoopVectorizeHints *Hints; 1777 /// Values to ignore in the cost model. 1778 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1779 /// Values to ignore in the cost model when VF > 1. 1780 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1781 }; 1782 1783 /// \brief This holds vectorization requirements that must be verified late in 1784 /// the process. The requirements are set by legalize and costmodel. Once 1785 /// vectorization has been determined to be possible and profitable the 1786 /// requirements can be verified by looking for metadata or compiler options. 1787 /// For example, some loops require FP commutativity which is only allowed if 1788 /// vectorization is explicitly specified or if the fast-math compiler option 1789 /// has been provided. 1790 /// Late evaluation of these requirements allows helpful diagnostics to be 1791 /// composed that tells the user what need to be done to vectorize the loop. For 1792 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late 1793 /// evaluation should be used only when diagnostics can generated that can be 1794 /// followed by a non-expert user. 1795 class LoopVectorizationRequirements { 1796 public: 1797 LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) 1798 : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {} 1799 1800 void addUnsafeAlgebraInst(Instruction *I) { 1801 // First unsafe algebra instruction. 1802 if (!UnsafeAlgebraInst) 1803 UnsafeAlgebraInst = I; 1804 } 1805 1806 void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } 1807 1808 bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { 1809 const char *Name = Hints.vectorizeAnalysisPassName(); 1810 bool Failed = false; 1811 if (UnsafeAlgebraInst && !Hints.allowReordering()) { 1812 ORE.emitOptimizationRemarkAnalysisFPCommute( 1813 Name, UnsafeAlgebraInst->getDebugLoc(), 1814 UnsafeAlgebraInst->getParent(), 1815 VectorizationReport() << "cannot prove it is safe to reorder " 1816 "floating-point operations"); 1817 Failed = true; 1818 } 1819 1820 // Test if runtime memcheck thresholds are exceeded. 1821 bool PragmaThresholdReached = 1822 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 1823 bool ThresholdReached = 1824 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 1825 if ((ThresholdReached && !Hints.allowReordering()) || 1826 PragmaThresholdReached) { 1827 ORE.emitOptimizationRemarkAnalysisAliasing( 1828 Name, L, 1829 VectorizationReport() 1830 << "cannot prove it is safe to reorder memory operations"); 1831 DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 1832 Failed = true; 1833 } 1834 1835 return Failed; 1836 } 1837 1838 private: 1839 unsigned NumRuntimePointerChecks; 1840 Instruction *UnsafeAlgebraInst; 1841 1842 /// Interface to emit optimization remarks. 1843 OptimizationRemarkEmitter &ORE; 1844 }; 1845 1846 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { 1847 if (L.empty()) 1848 return V.push_back(&L); 1849 1850 for (Loop *InnerL : L) 1851 addInnerLoop(*InnerL, V); 1852 } 1853 1854 /// The LoopVectorize Pass. 1855 struct LoopVectorize : public FunctionPass { 1856 /// Pass identification, replacement for typeid 1857 static char ID; 1858 1859 explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) 1860 : FunctionPass(ID) { 1861 Impl.DisableUnrolling = NoUnrolling; 1862 Impl.AlwaysVectorize = AlwaysVectorize; 1863 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1864 } 1865 1866 LoopVectorizePass Impl; 1867 1868 bool runOnFunction(Function &F) override { 1869 if (skipFunction(F)) 1870 return false; 1871 1872 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1873 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1874 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1875 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1876 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1877 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1878 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; 1879 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1880 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1881 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1882 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1883 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1884 1885 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1886 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1887 1888 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1889 GetLAA, *ORE); 1890 } 1891 1892 void getAnalysisUsage(AnalysisUsage &AU) const override { 1893 AU.addRequired<AssumptionCacheTracker>(); 1894 AU.addRequiredID(LoopSimplifyID); 1895 AU.addRequiredID(LCSSAID); 1896 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1897 AU.addRequired<DominatorTreeWrapperPass>(); 1898 AU.addRequired<LoopInfoWrapperPass>(); 1899 AU.addRequired<ScalarEvolutionWrapperPass>(); 1900 AU.addRequired<TargetTransformInfoWrapperPass>(); 1901 AU.addRequired<AAResultsWrapperPass>(); 1902 AU.addRequired<LoopAccessLegacyAnalysis>(); 1903 AU.addRequired<DemandedBitsWrapperPass>(); 1904 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1905 AU.addPreserved<LoopInfoWrapperPass>(); 1906 AU.addPreserved<DominatorTreeWrapperPass>(); 1907 AU.addPreserved<BasicAAWrapperPass>(); 1908 AU.addPreserved<GlobalsAAWrapperPass>(); 1909 } 1910 }; 1911 1912 } // end anonymous namespace 1913 1914 //===----------------------------------------------------------------------===// 1915 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1916 // LoopVectorizationCostModel. 1917 //===----------------------------------------------------------------------===// 1918 1919 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1920 // We need to place the broadcast of invariant variables outside the loop. 1921 Instruction *Instr = dyn_cast<Instruction>(V); 1922 bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 1923 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 1924 1925 // Place the code for broadcasting invariant variables in the new preheader. 1926 IRBuilder<>::InsertPointGuard Guard(Builder); 1927 if (Invariant) 1928 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1929 1930 // Broadcast the scalar into all locations in the vector. 1931 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1932 1933 return Shuf; 1934 } 1935 1936 void InnerLoopVectorizer::createVectorIntInductionPHI( 1937 const InductionDescriptor &II, VectorParts &Entry, IntegerType *TruncType) { 1938 Value *Start = II.getStartValue(); 1939 ConstantInt *Step = II.getConstIntStepValue(); 1940 assert(Step && "Can not widen an IV with a non-constant step"); 1941 1942 // Construct the initial value of the vector IV in the vector loop preheader 1943 auto CurrIP = Builder.saveIP(); 1944 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1945 if (TruncType) { 1946 Step = ConstantInt::getSigned(TruncType, Step->getSExtValue()); 1947 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1948 } 1949 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1950 Value *SteppedStart = getStepVector(SplatStart, 0, Step); 1951 Builder.restoreIP(CurrIP); 1952 1953 Value *SplatVF = 1954 ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(), 1955 VF * Step->getSExtValue())); 1956 // We may need to add the step a number of times, depending on the unroll 1957 // factor. The last of those goes into the PHI. 1958 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1959 &*LoopVectorBody->getFirstInsertionPt()); 1960 Instruction *LastInduction = VecInd; 1961 for (unsigned Part = 0; Part < UF; ++Part) { 1962 Entry[Part] = LastInduction; 1963 LastInduction = cast<Instruction>( 1964 Builder.CreateAdd(LastInduction, SplatVF, "step.add")); 1965 } 1966 1967 // Move the last step to the end of the latch block. This ensures consistent 1968 // placement of all induction updates. 1969 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1970 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1971 auto *ICmp = cast<Instruction>(Br->getCondition()); 1972 LastInduction->moveBefore(ICmp); 1973 LastInduction->setName("vec.ind.next"); 1974 1975 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1976 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1977 } 1978 1979 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1980 if (Legal->isScalarAfterVectorization(IV)) 1981 return true; 1982 auto isScalarInst = [&](User *U) -> bool { 1983 auto *I = cast<Instruction>(U); 1984 return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I)); 1985 }; 1986 return any_of(IV->users(), isScalarInst); 1987 } 1988 1989 void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, 1990 TruncInst *Trunc) { 1991 1992 auto II = Legal->getInductionVars()->find(IV); 1993 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1994 1995 auto ID = II->second; 1996 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1997 1998 // If a truncate instruction was provided, get the smaller type. 1999 auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr; 2000 2001 // The scalar value to broadcast. This will be derived from the canonical 2002 // induction variable. 2003 Value *ScalarIV = nullptr; 2004 2005 // The step of the induction. 2006 Value *Step = nullptr; 2007 2008 // The value from the original loop to which we are mapping the new induction 2009 // variable. 2010 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2011 2012 // True if we have vectorized the induction variable. 2013 auto VectorizedIV = false; 2014 2015 // Determine if we want a scalar version of the induction variable. This is 2016 // true if the induction variable itself is not widened, or if it has at 2017 // least one user in the loop that is not widened. 2018 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 2019 2020 // If the induction variable has a constant integer step value, go ahead and 2021 // get it now. 2022 if (ID.getConstIntStepValue()) 2023 Step = ID.getConstIntStepValue(); 2024 2025 // Try to create a new independent vector induction variable. If we can't 2026 // create the phi node, we will splat the scalar induction variable in each 2027 // loop iteration. 2028 if (VF > 1 && IV->getType() == Induction->getType() && Step && 2029 !Legal->isScalarAfterVectorization(EntryVal)) { 2030 createVectorIntInductionPHI(ID, Entry, TruncType); 2031 VectorizedIV = true; 2032 } 2033 2034 // If we haven't yet vectorized the induction variable, or if we will create 2035 // a scalar one, we need to define the scalar induction variable and step 2036 // values. If we were given a truncation type, truncate the canonical 2037 // induction variable and constant step. Otherwise, derive these values from 2038 // the induction descriptor. 2039 if (!VectorizedIV || NeedsScalarIV) { 2040 if (TruncType) { 2041 assert(Step && "Truncation requires constant integer step"); 2042 auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); 2043 ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); 2044 Step = ConstantInt::getSigned(TruncType, StepInt); 2045 } else { 2046 ScalarIV = Induction; 2047 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2048 if (IV != OldInduction) { 2049 ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); 2050 ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); 2051 ScalarIV->setName("offset.idx"); 2052 } 2053 if (!Step) { 2054 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2055 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 2056 &*Builder.GetInsertPoint()); 2057 } 2058 } 2059 } 2060 2061 // If we haven't yet vectorized the induction variable, splat the scalar 2062 // induction variable, and build the necessary step vectors. 2063 if (!VectorizedIV) { 2064 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2065 for (unsigned Part = 0; Part < UF; ++Part) 2066 Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); 2067 } 2068 2069 // If an induction variable is only used for counting loop iterations or 2070 // calculating addresses, it doesn't need to be widened. Create scalar steps 2071 // that can be used by instructions we will later scalarize. Note that the 2072 // addition of the scalar steps will not increase the number of instructions 2073 // in the loop in the common case prior to InstCombine. We will be trading 2074 // one vector extract for each scalar step. 2075 if (NeedsScalarIV) 2076 buildScalarSteps(ScalarIV, Step, EntryVal); 2077 } 2078 2079 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2080 Instruction::BinaryOps BinOp) { 2081 // Create and check the types. 2082 assert(Val->getType()->isVectorTy() && "Must be a vector"); 2083 int VLen = Val->getType()->getVectorNumElements(); 2084 2085 Type *STy = Val->getType()->getScalarType(); 2086 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2087 "Induction Step must be an integer or FP"); 2088 assert(Step->getType() == STy && "Step has wrong type"); 2089 2090 SmallVector<Constant *, 8> Indices; 2091 2092 if (STy->isIntegerTy()) { 2093 // Create a vector of consecutive numbers from zero to VF. 2094 for (int i = 0; i < VLen; ++i) 2095 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2096 2097 // Add the consecutive indices to the vector value. 2098 Constant *Cv = ConstantVector::get(Indices); 2099 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2100 Step = Builder.CreateVectorSplat(VLen, Step); 2101 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2102 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2103 // which can be found from the original scalar operations. 2104 Step = Builder.CreateMul(Cv, Step); 2105 return Builder.CreateAdd(Val, Step, "induction"); 2106 } 2107 2108 // Floating point induction. 2109 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2110 "Binary Opcode should be specified for FP induction"); 2111 // Create a vector of consecutive numbers from zero to VF. 2112 for (int i = 0; i < VLen; ++i) 2113 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2114 2115 // Add the consecutive indices to the vector value. 2116 Constant *Cv = ConstantVector::get(Indices); 2117 2118 Step = Builder.CreateVectorSplat(VLen, Step); 2119 2120 // Floating point operations had to be 'fast' to enable the induction. 2121 FastMathFlags Flags; 2122 Flags.setUnsafeAlgebra(); 2123 2124 Value *MulOp = Builder.CreateFMul(Cv, Step); 2125 if (isa<Instruction>(MulOp)) 2126 // Have to check, MulOp may be a constant 2127 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2128 2129 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2130 if (isa<Instruction>(BOp)) 2131 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2132 return BOp; 2133 } 2134 2135 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2136 Value *EntryVal) { 2137 2138 // We shouldn't have to build scalar steps if we aren't vectorizing. 2139 assert(VF > 1 && "VF should be greater than one"); 2140 2141 // Get the value type and ensure it and the step have the same integer type. 2142 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2143 assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && 2144 "Val and Step should have the same integer type"); 2145 2146 // Compute the scalar steps and save the results in ScalarIVMap. 2147 for (unsigned Part = 0; Part < UF; ++Part) 2148 for (unsigned I = 0; I < VF; ++I) { 2149 auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); 2150 auto *Mul = Builder.CreateMul(StartIdx, Step); 2151 auto *Add = Builder.CreateAdd(ScalarIV, Mul); 2152 ScalarIVMap[EntryVal].push_back(Add); 2153 } 2154 } 2155 2156 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 2157 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); 2158 auto *SE = PSE.getSE(); 2159 // Make sure that the pointer does not point to structs. 2160 if (Ptr->getType()->getPointerElementType()->isAggregateType()) 2161 return 0; 2162 2163 // If this value is a pointer induction variable, we know it is consecutive. 2164 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 2165 if (Phi && Inductions.count(Phi)) { 2166 InductionDescriptor II = Inductions[Phi]; 2167 return II.getConsecutiveDirection(); 2168 } 2169 2170 GetElementPtrInst *Gep = getGEPInstruction(Ptr); 2171 if (!Gep) 2172 return 0; 2173 2174 unsigned NumOperands = Gep->getNumOperands(); 2175 Value *GpPtr = Gep->getPointerOperand(); 2176 // If this GEP value is a consecutive pointer induction variable and all of 2177 // the indices are constant, then we know it is consecutive. 2178 Phi = dyn_cast<PHINode>(GpPtr); 2179 if (Phi && Inductions.count(Phi)) { 2180 2181 // Make sure that the pointer does not point to structs. 2182 PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 2183 if (GepPtrType->getElementType()->isAggregateType()) 2184 return 0; 2185 2186 // Make sure that all of the index operands are loop invariant. 2187 for (unsigned i = 1; i < NumOperands; ++i) 2188 if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) 2189 return 0; 2190 2191 InductionDescriptor II = Inductions[Phi]; 2192 return II.getConsecutiveDirection(); 2193 } 2194 2195 unsigned InductionOperand = getGEPInductionOperand(Gep); 2196 2197 // Check that all of the gep indices are uniform except for our induction 2198 // operand. 2199 for (unsigned i = 0; i != NumOperands; ++i) 2200 if (i != InductionOperand && 2201 !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) 2202 return 0; 2203 2204 // We can emit wide load/stores only if the last non-zero index is the 2205 // induction variable. 2206 const SCEV *Last = nullptr; 2207 if (!getSymbolicStrides() || !getSymbolicStrides()->count(Gep)) 2208 Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); 2209 else { 2210 // Because of the multiplication by a stride we can have a s/zext cast. 2211 // We are going to replace this stride by 1 so the cast is safe to ignore. 2212 // 2213 // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 2214 // %0 = trunc i64 %indvars.iv to i32 2215 // %mul = mul i32 %0, %Stride1 2216 // %idxprom = zext i32 %mul to i64 << Safe cast. 2217 // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom 2218 // 2219 Last = replaceSymbolicStrideSCEV(PSE, *getSymbolicStrides(), 2220 Gep->getOperand(InductionOperand), Gep); 2221 if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) 2222 Last = 2223 (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) 2224 ? C->getOperand() 2225 : Last; 2226 } 2227 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 2228 const SCEV *Step = AR->getStepRecurrence(*SE); 2229 2230 // The memory is consecutive because the last index is consecutive 2231 // and all other indices are loop invariant. 2232 if (Step->isOne()) 2233 return 1; 2234 if (Step->isAllOnesValue()) 2235 return -1; 2236 } 2237 2238 return 0; 2239 } 2240 2241 bool LoopVectorizationLegality::isUniform(Value *V) { 2242 return LAI->isUniform(V); 2243 } 2244 2245 InnerLoopVectorizer::VectorParts & 2246 InnerLoopVectorizer::getVectorValue(Value *V) { 2247 assert(V != Induction && "The new induction variable should not be used."); 2248 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2249 2250 // If we have a stride that is replaced by one, do it here. 2251 if (Legal->hasStride(V)) 2252 V = ConstantInt::get(V->getType(), 1); 2253 2254 // If we have this scalar in the map, return it. 2255 if (WidenMap.has(V)) 2256 return WidenMap.get(V); 2257 2258 // If this scalar is unknown, assume that it is a constant or that it is 2259 // loop invariant. Broadcast V and save the value for future uses. 2260 Value *B = getBroadcastInstrs(V); 2261 return WidenMap.splat(V, B); 2262 } 2263 2264 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2265 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2266 SmallVector<Constant *, 8> ShuffleMask; 2267 for (unsigned i = 0; i < VF; ++i) 2268 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2269 2270 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2271 ConstantVector::get(ShuffleMask), 2272 "reverse"); 2273 } 2274 2275 // Get a mask to interleave \p NumVec vectors into a wide vector. 2276 // I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> 2277 // E.g. For 2 interleaved vectors, if VF is 4, the mask is: 2278 // <0, 4, 1, 5, 2, 6, 3, 7> 2279 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, 2280 unsigned NumVec) { 2281 SmallVector<Constant *, 16> Mask; 2282 for (unsigned i = 0; i < VF; i++) 2283 for (unsigned j = 0; j < NumVec; j++) 2284 Mask.push_back(Builder.getInt32(j * VF + i)); 2285 2286 return ConstantVector::get(Mask); 2287 } 2288 2289 // Get the strided mask starting from index \p Start. 2290 // I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> 2291 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, 2292 unsigned Stride, unsigned VF) { 2293 SmallVector<Constant *, 16> Mask; 2294 for (unsigned i = 0; i < VF; i++) 2295 Mask.push_back(Builder.getInt32(Start + i * Stride)); 2296 2297 return ConstantVector::get(Mask); 2298 } 2299 2300 // Get a mask of two parts: The first part consists of sequential integers 2301 // starting from 0, The second part consists of UNDEFs. 2302 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> 2303 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, 2304 unsigned NumUndef) { 2305 SmallVector<Constant *, 16> Mask; 2306 for (unsigned i = 0; i < NumInt; i++) 2307 Mask.push_back(Builder.getInt32(i)); 2308 2309 Constant *Undef = UndefValue::get(Builder.getInt32Ty()); 2310 for (unsigned i = 0; i < NumUndef; i++) 2311 Mask.push_back(Undef); 2312 2313 return ConstantVector::get(Mask); 2314 } 2315 2316 // Concatenate two vectors with the same element type. The 2nd vector should 2317 // not have more elements than the 1st vector. If the 2nd vector has less 2318 // elements, extend it with UNDEFs. 2319 static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, 2320 Value *V2) { 2321 VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); 2322 VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); 2323 assert(VecTy1 && VecTy2 && 2324 VecTy1->getScalarType() == VecTy2->getScalarType() && 2325 "Expect two vectors with the same element type"); 2326 2327 unsigned NumElts1 = VecTy1->getNumElements(); 2328 unsigned NumElts2 = VecTy2->getNumElements(); 2329 assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); 2330 2331 if (NumElts1 > NumElts2) { 2332 // Extend with UNDEFs. 2333 Constant *ExtMask = 2334 getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); 2335 V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); 2336 } 2337 2338 Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); 2339 return Builder.CreateShuffleVector(V1, V2, Mask); 2340 } 2341 2342 // Concatenate vectors in the given list. All vectors have the same type. 2343 static Value *ConcatenateVectors(IRBuilder<> &Builder, 2344 ArrayRef<Value *> InputList) { 2345 unsigned NumVec = InputList.size(); 2346 assert(NumVec > 1 && "Should be at least two vectors"); 2347 2348 SmallVector<Value *, 8> ResList; 2349 ResList.append(InputList.begin(), InputList.end()); 2350 do { 2351 SmallVector<Value *, 8> TmpList; 2352 for (unsigned i = 0; i < NumVec - 1; i += 2) { 2353 Value *V0 = ResList[i], *V1 = ResList[i + 1]; 2354 assert((V0->getType() == V1->getType() || i == NumVec - 2) && 2355 "Only the last vector may have a different type"); 2356 2357 TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); 2358 } 2359 2360 // Push the last vector if the total number of vectors is odd. 2361 if (NumVec % 2 != 0) 2362 TmpList.push_back(ResList[NumVec - 1]); 2363 2364 ResList = TmpList; 2365 NumVec = ResList.size(); 2366 } while (NumVec > 1); 2367 2368 return ResList[0]; 2369 } 2370 2371 // Try to vectorize the interleave group that \p Instr belongs to. 2372 // 2373 // E.g. Translate following interleaved load group (factor = 3): 2374 // for (i = 0; i < N; i+=3) { 2375 // R = Pic[i]; // Member of index 0 2376 // G = Pic[i+1]; // Member of index 1 2377 // B = Pic[i+2]; // Member of index 2 2378 // ... // do something to R, G, B 2379 // } 2380 // To: 2381 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2382 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2383 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2384 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2385 // 2386 // Or translate following interleaved store group (factor = 3): 2387 // for (i = 0; i < N; i+=3) { 2388 // ... do something to R, G, B 2389 // Pic[i] = R; // Member of index 0 2390 // Pic[i+1] = G; // Member of index 1 2391 // Pic[i+2] = B; // Member of index 2 2392 // } 2393 // To: 2394 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2395 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2396 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2397 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2398 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2399 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { 2400 const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); 2401 assert(Group && "Fail to get an interleaved access group."); 2402 2403 // Skip if current instruction is not the insert position. 2404 if (Instr != Group->getInsertPos()) 2405 return; 2406 2407 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2408 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2409 Value *Ptr = getPointerOperand(Instr); 2410 2411 // Prepare for the vector type of the interleaved load/store. 2412 Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2413 unsigned InterleaveFactor = Group->getFactor(); 2414 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2415 Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); 2416 2417 // Prepare for the new pointers. 2418 setDebugLocFromInst(Builder, Ptr); 2419 VectorParts &PtrParts = getVectorValue(Ptr); 2420 SmallVector<Value *, 2> NewPtrs; 2421 unsigned Index = Group->getIndex(Instr); 2422 for (unsigned Part = 0; Part < UF; Part++) { 2423 // Extract the pointer for current instruction from the pointer vector. A 2424 // reverse access uses the pointer in the last lane. 2425 Value *NewPtr = Builder.CreateExtractElement( 2426 PtrParts[Part], 2427 Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); 2428 2429 // Notice current instruction could be any index. Need to adjust the address 2430 // to the member of index 0. 2431 // 2432 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2433 // b = A[i]; // Member of index 0 2434 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2435 // 2436 // E.g. A[i+1] = a; // Member of index 1 2437 // A[i] = b; // Member of index 0 2438 // A[i+2] = c; // Member of index 2 (Current instruction) 2439 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2440 NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); 2441 2442 // Cast to the vector pointer type. 2443 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2444 } 2445 2446 setDebugLocFromInst(Builder, Instr); 2447 Value *UndefVec = UndefValue::get(VecTy); 2448 2449 // Vectorize the interleaved load group. 2450 if (LI) { 2451 for (unsigned Part = 0; Part < UF; Part++) { 2452 Instruction *NewLoadInstr = Builder.CreateAlignedLoad( 2453 NewPtrs[Part], Group->getAlignment(), "wide.vec"); 2454 2455 for (unsigned i = 0; i < InterleaveFactor; i++) { 2456 Instruction *Member = Group->getMember(i); 2457 2458 // Skip the gaps in the group. 2459 if (!Member) 2460 continue; 2461 2462 Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); 2463 Value *StridedVec = Builder.CreateShuffleVector( 2464 NewLoadInstr, UndefVec, StrideMask, "strided.vec"); 2465 2466 // If this member has different type, cast the result type. 2467 if (Member->getType() != ScalarTy) { 2468 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2469 StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); 2470 } 2471 2472 VectorParts &Entry = WidenMap.get(Member); 2473 Entry[Part] = 2474 Group->isReverse() ? reverseVector(StridedVec) : StridedVec; 2475 } 2476 2477 addMetadata(NewLoadInstr, Instr); 2478 } 2479 return; 2480 } 2481 2482 // The sub vector type for current instruction. 2483 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2484 2485 // Vectorize the interleaved store group. 2486 for (unsigned Part = 0; Part < UF; Part++) { 2487 // Collect the stored vector from each member. 2488 SmallVector<Value *, 4> StoredVecs; 2489 for (unsigned i = 0; i < InterleaveFactor; i++) { 2490 // Interleaved store group doesn't allow a gap, so each index has a member 2491 Instruction *Member = Group->getMember(i); 2492 assert(Member && "Fail to get a member from an interleaved store group"); 2493 2494 Value *StoredVec = 2495 getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part]; 2496 if (Group->isReverse()) 2497 StoredVec = reverseVector(StoredVec); 2498 2499 // If this member has different type, cast it to an unified type. 2500 if (StoredVec->getType() != SubVT) 2501 StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); 2502 2503 StoredVecs.push_back(StoredVec); 2504 } 2505 2506 // Concatenate all vectors into a wide vector. 2507 Value *WideVec = ConcatenateVectors(Builder, StoredVecs); 2508 2509 // Interleave the elements in the wide vector. 2510 Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); 2511 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2512 "interleaved.vec"); 2513 2514 Instruction *NewStoreInstr = 2515 Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); 2516 addMetadata(NewStoreInstr, Instr); 2517 } 2518 } 2519 2520 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { 2521 // Attempt to issue a wide load. 2522 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2523 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2524 2525 assert((LI || SI) && "Invalid Load/Store instruction"); 2526 2527 // Try to vectorize the interleave group if this access is interleaved. 2528 if (Legal->isAccessInterleaved(Instr)) 2529 return vectorizeInterleaveGroup(Instr); 2530 2531 Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2532 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2533 Value *Ptr = getPointerOperand(Instr); 2534 unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 2535 // An alignment of 0 means target abi alignment. We need to use the scalar's 2536 // target abi alignment in such a case. 2537 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2538 if (!Alignment) 2539 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2540 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2541 uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); 2542 uint64_t VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; 2543 2544 if (SI && Legal->blockNeedsPredication(SI->getParent()) && 2545 !Legal->isMaskRequired(SI)) 2546 return scalarizeInstruction(Instr, true); 2547 2548 if (ScalarAllocatedSize != VectorElementSize) 2549 return scalarizeInstruction(Instr); 2550 2551 // If the pointer is loop invariant scalarize the load. 2552 if (LI && Legal->isUniform(Ptr)) 2553 return scalarizeInstruction(Instr); 2554 2555 // If the pointer is non-consecutive and gather/scatter is not supported 2556 // scalarize the instruction. 2557 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 2558 bool Reverse = ConsecutiveStride < 0; 2559 bool CreateGatherScatter = 2560 !ConsecutiveStride && ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) || 2561 (SI && Legal->isLegalMaskedScatter(ScalarDataTy))); 2562 2563 if (!ConsecutiveStride && !CreateGatherScatter) 2564 return scalarizeInstruction(Instr); 2565 2566 Constant *Zero = Builder.getInt32(0); 2567 VectorParts &Entry = WidenMap.get(Instr); 2568 VectorParts VectorGep; 2569 2570 // Handle consecutive loads/stores. 2571 GetElementPtrInst *Gep = getGEPInstruction(Ptr); 2572 if (ConsecutiveStride) { 2573 if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 2574 setDebugLocFromInst(Builder, Gep); 2575 Value *PtrOperand = Gep->getPointerOperand(); 2576 Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 2577 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 2578 2579 // Create the new GEP with the new induction variable. 2580 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2581 Gep2->setOperand(0, FirstBasePtr); 2582 Gep2->setName("gep.indvar.base"); 2583 Ptr = Builder.Insert(Gep2); 2584 } else if (Gep) { 2585 setDebugLocFromInst(Builder, Gep); 2586 assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), 2587 OrigLoop) && 2588 "Base ptr must be invariant"); 2589 // The last index does not have to be the induction. It can be 2590 // consecutive and be a function of the index. For example A[I+1]; 2591 unsigned NumOperands = Gep->getNumOperands(); 2592 unsigned InductionOperand = getGEPInductionOperand(Gep); 2593 // Create the new GEP with the new induction variable. 2594 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2595 2596 for (unsigned i = 0; i < NumOperands; ++i) { 2597 Value *GepOperand = Gep->getOperand(i); 2598 Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); 2599 2600 // Update last index or loop invariant instruction anchored in loop. 2601 if (i == InductionOperand || 2602 (GepOperandInst && OrigLoop->contains(GepOperandInst))) { 2603 assert((i == InductionOperand || 2604 PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), 2605 OrigLoop)) && 2606 "Must be last index or loop invariant"); 2607 2608 VectorParts &GEPParts = getVectorValue(GepOperand); 2609 2610 // If GepOperand is an induction variable, and there's a scalarized 2611 // version of it available, use it. Otherwise, we will need to create 2612 // an extractelement instruction. 2613 Value *Index = ScalarIVMap.count(GepOperand) 2614 ? ScalarIVMap[GepOperand][0] 2615 : Builder.CreateExtractElement(GEPParts[0], Zero); 2616 2617 Gep2->setOperand(i, Index); 2618 Gep2->setName("gep.indvar.idx"); 2619 } 2620 } 2621 Ptr = Builder.Insert(Gep2); 2622 } else { // No GEP 2623 // Use the induction element ptr. 2624 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 2625 setDebugLocFromInst(Builder, Ptr); 2626 VectorParts &PtrVal = getVectorValue(Ptr); 2627 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 2628 } 2629 } else { 2630 // At this point we should vector version of GEP for Gather or Scatter 2631 assert(CreateGatherScatter && "The instruction should be scalarized"); 2632 if (Gep) { 2633 // Vectorizing GEP, across UF parts. We want to get a vector value for base 2634 // and each index that's defined inside the loop, even if it is 2635 // loop-invariant but wasn't hoisted out. Otherwise we want to keep them 2636 // scalar. 2637 SmallVector<VectorParts, 4> OpsV; 2638 for (Value *Op : Gep->operands()) { 2639 Instruction *SrcInst = dyn_cast<Instruction>(Op); 2640 if (SrcInst && OrigLoop->contains(SrcInst)) 2641 OpsV.push_back(getVectorValue(Op)); 2642 else 2643 OpsV.push_back(VectorParts(UF, Op)); 2644 } 2645 for (unsigned Part = 0; Part < UF; ++Part) { 2646 SmallVector<Value *, 4> Ops; 2647 Value *GEPBasePtr = OpsV[0][Part]; 2648 for (unsigned i = 1; i < Gep->getNumOperands(); i++) 2649 Ops.push_back(OpsV[i][Part]); 2650 Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep"); 2651 cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds()); 2652 assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); 2653 2654 NewGep = 2655 Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF)); 2656 VectorGep.push_back(NewGep); 2657 } 2658 } else 2659 VectorGep = getVectorValue(Ptr); 2660 } 2661 2662 VectorParts Mask = createBlockInMask(Instr->getParent()); 2663 // Handle Stores: 2664 if (SI) { 2665 assert(!Legal->isUniform(SI->getPointerOperand()) && 2666 "We do not allow storing to uniform addresses"); 2667 setDebugLocFromInst(Builder, SI); 2668 // We don't want to update the value in the map as it might be used in 2669 // another expression. So don't use a reference type for "StoredVal". 2670 VectorParts StoredVal = getVectorValue(SI->getValueOperand()); 2671 2672 for (unsigned Part = 0; Part < UF; ++Part) { 2673 Instruction *NewSI = nullptr; 2674 if (CreateGatherScatter) { 2675 Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; 2676 NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part], 2677 Alignment, MaskPart); 2678 } else { 2679 // Calculate the pointer for the specific unroll-part. 2680 Value *PartPtr = 2681 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2682 2683 if (Reverse) { 2684 // If we store to reverse consecutive memory locations, then we need 2685 // to reverse the order of elements in the stored value. 2686 StoredVal[Part] = reverseVector(StoredVal[Part]); 2687 // If the address is consecutive but reversed, then the 2688 // wide store needs to start at the last vector element. 2689 PartPtr = 2690 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2691 PartPtr = 2692 Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2693 Mask[Part] = reverseVector(Mask[Part]); 2694 } 2695 2696 Value *VecPtr = 2697 Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2698 2699 if (Legal->isMaskRequired(SI)) 2700 NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, 2701 Mask[Part]); 2702 else 2703 NewSI = 2704 Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); 2705 } 2706 addMetadata(NewSI, SI); 2707 } 2708 return; 2709 } 2710 2711 // Handle loads. 2712 assert(LI && "Must have a load instruction"); 2713 setDebugLocFromInst(Builder, LI); 2714 for (unsigned Part = 0; Part < UF; ++Part) { 2715 Instruction *NewLI; 2716 if (CreateGatherScatter) { 2717 Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; 2718 NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart, 2719 0, "wide.masked.gather"); 2720 Entry[Part] = NewLI; 2721 } else { 2722 // Calculate the pointer for the specific unroll-part. 2723 Value *PartPtr = 2724 Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2725 2726 if (Reverse) { 2727 // If the address is consecutive but reversed, then the 2728 // wide load needs to start at the last vector element. 2729 PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2730 PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2731 Mask[Part] = reverseVector(Mask[Part]); 2732 } 2733 2734 Value *VecPtr = 2735 Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2736 if (Legal->isMaskRequired(LI)) 2737 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2738 UndefValue::get(DataTy), 2739 "wide.masked.load"); 2740 else 2741 NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); 2742 Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; 2743 } 2744 addMetadata(NewLI, LI); 2745 } 2746 } 2747 2748 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2749 bool IfPredicateStore) { 2750 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2751 // Holds vector parameters or scalars, in case of uniform vals. 2752 SmallVector<VectorParts, 4> Params; 2753 2754 setDebugLocFromInst(Builder, Instr); 2755 2756 // Find all of the vectorized parameters. 2757 for (Value *SrcOp : Instr->operands()) { 2758 // If we are accessing the old induction variable, use the new one. 2759 if (SrcOp == OldInduction) { 2760 Params.push_back(getVectorValue(SrcOp)); 2761 continue; 2762 } 2763 2764 // Try using previously calculated values. 2765 auto *SrcInst = dyn_cast<Instruction>(SrcOp); 2766 2767 // If the src is an instruction that appeared earlier in the basic block, 2768 // then it should already be vectorized. 2769 if (SrcInst && OrigLoop->contains(SrcInst)) { 2770 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 2771 // The parameter is a vector value from earlier. 2772 Params.push_back(WidenMap.get(SrcInst)); 2773 } else { 2774 // The parameter is a scalar from outside the loop. Maybe even a constant. 2775 VectorParts Scalars; 2776 Scalars.append(UF, SrcOp); 2777 Params.push_back(Scalars); 2778 } 2779 } 2780 2781 assert(Params.size() == Instr->getNumOperands() && 2782 "Invalid number of operands"); 2783 2784 // Does this instruction return a value ? 2785 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2786 2787 Value *UndefVec = 2788 IsVoidRetTy ? nullptr 2789 : UndefValue::get(VectorType::get(Instr->getType(), VF)); 2790 // Create a new entry in the WidenMap and initialize it to Undef or Null. 2791 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 2792 2793 VectorParts Cond; 2794 if (IfPredicateStore) { 2795 assert(Instr->getParent()->getSinglePredecessor() && 2796 "Only support single predecessor blocks"); 2797 Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 2798 Instr->getParent()); 2799 } 2800 2801 // For each vector unroll 'part': 2802 for (unsigned Part = 0; Part < UF; ++Part) { 2803 // For each scalar that we create: 2804 for (unsigned Width = 0; Width < VF; ++Width) { 2805 2806 // Start if-block. 2807 Value *Cmp = nullptr; 2808 if (IfPredicateStore) { 2809 Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); 2810 Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, 2811 ConstantInt::get(Cmp->getType(), 1)); 2812 } 2813 2814 Instruction *Cloned = Instr->clone(); 2815 if (!IsVoidRetTy) 2816 Cloned->setName(Instr->getName() + ".cloned"); 2817 // Replace the operands of the cloned instructions with extracted scalars. 2818 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2819 2820 // If the operand is an induction variable, and there's a scalarized 2821 // version of it available, use it. Otherwise, we will need to create 2822 // an extractelement instruction if vectorizing. 2823 auto *NewOp = Params[op][Part]; 2824 auto *ScalarOp = Instr->getOperand(op); 2825 if (ScalarIVMap.count(ScalarOp)) 2826 NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; 2827 else if (NewOp->getType()->isVectorTy()) 2828 NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); 2829 Cloned->setOperand(op, NewOp); 2830 } 2831 addNewMetadata(Cloned, Instr); 2832 2833 // Place the cloned scalar in the new loop. 2834 Builder.Insert(Cloned); 2835 2836 // If we just cloned a new assumption, add it the assumption cache. 2837 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2838 if (II->getIntrinsicID() == Intrinsic::assume) 2839 AC->registerAssumption(II); 2840 2841 // If the original scalar returns a value we need to place it in a vector 2842 // so that future users will be able to use it. 2843 if (!IsVoidRetTy) 2844 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 2845 Builder.getInt32(Width)); 2846 // End if-block. 2847 if (IfPredicateStore) 2848 PredicatedStores.push_back( 2849 std::make_pair(cast<StoreInst>(Cloned), Cmp)); 2850 } 2851 } 2852 } 2853 2854 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2855 Value *End, Value *Step, 2856 Instruction *DL) { 2857 BasicBlock *Header = L->getHeader(); 2858 BasicBlock *Latch = L->getLoopLatch(); 2859 // As we're just creating this loop, it's possible no latch exists 2860 // yet. If so, use the header as this will be a single block loop. 2861 if (!Latch) 2862 Latch = Header; 2863 2864 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2865 setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); 2866 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2867 2868 Builder.SetInsertPoint(Latch->getTerminator()); 2869 2870 // Create i+1 and fill the PHINode. 2871 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2872 Induction->addIncoming(Start, L->getLoopPreheader()); 2873 Induction->addIncoming(Next, Latch); 2874 // Create the compare. 2875 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2876 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2877 2878 // Now we have two terminators. Remove the old one from the block. 2879 Latch->getTerminator()->eraseFromParent(); 2880 2881 return Induction; 2882 } 2883 2884 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2885 if (TripCount) 2886 return TripCount; 2887 2888 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2889 // Find the loop boundaries. 2890 ScalarEvolution *SE = PSE.getSE(); 2891 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2892 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2893 "Invalid loop count"); 2894 2895 Type *IdxTy = Legal->getWidestInductionType(); 2896 2897 // The exit count might have the type of i64 while the phi is i32. This can 2898 // happen if we have an induction variable that is sign extended before the 2899 // compare. The only way that we get a backedge taken count is that the 2900 // induction variable was signed and as such will not overflow. In such a case 2901 // truncation is legal. 2902 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2903 IdxTy->getPrimitiveSizeInBits()) 2904 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2905 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2906 2907 // Get the total trip count from the count by adding 1. 2908 const SCEV *ExitCount = SE->getAddExpr( 2909 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2910 2911 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2912 2913 // Expand the trip count and place the new instructions in the preheader. 2914 // Notice that the pre-header does not change, only the loop body. 2915 SCEVExpander Exp(*SE, DL, "induction"); 2916 2917 // Count holds the overall loop count (N). 2918 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2919 L->getLoopPreheader()->getTerminator()); 2920 2921 if (TripCount->getType()->isPointerTy()) 2922 TripCount = 2923 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2924 L->getLoopPreheader()->getTerminator()); 2925 2926 return TripCount; 2927 } 2928 2929 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2930 if (VectorTripCount) 2931 return VectorTripCount; 2932 2933 Value *TC = getOrCreateTripCount(L); 2934 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2935 2936 // Now we need to generate the expression for the part of the loop that the 2937 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2938 // iterations are not required for correctness, or N - Step, otherwise. Step 2939 // is equal to the vectorization factor (number of SIMD elements) times the 2940 // unroll factor (number of SIMD instructions). 2941 Constant *Step = ConstantInt::get(TC->getType(), VF * UF); 2942 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2943 2944 // If there is a non-reversed interleaved group that may speculatively access 2945 // memory out-of-bounds, we need to ensure that there will be at least one 2946 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2947 // the trip count, we set the remainder to be equal to the step. If the step 2948 // does not evenly divide the trip count, no adjustment is necessary since 2949 // there will already be scalar iterations. Note that the minimum iterations 2950 // check ensures that N >= Step. 2951 if (VF > 1 && Legal->requiresScalarEpilogue()) { 2952 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2953 R = Builder.CreateSelect(IsZero, Step, R); 2954 } 2955 2956 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2957 2958 return VectorTripCount; 2959 } 2960 2961 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2962 BasicBlock *Bypass) { 2963 Value *Count = getOrCreateTripCount(L); 2964 BasicBlock *BB = L->getLoopPreheader(); 2965 IRBuilder<> Builder(BB->getTerminator()); 2966 2967 // Generate code to check that the loop's trip count that we computed by 2968 // adding one to the backedge-taken count will not overflow. 2969 Value *CheckMinIters = Builder.CreateICmpULT( 2970 Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); 2971 2972 BasicBlock *NewBB = 2973 BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked"); 2974 // Update dominator tree immediately if the generated block is a 2975 // LoopBypassBlock because SCEV expansions to generate loop bypass 2976 // checks may query it before the current function is finished. 2977 DT->addNewBlock(NewBB, BB); 2978 if (L->getParentLoop()) 2979 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2980 ReplaceInstWithInst(BB->getTerminator(), 2981 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2982 LoopBypassBlocks.push_back(BB); 2983 } 2984 2985 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, 2986 BasicBlock *Bypass) { 2987 Value *TC = getOrCreateVectorTripCount(L); 2988 BasicBlock *BB = L->getLoopPreheader(); 2989 IRBuilder<> Builder(BB->getTerminator()); 2990 2991 // Now, compare the new count to zero. If it is zero skip the vector loop and 2992 // jump to the scalar loop. 2993 Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), 2994 "cmp.zero"); 2995 2996 // Generate code to check that the loop's trip count that we computed by 2997 // adding one to the backedge-taken count will not overflow. 2998 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2999 // Update dominator tree immediately if the generated block is a 3000 // LoopBypassBlock because SCEV expansions to generate loop bypass 3001 // checks may query it before the current function is finished. 3002 DT->addNewBlock(NewBB, BB); 3003 if (L->getParentLoop()) 3004 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 3005 ReplaceInstWithInst(BB->getTerminator(), 3006 BranchInst::Create(Bypass, NewBB, Cmp)); 3007 LoopBypassBlocks.push_back(BB); 3008 } 3009 3010 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3011 BasicBlock *BB = L->getLoopPreheader(); 3012 3013 // Generate the code to check that the SCEV assumptions that we made. 3014 // We want the new basic block to start at the first instruction in a 3015 // sequence of instructions that form a check. 3016 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3017 "scev.check"); 3018 Value *SCEVCheck = 3019 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 3020 3021 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3022 if (C->isZero()) 3023 return; 3024 3025 // Create a new block containing the stride check. 3026 BB->setName("vector.scevcheck"); 3027 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 3028 // Update dominator tree immediately if the generated block is a 3029 // LoopBypassBlock because SCEV expansions to generate loop bypass 3030 // checks may query it before the current function is finished. 3031 DT->addNewBlock(NewBB, BB); 3032 if (L->getParentLoop()) 3033 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 3034 ReplaceInstWithInst(BB->getTerminator(), 3035 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 3036 LoopBypassBlocks.push_back(BB); 3037 AddedSafetyChecks = true; 3038 } 3039 3040 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3041 BasicBlock *BB = L->getLoopPreheader(); 3042 3043 // Generate the code that checks in runtime if arrays overlap. We put the 3044 // checks into a separate block to make the more common case of few elements 3045 // faster. 3046 Instruction *FirstCheckInst; 3047 Instruction *MemRuntimeCheck; 3048 std::tie(FirstCheckInst, MemRuntimeCheck) = 3049 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 3050 if (!MemRuntimeCheck) 3051 return; 3052 3053 // Create a new block containing the memory check. 3054 BB->setName("vector.memcheck"); 3055 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 3056 // Update dominator tree immediately if the generated block is a 3057 // LoopBypassBlock because SCEV expansions to generate loop bypass 3058 // checks may query it before the current function is finished. 3059 DT->addNewBlock(NewBB, BB); 3060 if (L->getParentLoop()) 3061 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 3062 ReplaceInstWithInst(BB->getTerminator(), 3063 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 3064 LoopBypassBlocks.push_back(BB); 3065 AddedSafetyChecks = true; 3066 3067 // We currently don't use LoopVersioning for the actual loop cloning but we 3068 // still use it to add the noalias metadata. 3069 LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 3070 PSE.getSE()); 3071 LVer->prepareNoAliasMetadata(); 3072 } 3073 3074 void InnerLoopVectorizer::createEmptyLoop() { 3075 /* 3076 In this function we generate a new loop. The new loop will contain 3077 the vectorized instructions while the old loop will continue to run the 3078 scalar remainder. 3079 3080 [ ] <-- loop iteration number check. 3081 / | 3082 / v 3083 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3084 | / | 3085 | / v 3086 || [ ] <-- vector pre header. 3087 |/ | 3088 | v 3089 | [ ] \ 3090 | [ ]_| <-- vector loop. 3091 | | 3092 | v 3093 | -[ ] <--- middle-block. 3094 | / | 3095 | / v 3096 -|- >[ ] <--- new preheader. 3097 | | 3098 | v 3099 | [ ] \ 3100 | [ ]_| <-- old scalar loop to handle remainder. 3101 \ | 3102 \ v 3103 >[ ] <-- exit block. 3104 ... 3105 */ 3106 3107 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 3108 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 3109 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 3110 assert(VectorPH && "Invalid loop structure"); 3111 assert(ExitBlock && "Must have an exit block"); 3112 3113 // Some loops have a single integer induction variable, while other loops 3114 // don't. One example is c++ iterators that often have multiple pointer 3115 // induction variables. In the code below we also support a case where we 3116 // don't have a single induction variable. 3117 // 3118 // We try to obtain an induction variable from the original loop as hard 3119 // as possible. However if we don't find one that: 3120 // - is an integer 3121 // - counts from zero, stepping by one 3122 // - is the size of the widest induction variable type 3123 // then we create a new one. 3124 OldInduction = Legal->getInduction(); 3125 Type *IdxTy = Legal->getWidestInductionType(); 3126 3127 // Split the single block loop into the two loop structure described above. 3128 BasicBlock *VecBody = 3129 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 3130 BasicBlock *MiddleBlock = 3131 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 3132 BasicBlock *ScalarPH = 3133 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 3134 3135 // Create and register the new vector loop. 3136 Loop *Lp = new Loop(); 3137 Loop *ParentLoop = OrigLoop->getParentLoop(); 3138 3139 // Insert the new loop into the loop nest and register the new basic blocks 3140 // before calling any utilities such as SCEV that require valid LoopInfo. 3141 if (ParentLoop) { 3142 ParentLoop->addChildLoop(Lp); 3143 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 3144 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 3145 } else { 3146 LI->addTopLevelLoop(Lp); 3147 } 3148 Lp->addBasicBlockToLoop(VecBody, *LI); 3149 3150 // Find the loop boundaries. 3151 Value *Count = getOrCreateTripCount(Lp); 3152 3153 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3154 3155 // We need to test whether the backedge-taken count is uint##_max. Adding one 3156 // to it will cause overflow and an incorrect loop trip count in the vector 3157 // body. In case of overflow we want to directly jump to the scalar remainder 3158 // loop. 3159 emitMinimumIterationCountCheck(Lp, ScalarPH); 3160 // Now, compare the new count to zero. If it is zero skip the vector loop and 3161 // jump to the scalar loop. 3162 emitVectorLoopEnteredCheck(Lp, ScalarPH); 3163 // Generate the code to check any assumptions that we've made for SCEV 3164 // expressions. 3165 emitSCEVChecks(Lp, ScalarPH); 3166 3167 // Generate the code that checks in runtime if arrays overlap. We put the 3168 // checks into a separate block to make the more common case of few elements 3169 // faster. 3170 emitMemRuntimeChecks(Lp, ScalarPH); 3171 3172 // Generate the induction variable. 3173 // The loop step is equal to the vectorization factor (num of SIMD elements) 3174 // times the unroll factor (num of SIMD instructions). 3175 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3176 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3177 Induction = 3178 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3179 getDebugLocFromInstOrOperands(OldInduction)); 3180 3181 // We are going to resume the execution of the scalar loop. 3182 // Go over all of the induction variables that we found and fix the 3183 // PHIs that are left in the scalar version of the loop. 3184 // The starting values of PHI nodes depend on the counter of the last 3185 // iteration in the vectorized loop. 3186 // If we come from a bypass edge then we need to start from the original 3187 // start value. 3188 3189 // This variable saves the new starting index for the scalar loop. It is used 3190 // to test if there are any tail iterations left once the vector loop has 3191 // completed. 3192 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 3193 for (auto &InductionEntry : *List) { 3194 PHINode *OrigPhi = InductionEntry.first; 3195 InductionDescriptor II = InductionEntry.second; 3196 3197 // Create phi nodes to merge from the backedge-taken check block. 3198 PHINode *BCResumeVal = PHINode::Create( 3199 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 3200 Value *EndValue; 3201 if (OrigPhi == OldInduction) { 3202 // We know what the end value is. 3203 EndValue = CountRoundDown; 3204 } else { 3205 IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); 3206 Type *StepType = II.getStep()->getType(); 3207 Instruction::CastOps CastOp = 3208 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3209 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3210 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3211 EndValue = II.transform(B, CRD, PSE.getSE(), DL); 3212 EndValue->setName("ind.end"); 3213 } 3214 3215 // The new PHI merges the original incoming value, in case of a bypass, 3216 // or the value at the end of the vectorized loop. 3217 BCResumeVal->addIncoming(EndValue, MiddleBlock); 3218 3219 // Fix up external users of the induction variable. 3220 fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock); 3221 3222 // Fix the scalar body counter (PHI node). 3223 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 3224 3225 // The old induction's phi node in the scalar body needs the truncated 3226 // value. 3227 for (BasicBlock *BB : LoopBypassBlocks) 3228 BCResumeVal->addIncoming(II.getStartValue(), BB); 3229 OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); 3230 } 3231 3232 // Add a check in the middle block to see if we have completed 3233 // all of the iterations in the first vector loop. 3234 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3235 Value *CmpN = 3236 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3237 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3238 ReplaceInstWithInst(MiddleBlock->getTerminator(), 3239 BranchInst::Create(ExitBlock, ScalarPH, CmpN)); 3240 3241 // Get ready to start creating new instructions into the vectorized body. 3242 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3243 3244 // Save the state. 3245 LoopVectorPreHeader = Lp->getLoopPreheader(); 3246 LoopScalarPreHeader = ScalarPH; 3247 LoopMiddleBlock = MiddleBlock; 3248 LoopExitBlock = ExitBlock; 3249 LoopVectorBody = VecBody; 3250 LoopScalarBody = OldBasicBlock; 3251 3252 // Keep all loop hints from the original loop on the vector loop (we'll 3253 // replace the vectorizer-specific hints below). 3254 if (MDNode *LID = OrigLoop->getLoopID()) 3255 Lp->setLoopID(LID); 3256 3257 LoopVectorizeHints Hints(Lp, true, *ORE); 3258 Hints.setAlreadyVectorized(); 3259 } 3260 3261 // Fix up external users of the induction variable. At this point, we are 3262 // in LCSSA form, with all external PHIs that use the IV having one input value, 3263 // coming from the remainder loop. We need those PHIs to also have a correct 3264 // value for the IV when arriving directly from the middle block. 3265 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3266 const InductionDescriptor &II, 3267 Value *CountRoundDown, Value *EndValue, 3268 BasicBlock *MiddleBlock) { 3269 // There are two kinds of external IV usages - those that use the value 3270 // computed in the last iteration (the PHI) and those that use the penultimate 3271 // value (the value that feeds into the phi from the loop latch). 3272 // We allow both, but they, obviously, have different values. 3273 3274 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3275 3276 DenseMap<Value *, Value *> MissingVals; 3277 3278 // An external user of the last iteration's value should see the value that 3279 // the remainder loop uses to initialize its own IV. 3280 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3281 for (User *U : PostInc->users()) { 3282 Instruction *UI = cast<Instruction>(U); 3283 if (!OrigLoop->contains(UI)) { 3284 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3285 MissingVals[UI] = EndValue; 3286 } 3287 } 3288 3289 // An external user of the penultimate value need to see EndValue - Step. 3290 // The simplest way to get this is to recompute it from the constituent SCEVs, 3291 // that is Start + (Step * (CRD - 1)). 3292 for (User *U : OrigPhi->users()) { 3293 auto *UI = cast<Instruction>(U); 3294 if (!OrigLoop->contains(UI)) { 3295 const DataLayout &DL = 3296 OrigLoop->getHeader()->getModule()->getDataLayout(); 3297 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3298 3299 IRBuilder<> B(MiddleBlock->getTerminator()); 3300 Value *CountMinusOne = B.CreateSub( 3301 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3302 Value *CMO = B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType(), 3303 "cast.cmo"); 3304 Value *Escape = II.transform(B, CMO, PSE.getSE(), DL); 3305 Escape->setName("ind.escape"); 3306 MissingVals[UI] = Escape; 3307 } 3308 } 3309 3310 for (auto &I : MissingVals) { 3311 PHINode *PHI = cast<PHINode>(I.first); 3312 // One corner case we have to handle is two IVs "chasing" each-other, 3313 // that is %IV2 = phi [...], [ %IV1, %latch ] 3314 // In this case, if IV1 has an external use, we need to avoid adding both 3315 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3316 // don't already have an incoming value for the middle block. 3317 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3318 PHI->addIncoming(I.second, MiddleBlock); 3319 } 3320 } 3321 3322 namespace { 3323 struct CSEDenseMapInfo { 3324 static bool canHandle(Instruction *I) { 3325 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3326 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3327 } 3328 static inline Instruction *getEmptyKey() { 3329 return DenseMapInfo<Instruction *>::getEmptyKey(); 3330 } 3331 static inline Instruction *getTombstoneKey() { 3332 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3333 } 3334 static unsigned getHashValue(Instruction *I) { 3335 assert(canHandle(I) && "Unknown instruction!"); 3336 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3337 I->value_op_end())); 3338 } 3339 static bool isEqual(Instruction *LHS, Instruction *RHS) { 3340 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3341 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3342 return LHS == RHS; 3343 return LHS->isIdenticalTo(RHS); 3344 } 3345 }; 3346 } 3347 3348 ///\brief Perform cse of induction variable instructions. 3349 static void cse(BasicBlock *BB) { 3350 // Perform simple cse. 3351 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3352 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3353 Instruction *In = &*I++; 3354 3355 if (!CSEDenseMapInfo::canHandle(In)) 3356 continue; 3357 3358 // Check if we can replace this instruction with any of the 3359 // visited instructions. 3360 if (Instruction *V = CSEMap.lookup(In)) { 3361 In->replaceAllUsesWith(V); 3362 In->eraseFromParent(); 3363 continue; 3364 } 3365 3366 CSEMap[In] = In; 3367 } 3368 } 3369 3370 /// \brief Adds a 'fast' flag to floating point operations. 3371 static Value *addFastMathFlag(Value *V) { 3372 if (isa<FPMathOperator>(V)) { 3373 FastMathFlags Flags; 3374 Flags.setUnsafeAlgebra(); 3375 cast<Instruction>(V)->setFastMathFlags(Flags); 3376 } 3377 return V; 3378 } 3379 3380 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if 3381 /// the result needs to be inserted and/or extracted from vectors. 3382 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, 3383 const TargetTransformInfo &TTI) { 3384 if (Ty->isVoidTy()) 3385 return 0; 3386 3387 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 3388 unsigned Cost = 0; 3389 3390 for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { 3391 if (Insert) 3392 Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); 3393 if (Extract) 3394 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); 3395 } 3396 3397 return Cost; 3398 } 3399 3400 // Estimate cost of a call instruction CI if it were vectorized with factor VF. 3401 // Return the cost of the instruction, including scalarization overhead if it's 3402 // needed. The flag NeedToScalarize shows if the call needs to be scalarized - 3403 // i.e. either vector version isn't available, or is too expensive. 3404 static unsigned getVectorCallCost(CallInst *CI, unsigned VF, 3405 const TargetTransformInfo &TTI, 3406 const TargetLibraryInfo *TLI, 3407 bool &NeedToScalarize) { 3408 Function *F = CI->getCalledFunction(); 3409 StringRef FnName = CI->getCalledFunction()->getName(); 3410 Type *ScalarRetTy = CI->getType(); 3411 SmallVector<Type *, 4> Tys, ScalarTys; 3412 for (auto &ArgOp : CI->arg_operands()) 3413 ScalarTys.push_back(ArgOp->getType()); 3414 3415 // Estimate cost of scalarized vector call. The source operands are assumed 3416 // to be vectors, so we need to extract individual elements from there, 3417 // execute VF scalar calls, and then gather the result into the vector return 3418 // value. 3419 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3420 if (VF == 1) 3421 return ScalarCallCost; 3422 3423 // Compute corresponding vector type for return value and arguments. 3424 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3425 for (Type *ScalarTy : ScalarTys) 3426 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3427 3428 // Compute costs of unpacking argument values for the scalar calls and 3429 // packing the return values to a vector. 3430 unsigned ScalarizationCost = 3431 getScalarizationOverhead(RetTy, true, false, TTI); 3432 for (Type *Ty : Tys) 3433 ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); 3434 3435 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3436 3437 // If we can't emit a vector call for this function, then the currently found 3438 // cost is the cost we need to return. 3439 NeedToScalarize = true; 3440 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3441 return Cost; 3442 3443 // If the corresponding vector cost is cheaper, return its cost. 3444 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3445 if (VectorCallCost < Cost) { 3446 NeedToScalarize = false; 3447 return VectorCallCost; 3448 } 3449 return Cost; 3450 } 3451 3452 // Estimate cost of an intrinsic call instruction CI if it were vectorized with 3453 // factor VF. Return the cost of the instruction, including scalarization 3454 // overhead if it's needed. 3455 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, 3456 const TargetTransformInfo &TTI, 3457 const TargetLibraryInfo *TLI) { 3458 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3459 assert(ID && "Expected intrinsic call!"); 3460 3461 Type *RetTy = ToVectorTy(CI->getType(), VF); 3462 SmallVector<Type *, 4> Tys; 3463 for (Value *ArgOperand : CI->arg_operands()) 3464 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 3465 3466 FastMathFlags FMF; 3467 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3468 FMF = FPMO->getFastMathFlags(); 3469 3470 return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); 3471 } 3472 3473 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3474 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3475 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3476 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3477 } 3478 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3479 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3480 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3481 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3482 } 3483 3484 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3485 // For every instruction `I` in MinBWs, truncate the operands, create a 3486 // truncated version of `I` and reextend its result. InstCombine runs 3487 // later and will remove any ext/trunc pairs. 3488 // 3489 SmallPtrSet<Value *, 4> Erased; 3490 for (const auto &KV : *MinBWs) { 3491 VectorParts &Parts = WidenMap.get(KV.first); 3492 for (Value *&I : Parts) { 3493 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3494 continue; 3495 Type *OriginalTy = I->getType(); 3496 Type *ScalarTruncatedTy = 3497 IntegerType::get(OriginalTy->getContext(), KV.second); 3498 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3499 OriginalTy->getVectorNumElements()); 3500 if (TruncatedTy == OriginalTy) 3501 continue; 3502 3503 IRBuilder<> B(cast<Instruction>(I)); 3504 auto ShrinkOperand = [&](Value *V) -> Value * { 3505 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3506 if (ZI->getSrcTy() == TruncatedTy) 3507 return ZI->getOperand(0); 3508 return B.CreateZExtOrTrunc(V, TruncatedTy); 3509 }; 3510 3511 // The actual instruction modification depends on the instruction type, 3512 // unfortunately. 3513 Value *NewI = nullptr; 3514 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3515 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3516 ShrinkOperand(BO->getOperand(1))); 3517 cast<BinaryOperator>(NewI)->copyIRFlags(I); 3518 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3519 NewI = 3520 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3521 ShrinkOperand(CI->getOperand(1))); 3522 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3523 NewI = B.CreateSelect(SI->getCondition(), 3524 ShrinkOperand(SI->getTrueValue()), 3525 ShrinkOperand(SI->getFalseValue())); 3526 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3527 switch (CI->getOpcode()) { 3528 default: 3529 llvm_unreachable("Unhandled cast!"); 3530 case Instruction::Trunc: 3531 NewI = ShrinkOperand(CI->getOperand(0)); 3532 break; 3533 case Instruction::SExt: 3534 NewI = B.CreateSExtOrTrunc( 3535 CI->getOperand(0), 3536 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3537 break; 3538 case Instruction::ZExt: 3539 NewI = B.CreateZExtOrTrunc( 3540 CI->getOperand(0), 3541 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3542 break; 3543 } 3544 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3545 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3546 auto *O0 = B.CreateZExtOrTrunc( 3547 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3548 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3549 auto *O1 = B.CreateZExtOrTrunc( 3550 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3551 3552 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3553 } else if (isa<LoadInst>(I)) { 3554 // Don't do anything with the operands, just extend the result. 3555 continue; 3556 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3557 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3558 auto *O0 = B.CreateZExtOrTrunc( 3559 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3560 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3561 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3562 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3563 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3564 auto *O0 = B.CreateZExtOrTrunc( 3565 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3566 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3567 } else { 3568 llvm_unreachable("Unhandled instruction type!"); 3569 } 3570 3571 // Lastly, extend the result. 3572 NewI->takeName(cast<Instruction>(I)); 3573 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3574 I->replaceAllUsesWith(Res); 3575 cast<Instruction>(I)->eraseFromParent(); 3576 Erased.insert(I); 3577 I = Res; 3578 } 3579 } 3580 3581 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3582 for (const auto &KV : *MinBWs) { 3583 VectorParts &Parts = WidenMap.get(KV.first); 3584 for (Value *&I : Parts) { 3585 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3586 if (Inst && Inst->use_empty()) { 3587 Value *NewI = Inst->getOperand(0); 3588 Inst->eraseFromParent(); 3589 I = NewI; 3590 } 3591 } 3592 } 3593 } 3594 3595 void InnerLoopVectorizer::vectorizeLoop() { 3596 //===------------------------------------------------===// 3597 // 3598 // Notice: any optimization or new instruction that go 3599 // into the code below should be also be implemented in 3600 // the cost-model. 3601 // 3602 //===------------------------------------------------===// 3603 Constant *Zero = Builder.getInt32(0); 3604 3605 // In order to support recurrences we need to be able to vectorize Phi nodes. 3606 // Phi nodes have cycles, so we need to vectorize them in two stages. First, 3607 // we create a new vector PHI node with no incoming edges. We use this value 3608 // when we vectorize all of the instructions that use the PHI. Next, after 3609 // all of the instructions in the block are complete we add the new incoming 3610 // edges to the PHI. At this point all of the instructions in the basic block 3611 // are vectorized, so we can use them to construct the PHI. 3612 PhiVector PHIsToFix; 3613 3614 // Scan the loop in a topological order to ensure that defs are vectorized 3615 // before users. 3616 LoopBlocksDFS DFS(OrigLoop); 3617 DFS.perform(LI); 3618 3619 // Vectorize all of the blocks in the original loop. 3620 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) 3621 vectorizeBlockInLoop(BB, &PHIsToFix); 3622 3623 // Insert truncates and extends for any truncated instructions as hints to 3624 // InstCombine. 3625 if (VF > 1) 3626 truncateToMinimalBitwidths(); 3627 3628 // At this point every instruction in the original loop is widened to a 3629 // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI 3630 // nodes are currently empty because we did not want to introduce cycles. 3631 // This is the second stage of vectorizing recurrences. 3632 for (PHINode *Phi : PHIsToFix) { 3633 assert(Phi && "Unable to recover vectorized PHI"); 3634 3635 // Handle first-order recurrences that need to be fixed. 3636 if (Legal->isFirstOrderRecurrence(Phi)) { 3637 fixFirstOrderRecurrence(Phi); 3638 continue; 3639 } 3640 3641 // If the phi node is not a first-order recurrence, it must be a reduction. 3642 // Get it's reduction variable descriptor. 3643 assert(Legal->isReductionVariable(Phi) && 3644 "Unable to find the reduction variable"); 3645 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3646 3647 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3648 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3649 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3650 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3651 RdxDesc.getMinMaxRecurrenceKind(); 3652 setDebugLocFromInst(Builder, ReductionStartValue); 3653 3654 // We need to generate a reduction vector from the incoming scalar. 3655 // To do so, we need to generate the 'identity' vector and override 3656 // one of the elements with the incoming scalar reduction. We need 3657 // to do it in the vector-loop preheader. 3658 Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); 3659 3660 // This is the vector-clone of the value that leaves the loop. 3661 VectorParts &VectorExit = getVectorValue(LoopExitInst); 3662 Type *VecTy = VectorExit[0]->getType(); 3663 3664 // Find the reduction identity variable. Zero for addition, or, xor, 3665 // one for multiplication, -1 for And. 3666 Value *Identity; 3667 Value *VectorStart; 3668 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3669 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3670 // MinMax reduction have the start value as their identify. 3671 if (VF == 1) { 3672 VectorStart = Identity = ReductionStartValue; 3673 } else { 3674 VectorStart = Identity = 3675 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3676 } 3677 } else { 3678 // Handle other reduction kinds: 3679 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3680 RK, VecTy->getScalarType()); 3681 if (VF == 1) { 3682 Identity = Iden; 3683 // This vector is the Identity vector where the first element is the 3684 // incoming scalar reduction. 3685 VectorStart = ReductionStartValue; 3686 } else { 3687 Identity = ConstantVector::getSplat(VF, Iden); 3688 3689 // This vector is the Identity vector where the first element is the 3690 // incoming scalar reduction. 3691 VectorStart = 3692 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3693 } 3694 } 3695 3696 // Fix the vector-loop phi. 3697 3698 // Reductions do not have to start at zero. They can start with 3699 // any loop invariant values. 3700 VectorParts &VecRdxPhi = WidenMap.get(Phi); 3701 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3702 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3703 VectorParts &Val = getVectorValue(LoopVal); 3704 for (unsigned part = 0; part < UF; ++part) { 3705 // Make sure to add the reduction stat value only to the 3706 // first unroll part. 3707 Value *StartVal = (part == 0) ? VectorStart : Identity; 3708 cast<PHINode>(VecRdxPhi[part]) 3709 ->addIncoming(StartVal, LoopVectorPreHeader); 3710 cast<PHINode>(VecRdxPhi[part]) 3711 ->addIncoming(Val[part], LoopVectorBody); 3712 } 3713 3714 // Before each round, move the insertion point right between 3715 // the PHIs and the values we are going to write. 3716 // This allows us to write both PHINodes and the extractelement 3717 // instructions. 3718 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3719 3720 VectorParts RdxParts = getVectorValue(LoopExitInst); 3721 setDebugLocFromInst(Builder, LoopExitInst); 3722 3723 // If the vector reduction can be performed in a smaller type, we truncate 3724 // then extend the loop exit value to enable InstCombine to evaluate the 3725 // entire expression in the smaller type. 3726 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3727 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3728 Builder.SetInsertPoint(LoopVectorBody->getTerminator()); 3729 for (unsigned part = 0; part < UF; ++part) { 3730 Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); 3731 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3732 : Builder.CreateZExt(Trunc, VecTy); 3733 for (Value::user_iterator UI = RdxParts[part]->user_begin(); 3734 UI != RdxParts[part]->user_end();) 3735 if (*UI != Trunc) { 3736 (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); 3737 RdxParts[part] = Extnd; 3738 } else { 3739 ++UI; 3740 } 3741 } 3742 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3743 for (unsigned part = 0; part < UF; ++part) 3744 RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); 3745 } 3746 3747 // Reduce all of the unrolled parts into a single vector. 3748 Value *ReducedPartRdx = RdxParts[0]; 3749 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3750 setDebugLocFromInst(Builder, ReducedPartRdx); 3751 for (unsigned part = 1; part < UF; ++part) { 3752 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3753 // Floating point operations had to be 'fast' to enable the reduction. 3754 ReducedPartRdx = addFastMathFlag( 3755 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], 3756 ReducedPartRdx, "bin.rdx")); 3757 else 3758 ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( 3759 Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); 3760 } 3761 3762 if (VF > 1) { 3763 // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 3764 // and vector ops, reducing the set of values being computed by half each 3765 // round. 3766 assert(isPowerOf2_32(VF) && 3767 "Reduction emission only supported for pow2 vectors!"); 3768 Value *TmpVec = ReducedPartRdx; 3769 SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); 3770 for (unsigned i = VF; i != 1; i >>= 1) { 3771 // Move the upper half of the vector to the lower half. 3772 for (unsigned j = 0; j != i / 2; ++j) 3773 ShuffleMask[j] = Builder.getInt32(i / 2 + j); 3774 3775 // Fill the rest of the mask with undef. 3776 std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), 3777 UndefValue::get(Builder.getInt32Ty())); 3778 3779 Value *Shuf = Builder.CreateShuffleVector( 3780 TmpVec, UndefValue::get(TmpVec->getType()), 3781 ConstantVector::get(ShuffleMask), "rdx.shuf"); 3782 3783 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3784 // Floating point operations had to be 'fast' to enable the reduction. 3785 TmpVec = addFastMathFlag(Builder.CreateBinOp( 3786 (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); 3787 else 3788 TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, 3789 TmpVec, Shuf); 3790 } 3791 3792 // The result is in the first element of the vector. 3793 ReducedPartRdx = 3794 Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 3795 3796 // If the reduction can be performed in a smaller type, we need to extend 3797 // the reduction to the wider type before we branch to the original loop. 3798 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3799 ReducedPartRdx = 3800 RdxDesc.isSigned() 3801 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3802 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3803 } 3804 3805 // Create a phi node that merges control-flow from the backedge-taken check 3806 // block and the middle block. 3807 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3808 LoopScalarPreHeader->getTerminator()); 3809 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3810 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3811 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3812 3813 // Now, we need to fix the users of the reduction variable 3814 // inside and outside of the scalar remainder loop. 3815 // We know that the loop is in LCSSA form. We need to update the 3816 // PHI nodes in the exit blocks. 3817 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3818 LEE = LoopExitBlock->end(); 3819 LEI != LEE; ++LEI) { 3820 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3821 if (!LCSSAPhi) 3822 break; 3823 3824 // All PHINodes need to have a single entry edge, or two if 3825 // we already fixed them. 3826 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3827 3828 // We found our reduction value exit-PHI. Update it with the 3829 // incoming bypass edge. 3830 if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { 3831 // Add an edge coming from the bypass. 3832 LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3833 break; 3834 } 3835 } // end of the LCSSA phi scan. 3836 3837 // Fix the scalar loop reduction variable with the incoming reduction sum 3838 // from the vector body and from the backedge value. 3839 int IncomingEdgeBlockIdx = 3840 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3841 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3842 // Pick the other block. 3843 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3844 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3845 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3846 } // end of for each Phi in PHIsToFix. 3847 3848 fixLCSSAPHIs(); 3849 3850 // Make sure DomTree is updated. 3851 updateAnalysis(); 3852 3853 predicateStores(); 3854 3855 // Remove redundant induction instructions. 3856 cse(LoopVectorBody); 3857 } 3858 3859 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3860 3861 // This is the second phase of vectorizing first-order recurrences. An 3862 // overview of the transformation is described below. Suppose we have the 3863 // following loop. 3864 // 3865 // for (int i = 0; i < n; ++i) 3866 // b[i] = a[i] - a[i - 1]; 3867 // 3868 // There is a first-order recurrence on "a". For this loop, the shorthand 3869 // scalar IR looks like: 3870 // 3871 // scalar.ph: 3872 // s_init = a[-1] 3873 // br scalar.body 3874 // 3875 // scalar.body: 3876 // i = phi [0, scalar.ph], [i+1, scalar.body] 3877 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3878 // s2 = a[i] 3879 // b[i] = s2 - s1 3880 // br cond, scalar.body, ... 3881 // 3882 // In this example, s1 is a recurrence because it's value depends on the 3883 // previous iteration. In the first phase of vectorization, we created a 3884 // temporary value for s1. We now complete the vectorization and produce the 3885 // shorthand vector IR shown below (for VF = 4, UF = 1). 3886 // 3887 // vector.ph: 3888 // v_init = vector(..., ..., ..., a[-1]) 3889 // br vector.body 3890 // 3891 // vector.body 3892 // i = phi [0, vector.ph], [i+4, vector.body] 3893 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3894 // v2 = a[i, i+1, i+2, i+3]; 3895 // v3 = vector(v1(3), v2(0, 1, 2)) 3896 // b[i, i+1, i+2, i+3] = v2 - v3 3897 // br cond, vector.body, middle.block 3898 // 3899 // middle.block: 3900 // x = v2(3) 3901 // br scalar.ph 3902 // 3903 // scalar.ph: 3904 // s_init = phi [x, middle.block], [a[-1], otherwise] 3905 // br scalar.body 3906 // 3907 // After execution completes the vector loop, we extract the next value of 3908 // the recurrence (x) to use as the initial value in the scalar loop. 3909 3910 // Get the original loop preheader and single loop latch. 3911 auto *Preheader = OrigLoop->getLoopPreheader(); 3912 auto *Latch = OrigLoop->getLoopLatch(); 3913 3914 // Get the initial and previous values of the scalar recurrence. 3915 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3916 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3917 3918 // Create a vector from the initial value. 3919 auto *VectorInit = ScalarInit; 3920 if (VF > 1) { 3921 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3922 VectorInit = Builder.CreateInsertElement( 3923 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3924 Builder.getInt32(VF - 1), "vector.recur.init"); 3925 } 3926 3927 // We constructed a temporary phi node in the first phase of vectorization. 3928 // This phi node will eventually be deleted. 3929 auto &PhiParts = getVectorValue(Phi); 3930 Builder.SetInsertPoint(cast<Instruction>(PhiParts[0])); 3931 3932 // Create a phi node for the new recurrence. The current value will either be 3933 // the initial value inserted into a vector or loop-varying vector value. 3934 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3935 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3936 3937 // Get the vectorized previous value. We ensured the previous values was an 3938 // instruction when detecting the recurrence. 3939 auto &PreviousParts = getVectorValue(Previous); 3940 3941 // Set the insertion point to be after this instruction. We ensured the 3942 // previous value dominated all uses of the phi when detecting the 3943 // recurrence. 3944 Builder.SetInsertPoint( 3945 &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1]))); 3946 3947 // We will construct a vector for the recurrence by combining the values for 3948 // the current and previous iterations. This is the required shuffle mask. 3949 SmallVector<Constant *, 8> ShuffleMask(VF); 3950 ShuffleMask[0] = Builder.getInt32(VF - 1); 3951 for (unsigned I = 1; I < VF; ++I) 3952 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3953 3954 // The vector from which to take the initial value for the current iteration 3955 // (actual or unrolled). Initially, this is the vector phi node. 3956 Value *Incoming = VecPhi; 3957 3958 // Shuffle the current and previous vector and update the vector parts. 3959 for (unsigned Part = 0; Part < UF; ++Part) { 3960 auto *Shuffle = 3961 VF > 1 3962 ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part], 3963 ConstantVector::get(ShuffleMask)) 3964 : Incoming; 3965 PhiParts[Part]->replaceAllUsesWith(Shuffle); 3966 cast<Instruction>(PhiParts[Part])->eraseFromParent(); 3967 PhiParts[Part] = Shuffle; 3968 Incoming = PreviousParts[Part]; 3969 } 3970 3971 // Fix the latch value of the new recurrence in the vector loop. 3972 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3973 3974 // Extract the last vector element in the middle block. This will be the 3975 // initial value for the recurrence when jumping to the scalar loop. 3976 auto *Extract = Incoming; 3977 if (VF > 1) { 3978 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3979 Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), 3980 "vector.recur.extract"); 3981 } 3982 3983 // Fix the initial value of the original recurrence in the scalar loop. 3984 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3985 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3986 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3987 auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit; 3988 Start->addIncoming(Incoming, BB); 3989 } 3990 3991 Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); 3992 Phi->setName("scalar.recur"); 3993 3994 // Finally, fix users of the recurrence outside the loop. The users will need 3995 // either the last value of the scalar recurrence or the last value of the 3996 // vector recurrence we extracted in the middle block. Since the loop is in 3997 // LCSSA form, we just need to find the phi node for the original scalar 3998 // recurrence in the exit block, and then add an edge for the middle block. 3999 for (auto &I : *LoopExitBlock) { 4000 auto *LCSSAPhi = dyn_cast<PHINode>(&I); 4001 if (!LCSSAPhi) 4002 break; 4003 if (LCSSAPhi->getIncomingValue(0) == Phi) { 4004 LCSSAPhi->addIncoming(Extract, LoopMiddleBlock); 4005 break; 4006 } 4007 } 4008 } 4009 4010 void InnerLoopVectorizer::fixLCSSAPHIs() { 4011 for (Instruction &LEI : *LoopExitBlock) { 4012 auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); 4013 if (!LCSSAPhi) 4014 break; 4015 if (LCSSAPhi->getNumIncomingValues() == 1) 4016 LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 4017 LoopMiddleBlock); 4018 } 4019 } 4020 4021 void InnerLoopVectorizer::predicateStores() { 4022 for (auto KV : PredicatedStores) { 4023 BasicBlock::iterator I(KV.first); 4024 auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); 4025 auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, 4026 /*BranchWeights=*/nullptr, DT, LI); 4027 I->moveBefore(T); 4028 I->getParent()->setName("pred.store.if"); 4029 BB->setName("pred.store.continue"); 4030 } 4031 DEBUG(DT->verifyDomTree()); 4032 } 4033 4034 InnerLoopVectorizer::VectorParts 4035 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 4036 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 4037 "Invalid edge"); 4038 4039 // Look for cached value. 4040 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 4041 EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); 4042 if (ECEntryIt != MaskCache.end()) 4043 return ECEntryIt->second; 4044 4045 VectorParts SrcMask = createBlockInMask(Src); 4046 4047 // The terminator has to be a branch inst! 4048 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 4049 assert(BI && "Unexpected terminator found"); 4050 4051 if (BI->isConditional()) { 4052 VectorParts EdgeMask = getVectorValue(BI->getCondition()); 4053 4054 if (BI->getSuccessor(0) != Dst) 4055 for (unsigned part = 0; part < UF; ++part) 4056 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 4057 4058 for (unsigned part = 0; part < UF; ++part) 4059 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 4060 4061 MaskCache[Edge] = EdgeMask; 4062 return EdgeMask; 4063 } 4064 4065 MaskCache[Edge] = SrcMask; 4066 return SrcMask; 4067 } 4068 4069 InnerLoopVectorizer::VectorParts 4070 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 4071 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 4072 4073 // Loop incoming mask is all-one. 4074 if (OrigLoop->getHeader() == BB) { 4075 Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 4076 return getVectorValue(C); 4077 } 4078 4079 // This is the block mask. We OR all incoming edges, and with zero. 4080 Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 4081 VectorParts BlockMask = getVectorValue(Zero); 4082 4083 // For each pred: 4084 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 4085 VectorParts EM = createEdgeMask(*it, BB); 4086 for (unsigned part = 0; part < UF; ++part) 4087 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 4088 } 4089 4090 return BlockMask; 4091 } 4092 4093 void InnerLoopVectorizer::widenPHIInstruction( 4094 Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, 4095 unsigned VF, PhiVector *PV) { 4096 PHINode *P = cast<PHINode>(PN); 4097 // Handle recurrences. 4098 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4099 for (unsigned part = 0; part < UF; ++part) { 4100 // This is phase one of vectorizing PHIs. 4101 Type *VecTy = 4102 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4103 Entry[part] = PHINode::Create( 4104 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4105 } 4106 PV->push_back(P); 4107 return; 4108 } 4109 4110 setDebugLocFromInst(Builder, P); 4111 // Check for PHI nodes that are lowered to vector selects. 4112 if (P->getParent() != OrigLoop->getHeader()) { 4113 // We know that all PHIs in non-header blocks are converted into 4114 // selects, so we don't have to worry about the insertion order and we 4115 // can just use the builder. 4116 // At this point we generate the predication tree. There may be 4117 // duplications since this is a simple recursive scan, but future 4118 // optimizations will clean it up. 4119 4120 unsigned NumIncoming = P->getNumIncomingValues(); 4121 4122 // Generate a sequence of selects of the form: 4123 // SELECT(Mask3, In3, 4124 // SELECT(Mask2, In2, 4125 // ( ...))) 4126 for (unsigned In = 0; In < NumIncoming; In++) { 4127 VectorParts Cond = 4128 createEdgeMask(P->getIncomingBlock(In), P->getParent()); 4129 VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 4130 4131 for (unsigned part = 0; part < UF; ++part) { 4132 // We might have single edge PHIs (blocks) - use an identity 4133 // 'select' for the first PHI operand. 4134 if (In == 0) 4135 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]); 4136 else 4137 // Select between the current value and the previous incoming edge 4138 // based on the incoming mask. 4139 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part], 4140 "predphi"); 4141 } 4142 } 4143 return; 4144 } 4145 4146 // This PHINode must be an induction variable. 4147 // Make sure that we know about it. 4148 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 4149 4150 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 4151 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4152 4153 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4154 // which can be found from the original scalar operations. 4155 switch (II.getKind()) { 4156 case InductionDescriptor::IK_NoInduction: 4157 llvm_unreachable("Unknown induction"); 4158 case InductionDescriptor::IK_IntInduction: 4159 return widenIntInduction(P, Entry); 4160 case InductionDescriptor::IK_PtrInduction: { 4161 // Handle the pointer induction variable case. 4162 assert(P->getType()->isPointerTy() && "Unexpected type."); 4163 // This is the normalized GEP that starts counting at zero. 4164 Value *PtrInd = Induction; 4165 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4166 // This is the vector of results. Notice that we don't generate 4167 // vector geps because scalar geps result in better code. 4168 for (unsigned part = 0; part < UF; ++part) { 4169 if (VF == 1) { 4170 int EltIndex = part; 4171 Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); 4172 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4173 Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); 4174 SclrGep->setName("next.gep"); 4175 Entry[part] = SclrGep; 4176 continue; 4177 } 4178 4179 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 4180 for (unsigned int i = 0; i < VF; ++i) { 4181 int EltIndex = i + part * VF; 4182 Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); 4183 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4184 Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); 4185 SclrGep->setName("next.gep"); 4186 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 4187 Builder.getInt32(i), "insert.gep"); 4188 } 4189 Entry[part] = VecVal; 4190 } 4191 return; 4192 } 4193 case InductionDescriptor::IK_FpInduction: { 4194 assert(P->getType() == II.getStartValue()->getType() && 4195 "Types must match"); 4196 // Handle other induction variables that are now based on the 4197 // canonical one. 4198 assert(P != OldInduction && "Primary induction can be integer only"); 4199 4200 Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); 4201 V = II.transform(Builder, V, PSE.getSE(), DL); 4202 V->setName("fp.offset.idx"); 4203 4204 // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal 4205 4206 Value *Broadcasted = getBroadcastInstrs(V); 4207 // After broadcasting the induction variable we need to make the vector 4208 // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. 4209 Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue(); 4210 for (unsigned part = 0; part < UF; ++part) 4211 Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, 4212 II.getInductionOpcode()); 4213 return; 4214 } 4215 } 4216 } 4217 4218 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { 4219 // For each instruction in the old loop. 4220 for (Instruction &I : *BB) { 4221 VectorParts &Entry = WidenMap.get(&I); 4222 4223 switch (I.getOpcode()) { 4224 case Instruction::Br: 4225 // Nothing to do for PHIs and BR, since we already took care of the 4226 // loop control flow instructions. 4227 continue; 4228 case Instruction::PHI: { 4229 // Vectorize PHINodes. 4230 widenPHIInstruction(&I, Entry, UF, VF, PV); 4231 continue; 4232 } // End of PHI. 4233 4234 case Instruction::Add: 4235 case Instruction::FAdd: 4236 case Instruction::Sub: 4237 case Instruction::FSub: 4238 case Instruction::Mul: 4239 case Instruction::FMul: 4240 case Instruction::UDiv: 4241 case Instruction::SDiv: 4242 case Instruction::FDiv: 4243 case Instruction::URem: 4244 case Instruction::SRem: 4245 case Instruction::FRem: 4246 case Instruction::Shl: 4247 case Instruction::LShr: 4248 case Instruction::AShr: 4249 case Instruction::And: 4250 case Instruction::Or: 4251 case Instruction::Xor: { 4252 // Just widen binops. 4253 auto *BinOp = cast<BinaryOperator>(&I); 4254 setDebugLocFromInst(Builder, BinOp); 4255 VectorParts &A = getVectorValue(BinOp->getOperand(0)); 4256 VectorParts &B = getVectorValue(BinOp->getOperand(1)); 4257 4258 // Use this vector value for all users of the original instruction. 4259 for (unsigned Part = 0; Part < UF; ++Part) { 4260 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 4261 4262 if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) 4263 VecOp->copyIRFlags(BinOp); 4264 4265 Entry[Part] = V; 4266 } 4267 4268 addMetadata(Entry, BinOp); 4269 break; 4270 } 4271 case Instruction::Select: { 4272 // Widen selects. 4273 // If the selector is loop invariant we can create a select 4274 // instruction with a scalar condition. Otherwise, use vector-select. 4275 auto *SE = PSE.getSE(); 4276 bool InvariantCond = 4277 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4278 setDebugLocFromInst(Builder, &I); 4279 4280 // The condition can be loop invariant but still defined inside the 4281 // loop. This means that we can't just use the original 'cond' value. 4282 // We have to take the 'vectorized' value and pick the first lane. 4283 // Instcombine will make this a no-op. 4284 VectorParts &Cond = getVectorValue(I.getOperand(0)); 4285 VectorParts &Op0 = getVectorValue(I.getOperand(1)); 4286 VectorParts &Op1 = getVectorValue(I.getOperand(2)); 4287 4288 Value *ScalarCond = 4289 (VF == 1) 4290 ? Cond[0] 4291 : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); 4292 4293 for (unsigned Part = 0; Part < UF; ++Part) { 4294 Entry[Part] = Builder.CreateSelect( 4295 InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); 4296 } 4297 4298 addMetadata(Entry, &I); 4299 break; 4300 } 4301 4302 case Instruction::ICmp: 4303 case Instruction::FCmp: { 4304 // Widen compares. Generate vector compares. 4305 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4306 auto *Cmp = dyn_cast<CmpInst>(&I); 4307 setDebugLocFromInst(Builder, Cmp); 4308 VectorParts &A = getVectorValue(Cmp->getOperand(0)); 4309 VectorParts &B = getVectorValue(Cmp->getOperand(1)); 4310 for (unsigned Part = 0; Part < UF; ++Part) { 4311 Value *C = nullptr; 4312 if (FCmp) { 4313 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 4314 cast<FCmpInst>(C)->copyFastMathFlags(Cmp); 4315 } else { 4316 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 4317 } 4318 Entry[Part] = C; 4319 } 4320 4321 addMetadata(Entry, &I); 4322 break; 4323 } 4324 4325 case Instruction::Store: 4326 case Instruction::Load: 4327 vectorizeMemoryInstruction(&I); 4328 break; 4329 case Instruction::ZExt: 4330 case Instruction::SExt: 4331 case Instruction::FPToUI: 4332 case Instruction::FPToSI: 4333 case Instruction::FPExt: 4334 case Instruction::PtrToInt: 4335 case Instruction::IntToPtr: 4336 case Instruction::SIToFP: 4337 case Instruction::UIToFP: 4338 case Instruction::Trunc: 4339 case Instruction::FPTrunc: 4340 case Instruction::BitCast: { 4341 auto *CI = dyn_cast<CastInst>(&I); 4342 setDebugLocFromInst(Builder, CI); 4343 4344 // Optimize the special case where the source is a constant integer 4345 // induction variable. Notice that we can only optimize the 'trunc' case 4346 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 4347 // (c) other casts depend on pointer size. 4348 auto ID = Legal->getInductionVars()->lookup(OldInduction); 4349 if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction && 4350 ID.getConstIntStepValue()) { 4351 widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI)); 4352 addMetadata(Entry, &I); 4353 break; 4354 } 4355 4356 /// Vectorize casts. 4357 Type *DestTy = 4358 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4359 4360 VectorParts &A = getVectorValue(CI->getOperand(0)); 4361 for (unsigned Part = 0; Part < UF; ++Part) 4362 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 4363 addMetadata(Entry, &I); 4364 break; 4365 } 4366 4367 case Instruction::Call: { 4368 // Ignore dbg intrinsics. 4369 if (isa<DbgInfoIntrinsic>(I)) 4370 break; 4371 setDebugLocFromInst(Builder, &I); 4372 4373 Module *M = BB->getParent()->getParent(); 4374 auto *CI = cast<CallInst>(&I); 4375 4376 StringRef FnName = CI->getCalledFunction()->getName(); 4377 Function *F = CI->getCalledFunction(); 4378 Type *RetTy = ToVectorTy(CI->getType(), VF); 4379 SmallVector<Type *, 4> Tys; 4380 for (Value *ArgOperand : CI->arg_operands()) 4381 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4382 4383 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4384 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 4385 ID == Intrinsic::lifetime_start)) { 4386 scalarizeInstruction(&I); 4387 break; 4388 } 4389 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4390 // version of the instruction. 4391 // Is it beneficial to perform intrinsic call compared to lib call? 4392 bool NeedToScalarize; 4393 unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); 4394 bool UseVectorIntrinsic = 4395 ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; 4396 if (!UseVectorIntrinsic && NeedToScalarize) { 4397 scalarizeInstruction(&I); 4398 break; 4399 } 4400 4401 for (unsigned Part = 0; Part < UF; ++Part) { 4402 SmallVector<Value *, 4> Args; 4403 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4404 Value *Arg = CI->getArgOperand(i); 4405 // Some intrinsics have a scalar argument - don't replace it with a 4406 // vector. 4407 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { 4408 VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); 4409 Arg = VectorArg[Part]; 4410 } 4411 Args.push_back(Arg); 4412 } 4413 4414 Function *VectorF; 4415 if (UseVectorIntrinsic) { 4416 // Use vector version of the intrinsic. 4417 Type *TysForDecl[] = {CI->getType()}; 4418 if (VF > 1) 4419 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4420 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4421 } else { 4422 // Use vector version of the library call. 4423 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4424 assert(!VFnName.empty() && "Vector function name is empty."); 4425 VectorF = M->getFunction(VFnName); 4426 if (!VectorF) { 4427 // Generate a declaration 4428 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4429 VectorF = 4430 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4431 VectorF->copyAttributesFrom(F); 4432 } 4433 } 4434 assert(VectorF && "Can't create vector function."); 4435 4436 SmallVector<OperandBundleDef, 1> OpBundles; 4437 CI->getOperandBundlesAsDefs(OpBundles); 4438 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4439 4440 if (isa<FPMathOperator>(V)) 4441 V->copyFastMathFlags(CI); 4442 4443 Entry[Part] = V; 4444 } 4445 4446 addMetadata(Entry, &I); 4447 break; 4448 } 4449 4450 default: 4451 // All other instructions are unsupported. Scalarize them. 4452 scalarizeInstruction(&I); 4453 break; 4454 } // end of switch. 4455 } // end of for_each instr. 4456 } 4457 4458 void InnerLoopVectorizer::updateAnalysis() { 4459 // Forget the original basic block. 4460 PSE.getSE()->forgetLoop(OrigLoop); 4461 4462 // Update the dominator tree information. 4463 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4464 "Entry does not dominate exit."); 4465 4466 // We don't predicate stores by this point, so the vector body should be a 4467 // single loop. 4468 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 4469 4470 DT->addNewBlock(LoopMiddleBlock, LoopVectorBody); 4471 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4472 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4473 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4474 4475 DEBUG(DT->verifyDomTree()); 4476 } 4477 4478 /// \brief Check whether it is safe to if-convert this phi node. 4479 /// 4480 /// Phi nodes with constant expressions that can trap are not safe to if 4481 /// convert. 4482 static bool canIfConvertPHINodes(BasicBlock *BB) { 4483 for (Instruction &I : *BB) { 4484 auto *Phi = dyn_cast<PHINode>(&I); 4485 if (!Phi) 4486 return true; 4487 for (Value *V : Phi->incoming_values()) 4488 if (auto *C = dyn_cast<Constant>(V)) 4489 if (C->canTrap()) 4490 return false; 4491 } 4492 return true; 4493 } 4494 4495 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 4496 if (!EnableIfConversion) { 4497 emitAnalysis(VectorizationReport() << "if-conversion is disabled"); 4498 return false; 4499 } 4500 4501 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 4502 4503 // A list of pointers that we can safely read and write to. 4504 SmallPtrSet<Value *, 8> SafePointes; 4505 4506 // Collect safe addresses. 4507 for (BasicBlock *BB : TheLoop->blocks()) { 4508 if (blockNeedsPredication(BB)) 4509 continue; 4510 4511 for (Instruction &I : *BB) 4512 if (auto *Ptr = getPointerOperand(&I)) 4513 SafePointes.insert(Ptr); 4514 } 4515 4516 // Collect the blocks that need predication. 4517 BasicBlock *Header = TheLoop->getHeader(); 4518 for (BasicBlock *BB : TheLoop->blocks()) { 4519 // We don't support switch statements inside loops. 4520 if (!isa<BranchInst>(BB->getTerminator())) { 4521 emitAnalysis(VectorizationReport(BB->getTerminator()) 4522 << "loop contains a switch statement"); 4523 return false; 4524 } 4525 4526 // We must be able to predicate all blocks that need to be predicated. 4527 if (blockNeedsPredication(BB)) { 4528 if (!blockCanBePredicated(BB, SafePointes)) { 4529 emitAnalysis(VectorizationReport(BB->getTerminator()) 4530 << "control flow cannot be substituted for a select"); 4531 return false; 4532 } 4533 } else if (BB != Header && !canIfConvertPHINodes(BB)) { 4534 emitAnalysis(VectorizationReport(BB->getTerminator()) 4535 << "control flow cannot be substituted for a select"); 4536 return false; 4537 } 4538 } 4539 4540 // We can if-convert this loop. 4541 return true; 4542 } 4543 4544 bool LoopVectorizationLegality::canVectorize() { 4545 // We must have a loop in canonical form. Loops with indirectbr in them cannot 4546 // be canonicalized. 4547 if (!TheLoop->getLoopPreheader()) { 4548 emitAnalysis(VectorizationReport() 4549 << "loop control flow is not understood by vectorizer"); 4550 return false; 4551 } 4552 4553 // We can only vectorize innermost loops. 4554 if (!TheLoop->empty()) { 4555 emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); 4556 return false; 4557 } 4558 4559 // We must have a single backedge. 4560 if (TheLoop->getNumBackEdges() != 1) { 4561 emitAnalysis(VectorizationReport() 4562 << "loop control flow is not understood by vectorizer"); 4563 return false; 4564 } 4565 4566 // We must have a single exiting block. 4567 if (!TheLoop->getExitingBlock()) { 4568 emitAnalysis(VectorizationReport() 4569 << "loop control flow is not understood by vectorizer"); 4570 return false; 4571 } 4572 4573 // We only handle bottom-tested loops, i.e. loop in which the condition is 4574 // checked at the end of each iteration. With that we can assume that all 4575 // instructions in the loop are executed the same number of times. 4576 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4577 emitAnalysis(VectorizationReport() 4578 << "loop control flow is not understood by vectorizer"); 4579 return false; 4580 } 4581 4582 // We need to have a loop header. 4583 DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() 4584 << '\n'); 4585 4586 // Check if we can if-convert non-single-bb loops. 4587 unsigned NumBlocks = TheLoop->getNumBlocks(); 4588 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 4589 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 4590 return false; 4591 } 4592 4593 // ScalarEvolution needs to be able to find the exit count. 4594 const SCEV *ExitCount = PSE.getBackedgeTakenCount(); 4595 if (ExitCount == PSE.getSE()->getCouldNotCompute()) { 4596 emitAnalysis(VectorizationReport() 4597 << "could not determine number of loop iterations"); 4598 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 4599 return false; 4600 } 4601 4602 // Check if we can vectorize the instructions and CFG in this loop. 4603 if (!canVectorizeInstrs()) { 4604 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 4605 return false; 4606 } 4607 4608 // Go over each instruction and look at memory deps. 4609 if (!canVectorizeMemory()) { 4610 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 4611 return false; 4612 } 4613 4614 DEBUG(dbgs() << "LV: We can vectorize this loop" 4615 << (LAI->getRuntimePointerChecking()->Need 4616 ? " (with a runtime bound check)" 4617 : "") 4618 << "!\n"); 4619 4620 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 4621 4622 // If an override option has been passed in for interleaved accesses, use it. 4623 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 4624 UseInterleaved = EnableInterleavedMemAccesses; 4625 4626 // Analyze interleaved memory accesses. 4627 if (UseInterleaved) 4628 InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); 4629 4630 // Collect all instructions that are known to be uniform after vectorization. 4631 collectLoopUniforms(); 4632 4633 // Collect all instructions that are known to be scalar after vectorization. 4634 collectLoopScalars(); 4635 4636 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 4637 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 4638 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 4639 4640 if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { 4641 emitAnalysis(VectorizationReport() 4642 << "Too many SCEV assumptions need to be made and checked " 4643 << "at runtime"); 4644 DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); 4645 return false; 4646 } 4647 4648 // Okay! We can vectorize. At this point we don't have any other mem analysis 4649 // which may limit our maximum vectorization factor, so just return true with 4650 // no restrictions. 4651 return true; 4652 } 4653 4654 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 4655 if (Ty->isPointerTy()) 4656 return DL.getIntPtrType(Ty); 4657 4658 // It is possible that char's or short's overflow when we ask for the loop's 4659 // trip count, work around this by changing the type size. 4660 if (Ty->getScalarSizeInBits() < 32) 4661 return Type::getInt32Ty(Ty->getContext()); 4662 4663 return Ty; 4664 } 4665 4666 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 4667 Ty0 = convertPointerToIntegerType(DL, Ty0); 4668 Ty1 = convertPointerToIntegerType(DL, Ty1); 4669 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 4670 return Ty0; 4671 return Ty1; 4672 } 4673 4674 /// \brief Check that the instruction has outside loop users and is not an 4675 /// identified reduction variable. 4676 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 4677 SmallPtrSetImpl<Value *> &AllowedExit) { 4678 // Reduction and Induction instructions are allowed to have exit users. All 4679 // other instructions must not have external users. 4680 if (!AllowedExit.count(Inst)) 4681 // Check that all of the users of the loop are inside the BB. 4682 for (User *U : Inst->users()) { 4683 Instruction *UI = cast<Instruction>(U); 4684 // This user may be a reduction exit value. 4685 if (!TheLoop->contains(UI)) { 4686 DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 4687 return true; 4688 } 4689 } 4690 return false; 4691 } 4692 4693 void LoopVectorizationLegality::addInductionPhi( 4694 PHINode *Phi, const InductionDescriptor &ID, 4695 SmallPtrSetImpl<Value *> &AllowedExit) { 4696 Inductions[Phi] = ID; 4697 Type *PhiTy = Phi->getType(); 4698 const DataLayout &DL = Phi->getModule()->getDataLayout(); 4699 4700 // Get the widest type. 4701 if (!PhiTy->isFloatingPointTy()) { 4702 if (!WidestIndTy) 4703 WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 4704 else 4705 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 4706 } 4707 4708 // Int inductions are special because we only allow one IV. 4709 if (ID.getKind() == InductionDescriptor::IK_IntInduction && 4710 ID.getConstIntStepValue() && 4711 ID.getConstIntStepValue()->isOne() && 4712 isa<Constant>(ID.getStartValue()) && 4713 cast<Constant>(ID.getStartValue())->isNullValue()) { 4714 4715 // Use the phi node with the widest type as induction. Use the last 4716 // one if there are multiple (no good reason for doing this other 4717 // than it is expedient). We've checked that it begins at zero and 4718 // steps by one, so this is a canonical induction variable. 4719 if (!Induction || PhiTy == WidestIndTy) 4720 Induction = Phi; 4721 } 4722 4723 // Both the PHI node itself, and the "post-increment" value feeding 4724 // back into the PHI node may have external users. 4725 AllowedExit.insert(Phi); 4726 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); 4727 4728 DEBUG(dbgs() << "LV: Found an induction variable.\n"); 4729 return; 4730 } 4731 4732 bool LoopVectorizationLegality::canVectorizeInstrs() { 4733 BasicBlock *Header = TheLoop->getHeader(); 4734 4735 // Look for the attribute signaling the absence of NaNs. 4736 Function &F = *Header->getParent(); 4737 HasFunNoNaNAttr = 4738 F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; 4739 4740 // For each block in the loop. 4741 for (BasicBlock *BB : TheLoop->blocks()) { 4742 // Scan the instructions in the block and look for hazards. 4743 for (Instruction &I : *BB) { 4744 if (auto *Phi = dyn_cast<PHINode>(&I)) { 4745 Type *PhiTy = Phi->getType(); 4746 // Check that this PHI type is allowed. 4747 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 4748 !PhiTy->isPointerTy()) { 4749 emitAnalysis(VectorizationReport(Phi) 4750 << "loop control flow is not understood by vectorizer"); 4751 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 4752 return false; 4753 } 4754 4755 // If this PHINode is not in the header block, then we know that we 4756 // can convert it to select during if-conversion. No need to check if 4757 // the PHIs in this block are induction or reduction variables. 4758 if (BB != Header) { 4759 // Check that this instruction has no outside users or is an 4760 // identified reduction value with an outside user. 4761 if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) 4762 continue; 4763 emitAnalysis(VectorizationReport(Phi) 4764 << "value could not be identified as " 4765 "an induction or reduction variable"); 4766 return false; 4767 } 4768 4769 // We only allow if-converted PHIs with exactly two incoming values. 4770 if (Phi->getNumIncomingValues() != 2) { 4771 emitAnalysis(VectorizationReport(Phi) 4772 << "control flow not understood by vectorizer"); 4773 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 4774 return false; 4775 } 4776 4777 RecurrenceDescriptor RedDes; 4778 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { 4779 if (RedDes.hasUnsafeAlgebra()) 4780 Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); 4781 AllowedExit.insert(RedDes.getLoopExitInstr()); 4782 Reductions[Phi] = RedDes; 4783 continue; 4784 } 4785 4786 InductionDescriptor ID; 4787 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { 4788 addInductionPhi(Phi, ID, AllowedExit); 4789 if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) 4790 Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); 4791 continue; 4792 } 4793 4794 if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) { 4795 FirstOrderRecurrences.insert(Phi); 4796 continue; 4797 } 4798 4799 // As a last resort, coerce the PHI to a AddRec expression 4800 // and re-try classifying it a an induction PHI. 4801 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { 4802 addInductionPhi(Phi, ID, AllowedExit); 4803 continue; 4804 } 4805 4806 emitAnalysis(VectorizationReport(Phi) 4807 << "value that could not be identified as " 4808 "reduction is used outside the loop"); 4809 DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); 4810 return false; 4811 } // end of PHI handling 4812 4813 // We handle calls that: 4814 // * Are debug info intrinsics. 4815 // * Have a mapping to an IR intrinsic. 4816 // * Have a vector version available. 4817 auto *CI = dyn_cast<CallInst>(&I); 4818 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && 4819 !isa<DbgInfoIntrinsic>(CI) && 4820 !(CI->getCalledFunction() && TLI && 4821 TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { 4822 emitAnalysis(VectorizationReport(CI) 4823 << "call instruction cannot be vectorized"); 4824 DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); 4825 return false; 4826 } 4827 4828 // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the 4829 // second argument is the same (i.e. loop invariant) 4830 if (CI && hasVectorInstrinsicScalarOpd( 4831 getVectorIntrinsicIDForCall(CI, TLI), 1)) { 4832 auto *SE = PSE.getSE(); 4833 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { 4834 emitAnalysis(VectorizationReport(CI) 4835 << "intrinsic instruction cannot be vectorized"); 4836 DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); 4837 return false; 4838 } 4839 } 4840 4841 // Check that the instruction return type is vectorizable. 4842 // Also, we can't vectorize extractelement instructions. 4843 if ((!VectorType::isValidElementType(I.getType()) && 4844 !I.getType()->isVoidTy()) || 4845 isa<ExtractElementInst>(I)) { 4846 emitAnalysis(VectorizationReport(&I) 4847 << "instruction return type cannot be vectorized"); 4848 DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); 4849 return false; 4850 } 4851 4852 // Check that the stored type is vectorizable. 4853 if (auto *ST = dyn_cast<StoreInst>(&I)) { 4854 Type *T = ST->getValueOperand()->getType(); 4855 if (!VectorType::isValidElementType(T)) { 4856 emitAnalysis(VectorizationReport(ST) 4857 << "store instruction cannot be vectorized"); 4858 return false; 4859 } 4860 4861 // FP instructions can allow unsafe algebra, thus vectorizable by 4862 // non-IEEE-754 compliant SIMD units. 4863 // This applies to floating-point math operations and calls, not memory 4864 // operations, shuffles, or casts, as they don't change precision or 4865 // semantics. 4866 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && 4867 !I.hasUnsafeAlgebra()) { 4868 DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); 4869 Hints->setPotentiallyUnsafe(); 4870 } 4871 4872 // Reduction instructions are allowed to have exit users. 4873 // All other instructions must not have external users. 4874 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { 4875 emitAnalysis(VectorizationReport(&I) 4876 << "value cannot be used outside the loop"); 4877 return false; 4878 } 4879 4880 } // next instr. 4881 } 4882 4883 if (!Induction) { 4884 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 4885 if (Inductions.empty()) { 4886 emitAnalysis(VectorizationReport() 4887 << "loop induction variable could not be identified"); 4888 return false; 4889 } 4890 } 4891 4892 // Now we know the widest induction type, check if our found induction 4893 // is the same size. If it's not, unset it here and InnerLoopVectorizer 4894 // will create another. 4895 if (Induction && WidestIndTy != Induction->getType()) 4896 Induction = nullptr; 4897 4898 return true; 4899 } 4900 4901 void LoopVectorizationLegality::collectLoopScalars() { 4902 4903 // If an instruction is uniform after vectorization, it will remain scalar. 4904 Scalars.insert(Uniforms.begin(), Uniforms.end()); 4905 4906 // Collect the getelementptr instructions that will not be vectorized. A 4907 // getelementptr instruction is only vectorized if it is used for a legal 4908 // gather or scatter operation. 4909 for (auto *BB : TheLoop->blocks()) 4910 for (auto &I : *BB) { 4911 if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 4912 Scalars.insert(GEP); 4913 continue; 4914 } 4915 auto *Ptr = getPointerOperand(&I); 4916 if (!Ptr) 4917 continue; 4918 auto *GEP = getGEPInstruction(Ptr); 4919 if (GEP && isLegalGatherOrScatter(&I)) 4920 Scalars.erase(GEP); 4921 } 4922 4923 // An induction variable will remain scalar if all users of the induction 4924 // variable and induction variable update remain scalar. 4925 auto *Latch = TheLoop->getLoopLatch(); 4926 for (auto &Induction : *getInductionVars()) { 4927 auto *Ind = Induction.first; 4928 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4929 4930 // Determine if all users of the induction variable are scalar after 4931 // vectorization. 4932 auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { 4933 auto *I = cast<Instruction>(U); 4934 return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); 4935 }); 4936 if (!ScalarInd) 4937 continue; 4938 4939 // Determine if all users of the induction variable update instruction are 4940 // scalar after vectorization. 4941 auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 4942 auto *I = cast<Instruction>(U); 4943 return I == Ind || !TheLoop->contains(I) || Scalars.count(I); 4944 }); 4945 if (!ScalarIndUpdate) 4946 continue; 4947 4948 // The induction variable and its update instruction will remain scalar. 4949 Scalars.insert(Ind); 4950 Scalars.insert(IndUpdate); 4951 } 4952 } 4953 4954 void LoopVectorizationLegality::collectLoopUniforms() { 4955 // We now know that the loop is vectorizable! 4956 // Collect instructions inside the loop that will remain uniform after 4957 // vectorization. 4958 4959 // Global values, params and instructions outside of current loop are out of 4960 // scope. 4961 auto isOutOfScope = [&](Value *V) -> bool { 4962 Instruction *I = dyn_cast<Instruction>(V); 4963 return (!I || !TheLoop->contains(I)); 4964 }; 4965 4966 SetVector<Instruction *> Worklist; 4967 BasicBlock *Latch = TheLoop->getLoopLatch(); 4968 // Start with the conditional branch. 4969 if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) { 4970 Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4971 Worklist.insert(Cmp); 4972 DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); 4973 } 4974 4975 // Add all consecutive pointer values; these values will be uniform after 4976 // vectorization (and subsequent cleanup). Although non-consecutive, we also 4977 // add the pointer operands of interleaved accesses since they are treated 4978 // like consecutive pointers during vectorization. 4979 for (auto *BB : TheLoop->blocks()) 4980 for (auto &I : *BB) { 4981 Instruction *Ptr = nullptr; 4982 if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) 4983 Ptr = &I; 4984 else if (isAccessInterleaved(&I)) 4985 Ptr = cast<Instruction>(getPointerOperand(&I)); 4986 else 4987 continue; 4988 Worklist.insert(Ptr); 4989 DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ptr << "\n"); 4990 } 4991 4992 // Expand Worklist in topological order: whenever a new instruction 4993 // is added , its users should be either already inside Worklist, or 4994 // out of scope. It ensures a uniform instruction will only be used 4995 // by uniform instructions or out of scope instructions. 4996 unsigned idx = 0; 4997 while (idx != Worklist.size()) { 4998 Instruction *I = Worklist[idx++]; 4999 5000 for (auto OV : I->operand_values()) { 5001 if (isOutOfScope(OV)) 5002 continue; 5003 auto *OI = cast<Instruction>(OV); 5004 if (all_of(OI->users(), [&](User *U) -> bool { 5005 return isOutOfScope(U) || Worklist.count(cast<Instruction>(U)); 5006 })) { 5007 Worklist.insert(OI); 5008 DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); 5009 } 5010 } 5011 } 5012 5013 // For an instruction to be added into Worklist above, all its users inside 5014 // the current loop should be already added into Worklist. This condition 5015 // cannot be true for phi instructions which is always in a dependence loop. 5016 // Because any instruction in the dependence cycle always depends on others 5017 // in the cycle to be added into Worklist first, the result is no ones in 5018 // the cycle will be added into Worklist in the end. 5019 // That is why we process PHI separately. 5020 for (auto &Induction : *getInductionVars()) { 5021 auto *PN = Induction.first; 5022 auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); 5023 if (all_of(PN->users(), 5024 [&](User *U) -> bool { 5025 return U == UpdateV || isOutOfScope(U) || 5026 Worklist.count(cast<Instruction>(U)); 5027 }) && 5028 all_of(UpdateV->users(), [&](User *U) -> bool { 5029 return U == PN || isOutOfScope(U) || 5030 Worklist.count(cast<Instruction>(U)); 5031 })) { 5032 Worklist.insert(cast<Instruction>(PN)); 5033 Worklist.insert(cast<Instruction>(UpdateV)); 5034 DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n"); 5035 DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n"); 5036 } 5037 } 5038 5039 Uniforms.insert(Worklist.begin(), Worklist.end()); 5040 } 5041 5042 bool LoopVectorizationLegality::canVectorizeMemory() { 5043 LAI = &(*GetLAA)(*TheLoop); 5044 InterleaveInfo.setLAI(LAI); 5045 auto &OptionalReport = LAI->getReport(); 5046 if (OptionalReport) 5047 emitAnalysis(VectorizationReport(*OptionalReport)); 5048 if (!LAI->canVectorizeMemory()) 5049 return false; 5050 5051 if (LAI->hasStoreToLoopInvariantAddress()) { 5052 emitAnalysis( 5053 VectorizationReport() 5054 << "write to a loop invariant address could not be vectorized"); 5055 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 5056 return false; 5057 } 5058 5059 Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); 5060 PSE.addPredicate(LAI->getPSE().getUnionPredicate()); 5061 5062 return true; 5063 } 5064 5065 bool LoopVectorizationLegality::isInductionVariable(const Value *V) { 5066 Value *In0 = const_cast<Value *>(V); 5067 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 5068 if (!PN) 5069 return false; 5070 5071 return Inductions.count(PN); 5072 } 5073 5074 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { 5075 return FirstOrderRecurrences.count(Phi); 5076 } 5077 5078 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 5079 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 5080 } 5081 5082 bool LoopVectorizationLegality::blockCanBePredicated( 5083 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { 5084 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); 5085 5086 for (Instruction &I : *BB) { 5087 // Check that we don't have a constant expression that can trap as operand. 5088 for (Value *Operand : I.operands()) { 5089 if (auto *C = dyn_cast<Constant>(Operand)) 5090 if (C->canTrap()) 5091 return false; 5092 } 5093 // We might be able to hoist the load. 5094 if (I.mayReadFromMemory()) { 5095 auto *LI = dyn_cast<LoadInst>(&I); 5096 if (!LI) 5097 return false; 5098 if (!SafePtrs.count(LI->getPointerOperand())) { 5099 if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || 5100 isLegalMaskedGather(LI->getType())) { 5101 MaskedOp.insert(LI); 5102 continue; 5103 } 5104 // !llvm.mem.parallel_loop_access implies if-conversion safety. 5105 if (IsAnnotatedParallel) 5106 continue; 5107 return false; 5108 } 5109 } 5110 5111 if (I.mayWriteToMemory()) { 5112 auto *SI = dyn_cast<StoreInst>(&I); 5113 // We only support predication of stores in basic blocks with one 5114 // predecessor. 5115 if (!SI) 5116 return false; 5117 5118 // Build a masked store if it is legal for the target. 5119 if (isLegalMaskedStore(SI->getValueOperand()->getType(), 5120 SI->getPointerOperand()) || 5121 isLegalMaskedScatter(SI->getValueOperand()->getType())) { 5122 MaskedOp.insert(SI); 5123 continue; 5124 } 5125 5126 bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); 5127 bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); 5128 5129 if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || 5130 !isSinglePredecessor) 5131 return false; 5132 } 5133 if (I.mayThrow()) 5134 return false; 5135 5136 // The instructions below can trap. 5137 switch (I.getOpcode()) { 5138 default: 5139 continue; 5140 case Instruction::UDiv: 5141 case Instruction::SDiv: 5142 case Instruction::URem: 5143 case Instruction::SRem: 5144 return false; 5145 } 5146 } 5147 5148 return true; 5149 } 5150 5151 void InterleavedAccessInfo::collectConstStrideAccesses( 5152 MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, 5153 const ValueToValueMap &Strides) { 5154 5155 auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); 5156 5157 // Since it's desired that the load/store instructions be maintained in 5158 // "program order" for the interleaved access analysis, we have to visit the 5159 // blocks in the loop in reverse postorder (i.e., in a topological order). 5160 // Such an ordering will ensure that any load/store that may be executed 5161 // before a second load/store will precede the second load/store in 5162 // AccessStrideInfo. 5163 LoopBlocksDFS DFS(TheLoop); 5164 DFS.perform(LI); 5165 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) 5166 for (auto &I : *BB) { 5167 auto *LI = dyn_cast<LoadInst>(&I); 5168 auto *SI = dyn_cast<StoreInst>(&I); 5169 if (!LI && !SI) 5170 continue; 5171 5172 Value *Ptr = getPointerOperand(&I); 5173 int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); 5174 5175 const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); 5176 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 5177 uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); 5178 5179 // An alignment of 0 means target ABI alignment. 5180 unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); 5181 if (!Align) 5182 Align = DL.getABITypeAlignment(PtrTy->getElementType()); 5183 5184 AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align); 5185 } 5186 } 5187 5188 // Analyze interleaved accesses and collect them into interleaved load and 5189 // store groups. 5190 // 5191 // When generating code for an interleaved load group, we effectively hoist all 5192 // loads in the group to the location of the first load in program order. When 5193 // generating code for an interleaved store group, we sink all stores to the 5194 // location of the last store. This code motion can change the order of load 5195 // and store instructions and may break dependences. 5196 // 5197 // The code generation strategy mentioned above ensures that we won't violate 5198 // any write-after-read (WAR) dependences. 5199 // 5200 // E.g., for the WAR dependence: a = A[i]; // (1) 5201 // A[i] = b; // (2) 5202 // 5203 // The store group of (2) is always inserted at or below (2), and the load 5204 // group of (1) is always inserted at or above (1). Thus, the instructions will 5205 // never be reordered. All other dependences are checked to ensure the 5206 // correctness of the instruction reordering. 5207 // 5208 // The algorithm visits all memory accesses in the loop in bottom-up program 5209 // order. Program order is established by traversing the blocks in the loop in 5210 // reverse postorder when collecting the accesses. 5211 // 5212 // We visit the memory accesses in bottom-up order because it can simplify the 5213 // construction of store groups in the presence of write-after-write (WAW) 5214 // dependences. 5215 // 5216 // E.g., for the WAW dependence: A[i] = a; // (1) 5217 // A[i] = b; // (2) 5218 // A[i + 1] = c; // (3) 5219 // 5220 // We will first create a store group with (3) and (2). (1) can't be added to 5221 // this group because it and (2) are dependent. However, (1) can be grouped 5222 // with other accesses that may precede it in program order. Note that a 5223 // bottom-up order does not imply that WAW dependences should not be checked. 5224 void InterleavedAccessInfo::analyzeInterleaving( 5225 const ValueToValueMap &Strides) { 5226 DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); 5227 5228 // Holds all accesses with a constant stride. 5229 MapVector<Instruction *, StrideDescriptor> AccessStrideInfo; 5230 collectConstStrideAccesses(AccessStrideInfo, Strides); 5231 5232 if (AccessStrideInfo.empty()) 5233 return; 5234 5235 // Collect the dependences in the loop. 5236 collectDependences(); 5237 5238 // Holds all interleaved store groups temporarily. 5239 SmallSetVector<InterleaveGroup *, 4> StoreGroups; 5240 // Holds all interleaved load groups temporarily. 5241 SmallSetVector<InterleaveGroup *, 4> LoadGroups; 5242 5243 // Search in bottom-up program order for pairs of accesses (A and B) that can 5244 // form interleaved load or store groups. In the algorithm below, access A 5245 // precedes access B in program order. We initialize a group for B in the 5246 // outer loop of the algorithm, and then in the inner loop, we attempt to 5247 // insert each A into B's group if: 5248 // 5249 // 1. A and B have the same stride, 5250 // 2. A and B have the same memory object size, and 5251 // 3. A belongs in B's group according to its distance from B. 5252 // 5253 // Special care is taken to ensure group formation will not break any 5254 // dependences. 5255 for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend(); 5256 BI != E; ++BI) { 5257 Instruction *B = BI->first; 5258 StrideDescriptor DesB = BI->second; 5259 5260 // Initialize a group for B if it has an allowable stride. Even if we don't 5261 // create a group for B, we continue with the bottom-up algorithm to ensure 5262 // we don't break any of B's dependences. 5263 InterleaveGroup *Group = nullptr; 5264 if (isStrided(DesB.Stride)) { 5265 Group = getInterleaveGroup(B); 5266 if (!Group) { 5267 DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n'); 5268 Group = createInterleaveGroup(B, DesB.Stride, DesB.Align); 5269 } 5270 if (B->mayWriteToMemory()) 5271 StoreGroups.insert(Group); 5272 else 5273 LoadGroups.insert(Group); 5274 } 5275 5276 for (auto AI = std::next(BI); AI != E; ++AI) { 5277 Instruction *A = AI->first; 5278 StrideDescriptor DesA = AI->second; 5279 5280 // Our code motion strategy implies that we can't have dependences 5281 // between accesses in an interleaved group and other accesses located 5282 // between the first and last member of the group. Note that this also 5283 // means that a group can't have more than one member at a given offset. 5284 // The accesses in a group can have dependences with other accesses, but 5285 // we must ensure we don't extend the boundaries of the group such that 5286 // we encompass those dependent accesses. 5287 // 5288 // For example, assume we have the sequence of accesses shown below in a 5289 // stride-2 loop: 5290 // 5291 // (1, 2) is a group | A[i] = a; // (1) 5292 // | A[i-1] = b; // (2) | 5293 // A[i-3] = c; // (3) 5294 // A[i] = d; // (4) | (2, 4) is not a group 5295 // 5296 // Because accesses (2) and (3) are dependent, we can group (2) with (1) 5297 // but not with (4). If we did, the dependent access (3) would be within 5298 // the boundaries of the (2, 4) group. 5299 if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) { 5300 5301 // If a dependence exists and A is already in a group, we know that A 5302 // must be a store since A precedes B and WAR dependences are allowed. 5303 // Thus, A would be sunk below B. We release A's group to prevent this 5304 // illegal code motion. A will then be free to form another group with 5305 // instructions that precede it. 5306 if (isInterleaved(A)) { 5307 InterleaveGroup *StoreGroup = getInterleaveGroup(A); 5308 StoreGroups.remove(StoreGroup); 5309 releaseGroup(StoreGroup); 5310 } 5311 5312 // If a dependence exists and A is not already in a group (or it was 5313 // and we just released it), B might be hoisted above A (if B is a 5314 // load) or another store might be sunk below A (if B is a store). In 5315 // either case, we can't add additional instructions to B's group. B 5316 // will only form a group with instructions that it precedes. 5317 break; 5318 } 5319 5320 // At this point, we've checked for illegal code motion. If either A or B 5321 // isn't strided, there's nothing left to do. 5322 if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride)) 5323 continue; 5324 5325 // Ignore A if it's already in a group or isn't the same kind of memory 5326 // operation as B. 5327 if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory()) 5328 continue; 5329 5330 // Check rules 1 and 2. Ignore A if its stride or size is different from 5331 // that of B. 5332 if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size) 5333 continue; 5334 5335 // Calculate the distance from A to B. 5336 const SCEVConstant *DistToB = dyn_cast<SCEVConstant>( 5337 PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev)); 5338 if (!DistToB) 5339 continue; 5340 int64_t DistanceToB = DistToB->getAPInt().getSExtValue(); 5341 5342 // Check rule 3. Ignore A if its distance to B is not a multiple of the 5343 // size. 5344 if (DistanceToB % static_cast<int64_t>(DesB.Size)) 5345 continue; 5346 5347 // Ignore A if either A or B is in a predicated block. Although we 5348 // currently prevent group formation for predicated accesses, we may be 5349 // able to relax this limitation in the future once we handle more 5350 // complicated blocks. 5351 if (isPredicated(A->getParent()) || isPredicated(B->getParent())) 5352 continue; 5353 5354 // The index of A is the index of B plus A's distance to B in multiples 5355 // of the size. 5356 int IndexA = 5357 Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size); 5358 5359 // Try to insert A into B's group. 5360 if (Group->insertMember(A, IndexA, DesA.Align)) { 5361 DEBUG(dbgs() << "LV: Inserted:" << *A << '\n' 5362 << " into the interleave group with" << *B << '\n'); 5363 InterleaveGroupMap[A] = Group; 5364 5365 // Set the first load in program order as the insert position. 5366 if (A->mayReadFromMemory()) 5367 Group->setInsertPos(A); 5368 } 5369 } // Iteration over A accesses. 5370 } // Iteration over B accesses. 5371 5372 // Remove interleaved store groups with gaps. 5373 for (InterleaveGroup *Group : StoreGroups) 5374 if (Group->getNumMembers() != Group->getFactor()) 5375 releaseGroup(Group); 5376 5377 // If there is a non-reversed interleaved load group with gaps, we will need 5378 // to execute at least one scalar epilogue iteration. This will ensure that 5379 // we don't speculatively access memory out-of-bounds. Note that we only need 5380 // to look for a member at index factor - 1, since every group must have a 5381 // member at index zero. 5382 for (InterleaveGroup *Group : LoadGroups) 5383 if (!Group->getMember(Group->getFactor() - 1)) { 5384 if (Group->isReverse()) { 5385 releaseGroup(Group); 5386 } else { 5387 DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); 5388 RequiresScalarEpilogue = true; 5389 } 5390 } 5391 } 5392 5393 LoopVectorizationCostModel::VectorizationFactor 5394 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { 5395 // Width 1 means no vectorize 5396 VectorizationFactor Factor = {1U, 0U}; 5397 if (OptForSize && Legal->getRuntimePointerChecking()->Need) { 5398 emitAnalysis( 5399 VectorizationReport() 5400 << "runtime pointer checks needed. Enable vectorization of this " 5401 "loop with '#pragma clang loop vectorize(enable)' when " 5402 "compiling with -Os/-Oz"); 5403 DEBUG(dbgs() 5404 << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); 5405 return Factor; 5406 } 5407 5408 if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { 5409 emitAnalysis( 5410 VectorizationReport() 5411 << "store that is conditionally executed prevents vectorization"); 5412 DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); 5413 return Factor; 5414 } 5415 5416 // Find the trip count. 5417 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5418 DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5419 5420 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5421 unsigned SmallestType, WidestType; 5422 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5423 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5424 unsigned MaxSafeDepDist = -1U; 5425 5426 // Get the maximum safe dependence distance in bits computed by LAA. If the 5427 // loop contains any interleaved accesses, we divide the dependence distance 5428 // by the maximum interleave factor of all interleaved groups. Note that 5429 // although the division ensures correctness, this is a fairly conservative 5430 // computation because the maximum distance computed by LAA may not involve 5431 // any of the interleaved accesses. 5432 if (Legal->getMaxSafeDepDistBytes() != -1U) 5433 MaxSafeDepDist = 5434 Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor(); 5435 5436 WidestRegister = 5437 ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); 5438 unsigned MaxVectorSize = WidestRegister / WidestType; 5439 5440 DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " 5441 << WidestType << " bits.\n"); 5442 DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister 5443 << " bits.\n"); 5444 5445 if (MaxVectorSize == 0) { 5446 DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5447 MaxVectorSize = 1; 5448 } 5449 5450 assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" 5451 " into one vector!"); 5452 5453 unsigned VF = MaxVectorSize; 5454 if (MaximizeBandwidth && !OptForSize) { 5455 // Collect all viable vectorization factors. 5456 SmallVector<unsigned, 8> VFs; 5457 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5458 for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) 5459 VFs.push_back(VS); 5460 5461 // For each VF calculate its register usage. 5462 auto RUs = calculateRegisterUsage(VFs); 5463 5464 // Select the largest VF which doesn't require more registers than existing 5465 // ones. 5466 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 5467 for (int i = RUs.size() - 1; i >= 0; --i) { 5468 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 5469 VF = VFs[i]; 5470 break; 5471 } 5472 } 5473 } 5474 5475 // If we optimize the program for size, avoid creating the tail loop. 5476 if (OptForSize) { 5477 // If we are unable to calculate the trip count then don't try to vectorize. 5478 if (TC < 2) { 5479 emitAnalysis( 5480 VectorizationReport() 5481 << "unable to calculate the loop count due to complex control flow"); 5482 DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 5483 return Factor; 5484 } 5485 5486 // Find the maximum SIMD width that can fit within the trip count. 5487 VF = TC % MaxVectorSize; 5488 5489 if (VF == 0) 5490 VF = MaxVectorSize; 5491 else { 5492 // If the trip count that we found modulo the vectorization factor is not 5493 // zero then we require a tail. 5494 emitAnalysis(VectorizationReport() 5495 << "cannot optimize for size and vectorize at the " 5496 "same time. Enable vectorization of this loop " 5497 "with '#pragma clang loop vectorize(enable)' " 5498 "when compiling with -Os/-Oz"); 5499 DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 5500 return Factor; 5501 } 5502 } 5503 5504 int UserVF = Hints->getWidth(); 5505 if (UserVF != 0) { 5506 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 5507 DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 5508 5509 Factor.Width = UserVF; 5510 return Factor; 5511 } 5512 5513 float Cost = expectedCost(1).first; 5514 #ifndef NDEBUG 5515 const float ScalarCost = Cost; 5516 #endif /* NDEBUG */ 5517 unsigned Width = 1; 5518 DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5519 5520 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5521 // Ignore scalar width, because the user explicitly wants vectorization. 5522 if (ForceVectorization && VF > 1) { 5523 Width = 2; 5524 Cost = expectedCost(Width).first / (float)Width; 5525 } 5526 5527 for (unsigned i = 2; i <= VF; i *= 2) { 5528 // Notice that the vector loop needs to be executed less times, so 5529 // we need to divide the cost of the vector loops by the width of 5530 // the vector elements. 5531 VectorizationCostTy C = expectedCost(i); 5532 float VectorCost = C.first / (float)i; 5533 DEBUG(dbgs() << "LV: Vector loop of width " << i 5534 << " costs: " << (int)VectorCost << ".\n"); 5535 if (!C.second && !ForceVectorization) { 5536 DEBUG( 5537 dbgs() << "LV: Not considering vector loop of width " << i 5538 << " because it will not generate any vector instructions.\n"); 5539 continue; 5540 } 5541 if (VectorCost < Cost) { 5542 Cost = VectorCost; 5543 Width = i; 5544 } 5545 } 5546 5547 DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5548 << "LV: Vectorization seems to be not beneficial, " 5549 << "but was forced by a user.\n"); 5550 DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5551 Factor.Width = Width; 5552 Factor.Cost = Width * Cost; 5553 return Factor; 5554 } 5555 5556 std::pair<unsigned, unsigned> 5557 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5558 unsigned MinWidth = -1U; 5559 unsigned MaxWidth = 8; 5560 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5561 5562 // For each block. 5563 for (BasicBlock *BB : TheLoop->blocks()) { 5564 // For each instruction in the loop. 5565 for (Instruction &I : *BB) { 5566 Type *T = I.getType(); 5567 5568 // Skip ignored values. 5569 if (ValuesToIgnore.count(&I)) 5570 continue; 5571 5572 // Only examine Loads, Stores and PHINodes. 5573 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5574 continue; 5575 5576 // Examine PHI nodes that are reduction variables. Update the type to 5577 // account for the recurrence type. 5578 if (auto *PN = dyn_cast<PHINode>(&I)) { 5579 if (!Legal->isReductionVariable(PN)) 5580 continue; 5581 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5582 T = RdxDesc.getRecurrenceType(); 5583 } 5584 5585 // Examine the stored values. 5586 if (auto *ST = dyn_cast<StoreInst>(&I)) 5587 T = ST->getValueOperand()->getType(); 5588 5589 // Ignore loaded pointer types and stored pointer types that are not 5590 // consecutive. However, we do want to take consecutive stores/loads of 5591 // pointer vectors into account. 5592 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I)) 5593 continue; 5594 5595 MinWidth = std::min(MinWidth, 5596 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5597 MaxWidth = std::max(MaxWidth, 5598 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5599 } 5600 } 5601 5602 return {MinWidth, MaxWidth}; 5603 } 5604 5605 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, 5606 unsigned VF, 5607 unsigned LoopCost) { 5608 5609 // -- The interleave heuristics -- 5610 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5611 // There are many micro-architectural considerations that we can't predict 5612 // at this level. For example, frontend pressure (on decode or fetch) due to 5613 // code size, or the number and capabilities of the execution ports. 5614 // 5615 // We use the following heuristics to select the interleave count: 5616 // 1. If the code has reductions, then we interleave to break the cross 5617 // iteration dependency. 5618 // 2. If the loop is really small, then we interleave to reduce the loop 5619 // overhead. 5620 // 3. We don't interleave if we think that we will spill registers to memory 5621 // due to the increased register pressure. 5622 5623 // When we optimize for size, we don't interleave. 5624 if (OptForSize) 5625 return 1; 5626 5627 // We used the distance for the interleave count. 5628 if (Legal->getMaxSafeDepDistBytes() != -1U) 5629 return 1; 5630 5631 // Do not interleave loops with a relatively small trip count. 5632 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5633 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5634 return 1; 5635 5636 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5637 DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5638 << " registers\n"); 5639 5640 if (VF == 1) { 5641 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5642 TargetNumRegisters = ForceTargetNumScalarRegs; 5643 } else { 5644 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5645 TargetNumRegisters = ForceTargetNumVectorRegs; 5646 } 5647 5648 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5649 // We divide by these constants so assume that we have at least one 5650 // instruction that uses at least one register. 5651 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5652 R.NumInstructions = std::max(R.NumInstructions, 1U); 5653 5654 // We calculate the interleave count using the following formula. 5655 // Subtract the number of loop invariants from the number of available 5656 // registers. These registers are used by all of the interleaved instances. 5657 // Next, divide the remaining registers by the number of registers that is 5658 // required by the loop, in order to estimate how many parallel instances 5659 // fit without causing spills. All of this is rounded down if necessary to be 5660 // a power of two. We want power of two interleave count to simplify any 5661 // addressing operations or alignment considerations. 5662 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5663 R.MaxLocalUsers); 5664 5665 // Don't count the induction variable as interleaved. 5666 if (EnableIndVarRegisterHeur) 5667 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5668 std::max(1U, (R.MaxLocalUsers - 1))); 5669 5670 // Clamp the interleave ranges to reasonable counts. 5671 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5672 5673 // Check if the user has overridden the max. 5674 if (VF == 1) { 5675 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5676 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5677 } else { 5678 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5679 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5680 } 5681 5682 // If we did not calculate the cost for VF (because the user selected the VF) 5683 // then we calculate the cost of VF here. 5684 if (LoopCost == 0) 5685 LoopCost = expectedCost(VF).first; 5686 5687 // Clamp the calculated IC to be between the 1 and the max interleave count 5688 // that the target allows. 5689 if (IC > MaxInterleaveCount) 5690 IC = MaxInterleaveCount; 5691 else if (IC < 1) 5692 IC = 1; 5693 5694 // Interleave if we vectorized this loop and there is a reduction that could 5695 // benefit from interleaving. 5696 if (VF > 1 && Legal->getReductionVars()->size()) { 5697 DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5698 return IC; 5699 } 5700 5701 // Note that if we've already vectorized the loop we will have done the 5702 // runtime check and so interleaving won't require further checks. 5703 bool InterleavingRequiresRuntimePointerCheck = 5704 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5705 5706 // We want to interleave small loops in order to reduce the loop overhead and 5707 // potentially expose ILP opportunities. 5708 DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5709 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5710 // We assume that the cost overhead is 1 and we use the cost model 5711 // to estimate the cost of the loop and interleave until the cost of the 5712 // loop overhead is about 5% of the cost of the loop. 5713 unsigned SmallIC = 5714 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5715 5716 // Interleave until store/load ports (estimated by max interleave count) are 5717 // saturated. 5718 unsigned NumStores = Legal->getNumStores(); 5719 unsigned NumLoads = Legal->getNumLoads(); 5720 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5721 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5722 5723 // If we have a scalar reduction (vector reductions are already dealt with 5724 // by this point), we can increase the critical path length if the loop 5725 // we're interleaving is inside another loop. Limit, by default to 2, so the 5726 // critical path only gets increased by one reduction operation. 5727 if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) { 5728 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5729 SmallIC = std::min(SmallIC, F); 5730 StoresIC = std::min(StoresIC, F); 5731 LoadsIC = std::min(LoadsIC, F); 5732 } 5733 5734 if (EnableLoadStoreRuntimeInterleave && 5735 std::max(StoresIC, LoadsIC) > SmallIC) { 5736 DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5737 return std::max(StoresIC, LoadsIC); 5738 } 5739 5740 DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5741 return SmallIC; 5742 } 5743 5744 // Interleave if this is a large loop (small loops are already dealt with by 5745 // this point) that could benefit from interleaving. 5746 bool HasReductions = (Legal->getReductionVars()->size() > 0); 5747 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5748 DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5749 return IC; 5750 } 5751 5752 DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5753 return 1; 5754 } 5755 5756 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5757 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5758 // This function calculates the register usage by measuring the highest number 5759 // of values that are alive at a single location. Obviously, this is a very 5760 // rough estimation. We scan the loop in a topological order in order and 5761 // assign a number to each instruction. We use RPO to ensure that defs are 5762 // met before their users. We assume that each instruction that has in-loop 5763 // users starts an interval. We record every time that an in-loop value is 5764 // used, so we have a list of the first and last occurrences of each 5765 // instruction. Next, we transpose this data structure into a multi map that 5766 // holds the list of intervals that *end* at a specific location. This multi 5767 // map allows us to perform a linear search. We scan the instructions linearly 5768 // and record each time that a new interval starts, by placing it in a set. 5769 // If we find this value in the multi-map then we remove it from the set. 5770 // The max register usage is the maximum size of the set. 5771 // We also search for instructions that are defined outside the loop, but are 5772 // used inside the loop. We need this number separately from the max-interval 5773 // usage number because when we unroll, loop-invariant values do not take 5774 // more register. 5775 LoopBlocksDFS DFS(TheLoop); 5776 DFS.perform(LI); 5777 5778 RegisterUsage RU; 5779 RU.NumInstructions = 0; 5780 5781 // Each 'key' in the map opens a new interval. The values 5782 // of the map are the index of the 'last seen' usage of the 5783 // instruction that is the key. 5784 typedef DenseMap<Instruction *, unsigned> IntervalMap; 5785 // Maps instruction to its index. 5786 DenseMap<unsigned, Instruction *> IdxToInstr; 5787 // Marks the end of each interval. 5788 IntervalMap EndPoint; 5789 // Saves the list of instruction indices that are used in the loop. 5790 SmallSet<Instruction *, 8> Ends; 5791 // Saves the list of values that are used in the loop but are 5792 // defined outside the loop, such as arguments and constants. 5793 SmallPtrSet<Value *, 8> LoopInvariants; 5794 5795 unsigned Index = 0; 5796 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5797 RU.NumInstructions += BB->size(); 5798 for (Instruction &I : *BB) { 5799 IdxToInstr[Index++] = &I; 5800 5801 // Save the end location of each USE. 5802 for (Value *U : I.operands()) { 5803 auto *Instr = dyn_cast<Instruction>(U); 5804 5805 // Ignore non-instruction values such as arguments, constants, etc. 5806 if (!Instr) 5807 continue; 5808 5809 // If this instruction is outside the loop then record it and continue. 5810 if (!TheLoop->contains(Instr)) { 5811 LoopInvariants.insert(Instr); 5812 continue; 5813 } 5814 5815 // Overwrite previous end points. 5816 EndPoint[Instr] = Index; 5817 Ends.insert(Instr); 5818 } 5819 } 5820 } 5821 5822 // Saves the list of intervals that end with the index in 'key'. 5823 typedef SmallVector<Instruction *, 2> InstrList; 5824 DenseMap<unsigned, InstrList> TransposeEnds; 5825 5826 // Transpose the EndPoints to a list of values that end at each index. 5827 for (auto &Interval : EndPoint) 5828 TransposeEnds[Interval.second].push_back(Interval.first); 5829 5830 SmallSet<Instruction *, 8> OpenIntervals; 5831 5832 // Get the size of the widest register. 5833 unsigned MaxSafeDepDist = -1U; 5834 if (Legal->getMaxSafeDepDistBytes() != -1U) 5835 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5836 unsigned WidestRegister = 5837 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5838 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5839 5840 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5841 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5842 5843 DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5844 5845 // A lambda that gets the register usage for the given type and VF. 5846 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5847 if (Ty->isTokenTy()) 5848 return 0U; 5849 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5850 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5851 }; 5852 5853 for (unsigned int i = 0; i < Index; ++i) { 5854 Instruction *I = IdxToInstr[i]; 5855 // Ignore instructions that are never used within the loop. 5856 if (!Ends.count(I)) 5857 continue; 5858 5859 // Remove all of the instructions that end at this location. 5860 InstrList &List = TransposeEnds[i]; 5861 for (Instruction *ToRemove : List) 5862 OpenIntervals.erase(ToRemove); 5863 5864 // Skip ignored values. 5865 if (ValuesToIgnore.count(I)) 5866 continue; 5867 5868 // For each VF find the maximum usage of registers. 5869 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5870 if (VFs[j] == 1) { 5871 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5872 continue; 5873 } 5874 5875 // Count the number of live intervals. 5876 unsigned RegUsage = 0; 5877 for (auto Inst : OpenIntervals) { 5878 // Skip ignored values for VF > 1. 5879 if (VecValuesToIgnore.count(Inst)) 5880 continue; 5881 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5882 } 5883 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5884 } 5885 5886 DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5887 << OpenIntervals.size() << '\n'); 5888 5889 // Add the current instruction to the list of open intervals. 5890 OpenIntervals.insert(I); 5891 } 5892 5893 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5894 unsigned Invariant = 0; 5895 if (VFs[i] == 1) 5896 Invariant = LoopInvariants.size(); 5897 else { 5898 for (auto Inst : LoopInvariants) 5899 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5900 } 5901 5902 DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5903 DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5904 DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); 5905 DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); 5906 5907 RU.LoopInvariantRegs = Invariant; 5908 RU.MaxLocalUsers = MaxUsages[i]; 5909 RUs[i] = RU; 5910 } 5911 5912 return RUs; 5913 } 5914 5915 LoopVectorizationCostModel::VectorizationCostTy 5916 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5917 VectorizationCostTy Cost; 5918 5919 // For each block. 5920 for (BasicBlock *BB : TheLoop->blocks()) { 5921 VectorizationCostTy BlockCost; 5922 5923 // For each instruction in the old loop. 5924 for (Instruction &I : *BB) { 5925 // Skip dbg intrinsics. 5926 if (isa<DbgInfoIntrinsic>(I)) 5927 continue; 5928 5929 // Skip ignored values. 5930 if (ValuesToIgnore.count(&I)) 5931 continue; 5932 5933 VectorizationCostTy C = getInstructionCost(&I, VF); 5934 5935 // Check if we should override the cost. 5936 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5937 C.first = ForceTargetInstructionCost; 5938 5939 BlockCost.first += C.first; 5940 BlockCost.second |= C.second; 5941 DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " 5942 << VF << " For instruction: " << I << '\n'); 5943 } 5944 5945 // We assume that if-converted blocks have a 50% chance of being executed. 5946 // When the code is scalar then some of the blocks are avoided due to CF. 5947 // When the code is vectorized we execute all code paths. 5948 if (VF == 1 && Legal->blockNeedsPredication(BB)) 5949 BlockCost.first /= 2; 5950 5951 Cost.first += BlockCost.first; 5952 Cost.second |= BlockCost.second; 5953 } 5954 5955 return Cost; 5956 } 5957 5958 /// \brief Check whether the address computation for a non-consecutive memory 5959 /// access looks like an unlikely candidate for being merged into the indexing 5960 /// mode. 5961 /// 5962 /// We look for a GEP which has one index that is an induction variable and all 5963 /// other indices are loop invariant. If the stride of this access is also 5964 /// within a small bound we decide that this address computation can likely be 5965 /// merged into the addressing mode. 5966 /// In all other cases, we identify the address computation as complex. 5967 static bool isLikelyComplexAddressComputation(Value *Ptr, 5968 LoopVectorizationLegality *Legal, 5969 ScalarEvolution *SE, 5970 const Loop *TheLoop) { 5971 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5972 if (!Gep) 5973 return true; 5974 5975 // We are looking for a gep with all loop invariant indices except for one 5976 // which should be an induction variable. 5977 unsigned NumOperands = Gep->getNumOperands(); 5978 for (unsigned i = 1; i < NumOperands; ++i) { 5979 Value *Opd = Gep->getOperand(i); 5980 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5981 !Legal->isInductionVariable(Opd)) 5982 return true; 5983 } 5984 5985 // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step 5986 // can likely be merged into the address computation. 5987 unsigned MaxMergeDistance = 64; 5988 5989 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); 5990 if (!AddRec) 5991 return true; 5992 5993 // Check the step is constant. 5994 const SCEV *Step = AddRec->getStepRecurrence(*SE); 5995 // Calculate the pointer stride and check if it is consecutive. 5996 const auto *C = dyn_cast<SCEVConstant>(Step); 5997 if (!C) 5998 return true; 5999 6000 const APInt &APStepVal = C->getAPInt(); 6001 6002 // Huge step value - give up. 6003 if (APStepVal.getBitWidth() > 64) 6004 return true; 6005 6006 int64_t StepVal = APStepVal.getSExtValue(); 6007 6008 return StepVal > MaxMergeDistance; 6009 } 6010 6011 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6012 return Legal->hasStride(I->getOperand(0)) || 6013 Legal->hasStride(I->getOperand(1)); 6014 } 6015 6016 LoopVectorizationCostModel::VectorizationCostTy 6017 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 6018 // If we know that this instruction will remain uniform, check the cost of 6019 // the scalar version. 6020 if (Legal->isUniformAfterVectorization(I)) 6021 VF = 1; 6022 6023 Type *VectorTy; 6024 unsigned C = getInstructionCost(I, VF, VectorTy); 6025 6026 bool TypeNotScalarized = 6027 VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; 6028 return VectorizationCostTy(C, TypeNotScalarized); 6029 } 6030 6031 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6032 unsigned VF, 6033 Type *&VectorTy) { 6034 Type *RetTy = I->getType(); 6035 if (VF > 1 && MinBWs.count(I)) 6036 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6037 VectorTy = ToVectorTy(RetTy, VF); 6038 auto SE = PSE.getSE(); 6039 6040 // TODO: We need to estimate the cost of intrinsic calls. 6041 switch (I->getOpcode()) { 6042 case Instruction::GetElementPtr: 6043 // We mark this instruction as zero-cost because the cost of GEPs in 6044 // vectorized code depends on whether the corresponding memory instruction 6045 // is scalarized or not. Therefore, we handle GEPs with the memory 6046 // instruction cost. 6047 return 0; 6048 case Instruction::Br: { 6049 return TTI.getCFInstrCost(I->getOpcode()); 6050 } 6051 case Instruction::PHI: { 6052 auto *Phi = cast<PHINode>(I); 6053 6054 // First-order recurrences are replaced by vector shuffles inside the loop. 6055 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6056 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6057 VectorTy, VF - 1, VectorTy); 6058 6059 // TODO: IF-converted IFs become selects. 6060 return 0; 6061 } 6062 case Instruction::Add: 6063 case Instruction::FAdd: 6064 case Instruction::Sub: 6065 case Instruction::FSub: 6066 case Instruction::Mul: 6067 case Instruction::FMul: 6068 case Instruction::UDiv: 6069 case Instruction::SDiv: 6070 case Instruction::FDiv: 6071 case Instruction::URem: 6072 case Instruction::SRem: 6073 case Instruction::FRem: 6074 case Instruction::Shl: 6075 case Instruction::LShr: 6076 case Instruction::AShr: 6077 case Instruction::And: 6078 case Instruction::Or: 6079 case Instruction::Xor: { 6080 // Since we will replace the stride by 1 the multiplication should go away. 6081 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6082 return 0; 6083 // Certain instructions can be cheaper to vectorize if they have a constant 6084 // second vector operand. One example of this are shifts on x86. 6085 TargetTransformInfo::OperandValueKind Op1VK = 6086 TargetTransformInfo::OK_AnyValue; 6087 TargetTransformInfo::OperandValueKind Op2VK = 6088 TargetTransformInfo::OK_AnyValue; 6089 TargetTransformInfo::OperandValueProperties Op1VP = 6090 TargetTransformInfo::OP_None; 6091 TargetTransformInfo::OperandValueProperties Op2VP = 6092 TargetTransformInfo::OP_None; 6093 Value *Op2 = I->getOperand(1); 6094 6095 // Check for a splat of a constant or for a non uniform vector of constants. 6096 if (isa<ConstantInt>(Op2)) { 6097 ConstantInt *CInt = cast<ConstantInt>(Op2); 6098 if (CInt && CInt->getValue().isPowerOf2()) 6099 Op2VP = TargetTransformInfo::OP_PowerOf2; 6100 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 6101 } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { 6102 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 6103 Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); 6104 if (SplatValue) { 6105 ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); 6106 if (CInt && CInt->getValue().isPowerOf2()) 6107 Op2VP = TargetTransformInfo::OP_PowerOf2; 6108 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 6109 } 6110 } 6111 6112 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, 6113 Op1VP, Op2VP); 6114 } 6115 case Instruction::Select: { 6116 SelectInst *SI = cast<SelectInst>(I); 6117 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6118 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6119 Type *CondTy = SI->getCondition()->getType(); 6120 if (!ScalarCond) 6121 CondTy = VectorType::get(CondTy, VF); 6122 6123 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 6124 } 6125 case Instruction::ICmp: 6126 case Instruction::FCmp: { 6127 Type *ValTy = I->getOperand(0)->getType(); 6128 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6129 auto It = MinBWs.find(Op0AsInstruction); 6130 if (VF > 1 && It != MinBWs.end()) 6131 ValTy = IntegerType::get(ValTy->getContext(), It->second); 6132 VectorTy = ToVectorTy(ValTy, VF); 6133 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 6134 } 6135 case Instruction::Store: 6136 case Instruction::Load: { 6137 StoreInst *SI = dyn_cast<StoreInst>(I); 6138 LoadInst *LI = dyn_cast<LoadInst>(I); 6139 Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); 6140 VectorTy = ToVectorTy(ValTy, VF); 6141 6142 unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 6143 unsigned AS = 6144 SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); 6145 Value *Ptr = getPointerOperand(I); 6146 // We add the cost of address computation here instead of with the gep 6147 // instruction because only here we know whether the operation is 6148 // scalarized. 6149 if (VF == 1) 6150 return TTI.getAddressComputationCost(VectorTy) + 6151 TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 6152 6153 if (LI && Legal->isUniform(Ptr)) { 6154 // Scalar load + broadcast 6155 unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType()); 6156 Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 6157 Alignment, AS); 6158 return Cost + 6159 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy); 6160 } 6161 6162 // For an interleaved access, calculate the total cost of the whole 6163 // interleave group. 6164 if (Legal->isAccessInterleaved(I)) { 6165 auto Group = Legal->getInterleavedAccessGroup(I); 6166 assert(Group && "Fail to get an interleaved access group."); 6167 6168 // Only calculate the cost once at the insert position. 6169 if (Group->getInsertPos() != I) 6170 return 0; 6171 6172 unsigned InterleaveFactor = Group->getFactor(); 6173 Type *WideVecTy = 6174 VectorType::get(VectorTy->getVectorElementType(), 6175 VectorTy->getVectorNumElements() * InterleaveFactor); 6176 6177 // Holds the indices of existing members in an interleaved load group. 6178 // An interleaved store group doesn't need this as it doesn't allow gaps. 6179 SmallVector<unsigned, 4> Indices; 6180 if (LI) { 6181 for (unsigned i = 0; i < InterleaveFactor; i++) 6182 if (Group->getMember(i)) 6183 Indices.push_back(i); 6184 } 6185 6186 // Calculate the cost of the whole interleaved group. 6187 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6188 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 6189 Group->getAlignment(), AS); 6190 6191 if (Group->isReverse()) 6192 Cost += 6193 Group->getNumMembers() * 6194 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6195 6196 // FIXME: The interleaved load group with a huge gap could be even more 6197 // expensive than scalar operations. Then we could ignore such group and 6198 // use scalar operations instead. 6199 return Cost; 6200 } 6201 6202 // Scalarized loads/stores. 6203 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6204 bool UseGatherOrScatter = 6205 (ConsecutiveStride == 0) && Legal->isLegalGatherOrScatter(I); 6206 6207 bool Reverse = ConsecutiveStride < 0; 6208 const DataLayout &DL = I->getModule()->getDataLayout(); 6209 uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); 6210 uint64_t VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; 6211 if ((!ConsecutiveStride && !UseGatherOrScatter) || 6212 ScalarAllocatedSize != VectorElementSize) { 6213 bool IsComplexComputation = 6214 isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); 6215 unsigned Cost = 0; 6216 // The cost of extracting from the value vector and pointer vector. 6217 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6218 for (unsigned i = 0; i < VF; ++i) { 6219 // The cost of extracting the pointer operand. 6220 Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 6221 // In case of STORE, the cost of ExtractElement from the vector. 6222 // In case of LOAD, the cost of InsertElement into the returned 6223 // vector. 6224 Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement 6225 : Instruction::InsertElement, 6226 VectorTy, i); 6227 } 6228 6229 // The cost of the scalar loads/stores. 6230 Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); 6231 Cost += VF * 6232 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 6233 Alignment, AS); 6234 return Cost; 6235 } 6236 6237 unsigned Cost = TTI.getAddressComputationCost(VectorTy); 6238 if (UseGatherOrScatter) { 6239 assert(ConsecutiveStride == 0 && 6240 "Gather/Scatter are not used for consecutive stride"); 6241 return Cost + 6242 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 6243 Legal->isMaskRequired(I), Alignment); 6244 } 6245 // Wide load/stores. 6246 if (Legal->isMaskRequired(I)) 6247 Cost += 6248 TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 6249 else 6250 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 6251 6252 if (Reverse) 6253 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6254 return Cost; 6255 } 6256 case Instruction::ZExt: 6257 case Instruction::SExt: 6258 case Instruction::FPToUI: 6259 case Instruction::FPToSI: 6260 case Instruction::FPExt: 6261 case Instruction::PtrToInt: 6262 case Instruction::IntToPtr: 6263 case Instruction::SIToFP: 6264 case Instruction::UIToFP: 6265 case Instruction::Trunc: 6266 case Instruction::FPTrunc: 6267 case Instruction::BitCast: { 6268 // We optimize the truncation of induction variable. 6269 // The cost of these is the same as the scalar operation. 6270 if (I->getOpcode() == Instruction::Trunc && 6271 Legal->isInductionVariable(I->getOperand(0))) 6272 return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 6273 I->getOperand(0)->getType()); 6274 6275 Type *SrcScalarTy = I->getOperand(0)->getType(); 6276 Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); 6277 if (VF > 1 && MinBWs.count(I)) { 6278 // This cast is going to be shrunk. This may remove the cast or it might 6279 // turn it into slightly different cast. For example, if MinBW == 16, 6280 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6281 // 6282 // Calculate the modified src and dest types. 6283 Type *MinVecTy = VectorTy; 6284 if (I->getOpcode() == Instruction::Trunc) { 6285 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6286 VectorTy = 6287 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6288 } else if (I->getOpcode() == Instruction::ZExt || 6289 I->getOpcode() == Instruction::SExt) { 6290 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6291 VectorTy = 6292 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6293 } 6294 } 6295 6296 return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 6297 } 6298 case Instruction::Call: { 6299 bool NeedToScalarize; 6300 CallInst *CI = cast<CallInst>(I); 6301 unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); 6302 if (getVectorIntrinsicIDForCall(CI, TLI)) 6303 return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); 6304 return CallCost; 6305 } 6306 default: { 6307 // We are scalarizing the instruction. Return the cost of the scalar 6308 // instruction, plus the cost of insert and extract into vector 6309 // elements, times the vector width. 6310 unsigned Cost = 0; 6311 6312 if (!RetTy->isVoidTy() && VF != 1) { 6313 unsigned InsCost = 6314 TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy); 6315 unsigned ExtCost = 6316 TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy); 6317 6318 // The cost of inserting the results plus extracting each one of the 6319 // operands. 6320 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 6321 } 6322 6323 // The cost of executing VF copies of the scalar instruction. This opcode 6324 // is unknown. Assume that it is the same as 'mul'. 6325 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 6326 return Cost; 6327 } 6328 } // end of switch. 6329 } 6330 6331 char LoopVectorize::ID = 0; 6332 static const char lv_name[] = "Loop Vectorization"; 6333 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6334 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6335 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6336 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6337 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6338 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6339 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6340 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6341 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6342 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 6343 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6344 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 6345 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6346 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6347 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6348 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6349 6350 namespace llvm { 6351 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { 6352 return new LoopVectorize(NoUnrolling, AlwaysVectorize); 6353 } 6354 } 6355 6356 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6357 6358 // Check if the pointer operand of a load or store instruction is 6359 // consecutive. 6360 if (auto *Ptr = getPointerOperand(Inst)) 6361 return Legal->isConsecutivePtr(Ptr); 6362 return false; 6363 } 6364 6365 void LoopVectorizationCostModel::collectValuesToIgnore() { 6366 // Ignore ephemeral values. 6367 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6368 6369 // Ignore type-promoting instructions we identified during reduction 6370 // detection. 6371 for (auto &Reduction : *Legal->getReductionVars()) { 6372 RecurrenceDescriptor &RedDes = Reduction.second; 6373 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6374 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6375 } 6376 6377 // Insert values known to be scalar into VecValuesToIgnore. 6378 for (auto *BB : TheLoop->getBlocks()) 6379 for (auto &I : *BB) 6380 if (Legal->isScalarAfterVectorization(&I)) 6381 VecValuesToIgnore.insert(&I); 6382 } 6383 6384 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, 6385 bool IfPredicateStore) { 6386 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 6387 // Holds vector parameters or scalars, in case of uniform vals. 6388 SmallVector<VectorParts, 4> Params; 6389 6390 setDebugLocFromInst(Builder, Instr); 6391 6392 // Find all of the vectorized parameters. 6393 for (Value *SrcOp : Instr->operands()) { 6394 // If we are accessing the old induction variable, use the new one. 6395 if (SrcOp == OldInduction) { 6396 Params.push_back(getVectorValue(SrcOp)); 6397 continue; 6398 } 6399 6400 // Try using previously calculated values. 6401 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 6402 6403 // If the src is an instruction that appeared earlier in the basic block 6404 // then it should already be vectorized. 6405 if (SrcInst && OrigLoop->contains(SrcInst)) { 6406 assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 6407 // The parameter is a vector value from earlier. 6408 Params.push_back(WidenMap.get(SrcInst)); 6409 } else { 6410 // The parameter is a scalar from outside the loop. Maybe even a constant. 6411 VectorParts Scalars; 6412 Scalars.append(UF, SrcOp); 6413 Params.push_back(Scalars); 6414 } 6415 } 6416 6417 assert(Params.size() == Instr->getNumOperands() && 6418 "Invalid number of operands"); 6419 6420 // Does this instruction return a value ? 6421 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 6422 6423 Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); 6424 // Create a new entry in the WidenMap and initialize it to Undef or Null. 6425 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 6426 6427 VectorParts Cond; 6428 if (IfPredicateStore) { 6429 assert(Instr->getParent()->getSinglePredecessor() && 6430 "Only support single predecessor blocks"); 6431 Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 6432 Instr->getParent()); 6433 } 6434 6435 // For each vector unroll 'part': 6436 for (unsigned Part = 0; Part < UF; ++Part) { 6437 // For each scalar that we create: 6438 6439 // Start an "if (pred) a[i] = ..." block. 6440 Value *Cmp = nullptr; 6441 if (IfPredicateStore) { 6442 if (Cond[Part]->getType()->isVectorTy()) 6443 Cond[Part] = 6444 Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); 6445 Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], 6446 ConstantInt::get(Cond[Part]->getType(), 1)); 6447 } 6448 6449 Instruction *Cloned = Instr->clone(); 6450 if (!IsVoidRetTy) 6451 Cloned->setName(Instr->getName() + ".cloned"); 6452 // Replace the operands of the cloned instructions with extracted scalars. 6453 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 6454 Value *Op = Params[op][Part]; 6455 Cloned->setOperand(op, Op); 6456 } 6457 6458 // Place the cloned scalar in the new loop. 6459 Builder.Insert(Cloned); 6460 6461 // If we just cloned a new assumption, add it the assumption cache. 6462 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 6463 if (II->getIntrinsicID() == Intrinsic::assume) 6464 AC->registerAssumption(II); 6465 6466 // If the original scalar returns a value we need to place it in a vector 6467 // so that future users will be able to use it. 6468 if (!IsVoidRetTy) 6469 VecResults[Part] = Cloned; 6470 6471 // End if-block. 6472 if (IfPredicateStore) 6473 PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp)); 6474 } 6475 } 6476 6477 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { 6478 auto *SI = dyn_cast<StoreInst>(Instr); 6479 bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); 6480 6481 return scalarizeInstruction(Instr, IfPredicateStore); 6482 } 6483 6484 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6485 6486 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6487 6488 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6489 Instruction::BinaryOps BinOp) { 6490 // When unrolling and the VF is 1, we only need to add a simple scalar. 6491 Type *Ty = Val->getType(); 6492 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6493 6494 if (Ty->isFloatingPointTy()) { 6495 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6496 6497 // Floating point operations had to be 'fast' to enable the unrolling. 6498 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6499 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6500 } 6501 Constant *C = ConstantInt::get(Ty, StartIdx); 6502 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6503 } 6504 6505 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6506 SmallVector<Metadata *, 4> MDs; 6507 // Reserve first location for self reference to the LoopID metadata node. 6508 MDs.push_back(nullptr); 6509 bool IsUnrollMetadata = false; 6510 MDNode *LoopID = L->getLoopID(); 6511 if (LoopID) { 6512 // First find existing loop unrolling disable metadata. 6513 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6514 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6515 if (MD) { 6516 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6517 IsUnrollMetadata = 6518 S && S->getString().startswith("llvm.loop.unroll.disable"); 6519 } 6520 MDs.push_back(LoopID->getOperand(i)); 6521 } 6522 } 6523 6524 if (!IsUnrollMetadata) { 6525 // Add runtime unroll disable metadata. 6526 LLVMContext &Context = L->getHeader()->getContext(); 6527 SmallVector<Metadata *, 1> DisableOperands; 6528 DisableOperands.push_back( 6529 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6530 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6531 MDs.push_back(DisableNode); 6532 MDNode *NewLoopID = MDNode::get(Context, MDs); 6533 // Set operand 0 to refer to the loop id itself. 6534 NewLoopID->replaceOperandWith(0, NewLoopID); 6535 L->setLoopID(NewLoopID); 6536 } 6537 } 6538 6539 bool LoopVectorizePass::processLoop(Loop *L) { 6540 assert(L->empty() && "Only process inner loops."); 6541 6542 #ifndef NDEBUG 6543 const std::string DebugLocStr = getDebugLocString(L); 6544 #endif /* NDEBUG */ 6545 6546 DEBUG(dbgs() << "\nLV: Checking a loop in \"" 6547 << L->getHeader()->getParent()->getName() << "\" from " 6548 << DebugLocStr << "\n"); 6549 6550 LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); 6551 6552 DEBUG(dbgs() << "LV: Loop hints:" 6553 << " force=" 6554 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 6555 ? "disabled" 6556 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 6557 ? "enabled" 6558 : "?")) 6559 << " width=" << Hints.getWidth() 6560 << " unroll=" << Hints.getInterleave() << "\n"); 6561 6562 // Function containing loop 6563 Function *F = L->getHeader()->getParent(); 6564 6565 // Looking at the diagnostic output is the only way to determine if a loop 6566 // was vectorized (other than looking at the IR or machine code), so it 6567 // is important to generate an optimization remark for each loop. Most of 6568 // these messages are generated by emitOptimizationRemarkAnalysis. Remarks 6569 // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are 6570 // less verbose reporting vectorized loops and unvectorized loops that may 6571 // benefit from vectorization, respectively. 6572 6573 if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { 6574 DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 6575 return false; 6576 } 6577 6578 // Check the loop for a trip count threshold: 6579 // do not vectorize loops with a tiny trip count. 6580 const unsigned TC = SE->getSmallConstantTripCount(L); 6581 if (TC > 0u && TC < TinyTripCountVectorThreshold) { 6582 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 6583 << "This loop is not worth vectorizing."); 6584 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 6585 DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 6586 else { 6587 DEBUG(dbgs() << "\n"); 6588 emitAnalysisDiag(L, Hints, *ORE, VectorizationReport() 6589 << "vectorization is not beneficial " 6590 "and is not explicitly forced"); 6591 return false; 6592 } 6593 } 6594 6595 PredicatedScalarEvolution PSE(*SE, *L); 6596 6597 // Check if it is legal to vectorize the loop. 6598 LoopVectorizationRequirements Requirements(*ORE); 6599 LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, 6600 &Requirements, &Hints); 6601 if (!LVL.canVectorize()) { 6602 DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 6603 emitMissedWarning(F, L, Hints, ORE); 6604 return false; 6605 } 6606 6607 // Use the cost model. 6608 LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, 6609 &Hints); 6610 CM.collectValuesToIgnore(); 6611 6612 // Check the function attributes to find out if this function should be 6613 // optimized for size. 6614 bool OptForSize = 6615 Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); 6616 6617 // Compute the weighted frequency of this loop being executed and see if it 6618 // is less than 20% of the function entry baseline frequency. Note that we 6619 // always have a canonical loop here because we think we *can* vectorize. 6620 // FIXME: This is hidden behind a flag due to pervasive problems with 6621 // exactly what block frequency models. 6622 if (LoopVectorizeWithBlockFrequency) { 6623 BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); 6624 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 6625 LoopEntryFreq < ColdEntryFreq) 6626 OptForSize = true; 6627 } 6628 6629 // Check the function attributes to see if implicit floats are allowed. 6630 // FIXME: This check doesn't seem possibly correct -- what if the loop is 6631 // an integer loop and the vector instructions selected are purely integer 6632 // vector instructions? 6633 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 6634 DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 6635 "attribute is used.\n"); 6636 emitAnalysisDiag( 6637 L, Hints, *ORE, 6638 VectorizationReport() 6639 << "loop not vectorized due to NoImplicitFloat attribute"); 6640 emitMissedWarning(F, L, Hints, ORE); 6641 return false; 6642 } 6643 6644 // Check if the target supports potentially unsafe FP vectorization. 6645 // FIXME: Add a check for the type of safety issue (denormal, signaling) 6646 // for the target we're vectorizing for, to make sure none of the 6647 // additional fp-math flags can help. 6648 if (Hints.isPotentiallyUnsafe() && 6649 TTI->isFPVectorizationPotentiallyUnsafe()) { 6650 DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); 6651 emitAnalysisDiag(L, Hints, *ORE, 6652 VectorizationReport() 6653 << "loop not vectorized due to unsafe FP support."); 6654 emitMissedWarning(F, L, Hints, ORE); 6655 return false; 6656 } 6657 6658 // Select the optimal vectorization factor. 6659 const LoopVectorizationCostModel::VectorizationFactor VF = 6660 CM.selectVectorizationFactor(OptForSize); 6661 6662 // Select the interleave count. 6663 unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); 6664 6665 // Get user interleave count. 6666 unsigned UserIC = Hints.getInterleave(); 6667 6668 // Identify the diagnostic messages that should be produced. 6669 std::string VecDiagMsg, IntDiagMsg; 6670 bool VectorizeLoop = true, InterleaveLoop = true; 6671 if (Requirements.doesNotMeet(F, L, Hints)) { 6672 DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 6673 "requirements.\n"); 6674 emitMissedWarning(F, L, Hints, ORE); 6675 return false; 6676 } 6677 6678 if (VF.Width == 1) { 6679 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 6680 VecDiagMsg = 6681 "the cost-model indicates that vectorization is not beneficial"; 6682 VectorizeLoop = false; 6683 } 6684 6685 if (IC == 1 && UserIC <= 1) { 6686 // Tell the user interleaving is not beneficial. 6687 DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 6688 IntDiagMsg = 6689 "the cost-model indicates that interleaving is not beneficial"; 6690 InterleaveLoop = false; 6691 if (UserIC == 1) 6692 IntDiagMsg += 6693 " and is explicitly disabled or interleave count is set to 1"; 6694 } else if (IC > 1 && UserIC == 1) { 6695 // Tell the user interleaving is beneficial, but it explicitly disabled. 6696 DEBUG(dbgs() 6697 << "LV: Interleaving is beneficial but is explicitly disabled."); 6698 IntDiagMsg = "the cost-model indicates that interleaving is beneficial " 6699 "but is explicitly disabled or interleave count is set to 1"; 6700 InterleaveLoop = false; 6701 } 6702 6703 // Override IC if user provided an interleave count. 6704 IC = UserIC > 0 ? UserIC : IC; 6705 6706 // Emit diagnostic messages, if any. 6707 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 6708 if (!VectorizeLoop && !InterleaveLoop) { 6709 // Do not vectorize or interleaving the loop. 6710 ORE->emitOptimizationRemarkAnalysis(VAPassName, L, VecDiagMsg); 6711 ORE->emitOptimizationRemarkAnalysis(LV_NAME, L, IntDiagMsg); 6712 return false; 6713 } else if (!VectorizeLoop && InterleaveLoop) { 6714 DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 6715 ORE->emitOptimizationRemarkAnalysis(VAPassName, L, VecDiagMsg); 6716 } else if (VectorizeLoop && !InterleaveLoop) { 6717 DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 6718 << DebugLocStr << '\n'); 6719 ORE->emitOptimizationRemarkAnalysis(LV_NAME, L, IntDiagMsg); 6720 } else if (VectorizeLoop && InterleaveLoop) { 6721 DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 6722 << DebugLocStr << '\n'); 6723 DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 6724 } 6725 6726 if (!VectorizeLoop) { 6727 assert(IC > 1 && "interleave count should not be 1 or 0"); 6728 // If we decided that it is not legal to vectorize the loop, then 6729 // interleave it. 6730 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC); 6731 Unroller.vectorize(&LVL, CM.MinBWs); 6732 6733 ORE->emitOptimizationRemark(LV_NAME, L, 6734 Twine("interleaved loop (interleaved count: ") + 6735 Twine(IC) + ")"); 6736 } else { 6737 // If we decided that it is *legal* to vectorize the loop, then do it. 6738 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC); 6739 LB.vectorize(&LVL, CM.MinBWs); 6740 ++LoopsVectorized; 6741 6742 // Add metadata to disable runtime unrolling a scalar loop when there are 6743 // no runtime checks about strides and memory. A scalar loop that is 6744 // rarely used is not worth unrolling. 6745 if (!LB.areSafetyChecksAdded()) 6746 AddRuntimeUnrollDisableMetaData(L); 6747 6748 // Report the vectorization decision. 6749 ORE->emitOptimizationRemark( 6750 LV_NAME, L, Twine("vectorized loop (vectorization width: ") + 6751 Twine(VF.Width) + ", interleaved count: " + Twine(IC) + 6752 ")"); 6753 } 6754 6755 // Mark the loop as already vectorized to avoid vectorizing again. 6756 Hints.setAlreadyVectorized(); 6757 6758 DEBUG(verifyFunction(*L->getHeader()->getParent())); 6759 return true; 6760 } 6761 6762 bool LoopVectorizePass::runImpl( 6763 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 6764 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 6765 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 6766 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 6767 OptimizationRemarkEmitter &ORE_) { 6768 6769 SE = &SE_; 6770 LI = &LI_; 6771 TTI = &TTI_; 6772 DT = &DT_; 6773 BFI = &BFI_; 6774 TLI = TLI_; 6775 AA = &AA_; 6776 AC = &AC_; 6777 GetLAA = &GetLAA_; 6778 DB = &DB_; 6779 ORE = &ORE_; 6780 6781 // Compute some weights outside of the loop over the loops. Compute this 6782 // using a BranchProbability to re-use its scaling math. 6783 const BranchProbability ColdProb(1, 5); // 20% 6784 ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; 6785 6786 // Don't attempt if 6787 // 1. the target claims to have no vector registers, and 6788 // 2. interleaving won't help ILP. 6789 // 6790 // The second condition is necessary because, even if the target has no 6791 // vector registers, loop vectorization may still enable scalar 6792 // interleaving. 6793 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 6794 return false; 6795 6796 // Build up a worklist of inner-loops to vectorize. This is necessary as 6797 // the act of vectorizing or partially unrolling a loop creates new loops 6798 // and can invalidate iterators across the loops. 6799 SmallVector<Loop *, 8> Worklist; 6800 6801 for (Loop *L : *LI) 6802 addInnerLoop(*L, Worklist); 6803 6804 LoopsAnalyzed += Worklist.size(); 6805 6806 // Now walk the identified inner loops. 6807 bool Changed = false; 6808 while (!Worklist.empty()) 6809 Changed |= processLoop(Worklist.pop_back_val()); 6810 6811 // Process each loop nest in the function. 6812 return Changed; 6813 6814 } 6815 6816 6817 PreservedAnalyses LoopVectorizePass::run(Function &F, 6818 FunctionAnalysisManager &AM) { 6819 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 6820 auto &LI = AM.getResult<LoopAnalysis>(F); 6821 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 6822 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 6823 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 6824 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); 6825 auto &AA = AM.getResult<AAManager>(F); 6826 auto &AC = AM.getResult<AssumptionAnalysis>(F); 6827 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 6828 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 6829 6830 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 6831 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 6832 [&](Loop &L) -> const LoopAccessInfo & { 6833 return LAM.getResult<LoopAccessAnalysis>(L); 6834 }; 6835 bool Changed = 6836 runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA, ORE); 6837 if (!Changed) 6838 return PreservedAnalyses::all(); 6839 PreservedAnalyses PA; 6840 PA.preserve<LoopAnalysis>(); 6841 PA.preserve<DominatorTreeAnalysis>(); 6842 PA.preserve<BasicAA>(); 6843 PA.preserve<GlobalsAA>(); 6844 return PA; 6845 } 6846