1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 413 void fixVectorizedLoop(); 414 415 // Return true if any runtime check is added. 416 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 417 418 /// A type for vectorized values in the new loop. Each value from the 419 /// original loop, when vectorized, is represented by UF vector values in the 420 /// new unrolled loop, where UF is the unroll factor. 421 using VectorParts = SmallVector<Value *, 2>; 422 423 /// Vectorize a single GetElementPtrInst based on information gathered and 424 /// decisions taken during planning. 425 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 426 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 427 428 /// Vectorize a single PHINode in a block. This method handles the induction 429 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 430 /// arbitrary length vectors. 431 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 432 433 /// A helper function to scalarize a single Instruction in the innermost loop. 434 /// Generates a sequence of scalar instances for each lane between \p MinLane 435 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 436 /// inclusive.. 437 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 438 bool IfPredicateInstr); 439 440 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 441 /// is provided, the integer induction variable will first be truncated to 442 /// the corresponding type. 443 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 444 445 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 446 /// vector or scalar value on-demand if one is not yet available. When 447 /// vectorizing a loop, we visit the definition of an instruction before its 448 /// uses. When visiting the definition, we either vectorize or scalarize the 449 /// instruction, creating an entry for it in the corresponding map. (In some 450 /// cases, such as induction variables, we will create both vector and scalar 451 /// entries.) Then, as we encounter uses of the definition, we derive values 452 /// for each scalar or vector use unless such a value is already available. 453 /// For example, if we scalarize a definition and one of its uses is vector, 454 /// we build the required vector on-demand with an insertelement sequence 455 /// when visiting the use. Otherwise, if the use is scalar, we can use the 456 /// existing scalar definition. 457 /// 458 /// Return a value in the new loop corresponding to \p V from the original 459 /// loop at unroll index \p Part. If the value has already been vectorized, 460 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 461 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 462 /// a new vector value on-demand by inserting the scalar values into a vector 463 /// with an insertelement sequence. If the value has been neither vectorized 464 /// nor scalarized, it must be loop invariant, so we simply broadcast the 465 /// value into a vector. 466 Value *getOrCreateVectorValue(Value *V, unsigned Part); 467 468 /// Return a value in the new loop corresponding to \p V from the original 469 /// loop at unroll and vector indices \p Instance. If the value has been 470 /// vectorized but not scalarized, the necessary extractelement instruction 471 /// will be generated. 472 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 473 474 /// Construct the vector value of a scalarized value \p V one lane at a time. 475 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 476 477 /// Try to vectorize the interleaved access group that \p Instr belongs to 478 /// with the base address given in \p Addr, optionally masking the vector 479 /// operations if \p BlockInMask is non-null. Use \p State to translate given 480 /// VPValues to IR values in the vectorized loop. 481 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 482 VPValue *Addr, VPValue *BlockInMask = nullptr); 483 484 /// Vectorize Load and Store instructions with the base address given in \p 485 /// Addr, optionally masking the vector operations if \p BlockInMask is 486 /// non-null. Use \p State to translate given VPValues to IR values in the 487 /// vectorized loop. 488 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 489 VPValue *Addr, VPValue *StoredValue, 490 VPValue *BlockInMask); 491 492 /// Set the debug location in the builder using the debug location in 493 /// the instruction. 494 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 495 496 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 497 void fixNonInductionPHIs(void); 498 499 protected: 500 friend class LoopVectorizationPlanner; 501 502 /// A small list of PHINodes. 503 using PhiVector = SmallVector<PHINode *, 4>; 504 505 /// A type for scalarized values in the new loop. Each value from the 506 /// original loop, when scalarized, is represented by UF x VF scalar values 507 /// in the new unrolled loop, where UF is the unroll factor and VF is the 508 /// vectorization factor. 509 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 510 511 /// Set up the values of the IVs correctly when exiting the vector loop. 512 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 513 Value *CountRoundDown, Value *EndValue, 514 BasicBlock *MiddleBlock); 515 516 /// Create a new induction variable inside L. 517 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 518 Value *Step, Instruction *DL); 519 520 /// Handle all cross-iteration phis in the header. 521 void fixCrossIterationPHIs(); 522 523 /// Fix a first-order recurrence. This is the second phase of vectorizing 524 /// this phi node. 525 void fixFirstOrderRecurrence(PHINode *Phi); 526 527 /// Fix a reduction cross-iteration phi. This is the second phase of 528 /// vectorizing this phi node. 529 void fixReduction(PHINode *Phi); 530 531 /// Clear NSW/NUW flags from reduction instructions if necessary. 532 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 533 534 /// The Loop exit block may have single value PHI nodes with some 535 /// incoming value. While vectorizing we only handled real values 536 /// that were defined inside the loop and we should have one value for 537 /// each predecessor of its parent basic block. See PR14725. 538 void fixLCSSAPHIs(); 539 540 /// Iteratively sink the scalarized operands of a predicated instruction into 541 /// the block that was created for it. 542 void sinkScalarOperands(Instruction *PredInst); 543 544 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 545 /// represented as. 546 void truncateToMinimalBitwidths(); 547 548 /// Create a broadcast instruction. This method generates a broadcast 549 /// instruction (shuffle) for loop invariant values and for the induction 550 /// value. If this is the induction variable then we extend it to N, N+1, ... 551 /// this is needed because each iteration in the loop corresponds to a SIMD 552 /// element. 553 virtual Value *getBroadcastInstrs(Value *V); 554 555 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 556 /// to each vector element of Val. The sequence starts at StartIndex. 557 /// \p Opcode is relevant for FP induction variable. 558 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 559 Instruction::BinaryOps Opcode = 560 Instruction::BinaryOpsEnd); 561 562 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 563 /// variable on which to base the steps, \p Step is the size of the step, and 564 /// \p EntryVal is the value from the original loop that maps to the steps. 565 /// Note that \p EntryVal doesn't have to be an induction variable - it 566 /// can also be a truncate instruction. 567 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 568 const InductionDescriptor &ID); 569 570 /// Create a vector induction phi node based on an existing scalar one. \p 571 /// EntryVal is the value from the original loop that maps to the vector phi 572 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 573 /// truncate instruction, instead of widening the original IV, we widen a 574 /// version of the IV truncated to \p EntryVal's type. 575 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 576 Value *Step, Instruction *EntryVal); 577 578 /// Returns true if an instruction \p I should be scalarized instead of 579 /// vectorized for the chosen vectorization factor. 580 bool shouldScalarizeInstruction(Instruction *I) const; 581 582 /// Returns true if we should generate a scalar version of \p IV. 583 bool needsScalarInduction(Instruction *IV) const; 584 585 /// If there is a cast involved in the induction variable \p ID, which should 586 /// be ignored in the vectorized loop body, this function records the 587 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 588 /// cast. We had already proved that the casted Phi is equal to the uncasted 589 /// Phi in the vectorized loop (under a runtime guard), and therefore 590 /// there is no need to vectorize the cast - the same value can be used in the 591 /// vector loop for both the Phi and the cast. 592 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 593 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 594 /// 595 /// \p EntryVal is the value from the original loop that maps to the vector 596 /// phi node and is used to distinguish what is the IV currently being 597 /// processed - original one (if \p EntryVal is a phi corresponding to the 598 /// original IV) or the "newly-created" one based on the proof mentioned above 599 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 600 /// latter case \p EntryVal is a TruncInst and we must not record anything for 601 /// that IV, but it's error-prone to expect callers of this routine to care 602 /// about that, hence this explicit parameter. 603 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 604 const Instruction *EntryVal, 605 Value *VectorLoopValue, 606 unsigned Part, 607 unsigned Lane = UINT_MAX); 608 609 /// Generate a shuffle sequence that will reverse the vector Vec. 610 virtual Value *reverseVector(Value *Vec); 611 612 /// Returns (and creates if needed) the original loop trip count. 613 Value *getOrCreateTripCount(Loop *NewLoop); 614 615 /// Returns (and creates if needed) the trip count of the widened loop. 616 Value *getOrCreateVectorTripCount(Loop *NewLoop); 617 618 /// Returns a bitcasted value to the requested vector type. 619 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 620 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 621 const DataLayout &DL); 622 623 /// Emit a bypass check to see if the vector trip count is zero, including if 624 /// it overflows. 625 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 626 627 /// Emit a bypass check to see if all of the SCEV assumptions we've 628 /// had to make are correct. 629 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 630 631 /// Emit bypass checks to check any memory assumptions we may have made. 632 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 633 634 /// Compute the transformed value of Index at offset StartValue using step 635 /// StepValue. 636 /// For integer induction, returns StartValue + Index * StepValue. 637 /// For pointer induction, returns StartValue[Index * StepValue]. 638 /// FIXME: The newly created binary instructions should contain nsw/nuw 639 /// flags, which can be found from the original scalar operations. 640 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 641 const DataLayout &DL, 642 const InductionDescriptor &ID) const; 643 644 /// Add additional metadata to \p To that was not present on \p Orig. 645 /// 646 /// Currently this is used to add the noalias annotations based on the 647 /// inserted memchecks. Use this for instructions that are *cloned* into the 648 /// vector loop. 649 void addNewMetadata(Instruction *To, const Instruction *Orig); 650 651 /// Add metadata from one instruction to another. 652 /// 653 /// This includes both the original MDs from \p From and additional ones (\see 654 /// addNewMetadata). Use this for *newly created* instructions in the vector 655 /// loop. 656 void addMetadata(Instruction *To, Instruction *From); 657 658 /// Similar to the previous function but it adds the metadata to a 659 /// vector of instructions. 660 void addMetadata(ArrayRef<Value *> To, Instruction *From); 661 662 /// The original loop. 663 Loop *OrigLoop; 664 665 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 666 /// dynamic knowledge to simplify SCEV expressions and converts them to a 667 /// more usable form. 668 PredicatedScalarEvolution &PSE; 669 670 /// Loop Info. 671 LoopInfo *LI; 672 673 /// Dominator Tree. 674 DominatorTree *DT; 675 676 /// Alias Analysis. 677 AliasAnalysis *AA; 678 679 /// Target Library Info. 680 const TargetLibraryInfo *TLI; 681 682 /// Target Transform Info. 683 const TargetTransformInfo *TTI; 684 685 /// Assumption Cache. 686 AssumptionCache *AC; 687 688 /// Interface to emit optimization remarks. 689 OptimizationRemarkEmitter *ORE; 690 691 /// LoopVersioning. It's only set up (non-null) if memchecks were 692 /// used. 693 /// 694 /// This is currently only used to add no-alias metadata based on the 695 /// memchecks. The actually versioning is performed manually. 696 std::unique_ptr<LoopVersioning> LVer; 697 698 /// The vectorization SIMD factor to use. Each vector will have this many 699 /// vector elements. 700 unsigned VF; 701 702 /// The vectorization unroll factor to use. Each scalar is vectorized to this 703 /// many different vector instructions. 704 unsigned UF; 705 706 /// The builder that we use 707 IRBuilder<> Builder; 708 709 // --- Vectorization state --- 710 711 /// The vector-loop preheader. 712 BasicBlock *LoopVectorPreHeader; 713 714 /// The scalar-loop preheader. 715 BasicBlock *LoopScalarPreHeader; 716 717 /// Middle Block between the vector and the scalar. 718 BasicBlock *LoopMiddleBlock; 719 720 /// The ExitBlock of the scalar loop. 721 BasicBlock *LoopExitBlock; 722 723 /// The vector loop body. 724 BasicBlock *LoopVectorBody; 725 726 /// The scalar loop body. 727 BasicBlock *LoopScalarBody; 728 729 /// A list of all bypass blocks. The first block is the entry of the loop. 730 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 731 732 /// The new Induction variable which was added to the new block. 733 PHINode *Induction = nullptr; 734 735 /// The induction variable of the old basic block. 736 PHINode *OldInduction = nullptr; 737 738 /// Maps values from the original loop to their corresponding values in the 739 /// vectorized loop. A key value can map to either vector values, scalar 740 /// values or both kinds of values, depending on whether the key was 741 /// vectorized and scalarized. 742 VectorizerValueMap VectorLoopValueMap; 743 744 /// Store instructions that were predicated. 745 SmallVector<Instruction *, 4> PredicatedInstructions; 746 747 /// Trip count of the original loop. 748 Value *TripCount = nullptr; 749 750 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 751 Value *VectorTripCount = nullptr; 752 753 /// The legality analysis. 754 LoopVectorizationLegality *Legal; 755 756 /// The profitablity analysis. 757 LoopVectorizationCostModel *Cost; 758 759 // Record whether runtime checks are added. 760 bool AddedSafetyChecks = false; 761 762 // Holds the end values for each induction variable. We save the end values 763 // so we can later fix-up the external users of the induction variables. 764 DenseMap<PHINode *, Value *> IVEndValues; 765 766 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 767 // fixed up at the end of vector code generation. 768 SmallVector<PHINode *, 8> OrigPHIsToFix; 769 }; 770 771 class InnerLoopUnroller : public InnerLoopVectorizer { 772 public: 773 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 774 LoopInfo *LI, DominatorTree *DT, 775 const TargetLibraryInfo *TLI, 776 const TargetTransformInfo *TTI, AssumptionCache *AC, 777 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 778 LoopVectorizationLegality *LVL, 779 LoopVectorizationCostModel *CM) 780 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 781 UnrollFactor, LVL, CM) {} 782 783 private: 784 Value *getBroadcastInstrs(Value *V) override; 785 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 786 Instruction::BinaryOps Opcode = 787 Instruction::BinaryOpsEnd) override; 788 Value *reverseVector(Value *Vec) override; 789 }; 790 791 } // end namespace llvm 792 793 /// Look for a meaningful debug location on the instruction or it's 794 /// operands. 795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 796 if (!I) 797 return I; 798 799 DebugLoc Empty; 800 if (I->getDebugLoc() != Empty) 801 return I; 802 803 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 804 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 805 if (OpInst->getDebugLoc() != Empty) 806 return OpInst; 807 } 808 809 return I; 810 } 811 812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 813 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 814 const DILocation *DIL = Inst->getDebugLoc(); 815 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 816 !isa<DbgInfoIntrinsic>(Inst)) { 817 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 818 if (NewDIL) 819 B.SetCurrentDebugLocation(NewDIL.getValue()); 820 else 821 LLVM_DEBUG(dbgs() 822 << "Failed to create new discriminator: " 823 << DIL->getFilename() << " Line: " << DIL->getLine()); 824 } 825 else 826 B.SetCurrentDebugLocation(DIL); 827 } else 828 B.SetCurrentDebugLocation(DebugLoc()); 829 } 830 831 /// Write a record \p DebugMsg about vectorization failure to the debug 832 /// output stream. If \p I is passed, it is an instruction that prevents 833 /// vectorization. 834 #ifndef NDEBUG 835 static void debugVectorizationFailure(const StringRef DebugMsg, 836 Instruction *I) { 837 dbgs() << "LV: Not vectorizing: " << DebugMsg; 838 if (I != nullptr) 839 dbgs() << " " << *I; 840 else 841 dbgs() << '.'; 842 dbgs() << '\n'; 843 } 844 #endif 845 846 /// Create an analysis remark that explains why vectorization failed 847 /// 848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 849 /// RemarkName is the identifier for the remark. If \p I is passed it is an 850 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 851 /// the location of the remark. \return the remark object that can be 852 /// streamed to. 853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 854 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 855 Value *CodeRegion = TheLoop->getHeader(); 856 DebugLoc DL = TheLoop->getStartLoc(); 857 858 if (I) { 859 CodeRegion = I->getParent(); 860 // If there is no debug location attached to the instruction, revert back to 861 // using the loop's. 862 if (I->getDebugLoc()) 863 DL = I->getDebugLoc(); 864 } 865 866 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 867 R << "loop not vectorized: "; 868 return R; 869 } 870 871 namespace llvm { 872 873 void reportVectorizationFailure(const StringRef DebugMsg, 874 const StringRef OREMsg, const StringRef ORETag, 875 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 876 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 877 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 878 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 879 ORETag, TheLoop, I) << OREMsg); 880 } 881 882 } // end namespace llvm 883 884 #ifndef NDEBUG 885 /// \return string containing a file name and a line # for the given loop. 886 static std::string getDebugLocString(const Loop *L) { 887 std::string Result; 888 if (L) { 889 raw_string_ostream OS(Result); 890 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 891 LoopDbgLoc.print(OS); 892 else 893 // Just print the module name. 894 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 895 OS.flush(); 896 } 897 return Result; 898 } 899 #endif 900 901 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 902 const Instruction *Orig) { 903 // If the loop was versioned with memchecks, add the corresponding no-alias 904 // metadata. 905 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 906 LVer->annotateInstWithNoAlias(To, Orig); 907 } 908 909 void InnerLoopVectorizer::addMetadata(Instruction *To, 910 Instruction *From) { 911 propagateMetadata(To, From); 912 addNewMetadata(To, From); 913 } 914 915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 916 Instruction *From) { 917 for (Value *V : To) { 918 if (Instruction *I = dyn_cast<Instruction>(V)) 919 addMetadata(I, From); 920 } 921 } 922 923 namespace llvm { 924 925 // Loop vectorization cost-model hints how the scalar epilogue loop should be 926 // lowered. 927 enum ScalarEpilogueLowering { 928 929 // The default: allowing scalar epilogues. 930 CM_ScalarEpilogueAllowed, 931 932 // Vectorization with OptForSize: don't allow epilogues. 933 CM_ScalarEpilogueNotAllowedOptSize, 934 935 // A special case of vectorisation with OptForSize: loops with a very small 936 // trip count are considered for vectorization under OptForSize, thereby 937 // making sure the cost of their loop body is dominant, free of runtime 938 // guards and scalar iteration overheads. 939 CM_ScalarEpilogueNotAllowedLowTripLoop, 940 941 // Loop hint predicate indicating an epilogue is undesired. 942 CM_ScalarEpilogueNotNeededUsePredicate 943 }; 944 945 /// LoopVectorizationCostModel - estimates the expected speedups due to 946 /// vectorization. 947 /// In many cases vectorization is not profitable. This can happen because of 948 /// a number of reasons. In this class we mainly attempt to predict the 949 /// expected speedup/slowdowns due to the supported instruction set. We use the 950 /// TargetTransformInfo to query the different backends for the cost of 951 /// different operations. 952 class LoopVectorizationCostModel { 953 public: 954 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 955 PredicatedScalarEvolution &PSE, LoopInfo *LI, 956 LoopVectorizationLegality *Legal, 957 const TargetTransformInfo &TTI, 958 const TargetLibraryInfo *TLI, DemandedBits *DB, 959 AssumptionCache *AC, 960 OptimizationRemarkEmitter *ORE, const Function *F, 961 const LoopVectorizeHints *Hints, 962 InterleavedAccessInfo &IAI) 963 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 964 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 965 Hints(Hints), InterleaveInfo(IAI) {} 966 967 /// \return An upper bound for the vectorization factor, or None if 968 /// vectorization and interleaving should be avoided up front. 969 Optional<unsigned> computeMaxVF(); 970 971 /// \return True if runtime checks are required for vectorization, and false 972 /// otherwise. 973 bool runtimeChecksRequired(); 974 975 /// \return The most profitable vectorization factor and the cost of that VF. 976 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 977 /// then this vectorization factor will be selected if vectorization is 978 /// possible. 979 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 980 981 /// Setup cost-based decisions for user vectorization factor. 982 void selectUserVectorizationFactor(unsigned UserVF) { 983 collectUniformsAndScalars(UserVF); 984 collectInstsToScalarize(UserVF); 985 } 986 987 /// \return The size (in bits) of the smallest and widest types in the code 988 /// that needs to be vectorized. We ignore values that remain scalar such as 989 /// 64 bit loop indices. 990 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 991 992 /// \return The desired interleave count. 993 /// If interleave count has been specified by metadata it will be returned. 994 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 995 /// are the selected vectorization factor and the cost of the selected VF. 996 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 997 998 /// Memory access instruction may be vectorized in more than one way. 999 /// Form of instruction after vectorization depends on cost. 1000 /// This function takes cost-based decisions for Load/Store instructions 1001 /// and collects them in a map. This decisions map is used for building 1002 /// the lists of loop-uniform and loop-scalar instructions. 1003 /// The calculated cost is saved with widening decision in order to 1004 /// avoid redundant calculations. 1005 void setCostBasedWideningDecision(unsigned VF); 1006 1007 /// A struct that represents some properties of the register usage 1008 /// of a loop. 1009 struct RegisterUsage { 1010 /// Holds the number of loop invariant values that are used in the loop. 1011 /// The key is ClassID of target-provided register class. 1012 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1013 /// Holds the maximum number of concurrent live intervals in the loop. 1014 /// The key is ClassID of target-provided register class. 1015 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1016 }; 1017 1018 /// \return Returns information about the register usages of the loop for the 1019 /// given vectorization factors. 1020 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1021 1022 /// Collect values we want to ignore in the cost model. 1023 void collectValuesToIgnore(); 1024 1025 /// \returns The smallest bitwidth each instruction can be represented with. 1026 /// The vector equivalents of these instructions should be truncated to this 1027 /// type. 1028 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1029 return MinBWs; 1030 } 1031 1032 /// \returns True if it is more profitable to scalarize instruction \p I for 1033 /// vectorization factor \p VF. 1034 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1035 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1036 1037 // Cost model is not run in the VPlan-native path - return conservative 1038 // result until this changes. 1039 if (EnableVPlanNativePath) 1040 return false; 1041 1042 auto Scalars = InstsToScalarize.find(VF); 1043 assert(Scalars != InstsToScalarize.end() && 1044 "VF not yet analyzed for scalarization profitability"); 1045 return Scalars->second.find(I) != Scalars->second.end(); 1046 } 1047 1048 /// Returns true if \p I is known to be uniform after vectorization. 1049 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1050 if (VF == 1) 1051 return true; 1052 1053 // Cost model is not run in the VPlan-native path - return conservative 1054 // result until this changes. 1055 if (EnableVPlanNativePath) 1056 return false; 1057 1058 auto UniformsPerVF = Uniforms.find(VF); 1059 assert(UniformsPerVF != Uniforms.end() && 1060 "VF not yet analyzed for uniformity"); 1061 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1062 } 1063 1064 /// Returns true if \p I is known to be scalar after vectorization. 1065 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1066 if (VF == 1) 1067 return true; 1068 1069 // Cost model is not run in the VPlan-native path - return conservative 1070 // result until this changes. 1071 if (EnableVPlanNativePath) 1072 return false; 1073 1074 auto ScalarsPerVF = Scalars.find(VF); 1075 assert(ScalarsPerVF != Scalars.end() && 1076 "Scalar values are not calculated for VF"); 1077 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1078 } 1079 1080 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1081 /// for vectorization factor \p VF. 1082 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1083 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1084 !isProfitableToScalarize(I, VF) && 1085 !isScalarAfterVectorization(I, VF); 1086 } 1087 1088 /// Decision that was taken during cost calculation for memory instruction. 1089 enum InstWidening { 1090 CM_Unknown, 1091 CM_Widen, // For consecutive accesses with stride +1. 1092 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1093 CM_Interleave, 1094 CM_GatherScatter, 1095 CM_Scalarize 1096 }; 1097 1098 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1099 /// instruction \p I and vector width \p VF. 1100 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1101 unsigned Cost) { 1102 assert(VF >= 2 && "Expected VF >=2"); 1103 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1104 } 1105 1106 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1107 /// interleaving group \p Grp and vector width \p VF. 1108 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1109 InstWidening W, unsigned Cost) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 /// Broadcast this decicion to all instructions inside the group. 1112 /// But the cost will be assigned to one instruction only. 1113 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1114 if (auto *I = Grp->getMember(i)) { 1115 if (Grp->getInsertPos() == I) 1116 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1117 else 1118 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1119 } 1120 } 1121 } 1122 1123 /// Return the cost model decision for the given instruction \p I and vector 1124 /// width \p VF. Return CM_Unknown if this instruction did not pass 1125 /// through the cost modeling. 1126 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1127 assert(VF >= 2 && "Expected VF >=2"); 1128 1129 // Cost model is not run in the VPlan-native path - return conservative 1130 // result until this changes. 1131 if (EnableVPlanNativePath) 1132 return CM_GatherScatter; 1133 1134 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1135 auto Itr = WideningDecisions.find(InstOnVF); 1136 if (Itr == WideningDecisions.end()) 1137 return CM_Unknown; 1138 return Itr->second.first; 1139 } 1140 1141 /// Return the vectorization cost for the given instruction \p I and vector 1142 /// width \p VF. 1143 unsigned getWideningCost(Instruction *I, unsigned VF) { 1144 assert(VF >= 2 && "Expected VF >=2"); 1145 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1146 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1147 "The cost is not calculated"); 1148 return WideningDecisions[InstOnVF].second; 1149 } 1150 1151 /// Return True if instruction \p I is an optimizable truncate whose operand 1152 /// is an induction variable. Such a truncate will be removed by adding a new 1153 /// induction variable with the destination type. 1154 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1155 // If the instruction is not a truncate, return false. 1156 auto *Trunc = dyn_cast<TruncInst>(I); 1157 if (!Trunc) 1158 return false; 1159 1160 // Get the source and destination types of the truncate. 1161 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1162 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1163 1164 // If the truncate is free for the given types, return false. Replacing a 1165 // free truncate with an induction variable would add an induction variable 1166 // update instruction to each iteration of the loop. We exclude from this 1167 // check the primary induction variable since it will need an update 1168 // instruction regardless. 1169 Value *Op = Trunc->getOperand(0); 1170 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1171 return false; 1172 1173 // If the truncated value is not an induction variable, return false. 1174 return Legal->isInductionPhi(Op); 1175 } 1176 1177 /// Collects the instructions to scalarize for each predicated instruction in 1178 /// the loop. 1179 void collectInstsToScalarize(unsigned VF); 1180 1181 /// Collect Uniform and Scalar values for the given \p VF. 1182 /// The sets depend on CM decision for Load/Store instructions 1183 /// that may be vectorized as interleave, gather-scatter or scalarized. 1184 void collectUniformsAndScalars(unsigned VF) { 1185 // Do the analysis once. 1186 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1187 return; 1188 setCostBasedWideningDecision(VF); 1189 collectLoopUniforms(VF); 1190 collectLoopScalars(VF); 1191 } 1192 1193 /// Returns true if the target machine supports masked store operation 1194 /// for the given \p DataType and kind of access to \p Ptr. 1195 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1196 return Legal->isConsecutivePtr(Ptr) && 1197 TTI.isLegalMaskedStore(DataType, Alignment); 1198 } 1199 1200 /// Returns true if the target machine supports masked load operation 1201 /// for the given \p DataType and kind of access to \p Ptr. 1202 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1203 return Legal->isConsecutivePtr(Ptr) && 1204 TTI.isLegalMaskedLoad(DataType, Alignment); 1205 } 1206 1207 /// Returns true if the target machine supports masked scatter operation 1208 /// for the given \p DataType. 1209 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1210 return TTI.isLegalMaskedScatter(DataType, Alignment); 1211 } 1212 1213 /// Returns true if the target machine supports masked gather operation 1214 /// for the given \p DataType. 1215 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1216 return TTI.isLegalMaskedGather(DataType, Alignment); 1217 } 1218 1219 /// Returns true if the target machine can represent \p V as a masked gather 1220 /// or scatter operation. 1221 bool isLegalGatherOrScatter(Value *V) { 1222 bool LI = isa<LoadInst>(V); 1223 bool SI = isa<StoreInst>(V); 1224 if (!LI && !SI) 1225 return false; 1226 auto *Ty = getMemInstValueType(V); 1227 MaybeAlign Align = getLoadStoreAlignment(V); 1228 return (LI && isLegalMaskedGather(Ty, Align)) || 1229 (SI && isLegalMaskedScatter(Ty, Align)); 1230 } 1231 1232 /// Returns true if \p I is an instruction that will be scalarized with 1233 /// predication. Such instructions include conditional stores and 1234 /// instructions that may divide by zero. 1235 /// If a non-zero VF has been calculated, we check if I will be scalarized 1236 /// predication for that VF. 1237 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1238 1239 // Returns true if \p I is an instruction that will be predicated either 1240 // through scalar predication or masked load/store or masked gather/scatter. 1241 // Superset of instructions that return true for isScalarWithPredication. 1242 bool isPredicatedInst(Instruction *I) { 1243 if (!blockNeedsPredication(I->getParent())) 1244 return false; 1245 // Loads and stores that need some form of masked operation are predicated 1246 // instructions. 1247 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1248 return Legal->isMaskRequired(I); 1249 return isScalarWithPredication(I); 1250 } 1251 1252 /// Returns true if \p I is a memory instruction with consecutive memory 1253 /// access that can be widened. 1254 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1255 1256 /// Returns true if \p I is a memory instruction in an interleaved-group 1257 /// of memory accesses that can be vectorized with wide vector loads/stores 1258 /// and shuffles. 1259 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1260 1261 /// Check if \p Instr belongs to any interleaved access group. 1262 bool isAccessInterleaved(Instruction *Instr) { 1263 return InterleaveInfo.isInterleaved(Instr); 1264 } 1265 1266 /// Get the interleaved access group that \p Instr belongs to. 1267 const InterleaveGroup<Instruction> * 1268 getInterleavedAccessGroup(Instruction *Instr) { 1269 return InterleaveInfo.getInterleaveGroup(Instr); 1270 } 1271 1272 /// Returns true if an interleaved group requires a scalar iteration 1273 /// to handle accesses with gaps, and there is nothing preventing us from 1274 /// creating a scalar epilogue. 1275 bool requiresScalarEpilogue() const { 1276 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1277 } 1278 1279 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1280 /// loop hint annotation. 1281 bool isScalarEpilogueAllowed() const { 1282 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1283 } 1284 1285 /// Returns true if all loop blocks should be masked to fold tail loop. 1286 bool foldTailByMasking() const { return FoldTailByMasking; } 1287 1288 bool blockNeedsPredication(BasicBlock *BB) { 1289 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1290 } 1291 1292 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1293 /// with factor VF. Return the cost of the instruction, including 1294 /// scalarization overhead if it's needed. 1295 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1296 1297 /// Estimate cost of a call instruction CI if it were vectorized with factor 1298 /// VF. Return the cost of the instruction, including scalarization overhead 1299 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1300 /// scalarized - 1301 /// i.e. either vector version isn't available, or is too expensive. 1302 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1303 1304 private: 1305 unsigned NumPredStores = 0; 1306 1307 /// \return An upper bound for the vectorization factor, larger than zero. 1308 /// One is returned if vectorization should best be avoided due to cost. 1309 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1310 1311 /// The vectorization cost is a combination of the cost itself and a boolean 1312 /// indicating whether any of the contributing operations will actually 1313 /// operate on 1314 /// vector values after type legalization in the backend. If this latter value 1315 /// is 1316 /// false, then all operations will be scalarized (i.e. no vectorization has 1317 /// actually taken place). 1318 using VectorizationCostTy = std::pair<unsigned, bool>; 1319 1320 /// Returns the expected execution cost. The unit of the cost does 1321 /// not matter because we use the 'cost' units to compare different 1322 /// vector widths. The cost that is returned is *not* normalized by 1323 /// the factor width. 1324 VectorizationCostTy expectedCost(unsigned VF); 1325 1326 /// Returns the execution time cost of an instruction for a given vector 1327 /// width. Vector width of one means scalar. 1328 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1329 1330 /// The cost-computation logic from getInstructionCost which provides 1331 /// the vector type as an output parameter. 1332 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1333 1334 /// Calculate vectorization cost of memory instruction \p I. 1335 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1336 1337 /// The cost computation for scalarized memory instruction. 1338 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1339 1340 /// The cost computation for interleaving group of memory instructions. 1341 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1342 1343 /// The cost computation for Gather/Scatter instruction. 1344 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for widening instruction \p I with consecutive 1347 /// memory access. 1348 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1349 1350 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1351 /// Load: scalar load + broadcast. 1352 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1353 /// element) 1354 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1355 1356 /// Estimate the overhead of scalarizing an instruction. This is a 1357 /// convenience wrapper for the type-based getScalarizationOverhead API. 1358 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1359 1360 /// Returns whether the instruction is a load or store and will be a emitted 1361 /// as a vector operation. 1362 bool isConsecutiveLoadOrStore(Instruction *I); 1363 1364 /// Returns true if an artificially high cost for emulated masked memrefs 1365 /// should be used. 1366 bool useEmulatedMaskMemRefHack(Instruction *I); 1367 1368 /// Map of scalar integer values to the smallest bitwidth they can be legally 1369 /// represented as. The vector equivalents of these values should be truncated 1370 /// to this type. 1371 MapVector<Instruction *, uint64_t> MinBWs; 1372 1373 /// A type representing the costs for instructions if they were to be 1374 /// scalarized rather than vectorized. The entries are Instruction-Cost 1375 /// pairs. 1376 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1377 1378 /// A set containing all BasicBlocks that are known to present after 1379 /// vectorization as a predicated block. 1380 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1381 1382 /// Records whether it is allowed to have the original scalar loop execute at 1383 /// least once. This may be needed as a fallback loop in case runtime 1384 /// aliasing/dependence checks fail, or to handle the tail/remainder 1385 /// iterations when the trip count is unknown or doesn't divide by the VF, 1386 /// or as a peel-loop to handle gaps in interleave-groups. 1387 /// Under optsize and when the trip count is very small we don't allow any 1388 /// iterations to execute in the scalar loop. 1389 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1390 1391 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1392 bool FoldTailByMasking = false; 1393 1394 /// A map holding scalar costs for different vectorization factors. The 1395 /// presence of a cost for an instruction in the mapping indicates that the 1396 /// instruction will be scalarized when vectorizing with the associated 1397 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1398 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1399 1400 /// Holds the instructions known to be uniform after vectorization. 1401 /// The data is collected per VF. 1402 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1403 1404 /// Holds the instructions known to be scalar after vectorization. 1405 /// The data is collected per VF. 1406 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1407 1408 /// Holds the instructions (address computations) that are forced to be 1409 /// scalarized. 1410 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1411 1412 /// Returns the expected difference in cost from scalarizing the expression 1413 /// feeding a predicated instruction \p PredInst. The instructions to 1414 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1415 /// non-negative return value implies the expression will be scalarized. 1416 /// Currently, only single-use chains are considered for scalarization. 1417 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1418 unsigned VF); 1419 1420 /// Collect the instructions that are uniform after vectorization. An 1421 /// instruction is uniform if we represent it with a single scalar value in 1422 /// the vectorized loop corresponding to each vector iteration. Examples of 1423 /// uniform instructions include pointer operands of consecutive or 1424 /// interleaved memory accesses. Note that although uniformity implies an 1425 /// instruction will be scalar, the reverse is not true. In general, a 1426 /// scalarized instruction will be represented by VF scalar values in the 1427 /// vectorized loop, each corresponding to an iteration of the original 1428 /// scalar loop. 1429 void collectLoopUniforms(unsigned VF); 1430 1431 /// Collect the instructions that are scalar after vectorization. An 1432 /// instruction is scalar if it is known to be uniform or will be scalarized 1433 /// during vectorization. Non-uniform scalarized instructions will be 1434 /// represented by VF values in the vectorized loop, each corresponding to an 1435 /// iteration of the original scalar loop. 1436 void collectLoopScalars(unsigned VF); 1437 1438 /// Keeps cost model vectorization decision and cost for instructions. 1439 /// Right now it is used for memory instructions only. 1440 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1441 std::pair<InstWidening, unsigned>>; 1442 1443 DecisionList WideningDecisions; 1444 1445 /// Returns true if \p V is expected to be vectorized and it needs to be 1446 /// extracted. 1447 bool needsExtract(Value *V, unsigned VF) const { 1448 Instruction *I = dyn_cast<Instruction>(V); 1449 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1450 return false; 1451 1452 // Assume we can vectorize V (and hence we need extraction) if the 1453 // scalars are not computed yet. This can happen, because it is called 1454 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1455 // the scalars are collected. That should be a safe assumption in most 1456 // cases, because we check if the operands have vectorizable types 1457 // beforehand in LoopVectorizationLegality. 1458 return Scalars.find(VF) == Scalars.end() || 1459 !isScalarAfterVectorization(I, VF); 1460 }; 1461 1462 /// Returns a range containing only operands needing to be extracted. 1463 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1464 unsigned VF) { 1465 return SmallVector<Value *, 4>(make_filter_range( 1466 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1467 } 1468 1469 public: 1470 /// The loop that we evaluate. 1471 Loop *TheLoop; 1472 1473 /// Predicated scalar evolution analysis. 1474 PredicatedScalarEvolution &PSE; 1475 1476 /// Loop Info analysis. 1477 LoopInfo *LI; 1478 1479 /// Vectorization legality. 1480 LoopVectorizationLegality *Legal; 1481 1482 /// Vector target information. 1483 const TargetTransformInfo &TTI; 1484 1485 /// Target Library Info. 1486 const TargetLibraryInfo *TLI; 1487 1488 /// Demanded bits analysis. 1489 DemandedBits *DB; 1490 1491 /// Assumption cache. 1492 AssumptionCache *AC; 1493 1494 /// Interface to emit optimization remarks. 1495 OptimizationRemarkEmitter *ORE; 1496 1497 const Function *TheFunction; 1498 1499 /// Loop Vectorize Hint. 1500 const LoopVectorizeHints *Hints; 1501 1502 /// The interleave access information contains groups of interleaved accesses 1503 /// with the same stride and close to each other. 1504 InterleavedAccessInfo &InterleaveInfo; 1505 1506 /// Values to ignore in the cost model. 1507 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1508 1509 /// Values to ignore in the cost model when VF > 1. 1510 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1511 }; 1512 1513 } // end namespace llvm 1514 1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1516 // vectorization. The loop needs to be annotated with #pragma omp simd 1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1518 // vector length information is not provided, vectorization is not considered 1519 // explicit. Interleave hints are not allowed either. These limitations will be 1520 // relaxed in the future. 1521 // Please, note that we are currently forced to abuse the pragma 'clang 1522 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1524 // provides *explicit vectorization hints* (LV can bypass legal checks and 1525 // assume that vectorization is legal). However, both hints are implemented 1526 // using the same metadata (llvm.loop.vectorize, processed by 1527 // LoopVectorizeHints). This will be fixed in the future when the native IR 1528 // representation for pragma 'omp simd' is introduced. 1529 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1530 OptimizationRemarkEmitter *ORE) { 1531 assert(!OuterLp->empty() && "This is not an outer loop"); 1532 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1533 1534 // Only outer loops with an explicit vectorization hint are supported. 1535 // Unannotated outer loops are ignored. 1536 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1537 return false; 1538 1539 Function *Fn = OuterLp->getHeader()->getParent(); 1540 if (!Hints.allowVectorization(Fn, OuterLp, 1541 true /*VectorizeOnlyWhenForced*/)) { 1542 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1543 return false; 1544 } 1545 1546 if (Hints.getInterleave() > 1) { 1547 // TODO: Interleave support is future work. 1548 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1549 "outer loops.\n"); 1550 Hints.emitRemarkWithHints(); 1551 return false; 1552 } 1553 1554 return true; 1555 } 1556 1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1558 OptimizationRemarkEmitter *ORE, 1559 SmallVectorImpl<Loop *> &V) { 1560 // Collect inner loops and outer loops without irreducible control flow. For 1561 // now, only collect outer loops that have explicit vectorization hints. If we 1562 // are stress testing the VPlan H-CFG construction, we collect the outermost 1563 // loop of every loop nest. 1564 if (L.empty() || VPlanBuildStressTest || 1565 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1566 LoopBlocksRPO RPOT(&L); 1567 RPOT.perform(LI); 1568 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1569 V.push_back(&L); 1570 // TODO: Collect inner loops inside marked outer loops in case 1571 // vectorization fails for the outer loop. Do not invoke 1572 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1573 // already known to be reducible. We can use an inherited attribute for 1574 // that. 1575 return; 1576 } 1577 } 1578 for (Loop *InnerL : L) 1579 collectSupportedLoops(*InnerL, LI, ORE, V); 1580 } 1581 1582 namespace { 1583 1584 /// The LoopVectorize Pass. 1585 struct LoopVectorize : public FunctionPass { 1586 /// Pass identification, replacement for typeid 1587 static char ID; 1588 1589 LoopVectorizePass Impl; 1590 1591 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1592 bool VectorizeOnlyWhenForced = false) 1593 : FunctionPass(ID) { 1594 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1595 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1596 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1597 } 1598 1599 bool runOnFunction(Function &F) override { 1600 if (skipFunction(F)) 1601 return false; 1602 1603 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1604 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1605 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1606 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1607 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1608 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1609 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1610 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1611 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1612 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1613 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1614 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1615 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1616 1617 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1618 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1619 1620 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1621 GetLAA, *ORE, PSI); 1622 } 1623 1624 void getAnalysisUsage(AnalysisUsage &AU) const override { 1625 AU.addRequired<AssumptionCacheTracker>(); 1626 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1627 AU.addRequired<DominatorTreeWrapperPass>(); 1628 AU.addRequired<LoopInfoWrapperPass>(); 1629 AU.addRequired<ScalarEvolutionWrapperPass>(); 1630 AU.addRequired<TargetTransformInfoWrapperPass>(); 1631 AU.addRequired<AAResultsWrapperPass>(); 1632 AU.addRequired<LoopAccessLegacyAnalysis>(); 1633 AU.addRequired<DemandedBitsWrapperPass>(); 1634 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1635 AU.addRequired<InjectTLIMappingsLegacy>(); 1636 1637 // We currently do not preserve loopinfo/dominator analyses with outer loop 1638 // vectorization. Until this is addressed, mark these analyses as preserved 1639 // only for non-VPlan-native path. 1640 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1641 if (!EnableVPlanNativePath) { 1642 AU.addPreserved<LoopInfoWrapperPass>(); 1643 AU.addPreserved<DominatorTreeWrapperPass>(); 1644 } 1645 1646 AU.addPreserved<BasicAAWrapperPass>(); 1647 AU.addPreserved<GlobalsAAWrapperPass>(); 1648 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1649 } 1650 }; 1651 1652 } // end anonymous namespace 1653 1654 //===----------------------------------------------------------------------===// 1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1656 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1657 //===----------------------------------------------------------------------===// 1658 1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1660 // We need to place the broadcast of invariant variables outside the loop, 1661 // but only if it's proven safe to do so. Else, broadcast will be inside 1662 // vector loop body. 1663 Instruction *Instr = dyn_cast<Instruction>(V); 1664 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1665 (!Instr || 1666 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1667 // Place the code for broadcasting invariant variables in the new preheader. 1668 IRBuilder<>::InsertPointGuard Guard(Builder); 1669 if (SafeToHoist) 1670 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1671 1672 // Broadcast the scalar into all locations in the vector. 1673 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1674 1675 return Shuf; 1676 } 1677 1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1679 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1680 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1681 "Expected either an induction phi-node or a truncate of it!"); 1682 Value *Start = II.getStartValue(); 1683 1684 // Construct the initial value of the vector IV in the vector loop preheader 1685 auto CurrIP = Builder.saveIP(); 1686 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1687 if (isa<TruncInst>(EntryVal)) { 1688 assert(Start->getType()->isIntegerTy() && 1689 "Truncation requires an integer type"); 1690 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1691 Step = Builder.CreateTrunc(Step, TruncType); 1692 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1693 } 1694 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1695 Value *SteppedStart = 1696 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1697 1698 // We create vector phi nodes for both integer and floating-point induction 1699 // variables. Here, we determine the kind of arithmetic we will perform. 1700 Instruction::BinaryOps AddOp; 1701 Instruction::BinaryOps MulOp; 1702 if (Step->getType()->isIntegerTy()) { 1703 AddOp = Instruction::Add; 1704 MulOp = Instruction::Mul; 1705 } else { 1706 AddOp = II.getInductionOpcode(); 1707 MulOp = Instruction::FMul; 1708 } 1709 1710 // Multiply the vectorization factor by the step using integer or 1711 // floating-point arithmetic as appropriate. 1712 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1713 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1714 1715 // Create a vector splat to use in the induction update. 1716 // 1717 // FIXME: If the step is non-constant, we create the vector splat with 1718 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1719 // handle a constant vector splat. 1720 Value *SplatVF = 1721 isa<Constant>(Mul) 1722 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1723 : Builder.CreateVectorSplat(VF, Mul); 1724 Builder.restoreIP(CurrIP); 1725 1726 // We may need to add the step a number of times, depending on the unroll 1727 // factor. The last of those goes into the PHI. 1728 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1729 &*LoopVectorBody->getFirstInsertionPt()); 1730 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1731 Instruction *LastInduction = VecInd; 1732 for (unsigned Part = 0; Part < UF; ++Part) { 1733 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1734 1735 if (isa<TruncInst>(EntryVal)) 1736 addMetadata(LastInduction, EntryVal); 1737 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1738 1739 LastInduction = cast<Instruction>(addFastMathFlag( 1740 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1741 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1742 } 1743 1744 // Move the last step to the end of the latch block. This ensures consistent 1745 // placement of all induction updates. 1746 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1747 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1748 auto *ICmp = cast<Instruction>(Br->getCondition()); 1749 LastInduction->moveBefore(ICmp); 1750 LastInduction->setName("vec.ind.next"); 1751 1752 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1753 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1754 } 1755 1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1757 return Cost->isScalarAfterVectorization(I, VF) || 1758 Cost->isProfitableToScalarize(I, VF); 1759 } 1760 1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1762 if (shouldScalarizeInstruction(IV)) 1763 return true; 1764 auto isScalarInst = [&](User *U) -> bool { 1765 auto *I = cast<Instruction>(U); 1766 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1767 }; 1768 return llvm::any_of(IV->users(), isScalarInst); 1769 } 1770 1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1772 const InductionDescriptor &ID, const Instruction *EntryVal, 1773 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1774 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1775 "Expected either an induction phi-node or a truncate of it!"); 1776 1777 // This induction variable is not the phi from the original loop but the 1778 // newly-created IV based on the proof that casted Phi is equal to the 1779 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1780 // re-uses the same InductionDescriptor that original IV uses but we don't 1781 // have to do any recording in this case - that is done when original IV is 1782 // processed. 1783 if (isa<TruncInst>(EntryVal)) 1784 return; 1785 1786 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1787 if (Casts.empty()) 1788 return; 1789 // Only the first Cast instruction in the Casts vector is of interest. 1790 // The rest of the Casts (if exist) have no uses outside the 1791 // induction update chain itself. 1792 Instruction *CastInst = *Casts.begin(); 1793 if (Lane < UINT_MAX) 1794 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1795 else 1796 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1797 } 1798 1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1800 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1801 "Primary induction variable must have an integer type"); 1802 1803 auto II = Legal->getInductionVars().find(IV); 1804 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1805 1806 auto ID = II->second; 1807 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1808 1809 // The scalar value to broadcast. This will be derived from the canonical 1810 // induction variable. 1811 Value *ScalarIV = nullptr; 1812 1813 // The value from the original loop to which we are mapping the new induction 1814 // variable. 1815 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1816 1817 // True if we have vectorized the induction variable. 1818 auto VectorizedIV = false; 1819 1820 // Determine if we want a scalar version of the induction variable. This is 1821 // true if the induction variable itself is not widened, or if it has at 1822 // least one user in the loop that is not widened. 1823 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1824 1825 // Generate code for the induction step. Note that induction steps are 1826 // required to be loop-invariant 1827 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1828 "Induction step should be loop invariant"); 1829 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1830 Value *Step = nullptr; 1831 if (PSE.getSE()->isSCEVable(IV->getType())) { 1832 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1833 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1834 LoopVectorPreHeader->getTerminator()); 1835 } else { 1836 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1837 } 1838 1839 // Try to create a new independent vector induction variable. If we can't 1840 // create the phi node, we will splat the scalar induction variable in each 1841 // loop iteration. 1842 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1843 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1844 VectorizedIV = true; 1845 } 1846 1847 // If we haven't yet vectorized the induction variable, or if we will create 1848 // a scalar one, we need to define the scalar induction variable and step 1849 // values. If we were given a truncation type, truncate the canonical 1850 // induction variable and step. Otherwise, derive these values from the 1851 // induction descriptor. 1852 if (!VectorizedIV || NeedsScalarIV) { 1853 ScalarIV = Induction; 1854 if (IV != OldInduction) { 1855 ScalarIV = IV->getType()->isIntegerTy() 1856 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1857 : Builder.CreateCast(Instruction::SIToFP, Induction, 1858 IV->getType()); 1859 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1860 ScalarIV->setName("offset.idx"); 1861 } 1862 if (Trunc) { 1863 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1864 assert(Step->getType()->isIntegerTy() && 1865 "Truncation requires an integer step"); 1866 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1867 Step = Builder.CreateTrunc(Step, TruncType); 1868 } 1869 } 1870 1871 // If we haven't yet vectorized the induction variable, splat the scalar 1872 // induction variable, and build the necessary step vectors. 1873 // TODO: Don't do it unless the vectorized IV is really required. 1874 if (!VectorizedIV) { 1875 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1876 for (unsigned Part = 0; Part < UF; ++Part) { 1877 Value *EntryPart = 1878 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1879 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1880 if (Trunc) 1881 addMetadata(EntryPart, Trunc); 1882 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1883 } 1884 } 1885 1886 // If an induction variable is only used for counting loop iterations or 1887 // calculating addresses, it doesn't need to be widened. Create scalar steps 1888 // that can be used by instructions we will later scalarize. Note that the 1889 // addition of the scalar steps will not increase the number of instructions 1890 // in the loop in the common case prior to InstCombine. We will be trading 1891 // one vector extract for each scalar step. 1892 if (NeedsScalarIV) 1893 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1894 } 1895 1896 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1897 Instruction::BinaryOps BinOp) { 1898 // Create and check the types. 1899 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1900 int VLen = Val->getType()->getVectorNumElements(); 1901 1902 Type *STy = Val->getType()->getScalarType(); 1903 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1904 "Induction Step must be an integer or FP"); 1905 assert(Step->getType() == STy && "Step has wrong type"); 1906 1907 SmallVector<Constant *, 8> Indices; 1908 1909 if (STy->isIntegerTy()) { 1910 // Create a vector of consecutive numbers from zero to VF. 1911 for (int i = 0; i < VLen; ++i) 1912 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1913 1914 // Add the consecutive indices to the vector value. 1915 Constant *Cv = ConstantVector::get(Indices); 1916 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1917 Step = Builder.CreateVectorSplat(VLen, Step); 1918 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1919 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1920 // which can be found from the original scalar operations. 1921 Step = Builder.CreateMul(Cv, Step); 1922 return Builder.CreateAdd(Val, Step, "induction"); 1923 } 1924 1925 // Floating point induction. 1926 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1927 "Binary Opcode should be specified for FP induction"); 1928 // Create a vector of consecutive numbers from zero to VF. 1929 for (int i = 0; i < VLen; ++i) 1930 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1931 1932 // Add the consecutive indices to the vector value. 1933 Constant *Cv = ConstantVector::get(Indices); 1934 1935 Step = Builder.CreateVectorSplat(VLen, Step); 1936 1937 // Floating point operations had to be 'fast' to enable the induction. 1938 FastMathFlags Flags; 1939 Flags.setFast(); 1940 1941 Value *MulOp = Builder.CreateFMul(Cv, Step); 1942 if (isa<Instruction>(MulOp)) 1943 // Have to check, MulOp may be a constant 1944 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1945 1946 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1947 if (isa<Instruction>(BOp)) 1948 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1949 return BOp; 1950 } 1951 1952 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1953 Instruction *EntryVal, 1954 const InductionDescriptor &ID) { 1955 // We shouldn't have to build scalar steps if we aren't vectorizing. 1956 assert(VF > 1 && "VF should be greater than one"); 1957 1958 // Get the value type and ensure it and the step have the same integer type. 1959 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1960 assert(ScalarIVTy == Step->getType() && 1961 "Val and Step should have the same type"); 1962 1963 // We build scalar steps for both integer and floating-point induction 1964 // variables. Here, we determine the kind of arithmetic we will perform. 1965 Instruction::BinaryOps AddOp; 1966 Instruction::BinaryOps MulOp; 1967 if (ScalarIVTy->isIntegerTy()) { 1968 AddOp = Instruction::Add; 1969 MulOp = Instruction::Mul; 1970 } else { 1971 AddOp = ID.getInductionOpcode(); 1972 MulOp = Instruction::FMul; 1973 } 1974 1975 // Determine the number of scalars we need to generate for each unroll 1976 // iteration. If EntryVal is uniform, we only need to generate the first 1977 // lane. Otherwise, we generate all VF values. 1978 unsigned Lanes = 1979 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1980 : VF; 1981 // Compute the scalar steps and save the results in VectorLoopValueMap. 1982 for (unsigned Part = 0; Part < UF; ++Part) { 1983 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1984 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1985 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1986 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1987 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1988 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1989 } 1990 } 1991 } 1992 1993 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1994 assert(V != Induction && "The new induction variable should not be used."); 1995 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1996 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1997 1998 // If we have a stride that is replaced by one, do it here. Defer this for 1999 // the VPlan-native path until we start running Legal checks in that path. 2000 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2001 V = ConstantInt::get(V->getType(), 1); 2002 2003 // If we have a vector mapped to this value, return it. 2004 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2005 return VectorLoopValueMap.getVectorValue(V, Part); 2006 2007 // If the value has not been vectorized, check if it has been scalarized 2008 // instead. If it has been scalarized, and we actually need the value in 2009 // vector form, we will construct the vector values on demand. 2010 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2011 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2012 2013 // If we've scalarized a value, that value should be an instruction. 2014 auto *I = cast<Instruction>(V); 2015 2016 // If we aren't vectorizing, we can just copy the scalar map values over to 2017 // the vector map. 2018 if (VF == 1) { 2019 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2020 return ScalarValue; 2021 } 2022 2023 // Get the last scalar instruction we generated for V and Part. If the value 2024 // is known to be uniform after vectorization, this corresponds to lane zero 2025 // of the Part unroll iteration. Otherwise, the last instruction is the one 2026 // we created for the last vector lane of the Part unroll iteration. 2027 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2028 auto *LastInst = cast<Instruction>( 2029 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2030 2031 // Set the insert point after the last scalarized instruction. This ensures 2032 // the insertelement sequence will directly follow the scalar definitions. 2033 auto OldIP = Builder.saveIP(); 2034 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2035 Builder.SetInsertPoint(&*NewIP); 2036 2037 // However, if we are vectorizing, we need to construct the vector values. 2038 // If the value is known to be uniform after vectorization, we can just 2039 // broadcast the scalar value corresponding to lane zero for each unroll 2040 // iteration. Otherwise, we construct the vector values using insertelement 2041 // instructions. Since the resulting vectors are stored in 2042 // VectorLoopValueMap, we will only generate the insertelements once. 2043 Value *VectorValue = nullptr; 2044 if (Cost->isUniformAfterVectorization(I, VF)) { 2045 VectorValue = getBroadcastInstrs(ScalarValue); 2046 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2047 } else { 2048 // Initialize packing with insertelements to start from undef. 2049 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2050 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2051 for (unsigned Lane = 0; Lane < VF; ++Lane) 2052 packScalarIntoVectorValue(V, {Part, Lane}); 2053 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2054 } 2055 Builder.restoreIP(OldIP); 2056 return VectorValue; 2057 } 2058 2059 // If this scalar is unknown, assume that it is a constant or that it is 2060 // loop invariant. Broadcast V and save the value for future uses. 2061 Value *B = getBroadcastInstrs(V); 2062 VectorLoopValueMap.setVectorValue(V, Part, B); 2063 return B; 2064 } 2065 2066 Value * 2067 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2068 const VPIteration &Instance) { 2069 // If the value is not an instruction contained in the loop, it should 2070 // already be scalar. 2071 if (OrigLoop->isLoopInvariant(V)) 2072 return V; 2073 2074 assert(Instance.Lane > 0 2075 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2076 : true && "Uniform values only have lane zero"); 2077 2078 // If the value from the original loop has not been vectorized, it is 2079 // represented by UF x VF scalar values in the new loop. Return the requested 2080 // scalar value. 2081 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2082 return VectorLoopValueMap.getScalarValue(V, Instance); 2083 2084 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2085 // for the given unroll part. If this entry is not a vector type (i.e., the 2086 // vectorization factor is one), there is no need to generate an 2087 // extractelement instruction. 2088 auto *U = getOrCreateVectorValue(V, Instance.Part); 2089 if (!U->getType()->isVectorTy()) { 2090 assert(VF == 1 && "Value not scalarized has non-vector type"); 2091 return U; 2092 } 2093 2094 // Otherwise, the value from the original loop has been vectorized and is 2095 // represented by UF vector values. Extract and return the requested scalar 2096 // value from the appropriate vector lane. 2097 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2098 } 2099 2100 void InnerLoopVectorizer::packScalarIntoVectorValue( 2101 Value *V, const VPIteration &Instance) { 2102 assert(V != Induction && "The new induction variable should not be used."); 2103 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2104 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2105 2106 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2107 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2108 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2109 Builder.getInt32(Instance.Lane)); 2110 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2111 } 2112 2113 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2114 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2115 SmallVector<Constant *, 8> ShuffleMask; 2116 for (unsigned i = 0; i < VF; ++i) 2117 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2118 2119 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2120 ConstantVector::get(ShuffleMask), 2121 "reverse"); 2122 } 2123 2124 // Return whether we allow using masked interleave-groups (for dealing with 2125 // strided loads/stores that reside in predicated blocks, or for dealing 2126 // with gaps). 2127 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2128 // If an override option has been passed in for interleaved accesses, use it. 2129 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2130 return EnableMaskedInterleavedMemAccesses; 2131 2132 return TTI.enableMaskedInterleavedAccessVectorization(); 2133 } 2134 2135 // Try to vectorize the interleave group that \p Instr belongs to. 2136 // 2137 // E.g. Translate following interleaved load group (factor = 3): 2138 // for (i = 0; i < N; i+=3) { 2139 // R = Pic[i]; // Member of index 0 2140 // G = Pic[i+1]; // Member of index 1 2141 // B = Pic[i+2]; // Member of index 2 2142 // ... // do something to R, G, B 2143 // } 2144 // To: 2145 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2146 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2147 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2148 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2149 // 2150 // Or translate following interleaved store group (factor = 3): 2151 // for (i = 0; i < N; i+=3) { 2152 // ... do something to R, G, B 2153 // Pic[i] = R; // Member of index 0 2154 // Pic[i+1] = G; // Member of index 1 2155 // Pic[i+2] = B; // Member of index 2 2156 // } 2157 // To: 2158 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2159 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2160 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2161 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2162 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2163 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2164 VPTransformState &State, 2165 VPValue *Addr, 2166 VPValue *BlockInMask) { 2167 const InterleaveGroup<Instruction> *Group = 2168 Cost->getInterleavedAccessGroup(Instr); 2169 assert(Group && "Fail to get an interleaved access group."); 2170 2171 // Skip if current instruction is not the insert position. 2172 if (Instr != Group->getInsertPos()) 2173 return; 2174 2175 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2176 2177 // Prepare for the vector type of the interleaved load/store. 2178 Type *ScalarTy = getMemInstValueType(Instr); 2179 unsigned InterleaveFactor = Group->getFactor(); 2180 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2181 2182 // Prepare for the new pointers. 2183 SmallVector<Value *, 2> AddrParts; 2184 unsigned Index = Group->getIndex(Instr); 2185 2186 // TODO: extend the masked interleaved-group support to reversed access. 2187 assert((!BlockInMask || !Group->isReverse()) && 2188 "Reversed masked interleave-group not supported."); 2189 2190 // If the group is reverse, adjust the index to refer to the last vector lane 2191 // instead of the first. We adjust the index from the first vector lane, 2192 // rather than directly getting the pointer for lane VF - 1, because the 2193 // pointer operand of the interleaved access is supposed to be uniform. For 2194 // uniform instructions, we're only required to generate a value for the 2195 // first vector lane in each unroll iteration. 2196 if (Group->isReverse()) 2197 Index += (VF - 1) * Group->getFactor(); 2198 2199 for (unsigned Part = 0; Part < UF; Part++) { 2200 Value *AddrPart = State.get(Addr, {Part, 0}); 2201 setDebugLocFromInst(Builder, AddrPart); 2202 2203 // Notice current instruction could be any index. Need to adjust the address 2204 // to the member of index 0. 2205 // 2206 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2207 // b = A[i]; // Member of index 0 2208 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2209 // 2210 // E.g. A[i+1] = a; // Member of index 1 2211 // A[i] = b; // Member of index 0 2212 // A[i+2] = c; // Member of index 2 (Current instruction) 2213 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2214 2215 bool InBounds = false; 2216 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2217 InBounds = gep->isInBounds(); 2218 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2219 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2220 2221 // Cast to the vector pointer type. 2222 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2223 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2224 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2225 } 2226 2227 setDebugLocFromInst(Builder, Instr); 2228 Value *UndefVec = UndefValue::get(VecTy); 2229 2230 Value *MaskForGaps = nullptr; 2231 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2232 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2233 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2234 } 2235 2236 // Vectorize the interleaved load group. 2237 if (isa<LoadInst>(Instr)) { 2238 // For each unroll part, create a wide load for the group. 2239 SmallVector<Value *, 2> NewLoads; 2240 for (unsigned Part = 0; Part < UF; Part++) { 2241 Instruction *NewLoad; 2242 if (BlockInMask || MaskForGaps) { 2243 assert(useMaskedInterleavedAccesses(*TTI) && 2244 "masked interleaved groups are not allowed."); 2245 Value *GroupMask = MaskForGaps; 2246 if (BlockInMask) { 2247 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2248 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2249 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2250 Value *ShuffledMask = Builder.CreateShuffleVector( 2251 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2252 GroupMask = MaskForGaps 2253 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2254 MaskForGaps) 2255 : ShuffledMask; 2256 } 2257 NewLoad = 2258 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2259 GroupMask, UndefVec, "wide.masked.vec"); 2260 } 2261 else 2262 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2263 Group->getAlign(), "wide.vec"); 2264 Group->addMetadata(NewLoad); 2265 NewLoads.push_back(NewLoad); 2266 } 2267 2268 // For each member in the group, shuffle out the appropriate data from the 2269 // wide loads. 2270 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2271 Instruction *Member = Group->getMember(I); 2272 2273 // Skip the gaps in the group. 2274 if (!Member) 2275 continue; 2276 2277 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2278 for (unsigned Part = 0; Part < UF; Part++) { 2279 Value *StridedVec = Builder.CreateShuffleVector( 2280 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2281 2282 // If this member has different type, cast the result type. 2283 if (Member->getType() != ScalarTy) { 2284 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2285 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2286 } 2287 2288 if (Group->isReverse()) 2289 StridedVec = reverseVector(StridedVec); 2290 2291 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2292 } 2293 } 2294 return; 2295 } 2296 2297 // The sub vector type for current instruction. 2298 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2299 2300 // Vectorize the interleaved store group. 2301 for (unsigned Part = 0; Part < UF; Part++) { 2302 // Collect the stored vector from each member. 2303 SmallVector<Value *, 4> StoredVecs; 2304 for (unsigned i = 0; i < InterleaveFactor; i++) { 2305 // Interleaved store group doesn't allow a gap, so each index has a member 2306 Instruction *Member = Group->getMember(i); 2307 assert(Member && "Fail to get a member from an interleaved store group"); 2308 2309 Value *StoredVec = getOrCreateVectorValue( 2310 cast<StoreInst>(Member)->getValueOperand(), Part); 2311 if (Group->isReverse()) 2312 StoredVec = reverseVector(StoredVec); 2313 2314 // If this member has different type, cast it to a unified type. 2315 2316 if (StoredVec->getType() != SubVT) 2317 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2318 2319 StoredVecs.push_back(StoredVec); 2320 } 2321 2322 // Concatenate all vectors into a wide vector. 2323 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2324 2325 // Interleave the elements in the wide vector. 2326 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2327 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2328 "interleaved.vec"); 2329 2330 Instruction *NewStoreInstr; 2331 if (BlockInMask) { 2332 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2333 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2334 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2335 Value *ShuffledMask = Builder.CreateShuffleVector( 2336 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2337 NewStoreInstr = Builder.CreateMaskedStore( 2338 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2339 } 2340 else 2341 NewStoreInstr = 2342 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2343 2344 Group->addMetadata(NewStoreInstr); 2345 } 2346 } 2347 2348 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2349 VPTransformState &State, 2350 VPValue *Addr, 2351 VPValue *StoredValue, 2352 VPValue *BlockInMask) { 2353 // Attempt to issue a wide load. 2354 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2355 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2356 2357 assert((LI || SI) && "Invalid Load/Store instruction"); 2358 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2359 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2360 2361 LoopVectorizationCostModel::InstWidening Decision = 2362 Cost->getWideningDecision(Instr, VF); 2363 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2364 "CM decision should be taken at this point"); 2365 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2366 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2367 2368 Type *ScalarDataTy = getMemInstValueType(Instr); 2369 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2370 // An alignment of 0 means target abi alignment. We need to use the scalar's 2371 // target abi alignment in such a case. 2372 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2373 const Align Alignment = 2374 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2375 2376 // Determine if the pointer operand of the access is either consecutive or 2377 // reverse consecutive. 2378 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2379 bool ConsecutiveStride = 2380 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2381 bool CreateGatherScatter = 2382 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2383 2384 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2385 // gather/scatter. Otherwise Decision should have been to Scalarize. 2386 assert((ConsecutiveStride || CreateGatherScatter) && 2387 "The instruction should be scalarized"); 2388 (void)ConsecutiveStride; 2389 2390 VectorParts BlockInMaskParts(UF); 2391 bool isMaskRequired = BlockInMask; 2392 if (isMaskRequired) 2393 for (unsigned Part = 0; Part < UF; ++Part) 2394 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2395 2396 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2397 // Calculate the pointer for the specific unroll-part. 2398 GetElementPtrInst *PartPtr = nullptr; 2399 2400 bool InBounds = false; 2401 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2402 InBounds = gep->isInBounds(); 2403 2404 if (Reverse) { 2405 // If the address is consecutive but reversed, then the 2406 // wide store needs to start at the last vector element. 2407 PartPtr = cast<GetElementPtrInst>( 2408 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2409 PartPtr->setIsInBounds(InBounds); 2410 PartPtr = cast<GetElementPtrInst>( 2411 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2412 PartPtr->setIsInBounds(InBounds); 2413 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2414 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2415 } else { 2416 PartPtr = cast<GetElementPtrInst>( 2417 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2418 PartPtr->setIsInBounds(InBounds); 2419 } 2420 2421 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2422 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2423 }; 2424 2425 // Handle Stores: 2426 if (SI) { 2427 setDebugLocFromInst(Builder, SI); 2428 2429 for (unsigned Part = 0; Part < UF; ++Part) { 2430 Instruction *NewSI = nullptr; 2431 Value *StoredVal = State.get(StoredValue, Part); 2432 if (CreateGatherScatter) { 2433 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2434 Value *VectorGep = State.get(Addr, Part); 2435 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2436 MaskPart); 2437 } else { 2438 if (Reverse) { 2439 // If we store to reverse consecutive memory locations, then we need 2440 // to reverse the order of elements in the stored value. 2441 StoredVal = reverseVector(StoredVal); 2442 // We don't want to update the value in the map as it might be used in 2443 // another expression. So don't call resetVectorValue(StoredVal). 2444 } 2445 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2446 if (isMaskRequired) 2447 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2448 BlockInMaskParts[Part]); 2449 else 2450 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2451 } 2452 addMetadata(NewSI, SI); 2453 } 2454 return; 2455 } 2456 2457 // Handle loads. 2458 assert(LI && "Must have a load instruction"); 2459 setDebugLocFromInst(Builder, LI); 2460 for (unsigned Part = 0; Part < UF; ++Part) { 2461 Value *NewLI; 2462 if (CreateGatherScatter) { 2463 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2464 Value *VectorGep = State.get(Addr, Part); 2465 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2466 nullptr, "wide.masked.gather"); 2467 addMetadata(NewLI, LI); 2468 } else { 2469 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2470 if (isMaskRequired) 2471 NewLI = Builder.CreateMaskedLoad( 2472 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2473 "wide.masked.load"); 2474 else 2475 NewLI = 2476 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2477 2478 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2479 addMetadata(NewLI, LI); 2480 if (Reverse) 2481 NewLI = reverseVector(NewLI); 2482 } 2483 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2484 } 2485 } 2486 2487 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2488 const VPIteration &Instance, 2489 bool IfPredicateInstr) { 2490 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2491 2492 setDebugLocFromInst(Builder, Instr); 2493 2494 // Does this instruction return a value ? 2495 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2496 2497 Instruction *Cloned = Instr->clone(); 2498 if (!IsVoidRetTy) 2499 Cloned->setName(Instr->getName() + ".cloned"); 2500 2501 // Replace the operands of the cloned instructions with their scalar 2502 // equivalents in the new loop. 2503 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2504 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2505 Cloned->setOperand(op, NewOp); 2506 } 2507 addNewMetadata(Cloned, Instr); 2508 2509 // Place the cloned scalar in the new loop. 2510 Builder.Insert(Cloned); 2511 2512 // Add the cloned scalar to the scalar map entry. 2513 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2514 2515 // If we just cloned a new assumption, add it the assumption cache. 2516 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2517 if (II->getIntrinsicID() == Intrinsic::assume) 2518 AC->registerAssumption(II); 2519 2520 // End if-block. 2521 if (IfPredicateInstr) 2522 PredicatedInstructions.push_back(Cloned); 2523 } 2524 2525 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2526 Value *End, Value *Step, 2527 Instruction *DL) { 2528 BasicBlock *Header = L->getHeader(); 2529 BasicBlock *Latch = L->getLoopLatch(); 2530 // As we're just creating this loop, it's possible no latch exists 2531 // yet. If so, use the header as this will be a single block loop. 2532 if (!Latch) 2533 Latch = Header; 2534 2535 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2536 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2537 setDebugLocFromInst(Builder, OldInst); 2538 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2539 2540 Builder.SetInsertPoint(Latch->getTerminator()); 2541 setDebugLocFromInst(Builder, OldInst); 2542 2543 // Create i+1 and fill the PHINode. 2544 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2545 Induction->addIncoming(Start, L->getLoopPreheader()); 2546 Induction->addIncoming(Next, Latch); 2547 // Create the compare. 2548 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2549 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2550 2551 // Now we have two terminators. Remove the old one from the block. 2552 Latch->getTerminator()->eraseFromParent(); 2553 2554 return Induction; 2555 } 2556 2557 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2558 if (TripCount) 2559 return TripCount; 2560 2561 assert(L && "Create Trip Count for null loop."); 2562 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2563 // Find the loop boundaries. 2564 ScalarEvolution *SE = PSE.getSE(); 2565 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2566 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2567 "Invalid loop count"); 2568 2569 Type *IdxTy = Legal->getWidestInductionType(); 2570 assert(IdxTy && "No type for induction"); 2571 2572 // The exit count might have the type of i64 while the phi is i32. This can 2573 // happen if we have an induction variable that is sign extended before the 2574 // compare. The only way that we get a backedge taken count is that the 2575 // induction variable was signed and as such will not overflow. In such a case 2576 // truncation is legal. 2577 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2578 IdxTy->getPrimitiveSizeInBits()) 2579 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2580 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2581 2582 // Get the total trip count from the count by adding 1. 2583 const SCEV *ExitCount = SE->getAddExpr( 2584 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2585 2586 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2587 2588 // Expand the trip count and place the new instructions in the preheader. 2589 // Notice that the pre-header does not change, only the loop body. 2590 SCEVExpander Exp(*SE, DL, "induction"); 2591 2592 // Count holds the overall loop count (N). 2593 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2594 L->getLoopPreheader()->getTerminator()); 2595 2596 if (TripCount->getType()->isPointerTy()) 2597 TripCount = 2598 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2599 L->getLoopPreheader()->getTerminator()); 2600 2601 return TripCount; 2602 } 2603 2604 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2605 if (VectorTripCount) 2606 return VectorTripCount; 2607 2608 Value *TC = getOrCreateTripCount(L); 2609 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2610 2611 Type *Ty = TC->getType(); 2612 Constant *Step = ConstantInt::get(Ty, VF * UF); 2613 2614 // If the tail is to be folded by masking, round the number of iterations N 2615 // up to a multiple of Step instead of rounding down. This is done by first 2616 // adding Step-1 and then rounding down. Note that it's ok if this addition 2617 // overflows: the vector induction variable will eventually wrap to zero given 2618 // that it starts at zero and its Step is a power of two; the loop will then 2619 // exit, with the last early-exit vector comparison also producing all-true. 2620 if (Cost->foldTailByMasking()) { 2621 assert(isPowerOf2_32(VF * UF) && 2622 "VF*UF must be a power of 2 when folding tail by masking"); 2623 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2624 } 2625 2626 // Now we need to generate the expression for the part of the loop that the 2627 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2628 // iterations are not required for correctness, or N - Step, otherwise. Step 2629 // is equal to the vectorization factor (number of SIMD elements) times the 2630 // unroll factor (number of SIMD instructions). 2631 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2632 2633 // If there is a non-reversed interleaved group that may speculatively access 2634 // memory out-of-bounds, we need to ensure that there will be at least one 2635 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2636 // the trip count, we set the remainder to be equal to the step. If the step 2637 // does not evenly divide the trip count, no adjustment is necessary since 2638 // there will already be scalar iterations. Note that the minimum iterations 2639 // check ensures that N >= Step. 2640 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2641 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2642 R = Builder.CreateSelect(IsZero, Step, R); 2643 } 2644 2645 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2646 2647 return VectorTripCount; 2648 } 2649 2650 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2651 const DataLayout &DL) { 2652 // Verify that V is a vector type with same number of elements as DstVTy. 2653 unsigned VF = DstVTy->getNumElements(); 2654 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2655 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2656 Type *SrcElemTy = SrcVecTy->getElementType(); 2657 Type *DstElemTy = DstVTy->getElementType(); 2658 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2659 "Vector elements must have same size"); 2660 2661 // Do a direct cast if element types are castable. 2662 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2663 return Builder.CreateBitOrPointerCast(V, DstVTy); 2664 } 2665 // V cannot be directly casted to desired vector type. 2666 // May happen when V is a floating point vector but DstVTy is a vector of 2667 // pointers or vice-versa. Handle this using a two-step bitcast using an 2668 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2669 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2670 "Only one type should be a pointer type"); 2671 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2672 "Only one type should be a floating point type"); 2673 Type *IntTy = 2674 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2675 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2676 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2677 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2678 } 2679 2680 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2681 BasicBlock *Bypass) { 2682 Value *Count = getOrCreateTripCount(L); 2683 // Reuse existing vector loop preheader for TC checks. 2684 // Note that new preheader block is generated for vector loop. 2685 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2686 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2687 2688 // Generate code to check if the loop's trip count is less than VF * UF, or 2689 // equal to it in case a scalar epilogue is required; this implies that the 2690 // vector trip count is zero. This check also covers the case where adding one 2691 // to the backedge-taken count overflowed leading to an incorrect trip count 2692 // of zero. In this case we will also jump to the scalar loop. 2693 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2694 : ICmpInst::ICMP_ULT; 2695 2696 // If tail is to be folded, vector loop takes care of all iterations. 2697 Value *CheckMinIters = Builder.getFalse(); 2698 if (!Cost->foldTailByMasking()) 2699 CheckMinIters = Builder.CreateICmp( 2700 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2701 "min.iters.check"); 2702 2703 // Create new preheader for vector loop. 2704 LoopVectorPreHeader = 2705 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2706 "vector.ph"); 2707 2708 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2709 DT->getNode(Bypass)->getIDom()) && 2710 "TC check is expected to dominate Bypass"); 2711 2712 // Update dominator for Bypass & LoopExit. 2713 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2714 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2715 2716 ReplaceInstWithInst( 2717 TCCheckBlock->getTerminator(), 2718 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2719 LoopBypassBlocks.push_back(TCCheckBlock); 2720 } 2721 2722 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2723 // Reuse existing vector loop preheader for SCEV checks. 2724 // Note that new preheader block is generated for vector loop. 2725 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2726 2727 // Generate the code to check that the SCEV assumptions that we made. 2728 // We want the new basic block to start at the first instruction in a 2729 // sequence of instructions that form a check. 2730 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2731 "scev.check"); 2732 Value *SCEVCheck = Exp.expandCodeForPredicate( 2733 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2734 2735 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2736 if (C->isZero()) 2737 return; 2738 2739 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2740 "Cannot SCEV check stride or overflow when optimizing for size"); 2741 2742 SCEVCheckBlock->setName("vector.scevcheck"); 2743 // Create new preheader for vector loop. 2744 LoopVectorPreHeader = 2745 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2746 nullptr, "vector.ph"); 2747 2748 // Update dominator only if this is first RT check. 2749 if (LoopBypassBlocks.empty()) { 2750 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2751 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2752 } 2753 2754 ReplaceInstWithInst( 2755 SCEVCheckBlock->getTerminator(), 2756 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2757 LoopBypassBlocks.push_back(SCEVCheckBlock); 2758 AddedSafetyChecks = true; 2759 } 2760 2761 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2762 // VPlan-native path does not do any analysis for runtime checks currently. 2763 if (EnableVPlanNativePath) 2764 return; 2765 2766 // Reuse existing vector loop preheader for runtime memory checks. 2767 // Note that new preheader block is generated for vector loop. 2768 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2769 2770 // Generate the code that checks in runtime if arrays overlap. We put the 2771 // checks into a separate block to make the more common case of few elements 2772 // faster. 2773 Instruction *FirstCheckInst; 2774 Instruction *MemRuntimeCheck; 2775 std::tie(FirstCheckInst, MemRuntimeCheck) = 2776 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2777 if (!MemRuntimeCheck) 2778 return; 2779 2780 if (MemCheckBlock->getParent()->hasOptSize()) { 2781 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2782 "Cannot emit memory checks when optimizing for size, unless forced " 2783 "to vectorize."); 2784 ORE->emit([&]() { 2785 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2786 L->getStartLoc(), L->getHeader()) 2787 << "Code-size may be reduced by not forcing " 2788 "vectorization, or by source-code modifications " 2789 "eliminating the need for runtime checks " 2790 "(e.g., adding 'restrict')."; 2791 }); 2792 } 2793 2794 MemCheckBlock->setName("vector.memcheck"); 2795 // Create new preheader for vector loop. 2796 LoopVectorPreHeader = 2797 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2798 "vector.ph"); 2799 2800 // Update dominator only if this is first RT check. 2801 if (LoopBypassBlocks.empty()) { 2802 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2803 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2804 } 2805 2806 ReplaceInstWithInst( 2807 MemCheckBlock->getTerminator(), 2808 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2809 LoopBypassBlocks.push_back(MemCheckBlock); 2810 AddedSafetyChecks = true; 2811 2812 // We currently don't use LoopVersioning for the actual loop cloning but we 2813 // still use it to add the noalias metadata. 2814 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2815 PSE.getSE()); 2816 LVer->prepareNoAliasMetadata(); 2817 } 2818 2819 Value *InnerLoopVectorizer::emitTransformedIndex( 2820 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2821 const InductionDescriptor &ID) const { 2822 2823 SCEVExpander Exp(*SE, DL, "induction"); 2824 auto Step = ID.getStep(); 2825 auto StartValue = ID.getStartValue(); 2826 assert(Index->getType() == Step->getType() && 2827 "Index type does not match StepValue type"); 2828 2829 // Note: the IR at this point is broken. We cannot use SE to create any new 2830 // SCEV and then expand it, hoping that SCEV's simplification will give us 2831 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2832 // lead to various SCEV crashes. So all we can do is to use builder and rely 2833 // on InstCombine for future simplifications. Here we handle some trivial 2834 // cases only. 2835 auto CreateAdd = [&B](Value *X, Value *Y) { 2836 assert(X->getType() == Y->getType() && "Types don't match!"); 2837 if (auto *CX = dyn_cast<ConstantInt>(X)) 2838 if (CX->isZero()) 2839 return Y; 2840 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2841 if (CY->isZero()) 2842 return X; 2843 return B.CreateAdd(X, Y); 2844 }; 2845 2846 auto CreateMul = [&B](Value *X, Value *Y) { 2847 assert(X->getType() == Y->getType() && "Types don't match!"); 2848 if (auto *CX = dyn_cast<ConstantInt>(X)) 2849 if (CX->isOne()) 2850 return Y; 2851 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2852 if (CY->isOne()) 2853 return X; 2854 return B.CreateMul(X, Y); 2855 }; 2856 2857 switch (ID.getKind()) { 2858 case InductionDescriptor::IK_IntInduction: { 2859 assert(Index->getType() == StartValue->getType() && 2860 "Index type does not match StartValue type"); 2861 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2862 return B.CreateSub(StartValue, Index); 2863 auto *Offset = CreateMul( 2864 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2865 return CreateAdd(StartValue, Offset); 2866 } 2867 case InductionDescriptor::IK_PtrInduction: { 2868 assert(isa<SCEVConstant>(Step) && 2869 "Expected constant step for pointer induction"); 2870 return B.CreateGEP( 2871 StartValue->getType()->getPointerElementType(), StartValue, 2872 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2873 &*B.GetInsertPoint()))); 2874 } 2875 case InductionDescriptor::IK_FpInduction: { 2876 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2877 auto InductionBinOp = ID.getInductionBinOp(); 2878 assert(InductionBinOp && 2879 (InductionBinOp->getOpcode() == Instruction::FAdd || 2880 InductionBinOp->getOpcode() == Instruction::FSub) && 2881 "Original bin op should be defined for FP induction"); 2882 2883 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2884 2885 // Floating point operations had to be 'fast' to enable the induction. 2886 FastMathFlags Flags; 2887 Flags.setFast(); 2888 2889 Value *MulExp = B.CreateFMul(StepValue, Index); 2890 if (isa<Instruction>(MulExp)) 2891 // We have to check, the MulExp may be a constant. 2892 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2893 2894 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2895 "induction"); 2896 if (isa<Instruction>(BOp)) 2897 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2898 2899 return BOp; 2900 } 2901 case InductionDescriptor::IK_NoInduction: 2902 return nullptr; 2903 } 2904 llvm_unreachable("invalid enum"); 2905 } 2906 2907 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2908 /* 2909 In this function we generate a new loop. The new loop will contain 2910 the vectorized instructions while the old loop will continue to run the 2911 scalar remainder. 2912 2913 [ ] <-- loop iteration number check. 2914 / | 2915 / v 2916 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2917 | / | 2918 | / v 2919 || [ ] <-- vector pre header. 2920 |/ | 2921 | v 2922 | [ ] \ 2923 | [ ]_| <-- vector loop. 2924 | | 2925 | v 2926 | -[ ] <--- middle-block. 2927 | / | 2928 | / v 2929 -|- >[ ] <--- new preheader. 2930 | | 2931 | v 2932 | [ ] \ 2933 | [ ]_| <-- old scalar loop to handle remainder. 2934 \ | 2935 \ v 2936 >[ ] <-- exit block. 2937 ... 2938 */ 2939 2940 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2941 2942 // Some loops have a single integer induction variable, while other loops 2943 // don't. One example is c++ iterators that often have multiple pointer 2944 // induction variables. In the code below we also support a case where we 2945 // don't have a single induction variable. 2946 // 2947 // We try to obtain an induction variable from the original loop as hard 2948 // as possible. However if we don't find one that: 2949 // - is an integer 2950 // - counts from zero, stepping by one 2951 // - is the size of the widest induction variable type 2952 // then we create a new one. 2953 OldInduction = Legal->getPrimaryInduction(); 2954 Type *IdxTy = Legal->getWidestInductionType(); 2955 2956 // Split the single block loop into the two loop structure described above. 2957 LoopScalarBody = OrigLoop->getHeader(); 2958 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2959 LoopExitBlock = OrigLoop->getExitBlock(); 2960 assert(LoopExitBlock && "Must have an exit block"); 2961 assert(LoopVectorPreHeader && "Invalid loop structure"); 2962 2963 LoopMiddleBlock = 2964 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2965 LI, nullptr, "middle.block"); 2966 LoopScalarPreHeader = 2967 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2968 nullptr, "scalar.ph"); 2969 // We intentionally don't let SplitBlock to update LoopInfo since 2970 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2971 // LoopVectorBody is explicitly added to the correct place few lines later. 2972 LoopVectorBody = 2973 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2974 nullptr, nullptr, "vector.body"); 2975 2976 // Update dominator for loop exit. 2977 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2978 2979 // Create and register the new vector loop. 2980 Loop *Lp = LI->AllocateLoop(); 2981 Loop *ParentLoop = OrigLoop->getParentLoop(); 2982 2983 // Insert the new loop into the loop nest and register the new basic blocks 2984 // before calling any utilities such as SCEV that require valid LoopInfo. 2985 if (ParentLoop) { 2986 ParentLoop->addChildLoop(Lp); 2987 } else { 2988 LI->addTopLevelLoop(Lp); 2989 } 2990 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 2991 2992 // Find the loop boundaries. 2993 Value *Count = getOrCreateTripCount(Lp); 2994 2995 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2996 2997 // Now, compare the new count to zero. If it is zero skip the vector loop and 2998 // jump to the scalar loop. This check also covers the case where the 2999 // backedge-taken count is uint##_max: adding one to it will overflow leading 3000 // to an incorrect trip count of zero. In this (rare) case we will also jump 3001 // to the scalar loop. 3002 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3003 3004 // Generate the code to check any assumptions that we've made for SCEV 3005 // expressions. 3006 emitSCEVChecks(Lp, LoopScalarPreHeader); 3007 3008 // Generate the code that checks in runtime if arrays overlap. We put the 3009 // checks into a separate block to make the more common case of few elements 3010 // faster. 3011 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3012 3013 // Generate the induction variable. 3014 // The loop step is equal to the vectorization factor (num of SIMD elements) 3015 // times the unroll factor (num of SIMD instructions). 3016 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3017 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3018 Induction = 3019 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3020 getDebugLocFromInstOrOperands(OldInduction)); 3021 3022 // We are going to resume the execution of the scalar loop. 3023 // Go over all of the induction variables that we found and fix the 3024 // PHIs that are left in the scalar version of the loop. 3025 // The starting values of PHI nodes depend on the counter of the last 3026 // iteration in the vectorized loop. 3027 // If we come from a bypass edge then we need to start from the original 3028 // start value. 3029 3030 // This variable saves the new starting index for the scalar loop. It is used 3031 // to test if there are any tail iterations left once the vector loop has 3032 // completed. 3033 for (auto &InductionEntry : Legal->getInductionVars()) { 3034 PHINode *OrigPhi = InductionEntry.first; 3035 InductionDescriptor II = InductionEntry.second; 3036 3037 // Create phi nodes to merge from the backedge-taken check block. 3038 PHINode *BCResumeVal = 3039 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3040 LoopScalarPreHeader->getTerminator()); 3041 // Copy original phi DL over to the new one. 3042 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3043 Value *&EndValue = IVEndValues[OrigPhi]; 3044 if (OrigPhi == OldInduction) { 3045 // We know what the end value is. 3046 EndValue = CountRoundDown; 3047 } else { 3048 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3049 Type *StepType = II.getStep()->getType(); 3050 Instruction::CastOps CastOp = 3051 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3052 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3053 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3054 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3055 EndValue->setName("ind.end"); 3056 } 3057 3058 // The new PHI merges the original incoming value, in case of a bypass, 3059 // or the value at the end of the vectorized loop. 3060 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3061 3062 // Fix the scalar body counter (PHI node). 3063 // The old induction's phi node in the scalar body needs the truncated 3064 // value. 3065 for (BasicBlock *BB : LoopBypassBlocks) 3066 BCResumeVal->addIncoming(II.getStartValue(), BB); 3067 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3068 } 3069 3070 // We need the OrigLoop (scalar loop part) latch terminator to help 3071 // produce correct debug info for the middle block BB instructions. 3072 // The legality check stage guarantees that the loop will have a single 3073 // latch. 3074 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3075 "Scalar loop latch terminator isn't a branch"); 3076 BranchInst *ScalarLatchBr = 3077 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3078 3079 // Add a check in the middle block to see if we have completed 3080 // all of the iterations in the first vector loop. 3081 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3082 // If tail is to be folded, we know we don't need to run the remainder. 3083 Value *CmpN = Builder.getTrue(); 3084 if (!Cost->foldTailByMasking()) { 3085 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3086 CountRoundDown, "cmp.n", 3087 LoopMiddleBlock->getTerminator()); 3088 3089 // Here we use the same DebugLoc as the scalar loop latch branch instead 3090 // of the corresponding compare because they may have ended up with 3091 // different line numbers and we want to avoid awkward line stepping while 3092 // debugging. Eg. if the compare has got a line number inside the loop. 3093 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3094 } 3095 3096 BranchInst *BrInst = 3097 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3098 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3099 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3100 3101 // Get ready to start creating new instructions into the vectorized body. 3102 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3103 "Inconsistent vector loop preheader"); 3104 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3105 3106 Optional<MDNode *> VectorizedLoopID = 3107 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3108 LLVMLoopVectorizeFollowupVectorized}); 3109 if (VectorizedLoopID.hasValue()) { 3110 Lp->setLoopID(VectorizedLoopID.getValue()); 3111 3112 // Do not setAlreadyVectorized if loop attributes have been defined 3113 // explicitly. 3114 return LoopVectorPreHeader; 3115 } 3116 3117 // Keep all loop hints from the original loop on the vector loop (we'll 3118 // replace the vectorizer-specific hints below). 3119 if (MDNode *LID = OrigLoop->getLoopID()) 3120 Lp->setLoopID(LID); 3121 3122 LoopVectorizeHints Hints(Lp, true, *ORE); 3123 Hints.setAlreadyVectorized(); 3124 3125 #ifdef EXPENSIVE_CHECKS 3126 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3127 LI->verify(*DT); 3128 #endif 3129 3130 return LoopVectorPreHeader; 3131 } 3132 3133 // Fix up external users of the induction variable. At this point, we are 3134 // in LCSSA form, with all external PHIs that use the IV having one input value, 3135 // coming from the remainder loop. We need those PHIs to also have a correct 3136 // value for the IV when arriving directly from the middle block. 3137 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3138 const InductionDescriptor &II, 3139 Value *CountRoundDown, Value *EndValue, 3140 BasicBlock *MiddleBlock) { 3141 // There are two kinds of external IV usages - those that use the value 3142 // computed in the last iteration (the PHI) and those that use the penultimate 3143 // value (the value that feeds into the phi from the loop latch). 3144 // We allow both, but they, obviously, have different values. 3145 3146 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3147 3148 DenseMap<Value *, Value *> MissingVals; 3149 3150 // An external user of the last iteration's value should see the value that 3151 // the remainder loop uses to initialize its own IV. 3152 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3153 for (User *U : PostInc->users()) { 3154 Instruction *UI = cast<Instruction>(U); 3155 if (!OrigLoop->contains(UI)) { 3156 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3157 MissingVals[UI] = EndValue; 3158 } 3159 } 3160 3161 // An external user of the penultimate value need to see EndValue - Step. 3162 // The simplest way to get this is to recompute it from the constituent SCEVs, 3163 // that is Start + (Step * (CRD - 1)). 3164 for (User *U : OrigPhi->users()) { 3165 auto *UI = cast<Instruction>(U); 3166 if (!OrigLoop->contains(UI)) { 3167 const DataLayout &DL = 3168 OrigLoop->getHeader()->getModule()->getDataLayout(); 3169 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3170 3171 IRBuilder<> B(MiddleBlock->getTerminator()); 3172 Value *CountMinusOne = B.CreateSub( 3173 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3174 Value *CMO = 3175 !II.getStep()->getType()->isIntegerTy() 3176 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3177 II.getStep()->getType()) 3178 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3179 CMO->setName("cast.cmo"); 3180 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3181 Escape->setName("ind.escape"); 3182 MissingVals[UI] = Escape; 3183 } 3184 } 3185 3186 for (auto &I : MissingVals) { 3187 PHINode *PHI = cast<PHINode>(I.first); 3188 // One corner case we have to handle is two IVs "chasing" each-other, 3189 // that is %IV2 = phi [...], [ %IV1, %latch ] 3190 // In this case, if IV1 has an external use, we need to avoid adding both 3191 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3192 // don't already have an incoming value for the middle block. 3193 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3194 PHI->addIncoming(I.second, MiddleBlock); 3195 } 3196 } 3197 3198 namespace { 3199 3200 struct CSEDenseMapInfo { 3201 static bool canHandle(const Instruction *I) { 3202 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3203 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3204 } 3205 3206 static inline Instruction *getEmptyKey() { 3207 return DenseMapInfo<Instruction *>::getEmptyKey(); 3208 } 3209 3210 static inline Instruction *getTombstoneKey() { 3211 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3212 } 3213 3214 static unsigned getHashValue(const Instruction *I) { 3215 assert(canHandle(I) && "Unknown instruction!"); 3216 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3217 I->value_op_end())); 3218 } 3219 3220 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3221 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3222 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3223 return LHS == RHS; 3224 return LHS->isIdenticalTo(RHS); 3225 } 3226 }; 3227 3228 } // end anonymous namespace 3229 3230 ///Perform cse of induction variable instructions. 3231 static void cse(BasicBlock *BB) { 3232 // Perform simple cse. 3233 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3234 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3235 Instruction *In = &*I++; 3236 3237 if (!CSEDenseMapInfo::canHandle(In)) 3238 continue; 3239 3240 // Check if we can replace this instruction with any of the 3241 // visited instructions. 3242 if (Instruction *V = CSEMap.lookup(In)) { 3243 In->replaceAllUsesWith(V); 3244 In->eraseFromParent(); 3245 continue; 3246 } 3247 3248 CSEMap[In] = In; 3249 } 3250 } 3251 3252 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3253 unsigned VF, 3254 bool &NeedToScalarize) { 3255 Function *F = CI->getCalledFunction(); 3256 Type *ScalarRetTy = CI->getType(); 3257 SmallVector<Type *, 4> Tys, ScalarTys; 3258 for (auto &ArgOp : CI->arg_operands()) 3259 ScalarTys.push_back(ArgOp->getType()); 3260 3261 // Estimate cost of scalarized vector call. The source operands are assumed 3262 // to be vectors, so we need to extract individual elements from there, 3263 // execute VF scalar calls, and then gather the result into the vector return 3264 // value. 3265 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3266 if (VF == 1) 3267 return ScalarCallCost; 3268 3269 // Compute corresponding vector type for return value and arguments. 3270 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3271 for (Type *ScalarTy : ScalarTys) 3272 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3273 3274 // Compute costs of unpacking argument values for the scalar calls and 3275 // packing the return values to a vector. 3276 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3277 3278 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3279 3280 // If we can't emit a vector call for this function, then the currently found 3281 // cost is the cost we need to return. 3282 NeedToScalarize = true; 3283 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3284 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3285 3286 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3287 return Cost; 3288 3289 // If the corresponding vector cost is cheaper, return its cost. 3290 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3291 if (VectorCallCost < Cost) { 3292 NeedToScalarize = false; 3293 return VectorCallCost; 3294 } 3295 return Cost; 3296 } 3297 3298 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3299 unsigned VF) { 3300 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3301 assert(ID && "Expected intrinsic call!"); 3302 3303 FastMathFlags FMF; 3304 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3305 FMF = FPMO->getFastMathFlags(); 3306 3307 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3308 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3309 } 3310 3311 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3312 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3313 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3314 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3315 } 3316 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3317 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3318 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3319 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3320 } 3321 3322 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3323 // For every instruction `I` in MinBWs, truncate the operands, create a 3324 // truncated version of `I` and reextend its result. InstCombine runs 3325 // later and will remove any ext/trunc pairs. 3326 SmallPtrSet<Value *, 4> Erased; 3327 for (const auto &KV : Cost->getMinimalBitwidths()) { 3328 // If the value wasn't vectorized, we must maintain the original scalar 3329 // type. The absence of the value from VectorLoopValueMap indicates that it 3330 // wasn't vectorized. 3331 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3332 continue; 3333 for (unsigned Part = 0; Part < UF; ++Part) { 3334 Value *I = getOrCreateVectorValue(KV.first, Part); 3335 if (Erased.find(I) != Erased.end() || I->use_empty() || 3336 !isa<Instruction>(I)) 3337 continue; 3338 Type *OriginalTy = I->getType(); 3339 Type *ScalarTruncatedTy = 3340 IntegerType::get(OriginalTy->getContext(), KV.second); 3341 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3342 OriginalTy->getVectorNumElements()); 3343 if (TruncatedTy == OriginalTy) 3344 continue; 3345 3346 IRBuilder<> B(cast<Instruction>(I)); 3347 auto ShrinkOperand = [&](Value *V) -> Value * { 3348 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3349 if (ZI->getSrcTy() == TruncatedTy) 3350 return ZI->getOperand(0); 3351 return B.CreateZExtOrTrunc(V, TruncatedTy); 3352 }; 3353 3354 // The actual instruction modification depends on the instruction type, 3355 // unfortunately. 3356 Value *NewI = nullptr; 3357 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3358 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3359 ShrinkOperand(BO->getOperand(1))); 3360 3361 // Any wrapping introduced by shrinking this operation shouldn't be 3362 // considered undefined behavior. So, we can't unconditionally copy 3363 // arithmetic wrapping flags to NewI. 3364 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3365 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3366 NewI = 3367 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3368 ShrinkOperand(CI->getOperand(1))); 3369 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3370 NewI = B.CreateSelect(SI->getCondition(), 3371 ShrinkOperand(SI->getTrueValue()), 3372 ShrinkOperand(SI->getFalseValue())); 3373 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3374 switch (CI->getOpcode()) { 3375 default: 3376 llvm_unreachable("Unhandled cast!"); 3377 case Instruction::Trunc: 3378 NewI = ShrinkOperand(CI->getOperand(0)); 3379 break; 3380 case Instruction::SExt: 3381 NewI = B.CreateSExtOrTrunc( 3382 CI->getOperand(0), 3383 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3384 break; 3385 case Instruction::ZExt: 3386 NewI = B.CreateZExtOrTrunc( 3387 CI->getOperand(0), 3388 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3389 break; 3390 } 3391 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3392 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3393 auto *O0 = B.CreateZExtOrTrunc( 3394 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3395 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3396 auto *O1 = B.CreateZExtOrTrunc( 3397 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3398 3399 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3400 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3401 // Don't do anything with the operands, just extend the result. 3402 continue; 3403 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3404 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3405 auto *O0 = B.CreateZExtOrTrunc( 3406 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3407 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3408 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3409 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3410 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3411 auto *O0 = B.CreateZExtOrTrunc( 3412 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3413 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3414 } else { 3415 // If we don't know what to do, be conservative and don't do anything. 3416 continue; 3417 } 3418 3419 // Lastly, extend the result. 3420 NewI->takeName(cast<Instruction>(I)); 3421 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3422 I->replaceAllUsesWith(Res); 3423 cast<Instruction>(I)->eraseFromParent(); 3424 Erased.insert(I); 3425 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3426 } 3427 } 3428 3429 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3430 for (const auto &KV : Cost->getMinimalBitwidths()) { 3431 // If the value wasn't vectorized, we must maintain the original scalar 3432 // type. The absence of the value from VectorLoopValueMap indicates that it 3433 // wasn't vectorized. 3434 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3435 continue; 3436 for (unsigned Part = 0; Part < UF; ++Part) { 3437 Value *I = getOrCreateVectorValue(KV.first, Part); 3438 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3439 if (Inst && Inst->use_empty()) { 3440 Value *NewI = Inst->getOperand(0); 3441 Inst->eraseFromParent(); 3442 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3443 } 3444 } 3445 } 3446 } 3447 3448 void InnerLoopVectorizer::fixVectorizedLoop() { 3449 // Insert truncates and extends for any truncated instructions as hints to 3450 // InstCombine. 3451 if (VF > 1) 3452 truncateToMinimalBitwidths(); 3453 3454 // Fix widened non-induction PHIs by setting up the PHI operands. 3455 if (OrigPHIsToFix.size()) { 3456 assert(EnableVPlanNativePath && 3457 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3458 fixNonInductionPHIs(); 3459 } 3460 3461 // At this point every instruction in the original loop is widened to a 3462 // vector form. Now we need to fix the recurrences in the loop. These PHI 3463 // nodes are currently empty because we did not want to introduce cycles. 3464 // This is the second stage of vectorizing recurrences. 3465 fixCrossIterationPHIs(); 3466 3467 // Forget the original basic block. 3468 PSE.getSE()->forgetLoop(OrigLoop); 3469 3470 // Fix-up external users of the induction variables. 3471 for (auto &Entry : Legal->getInductionVars()) 3472 fixupIVUsers(Entry.first, Entry.second, 3473 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3474 IVEndValues[Entry.first], LoopMiddleBlock); 3475 3476 fixLCSSAPHIs(); 3477 for (Instruction *PI : PredicatedInstructions) 3478 sinkScalarOperands(&*PI); 3479 3480 // Remove redundant induction instructions. 3481 cse(LoopVectorBody); 3482 3483 // Set/update profile weights for the vector and remainder loops as original 3484 // loop iterations are now distributed among them. Note that original loop 3485 // represented by LoopScalarBody becomes remainder loop after vectorization. 3486 // 3487 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3488 // end up getting slightly roughened result but that should be OK since 3489 // profile is not inherently precise anyway. Note also possible bypass of 3490 // vector code caused by legality checks is ignored, assigning all the weight 3491 // to the vector loop, optimistically. 3492 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3493 LI->getLoopFor(LoopVectorBody), 3494 LI->getLoopFor(LoopScalarBody), VF * UF); 3495 } 3496 3497 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3498 // In order to support recurrences we need to be able to vectorize Phi nodes. 3499 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3500 // stage #2: We now need to fix the recurrences by adding incoming edges to 3501 // the currently empty PHI nodes. At this point every instruction in the 3502 // original loop is widened to a vector form so we can use them to construct 3503 // the incoming edges. 3504 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3505 // Handle first-order recurrences and reductions that need to be fixed. 3506 if (Legal->isFirstOrderRecurrence(&Phi)) 3507 fixFirstOrderRecurrence(&Phi); 3508 else if (Legal->isReductionVariable(&Phi)) 3509 fixReduction(&Phi); 3510 } 3511 } 3512 3513 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3514 // This is the second phase of vectorizing first-order recurrences. An 3515 // overview of the transformation is described below. Suppose we have the 3516 // following loop. 3517 // 3518 // for (int i = 0; i < n; ++i) 3519 // b[i] = a[i] - a[i - 1]; 3520 // 3521 // There is a first-order recurrence on "a". For this loop, the shorthand 3522 // scalar IR looks like: 3523 // 3524 // scalar.ph: 3525 // s_init = a[-1] 3526 // br scalar.body 3527 // 3528 // scalar.body: 3529 // i = phi [0, scalar.ph], [i+1, scalar.body] 3530 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3531 // s2 = a[i] 3532 // b[i] = s2 - s1 3533 // br cond, scalar.body, ... 3534 // 3535 // In this example, s1 is a recurrence because it's value depends on the 3536 // previous iteration. In the first phase of vectorization, we created a 3537 // temporary value for s1. We now complete the vectorization and produce the 3538 // shorthand vector IR shown below (for VF = 4, UF = 1). 3539 // 3540 // vector.ph: 3541 // v_init = vector(..., ..., ..., a[-1]) 3542 // br vector.body 3543 // 3544 // vector.body 3545 // i = phi [0, vector.ph], [i+4, vector.body] 3546 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3547 // v2 = a[i, i+1, i+2, i+3]; 3548 // v3 = vector(v1(3), v2(0, 1, 2)) 3549 // b[i, i+1, i+2, i+3] = v2 - v3 3550 // br cond, vector.body, middle.block 3551 // 3552 // middle.block: 3553 // x = v2(3) 3554 // br scalar.ph 3555 // 3556 // scalar.ph: 3557 // s_init = phi [x, middle.block], [a[-1], otherwise] 3558 // br scalar.body 3559 // 3560 // After execution completes the vector loop, we extract the next value of 3561 // the recurrence (x) to use as the initial value in the scalar loop. 3562 3563 // Get the original loop preheader and single loop latch. 3564 auto *Preheader = OrigLoop->getLoopPreheader(); 3565 auto *Latch = OrigLoop->getLoopLatch(); 3566 3567 // Get the initial and previous values of the scalar recurrence. 3568 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3569 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3570 3571 // Create a vector from the initial value. 3572 auto *VectorInit = ScalarInit; 3573 if (VF > 1) { 3574 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3575 VectorInit = Builder.CreateInsertElement( 3576 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3577 Builder.getInt32(VF - 1), "vector.recur.init"); 3578 } 3579 3580 // We constructed a temporary phi node in the first phase of vectorization. 3581 // This phi node will eventually be deleted. 3582 Builder.SetInsertPoint( 3583 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3584 3585 // Create a phi node for the new recurrence. The current value will either be 3586 // the initial value inserted into a vector or loop-varying vector value. 3587 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3588 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3589 3590 // Get the vectorized previous value of the last part UF - 1. It appears last 3591 // among all unrolled iterations, due to the order of their construction. 3592 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3593 3594 // Find and set the insertion point after the previous value if it is an 3595 // instruction. 3596 BasicBlock::iterator InsertPt; 3597 // Note that the previous value may have been constant-folded so it is not 3598 // guaranteed to be an instruction in the vector loop. 3599 // FIXME: Loop invariant values do not form recurrences. We should deal with 3600 // them earlier. 3601 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3602 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3603 else { 3604 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3605 if (isa<PHINode>(PreviousLastPart)) 3606 // If the previous value is a phi node, we should insert after all the phi 3607 // nodes in the block containing the PHI to avoid breaking basic block 3608 // verification. Note that the basic block may be different to 3609 // LoopVectorBody, in case we predicate the loop. 3610 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3611 else 3612 InsertPt = ++PreviousInst->getIterator(); 3613 } 3614 Builder.SetInsertPoint(&*InsertPt); 3615 3616 // We will construct a vector for the recurrence by combining the values for 3617 // the current and previous iterations. This is the required shuffle mask. 3618 SmallVector<Constant *, 8> ShuffleMask(VF); 3619 ShuffleMask[0] = Builder.getInt32(VF - 1); 3620 for (unsigned I = 1; I < VF; ++I) 3621 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3622 3623 // The vector from which to take the initial value for the current iteration 3624 // (actual or unrolled). Initially, this is the vector phi node. 3625 Value *Incoming = VecPhi; 3626 3627 // Shuffle the current and previous vector and update the vector parts. 3628 for (unsigned Part = 0; Part < UF; ++Part) { 3629 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3630 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3631 auto *Shuffle = 3632 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3633 ConstantVector::get(ShuffleMask)) 3634 : Incoming; 3635 PhiPart->replaceAllUsesWith(Shuffle); 3636 cast<Instruction>(PhiPart)->eraseFromParent(); 3637 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3638 Incoming = PreviousPart; 3639 } 3640 3641 // Fix the latch value of the new recurrence in the vector loop. 3642 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3643 3644 // Extract the last vector element in the middle block. This will be the 3645 // initial value for the recurrence when jumping to the scalar loop. 3646 auto *ExtractForScalar = Incoming; 3647 if (VF > 1) { 3648 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3649 ExtractForScalar = Builder.CreateExtractElement( 3650 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3651 } 3652 // Extract the second last element in the middle block if the 3653 // Phi is used outside the loop. We need to extract the phi itself 3654 // and not the last element (the phi update in the current iteration). This 3655 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3656 // when the scalar loop is not run at all. 3657 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3658 if (VF > 1) 3659 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3660 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3661 // When loop is unrolled without vectorizing, initialize 3662 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3663 // `Incoming`. This is analogous to the vectorized case above: extracting the 3664 // second last element when VF > 1. 3665 else if (UF > 1) 3666 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3667 3668 // Fix the initial value of the original recurrence in the scalar loop. 3669 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3670 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3671 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3672 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3673 Start->addIncoming(Incoming, BB); 3674 } 3675 3676 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3677 Phi->setName("scalar.recur"); 3678 3679 // Finally, fix users of the recurrence outside the loop. The users will need 3680 // either the last value of the scalar recurrence or the last value of the 3681 // vector recurrence we extracted in the middle block. Since the loop is in 3682 // LCSSA form, we just need to find all the phi nodes for the original scalar 3683 // recurrence in the exit block, and then add an edge for the middle block. 3684 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3685 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3686 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3687 } 3688 } 3689 } 3690 3691 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3692 Constant *Zero = Builder.getInt32(0); 3693 3694 // Get it's reduction variable descriptor. 3695 assert(Legal->isReductionVariable(Phi) && 3696 "Unable to find the reduction variable"); 3697 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3698 3699 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3700 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3701 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3702 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3703 RdxDesc.getMinMaxRecurrenceKind(); 3704 setDebugLocFromInst(Builder, ReductionStartValue); 3705 3706 // We need to generate a reduction vector from the incoming scalar. 3707 // To do so, we need to generate the 'identity' vector and override 3708 // one of the elements with the incoming scalar reduction. We need 3709 // to do it in the vector-loop preheader. 3710 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3711 3712 // This is the vector-clone of the value that leaves the loop. 3713 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3714 3715 // Find the reduction identity variable. Zero for addition, or, xor, 3716 // one for multiplication, -1 for And. 3717 Value *Identity; 3718 Value *VectorStart; 3719 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3720 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3721 // MinMax reduction have the start value as their identify. 3722 if (VF == 1) { 3723 VectorStart = Identity = ReductionStartValue; 3724 } else { 3725 VectorStart = Identity = 3726 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3727 } 3728 } else { 3729 // Handle other reduction kinds: 3730 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3731 RK, VecTy->getScalarType()); 3732 if (VF == 1) { 3733 Identity = Iden; 3734 // This vector is the Identity vector where the first element is the 3735 // incoming scalar reduction. 3736 VectorStart = ReductionStartValue; 3737 } else { 3738 Identity = ConstantVector::getSplat({VF, false}, Iden); 3739 3740 // This vector is the Identity vector where the first element is the 3741 // incoming scalar reduction. 3742 VectorStart = 3743 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3744 } 3745 } 3746 3747 // Wrap flags are in general invalid after vectorization, clear them. 3748 clearReductionWrapFlags(RdxDesc); 3749 3750 // Fix the vector-loop phi. 3751 3752 // Reductions do not have to start at zero. They can start with 3753 // any loop invariant values. 3754 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3755 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3756 3757 for (unsigned Part = 0; Part < UF; ++Part) { 3758 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3759 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3760 // Make sure to add the reduction start value only to the 3761 // first unroll part. 3762 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3763 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3764 cast<PHINode>(VecRdxPhi) 3765 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3766 } 3767 3768 // Before each round, move the insertion point right between 3769 // the PHIs and the values we are going to write. 3770 // This allows us to write both PHINodes and the extractelement 3771 // instructions. 3772 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3773 3774 setDebugLocFromInst(Builder, LoopExitInst); 3775 3776 // If tail is folded by masking, the vector value to leave the loop should be 3777 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3778 // instead of the former. 3779 if (Cost->foldTailByMasking()) { 3780 for (unsigned Part = 0; Part < UF; ++Part) { 3781 Value *VecLoopExitInst = 3782 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3783 Value *Sel = nullptr; 3784 for (User *U : VecLoopExitInst->users()) { 3785 if (isa<SelectInst>(U)) { 3786 assert(!Sel && "Reduction exit feeding two selects"); 3787 Sel = U; 3788 } else 3789 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3790 } 3791 assert(Sel && "Reduction exit feeds no select"); 3792 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3793 } 3794 } 3795 3796 // If the vector reduction can be performed in a smaller type, we truncate 3797 // then extend the loop exit value to enable InstCombine to evaluate the 3798 // entire expression in the smaller type. 3799 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3800 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3801 Builder.SetInsertPoint( 3802 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3803 VectorParts RdxParts(UF); 3804 for (unsigned Part = 0; Part < UF; ++Part) { 3805 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3806 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3807 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3808 : Builder.CreateZExt(Trunc, VecTy); 3809 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3810 UI != RdxParts[Part]->user_end();) 3811 if (*UI != Trunc) { 3812 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3813 RdxParts[Part] = Extnd; 3814 } else { 3815 ++UI; 3816 } 3817 } 3818 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3819 for (unsigned Part = 0; Part < UF; ++Part) { 3820 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3821 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3822 } 3823 } 3824 3825 // Reduce all of the unrolled parts into a single vector. 3826 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3827 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3828 3829 // The middle block terminator has already been assigned a DebugLoc here (the 3830 // OrigLoop's single latch terminator). We want the whole middle block to 3831 // appear to execute on this line because: (a) it is all compiler generated, 3832 // (b) these instructions are always executed after evaluating the latch 3833 // conditional branch, and (c) other passes may add new predecessors which 3834 // terminate on this line. This is the easiest way to ensure we don't 3835 // accidentally cause an extra step back into the loop while debugging. 3836 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3837 for (unsigned Part = 1; Part < UF; ++Part) { 3838 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3839 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3840 // Floating point operations had to be 'fast' to enable the reduction. 3841 ReducedPartRdx = addFastMathFlag( 3842 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3843 ReducedPartRdx, "bin.rdx"), 3844 RdxDesc.getFastMathFlags()); 3845 else 3846 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3847 RdxPart); 3848 } 3849 3850 if (VF > 1) { 3851 bool NoNaN = Legal->hasFunNoNaNAttr(); 3852 ReducedPartRdx = 3853 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3854 // If the reduction can be performed in a smaller type, we need to extend 3855 // the reduction to the wider type before we branch to the original loop. 3856 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3857 ReducedPartRdx = 3858 RdxDesc.isSigned() 3859 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3860 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3861 } 3862 3863 // Create a phi node that merges control-flow from the backedge-taken check 3864 // block and the middle block. 3865 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3866 LoopScalarPreHeader->getTerminator()); 3867 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3868 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3869 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3870 3871 // Now, we need to fix the users of the reduction variable 3872 // inside and outside of the scalar remainder loop. 3873 // We know that the loop is in LCSSA form. We need to update the 3874 // PHI nodes in the exit blocks. 3875 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3876 // All PHINodes need to have a single entry edge, or two if 3877 // we already fixed them. 3878 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3879 3880 // We found a reduction value exit-PHI. Update it with the 3881 // incoming bypass edge. 3882 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3883 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3884 } // end of the LCSSA phi scan. 3885 3886 // Fix the scalar loop reduction variable with the incoming reduction sum 3887 // from the vector body and from the backedge value. 3888 int IncomingEdgeBlockIdx = 3889 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3890 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3891 // Pick the other block. 3892 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3893 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3894 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3895 } 3896 3897 void InnerLoopVectorizer::clearReductionWrapFlags( 3898 RecurrenceDescriptor &RdxDesc) { 3899 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3900 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3901 RK != RecurrenceDescriptor::RK_IntegerMult) 3902 return; 3903 3904 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3905 assert(LoopExitInstr && "null loop exit instruction"); 3906 SmallVector<Instruction *, 8> Worklist; 3907 SmallPtrSet<Instruction *, 8> Visited; 3908 Worklist.push_back(LoopExitInstr); 3909 Visited.insert(LoopExitInstr); 3910 3911 while (!Worklist.empty()) { 3912 Instruction *Cur = Worklist.pop_back_val(); 3913 if (isa<OverflowingBinaryOperator>(Cur)) 3914 for (unsigned Part = 0; Part < UF; ++Part) { 3915 Value *V = getOrCreateVectorValue(Cur, Part); 3916 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3917 } 3918 3919 for (User *U : Cur->users()) { 3920 Instruction *UI = cast<Instruction>(U); 3921 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3922 Visited.insert(UI).second) 3923 Worklist.push_back(UI); 3924 } 3925 } 3926 } 3927 3928 void InnerLoopVectorizer::fixLCSSAPHIs() { 3929 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3930 if (LCSSAPhi.getNumIncomingValues() == 1) { 3931 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3932 // Non-instruction incoming values will have only one value. 3933 unsigned LastLane = 0; 3934 if (isa<Instruction>(IncomingValue)) 3935 LastLane = Cost->isUniformAfterVectorization( 3936 cast<Instruction>(IncomingValue), VF) 3937 ? 0 3938 : VF - 1; 3939 // Can be a loop invariant incoming value or the last scalar value to be 3940 // extracted from the vectorized loop. 3941 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3942 Value *lastIncomingValue = 3943 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3944 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3945 } 3946 } 3947 } 3948 3949 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3950 // The basic block and loop containing the predicated instruction. 3951 auto *PredBB = PredInst->getParent(); 3952 auto *VectorLoop = LI->getLoopFor(PredBB); 3953 3954 // Initialize a worklist with the operands of the predicated instruction. 3955 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3956 3957 // Holds instructions that we need to analyze again. An instruction may be 3958 // reanalyzed if we don't yet know if we can sink it or not. 3959 SmallVector<Instruction *, 8> InstsToReanalyze; 3960 3961 // Returns true if a given use occurs in the predicated block. Phi nodes use 3962 // their operands in their corresponding predecessor blocks. 3963 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3964 auto *I = cast<Instruction>(U.getUser()); 3965 BasicBlock *BB = I->getParent(); 3966 if (auto *Phi = dyn_cast<PHINode>(I)) 3967 BB = Phi->getIncomingBlock( 3968 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3969 return BB == PredBB; 3970 }; 3971 3972 // Iteratively sink the scalarized operands of the predicated instruction 3973 // into the block we created for it. When an instruction is sunk, it's 3974 // operands are then added to the worklist. The algorithm ends after one pass 3975 // through the worklist doesn't sink a single instruction. 3976 bool Changed; 3977 do { 3978 // Add the instructions that need to be reanalyzed to the worklist, and 3979 // reset the changed indicator. 3980 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3981 InstsToReanalyze.clear(); 3982 Changed = false; 3983 3984 while (!Worklist.empty()) { 3985 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3986 3987 // We can't sink an instruction if it is a phi node, is already in the 3988 // predicated block, is not in the loop, or may have side effects. 3989 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3990 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3991 continue; 3992 3993 // It's legal to sink the instruction if all its uses occur in the 3994 // predicated block. Otherwise, there's nothing to do yet, and we may 3995 // need to reanalyze the instruction. 3996 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3997 InstsToReanalyze.push_back(I); 3998 continue; 3999 } 4000 4001 // Move the instruction to the beginning of the predicated block, and add 4002 // it's operands to the worklist. 4003 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4004 Worklist.insert(I->op_begin(), I->op_end()); 4005 4006 // The sinking may have enabled other instructions to be sunk, so we will 4007 // need to iterate. 4008 Changed = true; 4009 } 4010 } while (Changed); 4011 } 4012 4013 void InnerLoopVectorizer::fixNonInductionPHIs() { 4014 for (PHINode *OrigPhi : OrigPHIsToFix) { 4015 PHINode *NewPhi = 4016 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4017 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4018 4019 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4020 predecessors(OrigPhi->getParent())); 4021 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4022 predecessors(NewPhi->getParent())); 4023 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4024 "Scalar and Vector BB should have the same number of predecessors"); 4025 4026 // The insertion point in Builder may be invalidated by the time we get 4027 // here. Force the Builder insertion point to something valid so that we do 4028 // not run into issues during insertion point restore in 4029 // getOrCreateVectorValue calls below. 4030 Builder.SetInsertPoint(NewPhi); 4031 4032 // The predecessor order is preserved and we can rely on mapping between 4033 // scalar and vector block predecessors. 4034 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4035 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4036 4037 // When looking up the new scalar/vector values to fix up, use incoming 4038 // values from original phi. 4039 Value *ScIncV = 4040 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4041 4042 // Scalar incoming value may need a broadcast 4043 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4044 NewPhi->addIncoming(NewIncV, NewPredBB); 4045 } 4046 } 4047 } 4048 4049 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4050 unsigned VF, bool IsPtrLoopInvariant, 4051 SmallBitVector &IsIndexLoopInvariant) { 4052 // Construct a vector GEP by widening the operands of the scalar GEP as 4053 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4054 // results in a vector of pointers when at least one operand of the GEP 4055 // is vector-typed. Thus, to keep the representation compact, we only use 4056 // vector-typed operands for loop-varying values. 4057 4058 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4059 // If we are vectorizing, but the GEP has only loop-invariant operands, 4060 // the GEP we build (by only using vector-typed operands for 4061 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4062 // produce a vector of pointers, we need to either arbitrarily pick an 4063 // operand to broadcast, or broadcast a clone of the original GEP. 4064 // Here, we broadcast a clone of the original. 4065 // 4066 // TODO: If at some point we decide to scalarize instructions having 4067 // loop-invariant operands, this special case will no longer be 4068 // required. We would add the scalarization decision to 4069 // collectLoopScalars() and teach getVectorValue() to broadcast 4070 // the lane-zero scalar value. 4071 auto *Clone = Builder.Insert(GEP->clone()); 4072 for (unsigned Part = 0; Part < UF; ++Part) { 4073 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4074 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4075 addMetadata(EntryPart, GEP); 4076 } 4077 } else { 4078 // If the GEP has at least one loop-varying operand, we are sure to 4079 // produce a vector of pointers. But if we are only unrolling, we want 4080 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4081 // produce with the code below will be scalar (if VF == 1) or vector 4082 // (otherwise). Note that for the unroll-only case, we still maintain 4083 // values in the vector mapping with initVector, as we do for other 4084 // instructions. 4085 for (unsigned Part = 0; Part < UF; ++Part) { 4086 // The pointer operand of the new GEP. If it's loop-invariant, we 4087 // won't broadcast it. 4088 auto *Ptr = IsPtrLoopInvariant 4089 ? GEP->getPointerOperand() 4090 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4091 4092 // Collect all the indices for the new GEP. If any index is 4093 // loop-invariant, we won't broadcast it. 4094 SmallVector<Value *, 4> Indices; 4095 for (auto Index : enumerate(GEP->indices())) { 4096 Value *User = Index.value().get(); 4097 if (IsIndexLoopInvariant[Index.index()]) 4098 Indices.push_back(User); 4099 else 4100 Indices.push_back(getOrCreateVectorValue(User, Part)); 4101 } 4102 4103 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4104 // but it should be a vector, otherwise. 4105 auto *NewGEP = 4106 GEP->isInBounds() 4107 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4108 Indices) 4109 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4110 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4111 "NewGEP is not a pointer vector"); 4112 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4113 addMetadata(NewGEP, GEP); 4114 } 4115 } 4116 } 4117 4118 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4119 unsigned VF) { 4120 PHINode *P = cast<PHINode>(PN); 4121 if (EnableVPlanNativePath) { 4122 // Currently we enter here in the VPlan-native path for non-induction 4123 // PHIs where all control flow is uniform. We simply widen these PHIs. 4124 // Create a vector phi with no operands - the vector phi operands will be 4125 // set at the end of vector code generation. 4126 Type *VecTy = 4127 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4128 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4129 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4130 OrigPHIsToFix.push_back(P); 4131 4132 return; 4133 } 4134 4135 assert(PN->getParent() == OrigLoop->getHeader() && 4136 "Non-header phis should have been handled elsewhere"); 4137 4138 // In order to support recurrences we need to be able to vectorize Phi nodes. 4139 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4140 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4141 // this value when we vectorize all of the instructions that use the PHI. 4142 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4143 for (unsigned Part = 0; Part < UF; ++Part) { 4144 // This is phase one of vectorizing PHIs. 4145 Type *VecTy = 4146 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4147 Value *EntryPart = PHINode::Create( 4148 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4149 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4150 } 4151 return; 4152 } 4153 4154 setDebugLocFromInst(Builder, P); 4155 4156 // This PHINode must be an induction variable. 4157 // Make sure that we know about it. 4158 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4159 4160 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4161 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4162 4163 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4164 // which can be found from the original scalar operations. 4165 switch (II.getKind()) { 4166 case InductionDescriptor::IK_NoInduction: 4167 llvm_unreachable("Unknown induction"); 4168 case InductionDescriptor::IK_IntInduction: 4169 case InductionDescriptor::IK_FpInduction: 4170 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4171 case InductionDescriptor::IK_PtrInduction: { 4172 // Handle the pointer induction variable case. 4173 assert(P->getType()->isPointerTy() && "Unexpected type."); 4174 // This is the normalized GEP that starts counting at zero. 4175 Value *PtrInd = Induction; 4176 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4177 // Determine the number of scalars we need to generate for each unroll 4178 // iteration. If the instruction is uniform, we only need to generate the 4179 // first lane. Otherwise, we generate all VF values. 4180 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4181 // These are the scalar results. Notice that we don't generate vector GEPs 4182 // because scalar GEPs result in better code. 4183 for (unsigned Part = 0; Part < UF; ++Part) { 4184 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4185 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4186 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4187 Value *SclrGep = 4188 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4189 SclrGep->setName("next.gep"); 4190 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4191 } 4192 } 4193 return; 4194 } 4195 } 4196 } 4197 4198 /// A helper function for checking whether an integer division-related 4199 /// instruction may divide by zero (in which case it must be predicated if 4200 /// executed conditionally in the scalar code). 4201 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4202 /// Non-zero divisors that are non compile-time constants will not be 4203 /// converted into multiplication, so we will still end up scalarizing 4204 /// the division, but can do so w/o predication. 4205 static bool mayDivideByZero(Instruction &I) { 4206 assert((I.getOpcode() == Instruction::UDiv || 4207 I.getOpcode() == Instruction::SDiv || 4208 I.getOpcode() == Instruction::URem || 4209 I.getOpcode() == Instruction::SRem) && 4210 "Unexpected instruction"); 4211 Value *Divisor = I.getOperand(1); 4212 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4213 return !CInt || CInt->isZero(); 4214 } 4215 4216 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4217 switch (I.getOpcode()) { 4218 case Instruction::Br: 4219 case Instruction::PHI: 4220 case Instruction::GetElementPtr: 4221 llvm_unreachable("This instruction is handled by a different recipe."); 4222 case Instruction::UDiv: 4223 case Instruction::SDiv: 4224 case Instruction::SRem: 4225 case Instruction::URem: 4226 case Instruction::Add: 4227 case Instruction::FAdd: 4228 case Instruction::Sub: 4229 case Instruction::FSub: 4230 case Instruction::FNeg: 4231 case Instruction::Mul: 4232 case Instruction::FMul: 4233 case Instruction::FDiv: 4234 case Instruction::FRem: 4235 case Instruction::Shl: 4236 case Instruction::LShr: 4237 case Instruction::AShr: 4238 case Instruction::And: 4239 case Instruction::Or: 4240 case Instruction::Xor: { 4241 // Just widen unops and binops. 4242 setDebugLocFromInst(Builder, &I); 4243 4244 for (unsigned Part = 0; Part < UF; ++Part) { 4245 SmallVector<Value *, 2> Ops; 4246 for (Value *Op : I.operands()) 4247 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4248 4249 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4250 4251 if (auto *VecOp = dyn_cast<Instruction>(V)) 4252 VecOp->copyIRFlags(&I); 4253 4254 // Use this vector value for all users of the original instruction. 4255 VectorLoopValueMap.setVectorValue(&I, Part, V); 4256 addMetadata(V, &I); 4257 } 4258 4259 break; 4260 } 4261 case Instruction::Select: { 4262 // Widen selects. 4263 // If the selector is loop invariant we can create a select 4264 // instruction with a scalar condition. Otherwise, use vector-select. 4265 auto *SE = PSE.getSE(); 4266 bool InvariantCond = 4267 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4268 setDebugLocFromInst(Builder, &I); 4269 4270 // The condition can be loop invariant but still defined inside the 4271 // loop. This means that we can't just use the original 'cond' value. 4272 // We have to take the 'vectorized' value and pick the first lane. 4273 // Instcombine will make this a no-op. 4274 4275 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4276 4277 for (unsigned Part = 0; Part < UF; ++Part) { 4278 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4279 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4280 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4281 Value *Sel = 4282 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4283 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4284 addMetadata(Sel, &I); 4285 } 4286 4287 break; 4288 } 4289 4290 case Instruction::ICmp: 4291 case Instruction::FCmp: { 4292 // Widen compares. Generate vector compares. 4293 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4294 auto *Cmp = cast<CmpInst>(&I); 4295 setDebugLocFromInst(Builder, Cmp); 4296 for (unsigned Part = 0; Part < UF; ++Part) { 4297 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4298 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4299 Value *C = nullptr; 4300 if (FCmp) { 4301 // Propagate fast math flags. 4302 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4303 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4304 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4305 } else { 4306 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4307 } 4308 VectorLoopValueMap.setVectorValue(&I, Part, C); 4309 addMetadata(C, &I); 4310 } 4311 4312 break; 4313 } 4314 4315 case Instruction::ZExt: 4316 case Instruction::SExt: 4317 case Instruction::FPToUI: 4318 case Instruction::FPToSI: 4319 case Instruction::FPExt: 4320 case Instruction::PtrToInt: 4321 case Instruction::IntToPtr: 4322 case Instruction::SIToFP: 4323 case Instruction::UIToFP: 4324 case Instruction::Trunc: 4325 case Instruction::FPTrunc: 4326 case Instruction::BitCast: { 4327 auto *CI = cast<CastInst>(&I); 4328 setDebugLocFromInst(Builder, CI); 4329 4330 /// Vectorize casts. 4331 Type *DestTy = 4332 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4333 4334 for (unsigned Part = 0; Part < UF; ++Part) { 4335 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4336 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4337 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4338 addMetadata(Cast, &I); 4339 } 4340 break; 4341 } 4342 4343 case Instruction::Call: { 4344 // Ignore dbg intrinsics. 4345 if (isa<DbgInfoIntrinsic>(I)) 4346 break; 4347 setDebugLocFromInst(Builder, &I); 4348 4349 Module *M = I.getParent()->getParent()->getParent(); 4350 auto *CI = cast<CallInst>(&I); 4351 4352 SmallVector<Type *, 4> Tys; 4353 for (Value *ArgOperand : CI->arg_operands()) 4354 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4355 4356 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4357 4358 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4359 // version of the instruction. 4360 // Is it beneficial to perform intrinsic call compared to lib call? 4361 bool NeedToScalarize = false; 4362 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4363 bool UseVectorIntrinsic = 4364 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4365 assert((UseVectorIntrinsic || !NeedToScalarize) && 4366 "Instruction should be scalarized elsewhere."); 4367 4368 for (unsigned Part = 0; Part < UF; ++Part) { 4369 SmallVector<Value *, 4> Args; 4370 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4371 Value *Arg = CI->getArgOperand(i); 4372 // Some intrinsics have a scalar argument - don't replace it with a 4373 // vector. 4374 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4375 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4376 Args.push_back(Arg); 4377 } 4378 4379 Function *VectorF; 4380 if (UseVectorIntrinsic) { 4381 // Use vector version of the intrinsic. 4382 Type *TysForDecl[] = {CI->getType()}; 4383 if (VF > 1) 4384 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4385 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4386 } else { 4387 // Use vector version of the function call. 4388 const VFShape Shape = 4389 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4390 #ifndef NDEBUG 4391 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4392 assert(std::find_if(Infos.begin(), Infos.end(), 4393 [&Shape](const VFInfo &Info) { 4394 return Info.Shape == Shape; 4395 }) != Infos.end() && 4396 "Vector function shape is missing from the database."); 4397 #endif 4398 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4399 } 4400 assert(VectorF && "Can't create vector function."); 4401 4402 SmallVector<OperandBundleDef, 1> OpBundles; 4403 CI->getOperandBundlesAsDefs(OpBundles); 4404 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4405 4406 if (isa<FPMathOperator>(V)) 4407 V->copyFastMathFlags(CI); 4408 4409 VectorLoopValueMap.setVectorValue(&I, Part, V); 4410 addMetadata(V, &I); 4411 } 4412 4413 break; 4414 } 4415 4416 default: 4417 // This instruction is not vectorized by simple widening. 4418 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4419 llvm_unreachable("Unhandled instruction!"); 4420 } // end of switch. 4421 } 4422 4423 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4424 // We should not collect Scalars more than once per VF. Right now, this 4425 // function is called from collectUniformsAndScalars(), which already does 4426 // this check. Collecting Scalars for VF=1 does not make any sense. 4427 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4428 "This function should not be visited twice for the same VF"); 4429 4430 SmallSetVector<Instruction *, 8> Worklist; 4431 4432 // These sets are used to seed the analysis with pointers used by memory 4433 // accesses that will remain scalar. 4434 SmallSetVector<Instruction *, 8> ScalarPtrs; 4435 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4436 4437 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4438 // The pointer operands of loads and stores will be scalar as long as the 4439 // memory access is not a gather or scatter operation. The value operand of a 4440 // store will remain scalar if the store is scalarized. 4441 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4442 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4443 assert(WideningDecision != CM_Unknown && 4444 "Widening decision should be ready at this moment"); 4445 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4446 if (Ptr == Store->getValueOperand()) 4447 return WideningDecision == CM_Scalarize; 4448 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4449 "Ptr is neither a value or pointer operand"); 4450 return WideningDecision != CM_GatherScatter; 4451 }; 4452 4453 // A helper that returns true if the given value is a bitcast or 4454 // getelementptr instruction contained in the loop. 4455 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4456 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4457 isa<GetElementPtrInst>(V)) && 4458 !TheLoop->isLoopInvariant(V); 4459 }; 4460 4461 // A helper that evaluates a memory access's use of a pointer. If the use 4462 // will be a scalar use, and the pointer is only used by memory accesses, we 4463 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4464 // PossibleNonScalarPtrs. 4465 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4466 // We only care about bitcast and getelementptr instructions contained in 4467 // the loop. 4468 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4469 return; 4470 4471 // If the pointer has already been identified as scalar (e.g., if it was 4472 // also identified as uniform), there's nothing to do. 4473 auto *I = cast<Instruction>(Ptr); 4474 if (Worklist.count(I)) 4475 return; 4476 4477 // If the use of the pointer will be a scalar use, and all users of the 4478 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4479 // place the pointer in PossibleNonScalarPtrs. 4480 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4481 return isa<LoadInst>(U) || isa<StoreInst>(U); 4482 })) 4483 ScalarPtrs.insert(I); 4484 else 4485 PossibleNonScalarPtrs.insert(I); 4486 }; 4487 4488 // We seed the scalars analysis with three classes of instructions: (1) 4489 // instructions marked uniform-after-vectorization, (2) bitcast and 4490 // getelementptr instructions used by memory accesses requiring a scalar use, 4491 // and (3) pointer induction variables and their update instructions (we 4492 // currently only scalarize these). 4493 // 4494 // (1) Add to the worklist all instructions that have been identified as 4495 // uniform-after-vectorization. 4496 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4497 4498 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4499 // memory accesses requiring a scalar use. The pointer operands of loads and 4500 // stores will be scalar as long as the memory accesses is not a gather or 4501 // scatter operation. The value operand of a store will remain scalar if the 4502 // store is scalarized. 4503 for (auto *BB : TheLoop->blocks()) 4504 for (auto &I : *BB) { 4505 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4506 evaluatePtrUse(Load, Load->getPointerOperand()); 4507 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4508 evaluatePtrUse(Store, Store->getPointerOperand()); 4509 evaluatePtrUse(Store, Store->getValueOperand()); 4510 } 4511 } 4512 for (auto *I : ScalarPtrs) 4513 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4514 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4515 Worklist.insert(I); 4516 } 4517 4518 // (3) Add to the worklist all pointer induction variables and their update 4519 // instructions. 4520 // 4521 // TODO: Once we are able to vectorize pointer induction variables we should 4522 // no longer insert them into the worklist here. 4523 auto *Latch = TheLoop->getLoopLatch(); 4524 for (auto &Induction : Legal->getInductionVars()) { 4525 auto *Ind = Induction.first; 4526 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4527 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4528 continue; 4529 Worklist.insert(Ind); 4530 Worklist.insert(IndUpdate); 4531 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4532 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4533 << "\n"); 4534 } 4535 4536 // Insert the forced scalars. 4537 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4538 // induction variable when the PHI user is scalarized. 4539 auto ForcedScalar = ForcedScalars.find(VF); 4540 if (ForcedScalar != ForcedScalars.end()) 4541 for (auto *I : ForcedScalar->second) 4542 Worklist.insert(I); 4543 4544 // Expand the worklist by looking through any bitcasts and getelementptr 4545 // instructions we've already identified as scalar. This is similar to the 4546 // expansion step in collectLoopUniforms(); however, here we're only 4547 // expanding to include additional bitcasts and getelementptr instructions. 4548 unsigned Idx = 0; 4549 while (Idx != Worklist.size()) { 4550 Instruction *Dst = Worklist[Idx++]; 4551 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4552 continue; 4553 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4554 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4555 auto *J = cast<Instruction>(U); 4556 return !TheLoop->contains(J) || Worklist.count(J) || 4557 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4558 isScalarUse(J, Src)); 4559 })) { 4560 Worklist.insert(Src); 4561 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4562 } 4563 } 4564 4565 // An induction variable will remain scalar if all users of the induction 4566 // variable and induction variable update remain scalar. 4567 for (auto &Induction : Legal->getInductionVars()) { 4568 auto *Ind = Induction.first; 4569 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4570 4571 // We already considered pointer induction variables, so there's no reason 4572 // to look at their users again. 4573 // 4574 // TODO: Once we are able to vectorize pointer induction variables we 4575 // should no longer skip over them here. 4576 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4577 continue; 4578 4579 // Determine if all users of the induction variable are scalar after 4580 // vectorization. 4581 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4582 auto *I = cast<Instruction>(U); 4583 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4584 }); 4585 if (!ScalarInd) 4586 continue; 4587 4588 // Determine if all users of the induction variable update instruction are 4589 // scalar after vectorization. 4590 auto ScalarIndUpdate = 4591 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4592 auto *I = cast<Instruction>(U); 4593 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4594 }); 4595 if (!ScalarIndUpdate) 4596 continue; 4597 4598 // The induction variable and its update instruction will remain scalar. 4599 Worklist.insert(Ind); 4600 Worklist.insert(IndUpdate); 4601 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4602 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4603 << "\n"); 4604 } 4605 4606 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4607 } 4608 4609 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4610 if (!blockNeedsPredication(I->getParent())) 4611 return false; 4612 switch(I->getOpcode()) { 4613 default: 4614 break; 4615 case Instruction::Load: 4616 case Instruction::Store: { 4617 if (!Legal->isMaskRequired(I)) 4618 return false; 4619 auto *Ptr = getLoadStorePointerOperand(I); 4620 auto *Ty = getMemInstValueType(I); 4621 // We have already decided how to vectorize this instruction, get that 4622 // result. 4623 if (VF > 1) { 4624 InstWidening WideningDecision = getWideningDecision(I, VF); 4625 assert(WideningDecision != CM_Unknown && 4626 "Widening decision should be ready at this moment"); 4627 return WideningDecision == CM_Scalarize; 4628 } 4629 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4630 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4631 isLegalMaskedGather(Ty, Alignment)) 4632 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4633 isLegalMaskedScatter(Ty, Alignment)); 4634 } 4635 case Instruction::UDiv: 4636 case Instruction::SDiv: 4637 case Instruction::SRem: 4638 case Instruction::URem: 4639 return mayDivideByZero(*I); 4640 } 4641 return false; 4642 } 4643 4644 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4645 unsigned VF) { 4646 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4647 assert(getWideningDecision(I, VF) == CM_Unknown && 4648 "Decision should not be set yet."); 4649 auto *Group = getInterleavedAccessGroup(I); 4650 assert(Group && "Must have a group."); 4651 4652 // If the instruction's allocated size doesn't equal it's type size, it 4653 // requires padding and will be scalarized. 4654 auto &DL = I->getModule()->getDataLayout(); 4655 auto *ScalarTy = getMemInstValueType(I); 4656 if (hasIrregularType(ScalarTy, DL, VF)) 4657 return false; 4658 4659 // Check if masking is required. 4660 // A Group may need masking for one of two reasons: it resides in a block that 4661 // needs predication, or it was decided to use masking to deal with gaps. 4662 bool PredicatedAccessRequiresMasking = 4663 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4664 bool AccessWithGapsRequiresMasking = 4665 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4666 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4667 return true; 4668 4669 // If masked interleaving is required, we expect that the user/target had 4670 // enabled it, because otherwise it either wouldn't have been created or 4671 // it should have been invalidated by the CostModel. 4672 assert(useMaskedInterleavedAccesses(TTI) && 4673 "Masked interleave-groups for predicated accesses are not enabled."); 4674 4675 auto *Ty = getMemInstValueType(I); 4676 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4677 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4678 : TTI.isLegalMaskedStore(Ty, Alignment); 4679 } 4680 4681 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4682 unsigned VF) { 4683 // Get and ensure we have a valid memory instruction. 4684 LoadInst *LI = dyn_cast<LoadInst>(I); 4685 StoreInst *SI = dyn_cast<StoreInst>(I); 4686 assert((LI || SI) && "Invalid memory instruction"); 4687 4688 auto *Ptr = getLoadStorePointerOperand(I); 4689 4690 // In order to be widened, the pointer should be consecutive, first of all. 4691 if (!Legal->isConsecutivePtr(Ptr)) 4692 return false; 4693 4694 // If the instruction is a store located in a predicated block, it will be 4695 // scalarized. 4696 if (isScalarWithPredication(I)) 4697 return false; 4698 4699 // If the instruction's allocated size doesn't equal it's type size, it 4700 // requires padding and will be scalarized. 4701 auto &DL = I->getModule()->getDataLayout(); 4702 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4703 if (hasIrregularType(ScalarTy, DL, VF)) 4704 return false; 4705 4706 return true; 4707 } 4708 4709 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4710 // We should not collect Uniforms more than once per VF. Right now, 4711 // this function is called from collectUniformsAndScalars(), which 4712 // already does this check. Collecting Uniforms for VF=1 does not make any 4713 // sense. 4714 4715 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4716 "This function should not be visited twice for the same VF"); 4717 4718 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4719 // not analyze again. Uniforms.count(VF) will return 1. 4720 Uniforms[VF].clear(); 4721 4722 // We now know that the loop is vectorizable! 4723 // Collect instructions inside the loop that will remain uniform after 4724 // vectorization. 4725 4726 // Global values, params and instructions outside of current loop are out of 4727 // scope. 4728 auto isOutOfScope = [&](Value *V) -> bool { 4729 Instruction *I = dyn_cast<Instruction>(V); 4730 return (!I || !TheLoop->contains(I)); 4731 }; 4732 4733 SetVector<Instruction *> Worklist; 4734 BasicBlock *Latch = TheLoop->getLoopLatch(); 4735 4736 // Instructions that are scalar with predication must not be considered 4737 // uniform after vectorization, because that would create an erroneous 4738 // replicating region where only a single instance out of VF should be formed. 4739 // TODO: optimize such seldom cases if found important, see PR40816. 4740 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4741 if (isScalarWithPredication(I, VF)) { 4742 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4743 << *I << "\n"); 4744 return; 4745 } 4746 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4747 Worklist.insert(I); 4748 }; 4749 4750 // Start with the conditional branch. If the branch condition is an 4751 // instruction contained in the loop that is only used by the branch, it is 4752 // uniform. 4753 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4754 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4755 addToWorklistIfAllowed(Cmp); 4756 4757 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4758 // are pointers that are treated like consecutive pointers during 4759 // vectorization. The pointer operands of interleaved accesses are an 4760 // example. 4761 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4762 4763 // Holds pointer operands of instructions that are possibly non-uniform. 4764 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4765 4766 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4767 InstWidening WideningDecision = getWideningDecision(I, VF); 4768 assert(WideningDecision != CM_Unknown && 4769 "Widening decision should be ready at this moment"); 4770 4771 return (WideningDecision == CM_Widen || 4772 WideningDecision == CM_Widen_Reverse || 4773 WideningDecision == CM_Interleave); 4774 }; 4775 // Iterate over the instructions in the loop, and collect all 4776 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4777 // that a consecutive-like pointer operand will be scalarized, we collect it 4778 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4779 // getelementptr instruction can be used by both vectorized and scalarized 4780 // memory instructions. For example, if a loop loads and stores from the same 4781 // location, but the store is conditional, the store will be scalarized, and 4782 // the getelementptr won't remain uniform. 4783 for (auto *BB : TheLoop->blocks()) 4784 for (auto &I : *BB) { 4785 // If there's no pointer operand, there's nothing to do. 4786 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4787 if (!Ptr) 4788 continue; 4789 4790 // True if all users of Ptr are memory accesses that have Ptr as their 4791 // pointer operand. 4792 auto UsersAreMemAccesses = 4793 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4794 return getLoadStorePointerOperand(U) == Ptr; 4795 }); 4796 4797 // Ensure the memory instruction will not be scalarized or used by 4798 // gather/scatter, making its pointer operand non-uniform. If the pointer 4799 // operand is used by any instruction other than a memory access, we 4800 // conservatively assume the pointer operand may be non-uniform. 4801 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4802 PossibleNonUniformPtrs.insert(Ptr); 4803 4804 // If the memory instruction will be vectorized and its pointer operand 4805 // is consecutive-like, or interleaving - the pointer operand should 4806 // remain uniform. 4807 else 4808 ConsecutiveLikePtrs.insert(Ptr); 4809 } 4810 4811 // Add to the Worklist all consecutive and consecutive-like pointers that 4812 // aren't also identified as possibly non-uniform. 4813 for (auto *V : ConsecutiveLikePtrs) 4814 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4815 addToWorklistIfAllowed(V); 4816 4817 // Expand Worklist in topological order: whenever a new instruction 4818 // is added , its users should be already inside Worklist. It ensures 4819 // a uniform instruction will only be used by uniform instructions. 4820 unsigned idx = 0; 4821 while (idx != Worklist.size()) { 4822 Instruction *I = Worklist[idx++]; 4823 4824 for (auto OV : I->operand_values()) { 4825 // isOutOfScope operands cannot be uniform instructions. 4826 if (isOutOfScope(OV)) 4827 continue; 4828 // First order recurrence Phi's should typically be considered 4829 // non-uniform. 4830 auto *OP = dyn_cast<PHINode>(OV); 4831 if (OP && Legal->isFirstOrderRecurrence(OP)) 4832 continue; 4833 // If all the users of the operand are uniform, then add the 4834 // operand into the uniform worklist. 4835 auto *OI = cast<Instruction>(OV); 4836 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4837 auto *J = cast<Instruction>(U); 4838 return Worklist.count(J) || 4839 (OI == getLoadStorePointerOperand(J) && 4840 isUniformDecision(J, VF)); 4841 })) 4842 addToWorklistIfAllowed(OI); 4843 } 4844 } 4845 4846 // Returns true if Ptr is the pointer operand of a memory access instruction 4847 // I, and I is known to not require scalarization. 4848 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4849 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4850 }; 4851 4852 // For an instruction to be added into Worklist above, all its users inside 4853 // the loop should also be in Worklist. However, this condition cannot be 4854 // true for phi nodes that form a cyclic dependence. We must process phi 4855 // nodes separately. An induction variable will remain uniform if all users 4856 // of the induction variable and induction variable update remain uniform. 4857 // The code below handles both pointer and non-pointer induction variables. 4858 for (auto &Induction : Legal->getInductionVars()) { 4859 auto *Ind = Induction.first; 4860 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4861 4862 // Determine if all users of the induction variable are uniform after 4863 // vectorization. 4864 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4865 auto *I = cast<Instruction>(U); 4866 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4867 isVectorizedMemAccessUse(I, Ind); 4868 }); 4869 if (!UniformInd) 4870 continue; 4871 4872 // Determine if all users of the induction variable update instruction are 4873 // uniform after vectorization. 4874 auto UniformIndUpdate = 4875 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4876 auto *I = cast<Instruction>(U); 4877 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4878 isVectorizedMemAccessUse(I, IndUpdate); 4879 }); 4880 if (!UniformIndUpdate) 4881 continue; 4882 4883 // The induction variable and its update instruction will remain uniform. 4884 addToWorklistIfAllowed(Ind); 4885 addToWorklistIfAllowed(IndUpdate); 4886 } 4887 4888 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4889 } 4890 4891 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4892 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4893 4894 if (Legal->getRuntimePointerChecking()->Need) { 4895 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4896 "runtime pointer checks needed. Enable vectorization of this " 4897 "loop with '#pragma clang loop vectorize(enable)' when " 4898 "compiling with -Os/-Oz", 4899 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4900 return true; 4901 } 4902 4903 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4904 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4905 "runtime SCEV checks needed. Enable vectorization of this " 4906 "loop with '#pragma clang loop vectorize(enable)' when " 4907 "compiling with -Os/-Oz", 4908 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4909 return true; 4910 } 4911 4912 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4913 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4914 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4915 "runtime stride == 1 checks needed. Enable vectorization of " 4916 "this loop with '#pragma clang loop vectorize(enable)' when " 4917 "compiling with -Os/-Oz", 4918 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4919 return true; 4920 } 4921 4922 return false; 4923 } 4924 4925 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4926 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4927 // TODO: It may by useful to do since it's still likely to be dynamically 4928 // uniform if the target can skip. 4929 reportVectorizationFailure( 4930 "Not inserting runtime ptr check for divergent target", 4931 "runtime pointer checks needed. Not enabled for divergent target", 4932 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4933 return None; 4934 } 4935 4936 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4937 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4938 if (TC == 1) { 4939 reportVectorizationFailure("Single iteration (non) loop", 4940 "loop trip count is one, irrelevant for vectorization", 4941 "SingleIterationLoop", ORE, TheLoop); 4942 return None; 4943 } 4944 4945 switch (ScalarEpilogueStatus) { 4946 case CM_ScalarEpilogueAllowed: 4947 return computeFeasibleMaxVF(TC); 4948 case CM_ScalarEpilogueNotNeededUsePredicate: 4949 LLVM_DEBUG( 4950 dbgs() << "LV: vector predicate hint/switch found.\n" 4951 << "LV: Not allowing scalar epilogue, creating predicated " 4952 << "vector loop.\n"); 4953 break; 4954 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4955 // fallthrough as a special case of OptForSize 4956 case CM_ScalarEpilogueNotAllowedOptSize: 4957 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4958 LLVM_DEBUG( 4959 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4960 else 4961 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4962 << "count.\n"); 4963 4964 // Bail if runtime checks are required, which are not good when optimising 4965 // for size. 4966 if (runtimeChecksRequired()) 4967 return None; 4968 break; 4969 } 4970 4971 // Now try the tail folding 4972 4973 // Invalidate interleave groups that require an epilogue if we can't mask 4974 // the interleave-group. 4975 if (!useMaskedInterleavedAccesses(TTI)) 4976 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4977 4978 unsigned MaxVF = computeFeasibleMaxVF(TC); 4979 if (TC > 0 && TC % MaxVF == 0) { 4980 // Accept MaxVF if we do not have a tail. 4981 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4982 return MaxVF; 4983 } 4984 4985 // If we don't know the precise trip count, or if the trip count that we 4986 // found modulo the vectorization factor is not zero, try to fold the tail 4987 // by masking. 4988 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4989 if (Legal->prepareToFoldTailByMasking()) { 4990 FoldTailByMasking = true; 4991 return MaxVF; 4992 } 4993 4994 if (TC == 0) { 4995 reportVectorizationFailure( 4996 "Unable to calculate the loop count due to complex control flow", 4997 "unable to calculate the loop count due to complex control flow", 4998 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4999 return None; 5000 } 5001 5002 reportVectorizationFailure( 5003 "Cannot optimize for size and vectorize at the same time.", 5004 "cannot optimize for size and vectorize at the same time. " 5005 "Enable vectorization of this loop with '#pragma clang loop " 5006 "vectorize(enable)' when compiling with -Os/-Oz", 5007 "NoTailLoopWithOptForSize", ORE, TheLoop); 5008 return None; 5009 } 5010 5011 unsigned 5012 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5013 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5014 unsigned SmallestType, WidestType; 5015 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5016 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5017 5018 // Get the maximum safe dependence distance in bits computed by LAA. 5019 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5020 // the memory accesses that is most restrictive (involved in the smallest 5021 // dependence distance). 5022 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5023 5024 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5025 5026 unsigned MaxVectorSize = WidestRegister / WidestType; 5027 5028 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5029 << " / " << WidestType << " bits.\n"); 5030 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5031 << WidestRegister << " bits.\n"); 5032 5033 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5034 " into one vector!"); 5035 if (MaxVectorSize == 0) { 5036 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5037 MaxVectorSize = 1; 5038 return MaxVectorSize; 5039 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5040 isPowerOf2_32(ConstTripCount)) { 5041 // We need to clamp the VF to be the ConstTripCount. There is no point in 5042 // choosing a higher viable VF as done in the loop below. 5043 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5044 << ConstTripCount << "\n"); 5045 MaxVectorSize = ConstTripCount; 5046 return MaxVectorSize; 5047 } 5048 5049 unsigned MaxVF = MaxVectorSize; 5050 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5051 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5052 // Collect all viable vectorization factors larger than the default MaxVF 5053 // (i.e. MaxVectorSize). 5054 SmallVector<unsigned, 8> VFs; 5055 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5056 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5057 VFs.push_back(VS); 5058 5059 // For each VF calculate its register usage. 5060 auto RUs = calculateRegisterUsage(VFs); 5061 5062 // Select the largest VF which doesn't require more registers than existing 5063 // ones. 5064 for (int i = RUs.size() - 1; i >= 0; --i) { 5065 bool Selected = true; 5066 for (auto& pair : RUs[i].MaxLocalUsers) { 5067 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5068 if (pair.second > TargetNumRegisters) 5069 Selected = false; 5070 } 5071 if (Selected) { 5072 MaxVF = VFs[i]; 5073 break; 5074 } 5075 } 5076 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5077 if (MaxVF < MinVF) { 5078 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5079 << ") with target's minimum: " << MinVF << '\n'); 5080 MaxVF = MinVF; 5081 } 5082 } 5083 } 5084 return MaxVF; 5085 } 5086 5087 VectorizationFactor 5088 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5089 float Cost = expectedCost(1).first; 5090 const float ScalarCost = Cost; 5091 unsigned Width = 1; 5092 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5093 5094 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5095 if (ForceVectorization && MaxVF > 1) { 5096 // Ignore scalar width, because the user explicitly wants vectorization. 5097 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5098 // evaluation. 5099 Cost = std::numeric_limits<float>::max(); 5100 } 5101 5102 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5103 // Notice that the vector loop needs to be executed less times, so 5104 // we need to divide the cost of the vector loops by the width of 5105 // the vector elements. 5106 VectorizationCostTy C = expectedCost(i); 5107 float VectorCost = C.first / (float)i; 5108 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5109 << " costs: " << (int)VectorCost << ".\n"); 5110 if (!C.second && !ForceVectorization) { 5111 LLVM_DEBUG( 5112 dbgs() << "LV: Not considering vector loop of width " << i 5113 << " because it will not generate any vector instructions.\n"); 5114 continue; 5115 } 5116 if (VectorCost < Cost) { 5117 Cost = VectorCost; 5118 Width = i; 5119 } 5120 } 5121 5122 if (!EnableCondStoresVectorization && NumPredStores) { 5123 reportVectorizationFailure("There are conditional stores.", 5124 "store that is conditionally executed prevents vectorization", 5125 "ConditionalStore", ORE, TheLoop); 5126 Width = 1; 5127 Cost = ScalarCost; 5128 } 5129 5130 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5131 << "LV: Vectorization seems to be not beneficial, " 5132 << "but was forced by a user.\n"); 5133 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5134 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5135 return Factor; 5136 } 5137 5138 std::pair<unsigned, unsigned> 5139 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5140 unsigned MinWidth = -1U; 5141 unsigned MaxWidth = 8; 5142 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5143 5144 // For each block. 5145 for (BasicBlock *BB : TheLoop->blocks()) { 5146 // For each instruction in the loop. 5147 for (Instruction &I : BB->instructionsWithoutDebug()) { 5148 Type *T = I.getType(); 5149 5150 // Skip ignored values. 5151 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5152 continue; 5153 5154 // Only examine Loads, Stores and PHINodes. 5155 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5156 continue; 5157 5158 // Examine PHI nodes that are reduction variables. Update the type to 5159 // account for the recurrence type. 5160 if (auto *PN = dyn_cast<PHINode>(&I)) { 5161 if (!Legal->isReductionVariable(PN)) 5162 continue; 5163 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5164 T = RdxDesc.getRecurrenceType(); 5165 } 5166 5167 // Examine the stored values. 5168 if (auto *ST = dyn_cast<StoreInst>(&I)) 5169 T = ST->getValueOperand()->getType(); 5170 5171 // Ignore loaded pointer types and stored pointer types that are not 5172 // vectorizable. 5173 // 5174 // FIXME: The check here attempts to predict whether a load or store will 5175 // be vectorized. We only know this for certain after a VF has 5176 // been selected. Here, we assume that if an access can be 5177 // vectorized, it will be. We should also look at extending this 5178 // optimization to non-pointer types. 5179 // 5180 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5181 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5182 continue; 5183 5184 MinWidth = std::min(MinWidth, 5185 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5186 MaxWidth = std::max(MaxWidth, 5187 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5188 } 5189 } 5190 5191 return {MinWidth, MaxWidth}; 5192 } 5193 5194 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5195 unsigned LoopCost) { 5196 // -- The interleave heuristics -- 5197 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5198 // There are many micro-architectural considerations that we can't predict 5199 // at this level. For example, frontend pressure (on decode or fetch) due to 5200 // code size, or the number and capabilities of the execution ports. 5201 // 5202 // We use the following heuristics to select the interleave count: 5203 // 1. If the code has reductions, then we interleave to break the cross 5204 // iteration dependency. 5205 // 2. If the loop is really small, then we interleave to reduce the loop 5206 // overhead. 5207 // 3. We don't interleave if we think that we will spill registers to memory 5208 // due to the increased register pressure. 5209 5210 if (!isScalarEpilogueAllowed()) 5211 return 1; 5212 5213 // We used the distance for the interleave count. 5214 if (Legal->getMaxSafeDepDistBytes() != -1U) 5215 return 1; 5216 5217 // Do not interleave loops with a relatively small known or estimated trip 5218 // count. 5219 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5220 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5221 return 1; 5222 5223 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5224 // We divide by these constants so assume that we have at least one 5225 // instruction that uses at least one register. 5226 for (auto& pair : R.MaxLocalUsers) { 5227 pair.second = std::max(pair.second, 1U); 5228 } 5229 5230 // We calculate the interleave count using the following formula. 5231 // Subtract the number of loop invariants from the number of available 5232 // registers. These registers are used by all of the interleaved instances. 5233 // Next, divide the remaining registers by the number of registers that is 5234 // required by the loop, in order to estimate how many parallel instances 5235 // fit without causing spills. All of this is rounded down if necessary to be 5236 // a power of two. We want power of two interleave count to simplify any 5237 // addressing operations or alignment considerations. 5238 // We also want power of two interleave counts to ensure that the induction 5239 // variable of the vector loop wraps to zero, when tail is folded by masking; 5240 // this currently happens when OptForSize, in which case IC is set to 1 above. 5241 unsigned IC = UINT_MAX; 5242 5243 for (auto& pair : R.MaxLocalUsers) { 5244 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5245 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5246 << " registers of " 5247 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5248 if (VF == 1) { 5249 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5250 TargetNumRegisters = ForceTargetNumScalarRegs; 5251 } else { 5252 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5253 TargetNumRegisters = ForceTargetNumVectorRegs; 5254 } 5255 unsigned MaxLocalUsers = pair.second; 5256 unsigned LoopInvariantRegs = 0; 5257 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5258 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5259 5260 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5261 // Don't count the induction variable as interleaved. 5262 if (EnableIndVarRegisterHeur) { 5263 TmpIC = 5264 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5265 std::max(1U, (MaxLocalUsers - 1))); 5266 } 5267 5268 IC = std::min(IC, TmpIC); 5269 } 5270 5271 // Clamp the interleave ranges to reasonable counts. 5272 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5273 5274 // Check if the user has overridden the max. 5275 if (VF == 1) { 5276 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5277 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5278 } else { 5279 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5280 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5281 } 5282 5283 // If trip count is known or estimated compile time constant, limit the 5284 // interleave count to be less than the trip count divided by VF. 5285 if (BestKnownTC) { 5286 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5287 } 5288 5289 // If we did not calculate the cost for VF (because the user selected the VF) 5290 // then we calculate the cost of VF here. 5291 if (LoopCost == 0) 5292 LoopCost = expectedCost(VF).first; 5293 5294 assert(LoopCost && "Non-zero loop cost expected"); 5295 5296 // Clamp the calculated IC to be between the 1 and the max interleave count 5297 // that the target and trip count allows. 5298 if (IC > MaxInterleaveCount) 5299 IC = MaxInterleaveCount; 5300 else if (IC < 1) 5301 IC = 1; 5302 5303 // Interleave if we vectorized this loop and there is a reduction that could 5304 // benefit from interleaving. 5305 if (VF > 1 && !Legal->getReductionVars().empty()) { 5306 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5307 return IC; 5308 } 5309 5310 // Note that if we've already vectorized the loop we will have done the 5311 // runtime check and so interleaving won't require further checks. 5312 bool InterleavingRequiresRuntimePointerCheck = 5313 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5314 5315 // We want to interleave small loops in order to reduce the loop overhead and 5316 // potentially expose ILP opportunities. 5317 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5318 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5319 // We assume that the cost overhead is 1 and we use the cost model 5320 // to estimate the cost of the loop and interleave until the cost of the 5321 // loop overhead is about 5% of the cost of the loop. 5322 unsigned SmallIC = 5323 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5324 5325 // Interleave until store/load ports (estimated by max interleave count) are 5326 // saturated. 5327 unsigned NumStores = Legal->getNumStores(); 5328 unsigned NumLoads = Legal->getNumLoads(); 5329 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5330 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5331 5332 // If we have a scalar reduction (vector reductions are already dealt with 5333 // by this point), we can increase the critical path length if the loop 5334 // we're interleaving is inside another loop. Limit, by default to 2, so the 5335 // critical path only gets increased by one reduction operation. 5336 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5337 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5338 SmallIC = std::min(SmallIC, F); 5339 StoresIC = std::min(StoresIC, F); 5340 LoadsIC = std::min(LoadsIC, F); 5341 } 5342 5343 if (EnableLoadStoreRuntimeInterleave && 5344 std::max(StoresIC, LoadsIC) > SmallIC) { 5345 LLVM_DEBUG( 5346 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5347 return std::max(StoresIC, LoadsIC); 5348 } 5349 5350 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5351 return SmallIC; 5352 } 5353 5354 // Interleave if this is a large loop (small loops are already dealt with by 5355 // this point) that could benefit from interleaving. 5356 bool HasReductions = !Legal->getReductionVars().empty(); 5357 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5358 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5359 return IC; 5360 } 5361 5362 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5363 return 1; 5364 } 5365 5366 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5367 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5368 // This function calculates the register usage by measuring the highest number 5369 // of values that are alive at a single location. Obviously, this is a very 5370 // rough estimation. We scan the loop in a topological order in order and 5371 // assign a number to each instruction. We use RPO to ensure that defs are 5372 // met before their users. We assume that each instruction that has in-loop 5373 // users starts an interval. We record every time that an in-loop value is 5374 // used, so we have a list of the first and last occurrences of each 5375 // instruction. Next, we transpose this data structure into a multi map that 5376 // holds the list of intervals that *end* at a specific location. This multi 5377 // map allows us to perform a linear search. We scan the instructions linearly 5378 // and record each time that a new interval starts, by placing it in a set. 5379 // If we find this value in the multi-map then we remove it from the set. 5380 // The max register usage is the maximum size of the set. 5381 // We also search for instructions that are defined outside the loop, but are 5382 // used inside the loop. We need this number separately from the max-interval 5383 // usage number because when we unroll, loop-invariant values do not take 5384 // more register. 5385 LoopBlocksDFS DFS(TheLoop); 5386 DFS.perform(LI); 5387 5388 RegisterUsage RU; 5389 5390 // Each 'key' in the map opens a new interval. The values 5391 // of the map are the index of the 'last seen' usage of the 5392 // instruction that is the key. 5393 using IntervalMap = DenseMap<Instruction *, unsigned>; 5394 5395 // Maps instruction to its index. 5396 SmallVector<Instruction *, 64> IdxToInstr; 5397 // Marks the end of each interval. 5398 IntervalMap EndPoint; 5399 // Saves the list of instruction indices that are used in the loop. 5400 SmallPtrSet<Instruction *, 8> Ends; 5401 // Saves the list of values that are used in the loop but are 5402 // defined outside the loop, such as arguments and constants. 5403 SmallPtrSet<Value *, 8> LoopInvariants; 5404 5405 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5406 for (Instruction &I : BB->instructionsWithoutDebug()) { 5407 IdxToInstr.push_back(&I); 5408 5409 // Save the end location of each USE. 5410 for (Value *U : I.operands()) { 5411 auto *Instr = dyn_cast<Instruction>(U); 5412 5413 // Ignore non-instruction values such as arguments, constants, etc. 5414 if (!Instr) 5415 continue; 5416 5417 // If this instruction is outside the loop then record it and continue. 5418 if (!TheLoop->contains(Instr)) { 5419 LoopInvariants.insert(Instr); 5420 continue; 5421 } 5422 5423 // Overwrite previous end points. 5424 EndPoint[Instr] = IdxToInstr.size(); 5425 Ends.insert(Instr); 5426 } 5427 } 5428 } 5429 5430 // Saves the list of intervals that end with the index in 'key'. 5431 using InstrList = SmallVector<Instruction *, 2>; 5432 DenseMap<unsigned, InstrList> TransposeEnds; 5433 5434 // Transpose the EndPoints to a list of values that end at each index. 5435 for (auto &Interval : EndPoint) 5436 TransposeEnds[Interval.second].push_back(Interval.first); 5437 5438 SmallPtrSet<Instruction *, 8> OpenIntervals; 5439 5440 // Get the size of the widest register. 5441 unsigned MaxSafeDepDist = -1U; 5442 if (Legal->getMaxSafeDepDistBytes() != -1U) 5443 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5444 unsigned WidestRegister = 5445 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5446 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5447 5448 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5449 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5450 5451 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5452 5453 // A lambda that gets the register usage for the given type and VF. 5454 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5455 if (Ty->isTokenTy()) 5456 return 0U; 5457 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5458 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5459 }; 5460 5461 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5462 Instruction *I = IdxToInstr[i]; 5463 5464 // Remove all of the instructions that end at this location. 5465 InstrList &List = TransposeEnds[i]; 5466 for (Instruction *ToRemove : List) 5467 OpenIntervals.erase(ToRemove); 5468 5469 // Ignore instructions that are never used within the loop. 5470 if (Ends.find(I) == Ends.end()) 5471 continue; 5472 5473 // Skip ignored values. 5474 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5475 continue; 5476 5477 // For each VF find the maximum usage of registers. 5478 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5479 // Count the number of live intervals. 5480 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5481 5482 if (VFs[j] == 1) { 5483 for (auto Inst : OpenIntervals) { 5484 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5485 if (RegUsage.find(ClassID) == RegUsage.end()) 5486 RegUsage[ClassID] = 1; 5487 else 5488 RegUsage[ClassID] += 1; 5489 } 5490 } else { 5491 collectUniformsAndScalars(VFs[j]); 5492 for (auto Inst : OpenIntervals) { 5493 // Skip ignored values for VF > 1. 5494 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5495 continue; 5496 if (isScalarAfterVectorization(Inst, VFs[j])) { 5497 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5498 if (RegUsage.find(ClassID) == RegUsage.end()) 5499 RegUsage[ClassID] = 1; 5500 else 5501 RegUsage[ClassID] += 1; 5502 } else { 5503 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5504 if (RegUsage.find(ClassID) == RegUsage.end()) 5505 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5506 else 5507 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5508 } 5509 } 5510 } 5511 5512 for (auto& pair : RegUsage) { 5513 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5514 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5515 else 5516 MaxUsages[j][pair.first] = pair.second; 5517 } 5518 } 5519 5520 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5521 << OpenIntervals.size() << '\n'); 5522 5523 // Add the current instruction to the list of open intervals. 5524 OpenIntervals.insert(I); 5525 } 5526 5527 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5528 SmallMapVector<unsigned, unsigned, 4> Invariant; 5529 5530 for (auto Inst : LoopInvariants) { 5531 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5532 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5533 if (Invariant.find(ClassID) == Invariant.end()) 5534 Invariant[ClassID] = Usage; 5535 else 5536 Invariant[ClassID] += Usage; 5537 } 5538 5539 LLVM_DEBUG({ 5540 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5541 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5542 << " item\n"; 5543 for (const auto &pair : MaxUsages[i]) { 5544 dbgs() << "LV(REG): RegisterClass: " 5545 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5546 << " registers\n"; 5547 } 5548 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5549 << " item\n"; 5550 for (const auto &pair : Invariant) { 5551 dbgs() << "LV(REG): RegisterClass: " 5552 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5553 << " registers\n"; 5554 } 5555 }); 5556 5557 RU.LoopInvariantRegs = Invariant; 5558 RU.MaxLocalUsers = MaxUsages[i]; 5559 RUs[i] = RU; 5560 } 5561 5562 return RUs; 5563 } 5564 5565 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5566 // TODO: Cost model for emulated masked load/store is completely 5567 // broken. This hack guides the cost model to use an artificially 5568 // high enough value to practically disable vectorization with such 5569 // operations, except where previously deployed legality hack allowed 5570 // using very low cost values. This is to avoid regressions coming simply 5571 // from moving "masked load/store" check from legality to cost model. 5572 // Masked Load/Gather emulation was previously never allowed. 5573 // Limited number of Masked Store/Scatter emulation was allowed. 5574 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5575 return isa<LoadInst>(I) || 5576 (isa<StoreInst>(I) && 5577 NumPredStores > NumberOfStoresToPredicate); 5578 } 5579 5580 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5581 // If we aren't vectorizing the loop, or if we've already collected the 5582 // instructions to scalarize, there's nothing to do. Collection may already 5583 // have occurred if we have a user-selected VF and are now computing the 5584 // expected cost for interleaving. 5585 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5586 return; 5587 5588 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5589 // not profitable to scalarize any instructions, the presence of VF in the 5590 // map will indicate that we've analyzed it already. 5591 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5592 5593 // Find all the instructions that are scalar with predication in the loop and 5594 // determine if it would be better to not if-convert the blocks they are in. 5595 // If so, we also record the instructions to scalarize. 5596 for (BasicBlock *BB : TheLoop->blocks()) { 5597 if (!blockNeedsPredication(BB)) 5598 continue; 5599 for (Instruction &I : *BB) 5600 if (isScalarWithPredication(&I)) { 5601 ScalarCostsTy ScalarCosts; 5602 // Do not apply discount logic if hacked cost is needed 5603 // for emulated masked memrefs. 5604 if (!useEmulatedMaskMemRefHack(&I) && 5605 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5606 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5607 // Remember that BB will remain after vectorization. 5608 PredicatedBBsAfterVectorization.insert(BB); 5609 } 5610 } 5611 } 5612 5613 int LoopVectorizationCostModel::computePredInstDiscount( 5614 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5615 unsigned VF) { 5616 assert(!isUniformAfterVectorization(PredInst, VF) && 5617 "Instruction marked uniform-after-vectorization will be predicated"); 5618 5619 // Initialize the discount to zero, meaning that the scalar version and the 5620 // vector version cost the same. 5621 int Discount = 0; 5622 5623 // Holds instructions to analyze. The instructions we visit are mapped in 5624 // ScalarCosts. Those instructions are the ones that would be scalarized if 5625 // we find that the scalar version costs less. 5626 SmallVector<Instruction *, 8> Worklist; 5627 5628 // Returns true if the given instruction can be scalarized. 5629 auto canBeScalarized = [&](Instruction *I) -> bool { 5630 // We only attempt to scalarize instructions forming a single-use chain 5631 // from the original predicated block that would otherwise be vectorized. 5632 // Although not strictly necessary, we give up on instructions we know will 5633 // already be scalar to avoid traversing chains that are unlikely to be 5634 // beneficial. 5635 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5636 isScalarAfterVectorization(I, VF)) 5637 return false; 5638 5639 // If the instruction is scalar with predication, it will be analyzed 5640 // separately. We ignore it within the context of PredInst. 5641 if (isScalarWithPredication(I)) 5642 return false; 5643 5644 // If any of the instruction's operands are uniform after vectorization, 5645 // the instruction cannot be scalarized. This prevents, for example, a 5646 // masked load from being scalarized. 5647 // 5648 // We assume we will only emit a value for lane zero of an instruction 5649 // marked uniform after vectorization, rather than VF identical values. 5650 // Thus, if we scalarize an instruction that uses a uniform, we would 5651 // create uses of values corresponding to the lanes we aren't emitting code 5652 // for. This behavior can be changed by allowing getScalarValue to clone 5653 // the lane zero values for uniforms rather than asserting. 5654 for (Use &U : I->operands()) 5655 if (auto *J = dyn_cast<Instruction>(U.get())) 5656 if (isUniformAfterVectorization(J, VF)) 5657 return false; 5658 5659 // Otherwise, we can scalarize the instruction. 5660 return true; 5661 }; 5662 5663 // Compute the expected cost discount from scalarizing the entire expression 5664 // feeding the predicated instruction. We currently only consider expressions 5665 // that are single-use instruction chains. 5666 Worklist.push_back(PredInst); 5667 while (!Worklist.empty()) { 5668 Instruction *I = Worklist.pop_back_val(); 5669 5670 // If we've already analyzed the instruction, there's nothing to do. 5671 if (ScalarCosts.find(I) != ScalarCosts.end()) 5672 continue; 5673 5674 // Compute the cost of the vector instruction. Note that this cost already 5675 // includes the scalarization overhead of the predicated instruction. 5676 unsigned VectorCost = getInstructionCost(I, VF).first; 5677 5678 // Compute the cost of the scalarized instruction. This cost is the cost of 5679 // the instruction as if it wasn't if-converted and instead remained in the 5680 // predicated block. We will scale this cost by block probability after 5681 // computing the scalarization overhead. 5682 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5683 5684 // Compute the scalarization overhead of needed insertelement instructions 5685 // and phi nodes. 5686 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5687 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5688 true, false); 5689 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5690 } 5691 5692 // Compute the scalarization overhead of needed extractelement 5693 // instructions. For each of the instruction's operands, if the operand can 5694 // be scalarized, add it to the worklist; otherwise, account for the 5695 // overhead. 5696 for (Use &U : I->operands()) 5697 if (auto *J = dyn_cast<Instruction>(U.get())) { 5698 assert(VectorType::isValidElementType(J->getType()) && 5699 "Instruction has non-scalar type"); 5700 if (canBeScalarized(J)) 5701 Worklist.push_back(J); 5702 else if (needsExtract(J, VF)) 5703 ScalarCost += TTI.getScalarizationOverhead( 5704 ToVectorTy(J->getType(),VF), false, true); 5705 } 5706 5707 // Scale the total scalar cost by block probability. 5708 ScalarCost /= getReciprocalPredBlockProb(); 5709 5710 // Compute the discount. A non-negative discount means the vector version 5711 // of the instruction costs more, and scalarizing would be beneficial. 5712 Discount += VectorCost - ScalarCost; 5713 ScalarCosts[I] = ScalarCost; 5714 } 5715 5716 return Discount; 5717 } 5718 5719 LoopVectorizationCostModel::VectorizationCostTy 5720 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5721 VectorizationCostTy Cost; 5722 5723 // For each block. 5724 for (BasicBlock *BB : TheLoop->blocks()) { 5725 VectorizationCostTy BlockCost; 5726 5727 // For each instruction in the old loop. 5728 for (Instruction &I : BB->instructionsWithoutDebug()) { 5729 // Skip ignored values. 5730 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5731 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5732 continue; 5733 5734 VectorizationCostTy C = getInstructionCost(&I, VF); 5735 5736 // Check if we should override the cost. 5737 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5738 C.first = ForceTargetInstructionCost; 5739 5740 BlockCost.first += C.first; 5741 BlockCost.second |= C.second; 5742 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5743 << " for VF " << VF << " For instruction: " << I 5744 << '\n'); 5745 } 5746 5747 // If we are vectorizing a predicated block, it will have been 5748 // if-converted. This means that the block's instructions (aside from 5749 // stores and instructions that may divide by zero) will now be 5750 // unconditionally executed. For the scalar case, we may not always execute 5751 // the predicated block. Thus, scale the block's cost by the probability of 5752 // executing it. 5753 if (VF == 1 && blockNeedsPredication(BB)) 5754 BlockCost.first /= getReciprocalPredBlockProb(); 5755 5756 Cost.first += BlockCost.first; 5757 Cost.second |= BlockCost.second; 5758 } 5759 5760 return Cost; 5761 } 5762 5763 /// Gets Address Access SCEV after verifying that the access pattern 5764 /// is loop invariant except the induction variable dependence. 5765 /// 5766 /// This SCEV can be sent to the Target in order to estimate the address 5767 /// calculation cost. 5768 static const SCEV *getAddressAccessSCEV( 5769 Value *Ptr, 5770 LoopVectorizationLegality *Legal, 5771 PredicatedScalarEvolution &PSE, 5772 const Loop *TheLoop) { 5773 5774 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5775 if (!Gep) 5776 return nullptr; 5777 5778 // We are looking for a gep with all loop invariant indices except for one 5779 // which should be an induction variable. 5780 auto SE = PSE.getSE(); 5781 unsigned NumOperands = Gep->getNumOperands(); 5782 for (unsigned i = 1; i < NumOperands; ++i) { 5783 Value *Opd = Gep->getOperand(i); 5784 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5785 !Legal->isInductionVariable(Opd)) 5786 return nullptr; 5787 } 5788 5789 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5790 return PSE.getSCEV(Ptr); 5791 } 5792 5793 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5794 return Legal->hasStride(I->getOperand(0)) || 5795 Legal->hasStride(I->getOperand(1)); 5796 } 5797 5798 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5799 unsigned VF) { 5800 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5801 Type *ValTy = getMemInstValueType(I); 5802 auto SE = PSE.getSE(); 5803 5804 unsigned AS = getLoadStoreAddressSpace(I); 5805 Value *Ptr = getLoadStorePointerOperand(I); 5806 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5807 5808 // Figure out whether the access is strided and get the stride value 5809 // if it's known in compile time 5810 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5811 5812 // Get the cost of the scalar memory instruction and address computation. 5813 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5814 5815 // Don't pass *I here, since it is scalar but will actually be part of a 5816 // vectorized loop where the user of it is a vectorized instruction. 5817 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5818 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5819 Alignment, AS); 5820 5821 // Get the overhead of the extractelement and insertelement instructions 5822 // we might create due to scalarization. 5823 Cost += getScalarizationOverhead(I, VF); 5824 5825 // If we have a predicated store, it may not be executed for each vector 5826 // lane. Scale the cost by the probability of executing the predicated 5827 // block. 5828 if (isPredicatedInst(I)) { 5829 Cost /= getReciprocalPredBlockProb(); 5830 5831 if (useEmulatedMaskMemRefHack(I)) 5832 // Artificially setting to a high enough value to practically disable 5833 // vectorization with such operations. 5834 Cost = 3000000; 5835 } 5836 5837 return Cost; 5838 } 5839 5840 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5841 unsigned VF) { 5842 Type *ValTy = getMemInstValueType(I); 5843 Type *VectorTy = ToVectorTy(ValTy, VF); 5844 Value *Ptr = getLoadStorePointerOperand(I); 5845 unsigned AS = getLoadStoreAddressSpace(I); 5846 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5847 5848 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5849 "Stride should be 1 or -1 for consecutive memory access"); 5850 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5851 unsigned Cost = 0; 5852 if (Legal->isMaskRequired(I)) 5853 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5854 Alignment ? Alignment->value() : 0, AS); 5855 else 5856 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5857 5858 bool Reverse = ConsecutiveStride < 0; 5859 if (Reverse) 5860 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5861 return Cost; 5862 } 5863 5864 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5865 unsigned VF) { 5866 Type *ValTy = getMemInstValueType(I); 5867 Type *VectorTy = ToVectorTy(ValTy, VF); 5868 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5869 unsigned AS = getLoadStoreAddressSpace(I); 5870 if (isa<LoadInst>(I)) { 5871 return TTI.getAddressComputationCost(ValTy) + 5872 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5873 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5874 } 5875 StoreInst *SI = cast<StoreInst>(I); 5876 5877 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5878 return TTI.getAddressComputationCost(ValTy) + 5879 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5880 (isLoopInvariantStoreValue 5881 ? 0 5882 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5883 VF - 1)); 5884 } 5885 5886 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5887 unsigned VF) { 5888 Type *ValTy = getMemInstValueType(I); 5889 Type *VectorTy = ToVectorTy(ValTy, VF); 5890 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5891 Value *Ptr = getLoadStorePointerOperand(I); 5892 5893 return TTI.getAddressComputationCost(VectorTy) + 5894 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5895 Legal->isMaskRequired(I), 5896 Alignment ? Alignment->value() : 0, I); 5897 } 5898 5899 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5900 unsigned VF) { 5901 Type *ValTy = getMemInstValueType(I); 5902 Type *VectorTy = ToVectorTy(ValTy, VF); 5903 unsigned AS = getLoadStoreAddressSpace(I); 5904 5905 auto Group = getInterleavedAccessGroup(I); 5906 assert(Group && "Fail to get an interleaved access group."); 5907 5908 unsigned InterleaveFactor = Group->getFactor(); 5909 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5910 5911 // Holds the indices of existing members in an interleaved load group. 5912 // An interleaved store group doesn't need this as it doesn't allow gaps. 5913 SmallVector<unsigned, 4> Indices; 5914 if (isa<LoadInst>(I)) { 5915 for (unsigned i = 0; i < InterleaveFactor; i++) 5916 if (Group->getMember(i)) 5917 Indices.push_back(i); 5918 } 5919 5920 // Calculate the cost of the whole interleaved group. 5921 bool UseMaskForGaps = 5922 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5923 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5924 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5925 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5926 5927 if (Group->isReverse()) { 5928 // TODO: Add support for reversed masked interleaved access. 5929 assert(!Legal->isMaskRequired(I) && 5930 "Reverse masked interleaved access not supported."); 5931 Cost += Group->getNumMembers() * 5932 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5933 } 5934 return Cost; 5935 } 5936 5937 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5938 unsigned VF) { 5939 // Calculate scalar cost only. Vectorization cost should be ready at this 5940 // moment. 5941 if (VF == 1) { 5942 Type *ValTy = getMemInstValueType(I); 5943 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5944 unsigned AS = getLoadStoreAddressSpace(I); 5945 5946 return TTI.getAddressComputationCost(ValTy) + 5947 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5948 } 5949 return getWideningCost(I, VF); 5950 } 5951 5952 LoopVectorizationCostModel::VectorizationCostTy 5953 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5954 // If we know that this instruction will remain uniform, check the cost of 5955 // the scalar version. 5956 if (isUniformAfterVectorization(I, VF)) 5957 VF = 1; 5958 5959 if (VF > 1 && isProfitableToScalarize(I, VF)) 5960 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5961 5962 // Forced scalars do not have any scalarization overhead. 5963 auto ForcedScalar = ForcedScalars.find(VF); 5964 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5965 auto InstSet = ForcedScalar->second; 5966 if (InstSet.find(I) != InstSet.end()) 5967 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5968 } 5969 5970 Type *VectorTy; 5971 unsigned C = getInstructionCost(I, VF, VectorTy); 5972 5973 bool TypeNotScalarized = 5974 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5975 return VectorizationCostTy(C, TypeNotScalarized); 5976 } 5977 5978 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5979 unsigned VF) { 5980 5981 if (VF == 1) 5982 return 0; 5983 5984 unsigned Cost = 0; 5985 Type *RetTy = ToVectorTy(I->getType(), VF); 5986 if (!RetTy->isVoidTy() && 5987 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5988 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5989 5990 // Some targets keep addresses scalar. 5991 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5992 return Cost; 5993 5994 // Some targets support efficient element stores. 5995 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5996 return Cost; 5997 5998 // Collect operands to consider. 5999 CallInst *CI = dyn_cast<CallInst>(I); 6000 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6001 6002 // Skip operands that do not require extraction/scalarization and do not incur 6003 // any overhead. 6004 return Cost + TTI.getOperandsScalarizationOverhead( 6005 filterExtractingOperands(Ops, VF), VF); 6006 } 6007 6008 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6009 if (VF == 1) 6010 return; 6011 NumPredStores = 0; 6012 for (BasicBlock *BB : TheLoop->blocks()) { 6013 // For each instruction in the old loop. 6014 for (Instruction &I : *BB) { 6015 Value *Ptr = getLoadStorePointerOperand(&I); 6016 if (!Ptr) 6017 continue; 6018 6019 // TODO: We should generate better code and update the cost model for 6020 // predicated uniform stores. Today they are treated as any other 6021 // predicated store (see added test cases in 6022 // invariant-store-vectorization.ll). 6023 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6024 NumPredStores++; 6025 6026 if (Legal->isUniform(Ptr) && 6027 // Conditional loads and stores should be scalarized and predicated. 6028 // isScalarWithPredication cannot be used here since masked 6029 // gather/scatters are not considered scalar with predication. 6030 !Legal->blockNeedsPredication(I.getParent())) { 6031 // TODO: Avoid replicating loads and stores instead of 6032 // relying on instcombine to remove them. 6033 // Load: Scalar load + broadcast 6034 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6035 unsigned Cost = getUniformMemOpCost(&I, VF); 6036 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6037 continue; 6038 } 6039 6040 // We assume that widening is the best solution when possible. 6041 if (memoryInstructionCanBeWidened(&I, VF)) { 6042 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6043 int ConsecutiveStride = 6044 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6045 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6046 "Expected consecutive stride."); 6047 InstWidening Decision = 6048 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6049 setWideningDecision(&I, VF, Decision, Cost); 6050 continue; 6051 } 6052 6053 // Choose between Interleaving, Gather/Scatter or Scalarization. 6054 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6055 unsigned NumAccesses = 1; 6056 if (isAccessInterleaved(&I)) { 6057 auto Group = getInterleavedAccessGroup(&I); 6058 assert(Group && "Fail to get an interleaved access group."); 6059 6060 // Make one decision for the whole group. 6061 if (getWideningDecision(&I, VF) != CM_Unknown) 6062 continue; 6063 6064 NumAccesses = Group->getNumMembers(); 6065 if (interleavedAccessCanBeWidened(&I, VF)) 6066 InterleaveCost = getInterleaveGroupCost(&I, VF); 6067 } 6068 6069 unsigned GatherScatterCost = 6070 isLegalGatherOrScatter(&I) 6071 ? getGatherScatterCost(&I, VF) * NumAccesses 6072 : std::numeric_limits<unsigned>::max(); 6073 6074 unsigned ScalarizationCost = 6075 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6076 6077 // Choose better solution for the current VF, 6078 // write down this decision and use it during vectorization. 6079 unsigned Cost; 6080 InstWidening Decision; 6081 if (InterleaveCost <= GatherScatterCost && 6082 InterleaveCost < ScalarizationCost) { 6083 Decision = CM_Interleave; 6084 Cost = InterleaveCost; 6085 } else if (GatherScatterCost < ScalarizationCost) { 6086 Decision = CM_GatherScatter; 6087 Cost = GatherScatterCost; 6088 } else { 6089 Decision = CM_Scalarize; 6090 Cost = ScalarizationCost; 6091 } 6092 // If the instructions belongs to an interleave group, the whole group 6093 // receives the same decision. The whole group receives the cost, but 6094 // the cost will actually be assigned to one instruction. 6095 if (auto Group = getInterleavedAccessGroup(&I)) 6096 setWideningDecision(Group, VF, Decision, Cost); 6097 else 6098 setWideningDecision(&I, VF, Decision, Cost); 6099 } 6100 } 6101 6102 // Make sure that any load of address and any other address computation 6103 // remains scalar unless there is gather/scatter support. This avoids 6104 // inevitable extracts into address registers, and also has the benefit of 6105 // activating LSR more, since that pass can't optimize vectorized 6106 // addresses. 6107 if (TTI.prefersVectorizedAddressing()) 6108 return; 6109 6110 // Start with all scalar pointer uses. 6111 SmallPtrSet<Instruction *, 8> AddrDefs; 6112 for (BasicBlock *BB : TheLoop->blocks()) 6113 for (Instruction &I : *BB) { 6114 Instruction *PtrDef = 6115 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6116 if (PtrDef && TheLoop->contains(PtrDef) && 6117 getWideningDecision(&I, VF) != CM_GatherScatter) 6118 AddrDefs.insert(PtrDef); 6119 } 6120 6121 // Add all instructions used to generate the addresses. 6122 SmallVector<Instruction *, 4> Worklist; 6123 for (auto *I : AddrDefs) 6124 Worklist.push_back(I); 6125 while (!Worklist.empty()) { 6126 Instruction *I = Worklist.pop_back_val(); 6127 for (auto &Op : I->operands()) 6128 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6129 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6130 AddrDefs.insert(InstOp).second) 6131 Worklist.push_back(InstOp); 6132 } 6133 6134 for (auto *I : AddrDefs) { 6135 if (isa<LoadInst>(I)) { 6136 // Setting the desired widening decision should ideally be handled in 6137 // by cost functions, but since this involves the task of finding out 6138 // if the loaded register is involved in an address computation, it is 6139 // instead changed here when we know this is the case. 6140 InstWidening Decision = getWideningDecision(I, VF); 6141 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6142 // Scalarize a widened load of address. 6143 setWideningDecision(I, VF, CM_Scalarize, 6144 (VF * getMemoryInstructionCost(I, 1))); 6145 else if (auto Group = getInterleavedAccessGroup(I)) { 6146 // Scalarize an interleave group of address loads. 6147 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6148 if (Instruction *Member = Group->getMember(I)) 6149 setWideningDecision(Member, VF, CM_Scalarize, 6150 (VF * getMemoryInstructionCost(Member, 1))); 6151 } 6152 } 6153 } else 6154 // Make sure I gets scalarized and a cost estimate without 6155 // scalarization overhead. 6156 ForcedScalars[VF].insert(I); 6157 } 6158 } 6159 6160 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6161 unsigned VF, 6162 Type *&VectorTy) { 6163 Type *RetTy = I->getType(); 6164 if (canTruncateToMinimalBitwidth(I, VF)) 6165 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6166 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6167 auto SE = PSE.getSE(); 6168 6169 // TODO: We need to estimate the cost of intrinsic calls. 6170 switch (I->getOpcode()) { 6171 case Instruction::GetElementPtr: 6172 // We mark this instruction as zero-cost because the cost of GEPs in 6173 // vectorized code depends on whether the corresponding memory instruction 6174 // is scalarized or not. Therefore, we handle GEPs with the memory 6175 // instruction cost. 6176 return 0; 6177 case Instruction::Br: { 6178 // In cases of scalarized and predicated instructions, there will be VF 6179 // predicated blocks in the vectorized loop. Each branch around these 6180 // blocks requires also an extract of its vector compare i1 element. 6181 bool ScalarPredicatedBB = false; 6182 BranchInst *BI = cast<BranchInst>(I); 6183 if (VF > 1 && BI->isConditional() && 6184 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6185 PredicatedBBsAfterVectorization.end() || 6186 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6187 PredicatedBBsAfterVectorization.end())) 6188 ScalarPredicatedBB = true; 6189 6190 if (ScalarPredicatedBB) { 6191 // Return cost for branches around scalarized and predicated blocks. 6192 Type *Vec_i1Ty = 6193 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6194 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6195 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6196 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6197 // The back-edge branch will remain, as will all scalar branches. 6198 return TTI.getCFInstrCost(Instruction::Br); 6199 else 6200 // This branch will be eliminated by if-conversion. 6201 return 0; 6202 // Note: We currently assume zero cost for an unconditional branch inside 6203 // a predicated block since it will become a fall-through, although we 6204 // may decide in the future to call TTI for all branches. 6205 } 6206 case Instruction::PHI: { 6207 auto *Phi = cast<PHINode>(I); 6208 6209 // First-order recurrences are replaced by vector shuffles inside the loop. 6210 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6211 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6212 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6213 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6214 6215 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6216 // converted into select instructions. We require N - 1 selects per phi 6217 // node, where N is the number of incoming values. 6218 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6219 return (Phi->getNumIncomingValues() - 1) * 6220 TTI.getCmpSelInstrCost( 6221 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6222 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6223 6224 return TTI.getCFInstrCost(Instruction::PHI); 6225 } 6226 case Instruction::UDiv: 6227 case Instruction::SDiv: 6228 case Instruction::URem: 6229 case Instruction::SRem: 6230 // If we have a predicated instruction, it may not be executed for each 6231 // vector lane. Get the scalarization cost and scale this amount by the 6232 // probability of executing the predicated block. If the instruction is not 6233 // predicated, we fall through to the next case. 6234 if (VF > 1 && isScalarWithPredication(I)) { 6235 unsigned Cost = 0; 6236 6237 // These instructions have a non-void type, so account for the phi nodes 6238 // that we will create. This cost is likely to be zero. The phi node 6239 // cost, if any, should be scaled by the block probability because it 6240 // models a copy at the end of each predicated block. 6241 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6242 6243 // The cost of the non-predicated instruction. 6244 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6245 6246 // The cost of insertelement and extractelement instructions needed for 6247 // scalarization. 6248 Cost += getScalarizationOverhead(I, VF); 6249 6250 // Scale the cost by the probability of executing the predicated blocks. 6251 // This assumes the predicated block for each vector lane is equally 6252 // likely. 6253 return Cost / getReciprocalPredBlockProb(); 6254 } 6255 LLVM_FALLTHROUGH; 6256 case Instruction::Add: 6257 case Instruction::FAdd: 6258 case Instruction::Sub: 6259 case Instruction::FSub: 6260 case Instruction::Mul: 6261 case Instruction::FMul: 6262 case Instruction::FDiv: 6263 case Instruction::FRem: 6264 case Instruction::Shl: 6265 case Instruction::LShr: 6266 case Instruction::AShr: 6267 case Instruction::And: 6268 case Instruction::Or: 6269 case Instruction::Xor: { 6270 // Since we will replace the stride by 1 the multiplication should go away. 6271 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6272 return 0; 6273 // Certain instructions can be cheaper to vectorize if they have a constant 6274 // second vector operand. One example of this are shifts on x86. 6275 Value *Op2 = I->getOperand(1); 6276 TargetTransformInfo::OperandValueProperties Op2VP; 6277 TargetTransformInfo::OperandValueKind Op2VK = 6278 TTI.getOperandInfo(Op2, Op2VP); 6279 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6280 Op2VK = TargetTransformInfo::OK_UniformValue; 6281 6282 SmallVector<const Value *, 4> Operands(I->operand_values()); 6283 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6284 return N * TTI.getArithmeticInstrCost( 6285 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6286 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6287 } 6288 case Instruction::FNeg: { 6289 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6290 return N * TTI.getArithmeticInstrCost( 6291 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6292 TargetTransformInfo::OK_AnyValue, 6293 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6294 I->getOperand(0), I); 6295 } 6296 case Instruction::Select: { 6297 SelectInst *SI = cast<SelectInst>(I); 6298 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6299 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6300 Type *CondTy = SI->getCondition()->getType(); 6301 if (!ScalarCond) 6302 CondTy = VectorType::get(CondTy, VF); 6303 6304 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6305 } 6306 case Instruction::ICmp: 6307 case Instruction::FCmp: { 6308 Type *ValTy = I->getOperand(0)->getType(); 6309 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6310 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6311 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6312 VectorTy = ToVectorTy(ValTy, VF); 6313 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6314 } 6315 case Instruction::Store: 6316 case Instruction::Load: { 6317 unsigned Width = VF; 6318 if (Width > 1) { 6319 InstWidening Decision = getWideningDecision(I, Width); 6320 assert(Decision != CM_Unknown && 6321 "CM decision should be taken at this point"); 6322 if (Decision == CM_Scalarize) 6323 Width = 1; 6324 } 6325 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6326 return getMemoryInstructionCost(I, VF); 6327 } 6328 case Instruction::ZExt: 6329 case Instruction::SExt: 6330 case Instruction::FPToUI: 6331 case Instruction::FPToSI: 6332 case Instruction::FPExt: 6333 case Instruction::PtrToInt: 6334 case Instruction::IntToPtr: 6335 case Instruction::SIToFP: 6336 case Instruction::UIToFP: 6337 case Instruction::Trunc: 6338 case Instruction::FPTrunc: 6339 case Instruction::BitCast: { 6340 // We optimize the truncation of induction variables having constant 6341 // integer steps. The cost of these truncations is the same as the scalar 6342 // operation. 6343 if (isOptimizableIVTruncate(I, VF)) { 6344 auto *Trunc = cast<TruncInst>(I); 6345 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6346 Trunc->getSrcTy(), Trunc); 6347 } 6348 6349 Type *SrcScalarTy = I->getOperand(0)->getType(); 6350 Type *SrcVecTy = 6351 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6352 if (canTruncateToMinimalBitwidth(I, VF)) { 6353 // This cast is going to be shrunk. This may remove the cast or it might 6354 // turn it into slightly different cast. For example, if MinBW == 16, 6355 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6356 // 6357 // Calculate the modified src and dest types. 6358 Type *MinVecTy = VectorTy; 6359 if (I->getOpcode() == Instruction::Trunc) { 6360 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6361 VectorTy = 6362 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6363 } else if (I->getOpcode() == Instruction::ZExt || 6364 I->getOpcode() == Instruction::SExt) { 6365 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6366 VectorTy = 6367 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6368 } 6369 } 6370 6371 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6372 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6373 } 6374 case Instruction::Call: { 6375 bool NeedToScalarize; 6376 CallInst *CI = cast<CallInst>(I); 6377 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6378 if (getVectorIntrinsicIDForCall(CI, TLI)) 6379 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6380 return CallCost; 6381 } 6382 default: 6383 // The cost of executing VF copies of the scalar instruction. This opcode 6384 // is unknown. Assume that it is the same as 'mul'. 6385 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6386 getScalarizationOverhead(I, VF); 6387 } // end of switch. 6388 } 6389 6390 char LoopVectorize::ID = 0; 6391 6392 static const char lv_name[] = "Loop Vectorization"; 6393 6394 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6395 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6396 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6397 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6398 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6399 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6400 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6401 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6402 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6403 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6404 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6405 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6406 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6407 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6408 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6409 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6410 6411 namespace llvm { 6412 6413 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6414 6415 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6416 bool VectorizeOnlyWhenForced) { 6417 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6418 } 6419 6420 } // end namespace llvm 6421 6422 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6423 // Check if the pointer operand of a load or store instruction is 6424 // consecutive. 6425 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6426 return Legal->isConsecutivePtr(Ptr); 6427 return false; 6428 } 6429 6430 void LoopVectorizationCostModel::collectValuesToIgnore() { 6431 // Ignore ephemeral values. 6432 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6433 6434 // Ignore type-promoting instructions we identified during reduction 6435 // detection. 6436 for (auto &Reduction : Legal->getReductionVars()) { 6437 RecurrenceDescriptor &RedDes = Reduction.second; 6438 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6439 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6440 } 6441 // Ignore type-casting instructions we identified during induction 6442 // detection. 6443 for (auto &Induction : Legal->getInductionVars()) { 6444 InductionDescriptor &IndDes = Induction.second; 6445 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6446 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6447 } 6448 } 6449 6450 // TODO: we could return a pair of values that specify the max VF and 6451 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6452 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6453 // doesn't have a cost model that can choose which plan to execute if 6454 // more than one is generated. 6455 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6456 LoopVectorizationCostModel &CM) { 6457 unsigned WidestType; 6458 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6459 return WidestVectorRegBits / WidestType; 6460 } 6461 6462 VectorizationFactor 6463 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6464 unsigned VF = UserVF; 6465 // Outer loop handling: They may require CFG and instruction level 6466 // transformations before even evaluating whether vectorization is profitable. 6467 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6468 // the vectorization pipeline. 6469 if (!OrigLoop->empty()) { 6470 // If the user doesn't provide a vectorization factor, determine a 6471 // reasonable one. 6472 if (!UserVF) { 6473 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6474 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6475 6476 // Make sure we have a VF > 1 for stress testing. 6477 if (VPlanBuildStressTest && VF < 2) { 6478 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6479 << "overriding computed VF.\n"); 6480 VF = 4; 6481 } 6482 } 6483 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6484 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6485 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6486 << " to build VPlans.\n"); 6487 buildVPlans(VF, VF); 6488 6489 // For VPlan build stress testing, we bail out after VPlan construction. 6490 if (VPlanBuildStressTest) 6491 return VectorizationFactor::Disabled(); 6492 6493 return {VF, 0}; 6494 } 6495 6496 LLVM_DEBUG( 6497 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6498 "VPlan-native path.\n"); 6499 return VectorizationFactor::Disabled(); 6500 } 6501 6502 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6503 assert(OrigLoop->empty() && "Inner loop expected."); 6504 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6505 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6506 return None; 6507 6508 // Invalidate interleave groups if all blocks of loop will be predicated. 6509 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6510 !useMaskedInterleavedAccesses(*TTI)) { 6511 LLVM_DEBUG( 6512 dbgs() 6513 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6514 "which requires masked-interleaved support.\n"); 6515 CM.InterleaveInfo.reset(); 6516 } 6517 6518 if (UserVF) { 6519 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6520 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6521 // Collect the instructions (and their associated costs) that will be more 6522 // profitable to scalarize. 6523 CM.selectUserVectorizationFactor(UserVF); 6524 buildVPlansWithVPRecipes(UserVF, UserVF); 6525 LLVM_DEBUG(printPlans(dbgs())); 6526 return {{UserVF, 0}}; 6527 } 6528 6529 unsigned MaxVF = MaybeMaxVF.getValue(); 6530 assert(MaxVF != 0 && "MaxVF is zero."); 6531 6532 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6533 // Collect Uniform and Scalar instructions after vectorization with VF. 6534 CM.collectUniformsAndScalars(VF); 6535 6536 // Collect the instructions (and their associated costs) that will be more 6537 // profitable to scalarize. 6538 if (VF > 1) 6539 CM.collectInstsToScalarize(VF); 6540 } 6541 6542 buildVPlansWithVPRecipes(1, MaxVF); 6543 LLVM_DEBUG(printPlans(dbgs())); 6544 if (MaxVF == 1) 6545 return VectorizationFactor::Disabled(); 6546 6547 // Select the optimal vectorization factor. 6548 return CM.selectVectorizationFactor(MaxVF); 6549 } 6550 6551 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6552 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6553 << '\n'); 6554 BestVF = VF; 6555 BestUF = UF; 6556 6557 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6558 return !Plan->hasVF(VF); 6559 }); 6560 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6561 } 6562 6563 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6564 DominatorTree *DT) { 6565 // Perform the actual loop transformation. 6566 6567 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6568 VPCallbackILV CallbackILV(ILV); 6569 6570 VPTransformState State{BestVF, BestUF, LI, 6571 DT, ILV.Builder, ILV.VectorLoopValueMap, 6572 &ILV, CallbackILV}; 6573 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6574 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6575 6576 //===------------------------------------------------===// 6577 // 6578 // Notice: any optimization or new instruction that go 6579 // into the code below should also be implemented in 6580 // the cost-model. 6581 // 6582 //===------------------------------------------------===// 6583 6584 // 2. Copy and widen instructions from the old loop into the new loop. 6585 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6586 VPlans.front()->execute(&State); 6587 6588 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6589 // predication, updating analyses. 6590 ILV.fixVectorizedLoop(); 6591 } 6592 6593 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6594 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6595 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6596 6597 // We create new control-flow for the vectorized loop, so the original 6598 // condition will be dead after vectorization if it's only used by the 6599 // branch. 6600 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6601 if (Cmp && Cmp->hasOneUse()) 6602 DeadInstructions.insert(Cmp); 6603 6604 // We create new "steps" for induction variable updates to which the original 6605 // induction variables map. An original update instruction will be dead if 6606 // all its users except the induction variable are dead. 6607 for (auto &Induction : Legal->getInductionVars()) { 6608 PHINode *Ind = Induction.first; 6609 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6610 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6611 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6612 DeadInstructions.end(); 6613 })) 6614 DeadInstructions.insert(IndUpdate); 6615 6616 // We record as "Dead" also the type-casting instructions we had identified 6617 // during induction analysis. We don't need any handling for them in the 6618 // vectorized loop because we have proven that, under a proper runtime 6619 // test guarding the vectorized loop, the value of the phi, and the casted 6620 // value of the phi, are the same. The last instruction in this casting chain 6621 // will get its scalar/vector/widened def from the scalar/vector/widened def 6622 // of the respective phi node. Any other casts in the induction def-use chain 6623 // have no other uses outside the phi update chain, and will be ignored. 6624 InductionDescriptor &IndDes = Induction.second; 6625 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6626 DeadInstructions.insert(Casts.begin(), Casts.end()); 6627 } 6628 } 6629 6630 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6631 6632 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6633 6634 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6635 Instruction::BinaryOps BinOp) { 6636 // When unrolling and the VF is 1, we only need to add a simple scalar. 6637 Type *Ty = Val->getType(); 6638 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6639 6640 if (Ty->isFloatingPointTy()) { 6641 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6642 6643 // Floating point operations had to be 'fast' to enable the unrolling. 6644 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6645 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6646 } 6647 Constant *C = ConstantInt::get(Ty, StartIdx); 6648 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6649 } 6650 6651 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6652 SmallVector<Metadata *, 4> MDs; 6653 // Reserve first location for self reference to the LoopID metadata node. 6654 MDs.push_back(nullptr); 6655 bool IsUnrollMetadata = false; 6656 MDNode *LoopID = L->getLoopID(); 6657 if (LoopID) { 6658 // First find existing loop unrolling disable metadata. 6659 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6660 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6661 if (MD) { 6662 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6663 IsUnrollMetadata = 6664 S && S->getString().startswith("llvm.loop.unroll.disable"); 6665 } 6666 MDs.push_back(LoopID->getOperand(i)); 6667 } 6668 } 6669 6670 if (!IsUnrollMetadata) { 6671 // Add runtime unroll disable metadata. 6672 LLVMContext &Context = L->getHeader()->getContext(); 6673 SmallVector<Metadata *, 1> DisableOperands; 6674 DisableOperands.push_back( 6675 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6676 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6677 MDs.push_back(DisableNode); 6678 MDNode *NewLoopID = MDNode::get(Context, MDs); 6679 // Set operand 0 to refer to the loop id itself. 6680 NewLoopID->replaceOperandWith(0, NewLoopID); 6681 L->setLoopID(NewLoopID); 6682 } 6683 } 6684 6685 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6686 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6687 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6688 bool PredicateAtRangeStart = Predicate(Range.Start); 6689 6690 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6691 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6692 Range.End = TmpVF; 6693 break; 6694 } 6695 6696 return PredicateAtRangeStart; 6697 } 6698 6699 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6700 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6701 /// of VF's starting at a given VF and extending it as much as possible. Each 6702 /// vectorization decision can potentially shorten this sub-range during 6703 /// buildVPlan(). 6704 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6705 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6706 VFRange SubRange = {VF, MaxVF + 1}; 6707 VPlans.push_back(buildVPlan(SubRange)); 6708 VF = SubRange.End; 6709 } 6710 } 6711 6712 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6713 VPlanPtr &Plan) { 6714 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6715 6716 // Look for cached value. 6717 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6718 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6719 if (ECEntryIt != EdgeMaskCache.end()) 6720 return ECEntryIt->second; 6721 6722 VPValue *SrcMask = createBlockInMask(Src, Plan); 6723 6724 // The terminator has to be a branch inst! 6725 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6726 assert(BI && "Unexpected terminator found"); 6727 6728 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6729 return EdgeMaskCache[Edge] = SrcMask; 6730 6731 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6732 assert(EdgeMask && "No Edge Mask found for condition"); 6733 6734 if (BI->getSuccessor(0) != Dst) 6735 EdgeMask = Builder.createNot(EdgeMask); 6736 6737 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6738 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6739 6740 return EdgeMaskCache[Edge] = EdgeMask; 6741 } 6742 6743 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6744 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6745 6746 // Look for cached value. 6747 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6748 if (BCEntryIt != BlockMaskCache.end()) 6749 return BCEntryIt->second; 6750 6751 // All-one mask is modelled as no-mask following the convention for masked 6752 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6753 VPValue *BlockMask = nullptr; 6754 6755 if (OrigLoop->getHeader() == BB) { 6756 if (!CM.blockNeedsPredication(BB)) 6757 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6758 6759 // Introduce the early-exit compare IV <= BTC to form header block mask. 6760 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6761 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6762 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6763 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6764 return BlockMaskCache[BB] = BlockMask; 6765 } 6766 6767 // This is the block mask. We OR all incoming edges. 6768 for (auto *Predecessor : predecessors(BB)) { 6769 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6770 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6771 return BlockMaskCache[BB] = EdgeMask; 6772 6773 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6774 BlockMask = EdgeMask; 6775 continue; 6776 } 6777 6778 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6779 } 6780 6781 return BlockMaskCache[BB] = BlockMask; 6782 } 6783 6784 VPWidenMemoryInstructionRecipe * 6785 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6786 VPlanPtr &Plan) { 6787 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6788 return nullptr; 6789 6790 auto willWiden = [&](unsigned VF) -> bool { 6791 if (VF == 1) 6792 return false; 6793 LoopVectorizationCostModel::InstWidening Decision = 6794 CM.getWideningDecision(I, VF); 6795 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6796 "CM decision should be taken at this point."); 6797 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6798 return true; 6799 if (CM.isScalarAfterVectorization(I, VF) || 6800 CM.isProfitableToScalarize(I, VF)) 6801 return false; 6802 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6803 }; 6804 6805 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6806 return nullptr; 6807 6808 VPValue *Mask = nullptr; 6809 if (Legal->isMaskRequired(I)) 6810 Mask = createBlockInMask(I->getParent(), Plan); 6811 6812 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6813 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6814 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6815 6816 StoreInst *Store = cast<StoreInst>(I); 6817 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6818 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6819 } 6820 6821 VPWidenIntOrFpInductionRecipe * 6822 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6823 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6824 // Check if this is an integer or fp induction. If so, build the recipe that 6825 // produces its scalar and vector values. 6826 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6827 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6828 II.getKind() == InductionDescriptor::IK_FpInduction) 6829 return new VPWidenIntOrFpInductionRecipe(Phi); 6830 6831 return nullptr; 6832 } 6833 6834 // Optimize the special case where the source is a constant integer 6835 // induction variable. Notice that we can only optimize the 'trunc' case 6836 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6837 // (c) other casts depend on pointer size. 6838 6839 // Determine whether \p K is a truncation based on an induction variable that 6840 // can be optimized. 6841 auto isOptimizableIVTruncate = 6842 [&](Instruction *K) -> std::function<bool(unsigned)> { 6843 return 6844 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6845 }; 6846 6847 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6848 isOptimizableIVTruncate(I), Range)) 6849 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6850 cast<TruncInst>(I)); 6851 return nullptr; 6852 } 6853 6854 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6855 PHINode *Phi = dyn_cast<PHINode>(I); 6856 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6857 return nullptr; 6858 6859 // We know that all PHIs in non-header blocks are converted into selects, so 6860 // we don't have to worry about the insertion order and we can just use the 6861 // builder. At this point we generate the predication tree. There may be 6862 // duplications since this is a simple recursive scan, but future 6863 // optimizations will clean it up. 6864 6865 SmallVector<VPValue *, 2> Masks; 6866 unsigned NumIncoming = Phi->getNumIncomingValues(); 6867 for (unsigned In = 0; In < NumIncoming; In++) { 6868 VPValue *EdgeMask = 6869 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6870 assert((EdgeMask || NumIncoming == 1) && 6871 "Multiple predecessors with one having a full mask"); 6872 if (EdgeMask) 6873 Masks.push_back(EdgeMask); 6874 } 6875 return new VPBlendRecipe(Phi, Masks); 6876 } 6877 6878 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6879 VFRange &Range) { 6880 6881 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6882 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6883 6884 if (IsPredicated) 6885 return false; 6886 6887 auto IsVectorizableOpcode = [](unsigned Opcode) { 6888 switch (Opcode) { 6889 case Instruction::Add: 6890 case Instruction::And: 6891 case Instruction::AShr: 6892 case Instruction::BitCast: 6893 case Instruction::Br: 6894 case Instruction::Call: 6895 case Instruction::FAdd: 6896 case Instruction::FCmp: 6897 case Instruction::FDiv: 6898 case Instruction::FMul: 6899 case Instruction::FNeg: 6900 case Instruction::FPExt: 6901 case Instruction::FPToSI: 6902 case Instruction::FPToUI: 6903 case Instruction::FPTrunc: 6904 case Instruction::FRem: 6905 case Instruction::FSub: 6906 case Instruction::ICmp: 6907 case Instruction::IntToPtr: 6908 case Instruction::Load: 6909 case Instruction::LShr: 6910 case Instruction::Mul: 6911 case Instruction::Or: 6912 case Instruction::PHI: 6913 case Instruction::PtrToInt: 6914 case Instruction::SDiv: 6915 case Instruction::Select: 6916 case Instruction::SExt: 6917 case Instruction::Shl: 6918 case Instruction::SIToFP: 6919 case Instruction::SRem: 6920 case Instruction::Store: 6921 case Instruction::Sub: 6922 case Instruction::Trunc: 6923 case Instruction::UDiv: 6924 case Instruction::UIToFP: 6925 case Instruction::URem: 6926 case Instruction::Xor: 6927 case Instruction::ZExt: 6928 return true; 6929 } 6930 return false; 6931 }; 6932 6933 if (!IsVectorizableOpcode(I->getOpcode())) 6934 return false; 6935 6936 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6937 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6938 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6939 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6940 return false; 6941 } 6942 6943 auto willWiden = [&](unsigned VF) -> bool { 6944 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6945 CM.isProfitableToScalarize(I, VF))) 6946 return false; 6947 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6948 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6949 // The following case may be scalarized depending on the VF. 6950 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6951 // version of the instruction. 6952 // Is it beneficial to perform intrinsic call compared to lib call? 6953 bool NeedToScalarize; 6954 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6955 bool UseVectorIntrinsic = 6956 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6957 return UseVectorIntrinsic || !NeedToScalarize; 6958 } 6959 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6960 assert(CM.getWideningDecision(I, VF) == 6961 LoopVectorizationCostModel::CM_Scalarize && 6962 "Memory widening decisions should have been taken care by now"); 6963 return false; 6964 } 6965 return true; 6966 }; 6967 6968 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6969 return false; 6970 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6971 // to avoid having to split recipes later. 6972 bool IsSingleton = Ingredient2Recipe.count(I); 6973 6974 // Success: widen this instruction. 6975 6976 // Use the default widening recipe. We optimize the common case where 6977 // consecutive instructions can be represented by a single recipe. 6978 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6979 LastExtensibleRecipe->appendInstruction(I)) 6980 return true; 6981 6982 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6983 if (!IsSingleton) 6984 LastExtensibleRecipe = WidenRecipe; 6985 setRecipe(I, WidenRecipe); 6986 VPBB->appendRecipe(WidenRecipe); 6987 return true; 6988 } 6989 6990 VPBasicBlock *VPRecipeBuilder::handleReplication( 6991 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6992 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6993 VPlanPtr &Plan) { 6994 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6995 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6996 Range); 6997 6998 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6999 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7000 7001 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7002 setRecipe(I, Recipe); 7003 7004 // Find if I uses a predicated instruction. If so, it will use its scalar 7005 // value. Avoid hoisting the insert-element which packs the scalar value into 7006 // a vector value, as that happens iff all users use the vector value. 7007 for (auto &Op : I->operands()) 7008 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7009 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7010 PredInst2Recipe[PredInst]->setAlsoPack(false); 7011 7012 // Finalize the recipe for Instr, first if it is not predicated. 7013 if (!IsPredicated) { 7014 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7015 VPBB->appendRecipe(Recipe); 7016 return VPBB; 7017 } 7018 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7019 assert(VPBB->getSuccessors().empty() && 7020 "VPBB has successors when handling predicated replication."); 7021 // Record predicated instructions for above packing optimizations. 7022 PredInst2Recipe[I] = Recipe; 7023 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7024 VPBlockUtils::insertBlockAfter(Region, VPBB); 7025 auto *RegSucc = new VPBasicBlock(); 7026 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7027 return RegSucc; 7028 } 7029 7030 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7031 VPRecipeBase *PredRecipe, 7032 VPlanPtr &Plan) { 7033 // Instructions marked for predication are replicated and placed under an 7034 // if-then construct to prevent side-effects. 7035 7036 // Generate recipes to compute the block mask for this region. 7037 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7038 7039 // Build the triangular if-then region. 7040 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7041 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7042 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7043 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7044 auto *PHIRecipe = 7045 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7046 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7047 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7048 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7049 7050 // Note: first set Entry as region entry and then connect successors starting 7051 // from it in order, to propagate the "parent" of each VPBasicBlock. 7052 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7053 VPBlockUtils::connectBlocks(Pred, Exit); 7054 7055 return Region; 7056 } 7057 7058 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7059 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7060 VPRecipeBase *Recipe = nullptr; 7061 7062 // First, check for specific widening recipes that deal with memory 7063 // operations, inductions and Phi nodes. 7064 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7065 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7066 (Recipe = tryToBlend(Instr, Plan)) || 7067 (isa<PHINode>(Instr) && 7068 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7069 setRecipe(Instr, Recipe); 7070 VPBB->appendRecipe(Recipe); 7071 return true; 7072 } 7073 7074 // Handle GEP widening. 7075 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7076 auto Scalarize = [&](unsigned VF) { 7077 return CM.isScalarWithPredication(Instr, VF) || 7078 CM.isScalarAfterVectorization(Instr, VF) || 7079 CM.isProfitableToScalarize(Instr, VF); 7080 }; 7081 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7082 return false; 7083 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7084 setRecipe(Instr, Recipe); 7085 VPBB->appendRecipe(Recipe); 7086 return true; 7087 } 7088 7089 // Check if Instr is to be widened by a general VPWidenRecipe, after 7090 // having first checked for specific widening recipes. 7091 if (tryToWiden(Instr, VPBB, Range)) 7092 return true; 7093 7094 return false; 7095 } 7096 7097 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7098 unsigned MaxVF) { 7099 assert(OrigLoop->empty() && "Inner loop expected."); 7100 7101 // Collect conditions feeding internal conditional branches; they need to be 7102 // represented in VPlan for it to model masking. 7103 SmallPtrSet<Value *, 1> NeedDef; 7104 7105 auto *Latch = OrigLoop->getLoopLatch(); 7106 for (BasicBlock *BB : OrigLoop->blocks()) { 7107 if (BB == Latch) 7108 continue; 7109 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7110 if (Branch && Branch->isConditional()) 7111 NeedDef.insert(Branch->getCondition()); 7112 } 7113 7114 // If the tail is to be folded by masking, the primary induction variable 7115 // needs to be represented in VPlan for it to model early-exit masking. 7116 // Also, both the Phi and the live-out instruction of each reduction are 7117 // required in order to introduce a select between them in VPlan. 7118 if (CM.foldTailByMasking()) { 7119 NeedDef.insert(Legal->getPrimaryInduction()); 7120 for (auto &Reduction : Legal->getReductionVars()) { 7121 NeedDef.insert(Reduction.first); 7122 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7123 } 7124 } 7125 7126 // Collect instructions from the original loop that will become trivially dead 7127 // in the vectorized loop. We don't need to vectorize these instructions. For 7128 // example, original induction update instructions can become dead because we 7129 // separately emit induction "steps" when generating code for the new loop. 7130 // Similarly, we create a new latch condition when setting up the structure 7131 // of the new loop, so the old one can become dead. 7132 SmallPtrSet<Instruction *, 4> DeadInstructions; 7133 collectTriviallyDeadInstructions(DeadInstructions); 7134 7135 // Add assume instructions we need to drop to DeadInstructions, to prevent 7136 // them from being added to the VPlan. 7137 // TODO: We only need to drop assumes in blocks that get flattend. If the 7138 // control flow is preserved, we should keep them. 7139 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7140 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7141 7142 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7143 // Dead instructions do not need sinking. Remove them from SinkAfter. 7144 for (Instruction *I : DeadInstructions) 7145 SinkAfter.erase(I); 7146 7147 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7148 VFRange SubRange = {VF, MaxVF + 1}; 7149 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7150 DeadInstructions, SinkAfter)); 7151 VF = SubRange.End; 7152 } 7153 } 7154 7155 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7156 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7157 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7158 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7159 7160 // Hold a mapping from predicated instructions to their recipes, in order to 7161 // fix their AlsoPack behavior if a user is determined to replicate and use a 7162 // scalar instead of vector value. 7163 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7164 7165 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7166 7167 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7168 7169 // --------------------------------------------------------------------------- 7170 // Pre-construction: record ingredients whose recipes we'll need to further 7171 // process after constructing the initial VPlan. 7172 // --------------------------------------------------------------------------- 7173 7174 // Mark instructions we'll need to sink later and their targets as 7175 // ingredients whose recipe we'll need to record. 7176 for (auto &Entry : SinkAfter) { 7177 RecipeBuilder.recordRecipeOf(Entry.first); 7178 RecipeBuilder.recordRecipeOf(Entry.second); 7179 } 7180 7181 // For each interleave group which is relevant for this (possibly trimmed) 7182 // Range, add it to the set of groups to be later applied to the VPlan and add 7183 // placeholders for its members' Recipes which we'll be replacing with a 7184 // single VPInterleaveRecipe. 7185 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7186 auto applyIG = [IG, this](unsigned VF) -> bool { 7187 return (VF >= 2 && // Query is illegal for VF == 1 7188 CM.getWideningDecision(IG->getInsertPos(), VF) == 7189 LoopVectorizationCostModel::CM_Interleave); 7190 }; 7191 if (!getDecisionAndClampRange(applyIG, Range)) 7192 continue; 7193 InterleaveGroups.insert(IG); 7194 for (unsigned i = 0; i < IG->getFactor(); i++) 7195 if (Instruction *Member = IG->getMember(i)) 7196 RecipeBuilder.recordRecipeOf(Member); 7197 }; 7198 7199 // --------------------------------------------------------------------------- 7200 // Build initial VPlan: Scan the body of the loop in a topological order to 7201 // visit each basic block after having visited its predecessor basic blocks. 7202 // --------------------------------------------------------------------------- 7203 7204 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7205 auto Plan = std::make_unique<VPlan>(); 7206 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7207 Plan->setEntry(VPBB); 7208 7209 // Represent values that will have defs inside VPlan. 7210 for (Value *V : NeedDef) 7211 Plan->addVPValue(V); 7212 7213 // Scan the body of the loop in a topological order to visit each basic block 7214 // after having visited its predecessor basic blocks. 7215 LoopBlocksDFS DFS(OrigLoop); 7216 DFS.perform(LI); 7217 7218 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7219 // Relevant instructions from basic block BB will be grouped into VPRecipe 7220 // ingredients and fill a new VPBasicBlock. 7221 unsigned VPBBsForBB = 0; 7222 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7223 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7224 VPBB = FirstVPBBForBB; 7225 Builder.setInsertPoint(VPBB); 7226 7227 // Introduce each ingredient into VPlan. 7228 for (Instruction &I : BB->instructionsWithoutDebug()) { 7229 Instruction *Instr = &I; 7230 7231 // First filter out irrelevant instructions, to ensure no recipes are 7232 // built for them. 7233 if (isa<BranchInst>(Instr) || 7234 DeadInstructions.find(Instr) != DeadInstructions.end()) 7235 continue; 7236 7237 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7238 continue; 7239 7240 // Otherwise, if all widening options failed, Instruction is to be 7241 // replicated. This may create a successor for VPBB. 7242 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7243 Instr, Range, VPBB, PredInst2Recipe, Plan); 7244 if (NextVPBB != VPBB) { 7245 VPBB = NextVPBB; 7246 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7247 : ""); 7248 } 7249 } 7250 } 7251 7252 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7253 // may also be empty, such as the last one VPBB, reflecting original 7254 // basic-blocks with no recipes. 7255 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7256 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7257 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7258 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7259 delete PreEntry; 7260 7261 // --------------------------------------------------------------------------- 7262 // Transform initial VPlan: Apply previously taken decisions, in order, to 7263 // bring the VPlan to its final state. 7264 // --------------------------------------------------------------------------- 7265 7266 // Apply Sink-After legal constraints. 7267 for (auto &Entry : SinkAfter) { 7268 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7269 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7270 Sink->moveAfter(Target); 7271 } 7272 7273 // Interleave memory: for each Interleave Group we marked earlier as relevant 7274 // for this VPlan, replace the Recipes widening its memory instructions with a 7275 // single VPInterleaveRecipe at its insertion point. 7276 for (auto IG : InterleaveGroups) { 7277 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7278 RecipeBuilder.getRecipe(IG->getInsertPos())); 7279 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7280 ->insertBefore(Recipe); 7281 7282 for (unsigned i = 0; i < IG->getFactor(); ++i) 7283 if (Instruction *Member = IG->getMember(i)) { 7284 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7285 } 7286 } 7287 7288 // Finally, if tail is folded by masking, introduce selects between the phi 7289 // and the live-out instruction of each reduction, at the end of the latch. 7290 if (CM.foldTailByMasking()) { 7291 Builder.setInsertPoint(VPBB); 7292 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7293 for (auto &Reduction : Legal->getReductionVars()) { 7294 VPValue *Phi = Plan->getVPValue(Reduction.first); 7295 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7296 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7297 } 7298 } 7299 7300 std::string PlanName; 7301 raw_string_ostream RSO(PlanName); 7302 unsigned VF = Range.Start; 7303 Plan->addVF(VF); 7304 RSO << "Initial VPlan for VF={" << VF; 7305 for (VF *= 2; VF < Range.End; VF *= 2) { 7306 Plan->addVF(VF); 7307 RSO << "," << VF; 7308 } 7309 RSO << "},UF>=1"; 7310 RSO.flush(); 7311 Plan->setName(PlanName); 7312 7313 return Plan; 7314 } 7315 7316 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7317 // Outer loop handling: They may require CFG and instruction level 7318 // transformations before even evaluating whether vectorization is profitable. 7319 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7320 // the vectorization pipeline. 7321 assert(!OrigLoop->empty()); 7322 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7323 7324 // Create new empty VPlan 7325 auto Plan = std::make_unique<VPlan>(); 7326 7327 // Build hierarchical CFG 7328 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7329 HCFGBuilder.buildHierarchicalCFG(); 7330 7331 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7332 Plan->addVF(VF); 7333 7334 if (EnableVPlanPredication) { 7335 VPlanPredicator VPP(*Plan); 7336 VPP.predicate(); 7337 7338 // Avoid running transformation to recipes until masked code generation in 7339 // VPlan-native path is in place. 7340 return Plan; 7341 } 7342 7343 SmallPtrSet<Instruction *, 1> DeadInstructions; 7344 VPlanTransforms::VPInstructionsToVPRecipes( 7345 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7346 return Plan; 7347 } 7348 7349 Value* LoopVectorizationPlanner::VPCallbackILV:: 7350 getOrCreateVectorValues(Value *V, unsigned Part) { 7351 return ILV.getOrCreateVectorValue(V, Part); 7352 } 7353 7354 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7355 Value *V, const VPIteration &Instance) { 7356 return ILV.getOrCreateScalarValue(V, Instance); 7357 } 7358 7359 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7360 VPSlotTracker &SlotTracker) const { 7361 O << " +\n" 7362 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7363 IG->getInsertPos()->printAsOperand(O, false); 7364 O << ", "; 7365 getAddr()->printAsOperand(O, SlotTracker); 7366 VPValue *Mask = getMask(); 7367 if (Mask) { 7368 O << ", "; 7369 Mask->printAsOperand(O, SlotTracker); 7370 } 7371 O << "\\l\""; 7372 for (unsigned i = 0; i < IG->getFactor(); ++i) 7373 if (Instruction *I = IG->getMember(i)) 7374 O << " +\n" 7375 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7376 } 7377 7378 void VPWidenRecipe::execute(VPTransformState &State) { 7379 for (auto &Instr : make_range(Begin, End)) 7380 State.ILV->widenInstruction(Instr); 7381 } 7382 7383 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7384 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7385 IsIndexLoopInvariant); 7386 } 7387 7388 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7389 assert(!State.Instance && "Int or FP induction being replicated."); 7390 State.ILV->widenIntOrFpInduction(IV, Trunc); 7391 } 7392 7393 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7394 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7395 } 7396 7397 void VPBlendRecipe::execute(VPTransformState &State) { 7398 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7399 // We know that all PHIs in non-header blocks are converted into 7400 // selects, so we don't have to worry about the insertion order and we 7401 // can just use the builder. 7402 // At this point we generate the predication tree. There may be 7403 // duplications since this is a simple recursive scan, but future 7404 // optimizations will clean it up. 7405 7406 unsigned NumIncoming = Phi->getNumIncomingValues(); 7407 7408 assert((User || NumIncoming == 1) && 7409 "Multiple predecessors with predecessors having a full mask"); 7410 // Generate a sequence of selects of the form: 7411 // SELECT(Mask3, In3, 7412 // SELECT(Mask2, In2, 7413 // ( ...))) 7414 InnerLoopVectorizer::VectorParts Entry(State.UF); 7415 for (unsigned In = 0; In < NumIncoming; ++In) { 7416 for (unsigned Part = 0; Part < State.UF; ++Part) { 7417 // We might have single edge PHIs (blocks) - use an identity 7418 // 'select' for the first PHI operand. 7419 Value *In0 = 7420 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7421 if (In == 0) 7422 Entry[Part] = In0; // Initialize with the first incoming value. 7423 else { 7424 // Select between the current value and the previous incoming edge 7425 // based on the incoming mask. 7426 Value *Cond = State.get(User->getOperand(In), Part); 7427 Entry[Part] = 7428 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7429 } 7430 } 7431 } 7432 for (unsigned Part = 0; Part < State.UF; ++Part) 7433 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7434 } 7435 7436 void VPInterleaveRecipe::execute(VPTransformState &State) { 7437 assert(!State.Instance && "Interleave group being replicated."); 7438 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7439 getMask()); 7440 } 7441 7442 void VPReplicateRecipe::execute(VPTransformState &State) { 7443 if (State.Instance) { // Generate a single instance. 7444 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7445 // Insert scalar instance packing it into a vector. 7446 if (AlsoPack && State.VF > 1) { 7447 // If we're constructing lane 0, initialize to start from undef. 7448 if (State.Instance->Lane == 0) { 7449 Value *Undef = 7450 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7451 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7452 } 7453 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7454 } 7455 return; 7456 } 7457 7458 // Generate scalar instances for all VF lanes of all UF parts, unless the 7459 // instruction is uniform inwhich case generate only the first lane for each 7460 // of the UF parts. 7461 unsigned EndLane = IsUniform ? 1 : State.VF; 7462 for (unsigned Part = 0; Part < State.UF; ++Part) 7463 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7464 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7465 } 7466 7467 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7468 assert(State.Instance && "Branch on Mask works only on single instance."); 7469 7470 unsigned Part = State.Instance->Part; 7471 unsigned Lane = State.Instance->Lane; 7472 7473 Value *ConditionBit = nullptr; 7474 if (!User) // Block in mask is all-one. 7475 ConditionBit = State.Builder.getTrue(); 7476 else { 7477 VPValue *BlockInMask = User->getOperand(0); 7478 ConditionBit = State.get(BlockInMask, Part); 7479 if (ConditionBit->getType()->isVectorTy()) 7480 ConditionBit = State.Builder.CreateExtractElement( 7481 ConditionBit, State.Builder.getInt32(Lane)); 7482 } 7483 7484 // Replace the temporary unreachable terminator with a new conditional branch, 7485 // whose two destinations will be set later when they are created. 7486 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7487 assert(isa<UnreachableInst>(CurrentTerminator) && 7488 "Expected to replace unreachable terminator with conditional branch."); 7489 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7490 CondBr->setSuccessor(0, nullptr); 7491 ReplaceInstWithInst(CurrentTerminator, CondBr); 7492 } 7493 7494 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7495 assert(State.Instance && "Predicated instruction PHI works per instance."); 7496 Instruction *ScalarPredInst = cast<Instruction>( 7497 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7498 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7499 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7500 assert(PredicatingBB && "Predicated block has no single predecessor."); 7501 7502 // By current pack/unpack logic we need to generate only a single phi node: if 7503 // a vector value for the predicated instruction exists at this point it means 7504 // the instruction has vector users only, and a phi for the vector value is 7505 // needed. In this case the recipe of the predicated instruction is marked to 7506 // also do that packing, thereby "hoisting" the insert-element sequence. 7507 // Otherwise, a phi node for the scalar value is needed. 7508 unsigned Part = State.Instance->Part; 7509 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7510 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7511 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7512 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7513 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7514 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7515 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7516 } else { 7517 Type *PredInstType = PredInst->getType(); 7518 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7519 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7520 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7521 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7522 } 7523 } 7524 7525 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7526 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7527 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7528 getMask()); 7529 } 7530 7531 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7532 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7533 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7534 // for predication. 7535 static ScalarEpilogueLowering getScalarEpilogueLowering( 7536 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7537 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7538 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7539 LoopVectorizationLegality &LVL) { 7540 bool OptSize = 7541 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7542 PGSOQueryType::IRPass); 7543 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7544 // don't look at hints or options, and don't request a scalar epilogue. 7545 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7546 return CM_ScalarEpilogueNotAllowedOptSize; 7547 7548 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7549 !PreferPredicateOverEpilog; 7550 7551 // 2) Next, if disabling predication is requested on the command line, honour 7552 // this and request a scalar epilogue. Also do this if we don't have a 7553 // primary induction variable, which is required for predication. 7554 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7555 return CM_ScalarEpilogueAllowed; 7556 7557 // 3) and 4) look if enabling predication is requested on the command line, 7558 // with a loop hint, or if the TTI hook indicates this is profitable, request 7559 // predication . 7560 if (PreferPredicateOverEpilog || 7561 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7562 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7563 LVL.getLAI()) && 7564 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7565 return CM_ScalarEpilogueNotNeededUsePredicate; 7566 7567 return CM_ScalarEpilogueAllowed; 7568 } 7569 7570 // Process the loop in the VPlan-native vectorization path. This path builds 7571 // VPlan upfront in the vectorization pipeline, which allows to apply 7572 // VPlan-to-VPlan transformations from the very beginning without modifying the 7573 // input LLVM IR. 7574 static bool processLoopInVPlanNativePath( 7575 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7576 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7577 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7578 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7579 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7580 7581 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7582 Function *F = L->getHeader()->getParent(); 7583 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7584 7585 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7586 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7587 7588 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7589 &Hints, IAI); 7590 // Use the planner for outer loop vectorization. 7591 // TODO: CM is not used at this point inside the planner. Turn CM into an 7592 // optional argument if we don't need it in the future. 7593 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7594 7595 // Get user vectorization factor. 7596 const unsigned UserVF = Hints.getWidth(); 7597 7598 // Plan how to best vectorize, return the best VF and its cost. 7599 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7600 7601 // If we are stress testing VPlan builds, do not attempt to generate vector 7602 // code. Masked vector code generation support will follow soon. 7603 // Also, do not attempt to vectorize if no vector code will be produced. 7604 if (VPlanBuildStressTest || EnableVPlanPredication || 7605 VectorizationFactor::Disabled() == VF) 7606 return false; 7607 7608 LVP.setBestPlan(VF.Width, 1); 7609 7610 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7611 &CM); 7612 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7613 << L->getHeader()->getParent()->getName() << "\"\n"); 7614 LVP.executePlan(LB, DT); 7615 7616 // Mark the loop as already vectorized to avoid vectorizing again. 7617 Hints.setAlreadyVectorized(); 7618 7619 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7620 return true; 7621 } 7622 7623 bool LoopVectorizePass::processLoop(Loop *L) { 7624 assert((EnableVPlanNativePath || L->empty()) && 7625 "VPlan-native path is not enabled. Only process inner loops."); 7626 7627 #ifndef NDEBUG 7628 const std::string DebugLocStr = getDebugLocString(L); 7629 #endif /* NDEBUG */ 7630 7631 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7632 << L->getHeader()->getParent()->getName() << "\" from " 7633 << DebugLocStr << "\n"); 7634 7635 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7636 7637 LLVM_DEBUG( 7638 dbgs() << "LV: Loop hints:" 7639 << " force=" 7640 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7641 ? "disabled" 7642 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7643 ? "enabled" 7644 : "?")) 7645 << " width=" << Hints.getWidth() 7646 << " unroll=" << Hints.getInterleave() << "\n"); 7647 7648 // Function containing loop 7649 Function *F = L->getHeader()->getParent(); 7650 7651 // Looking at the diagnostic output is the only way to determine if a loop 7652 // was vectorized (other than looking at the IR or machine code), so it 7653 // is important to generate an optimization remark for each loop. Most of 7654 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7655 // generated as OptimizationRemark and OptimizationRemarkMissed are 7656 // less verbose reporting vectorized loops and unvectorized loops that may 7657 // benefit from vectorization, respectively. 7658 7659 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7660 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7661 return false; 7662 } 7663 7664 PredicatedScalarEvolution PSE(*SE, *L); 7665 7666 // Check if it is legal to vectorize the loop. 7667 LoopVectorizationRequirements Requirements(*ORE); 7668 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7669 &Requirements, &Hints, DB, AC); 7670 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7671 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7672 Hints.emitRemarkWithHints(); 7673 return false; 7674 } 7675 7676 // Check the function attributes and profiles to find out if this function 7677 // should be optimized for size. 7678 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7679 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7680 7681 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7682 // here. They may require CFG and instruction level transformations before 7683 // even evaluating whether vectorization is profitable. Since we cannot modify 7684 // the incoming IR, we need to build VPlan upfront in the vectorization 7685 // pipeline. 7686 if (!L->empty()) 7687 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7688 ORE, BFI, PSI, Hints); 7689 7690 assert(L->empty() && "Inner loop expected."); 7691 7692 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7693 // count by optimizing for size, to minimize overheads. 7694 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7695 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7696 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7697 << "This loop is worth vectorizing only if no scalar " 7698 << "iteration overheads are incurred."); 7699 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7700 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7701 else { 7702 LLVM_DEBUG(dbgs() << "\n"); 7703 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7704 } 7705 } 7706 7707 // Check the function attributes to see if implicit floats are allowed. 7708 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7709 // an integer loop and the vector instructions selected are purely integer 7710 // vector instructions? 7711 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7712 reportVectorizationFailure( 7713 "Can't vectorize when the NoImplicitFloat attribute is used", 7714 "loop not vectorized due to NoImplicitFloat attribute", 7715 "NoImplicitFloat", ORE, L); 7716 Hints.emitRemarkWithHints(); 7717 return false; 7718 } 7719 7720 // Check if the target supports potentially unsafe FP vectorization. 7721 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7722 // for the target we're vectorizing for, to make sure none of the 7723 // additional fp-math flags can help. 7724 if (Hints.isPotentiallyUnsafe() && 7725 TTI->isFPVectorizationPotentiallyUnsafe()) { 7726 reportVectorizationFailure( 7727 "Potentially unsafe FP op prevents vectorization", 7728 "loop not vectorized due to unsafe FP support.", 7729 "UnsafeFP", ORE, L); 7730 Hints.emitRemarkWithHints(); 7731 return false; 7732 } 7733 7734 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7735 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7736 7737 // If an override option has been passed in for interleaved accesses, use it. 7738 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7739 UseInterleaved = EnableInterleavedMemAccesses; 7740 7741 // Analyze interleaved memory accesses. 7742 if (UseInterleaved) { 7743 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7744 } 7745 7746 // Use the cost model. 7747 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7748 F, &Hints, IAI); 7749 CM.collectValuesToIgnore(); 7750 7751 // Use the planner for vectorization. 7752 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7753 7754 // Get user vectorization factor. 7755 unsigned UserVF = Hints.getWidth(); 7756 7757 // Plan how to best vectorize, return the best VF and its cost. 7758 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7759 7760 VectorizationFactor VF = VectorizationFactor::Disabled(); 7761 unsigned IC = 1; 7762 unsigned UserIC = Hints.getInterleave(); 7763 7764 if (MaybeVF) { 7765 VF = *MaybeVF; 7766 // Select the interleave count. 7767 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7768 } 7769 7770 // Identify the diagnostic messages that should be produced. 7771 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7772 bool VectorizeLoop = true, InterleaveLoop = true; 7773 if (Requirements.doesNotMeet(F, L, Hints)) { 7774 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7775 "requirements.\n"); 7776 Hints.emitRemarkWithHints(); 7777 return false; 7778 } 7779 7780 if (VF.Width == 1) { 7781 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7782 VecDiagMsg = std::make_pair( 7783 "VectorizationNotBeneficial", 7784 "the cost-model indicates that vectorization is not beneficial"); 7785 VectorizeLoop = false; 7786 } 7787 7788 if (!MaybeVF && UserIC > 1) { 7789 // Tell the user interleaving was avoided up-front, despite being explicitly 7790 // requested. 7791 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7792 "interleaving should be avoided up front\n"); 7793 IntDiagMsg = std::make_pair( 7794 "InterleavingAvoided", 7795 "Ignoring UserIC, because interleaving was avoided up front"); 7796 InterleaveLoop = false; 7797 } else if (IC == 1 && UserIC <= 1) { 7798 // Tell the user interleaving is not beneficial. 7799 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7800 IntDiagMsg = std::make_pair( 7801 "InterleavingNotBeneficial", 7802 "the cost-model indicates that interleaving is not beneficial"); 7803 InterleaveLoop = false; 7804 if (UserIC == 1) { 7805 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7806 IntDiagMsg.second += 7807 " and is explicitly disabled or interleave count is set to 1"; 7808 } 7809 } else if (IC > 1 && UserIC == 1) { 7810 // Tell the user interleaving is beneficial, but it explicitly disabled. 7811 LLVM_DEBUG( 7812 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7813 IntDiagMsg = std::make_pair( 7814 "InterleavingBeneficialButDisabled", 7815 "the cost-model indicates that interleaving is beneficial " 7816 "but is explicitly disabled or interleave count is set to 1"); 7817 InterleaveLoop = false; 7818 } 7819 7820 // Override IC if user provided an interleave count. 7821 IC = UserIC > 0 ? UserIC : IC; 7822 7823 // Emit diagnostic messages, if any. 7824 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7825 if (!VectorizeLoop && !InterleaveLoop) { 7826 // Do not vectorize or interleaving the loop. 7827 ORE->emit([&]() { 7828 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7829 L->getStartLoc(), L->getHeader()) 7830 << VecDiagMsg.second; 7831 }); 7832 ORE->emit([&]() { 7833 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7834 L->getStartLoc(), L->getHeader()) 7835 << IntDiagMsg.second; 7836 }); 7837 return false; 7838 } else if (!VectorizeLoop && InterleaveLoop) { 7839 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7840 ORE->emit([&]() { 7841 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7842 L->getStartLoc(), L->getHeader()) 7843 << VecDiagMsg.second; 7844 }); 7845 } else if (VectorizeLoop && !InterleaveLoop) { 7846 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7847 << ") in " << DebugLocStr << '\n'); 7848 ORE->emit([&]() { 7849 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7850 L->getStartLoc(), L->getHeader()) 7851 << IntDiagMsg.second; 7852 }); 7853 } else if (VectorizeLoop && InterleaveLoop) { 7854 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7855 << ") in " << DebugLocStr << '\n'); 7856 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7857 } 7858 7859 LVP.setBestPlan(VF.Width, IC); 7860 7861 using namespace ore; 7862 bool DisableRuntimeUnroll = false; 7863 MDNode *OrigLoopID = L->getLoopID(); 7864 7865 if (!VectorizeLoop) { 7866 assert(IC > 1 && "interleave count should not be 1 or 0"); 7867 // If we decided that it is not legal to vectorize the loop, then 7868 // interleave it. 7869 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7870 &CM); 7871 LVP.executePlan(Unroller, DT); 7872 7873 ORE->emit([&]() { 7874 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7875 L->getHeader()) 7876 << "interleaved loop (interleaved count: " 7877 << NV("InterleaveCount", IC) << ")"; 7878 }); 7879 } else { 7880 // If we decided that it is *legal* to vectorize the loop, then do it. 7881 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7882 &LVL, &CM); 7883 LVP.executePlan(LB, DT); 7884 ++LoopsVectorized; 7885 7886 // Add metadata to disable runtime unrolling a scalar loop when there are 7887 // no runtime checks about strides and memory. A scalar loop that is 7888 // rarely used is not worth unrolling. 7889 if (!LB.areSafetyChecksAdded()) 7890 DisableRuntimeUnroll = true; 7891 7892 // Report the vectorization decision. 7893 ORE->emit([&]() { 7894 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7895 L->getHeader()) 7896 << "vectorized loop (vectorization width: " 7897 << NV("VectorizationFactor", VF.Width) 7898 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7899 }); 7900 } 7901 7902 Optional<MDNode *> RemainderLoopID = 7903 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7904 LLVMLoopVectorizeFollowupEpilogue}); 7905 if (RemainderLoopID.hasValue()) { 7906 L->setLoopID(RemainderLoopID.getValue()); 7907 } else { 7908 if (DisableRuntimeUnroll) 7909 AddRuntimeUnrollDisableMetaData(L); 7910 7911 // Mark the loop as already vectorized to avoid vectorizing again. 7912 Hints.setAlreadyVectorized(); 7913 } 7914 7915 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7916 return true; 7917 } 7918 7919 bool LoopVectorizePass::runImpl( 7920 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7921 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7922 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7923 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7924 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7925 SE = &SE_; 7926 LI = &LI_; 7927 TTI = &TTI_; 7928 DT = &DT_; 7929 BFI = &BFI_; 7930 TLI = TLI_; 7931 AA = &AA_; 7932 AC = &AC_; 7933 GetLAA = &GetLAA_; 7934 DB = &DB_; 7935 ORE = &ORE_; 7936 PSI = PSI_; 7937 7938 // Don't attempt if 7939 // 1. the target claims to have no vector registers, and 7940 // 2. interleaving won't help ILP. 7941 // 7942 // The second condition is necessary because, even if the target has no 7943 // vector registers, loop vectorization may still enable scalar 7944 // interleaving. 7945 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7946 TTI->getMaxInterleaveFactor(1) < 2) 7947 return false; 7948 7949 bool Changed = false; 7950 7951 // The vectorizer requires loops to be in simplified form. 7952 // Since simplification may add new inner loops, it has to run before the 7953 // legality and profitability checks. This means running the loop vectorizer 7954 // will simplify all loops, regardless of whether anything end up being 7955 // vectorized. 7956 for (auto &L : *LI) 7957 Changed |= 7958 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7959 7960 // Build up a worklist of inner-loops to vectorize. This is necessary as 7961 // the act of vectorizing or partially unrolling a loop creates new loops 7962 // and can invalidate iterators across the loops. 7963 SmallVector<Loop *, 8> Worklist; 7964 7965 for (Loop *L : *LI) 7966 collectSupportedLoops(*L, LI, ORE, Worklist); 7967 7968 LoopsAnalyzed += Worklist.size(); 7969 7970 // Now walk the identified inner loops. 7971 while (!Worklist.empty()) { 7972 Loop *L = Worklist.pop_back_val(); 7973 7974 // For the inner loops we actually process, form LCSSA to simplify the 7975 // transform. 7976 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7977 7978 Changed |= processLoop(L); 7979 } 7980 7981 // Process each loop nest in the function. 7982 return Changed; 7983 } 7984 7985 PreservedAnalyses LoopVectorizePass::run(Function &F, 7986 FunctionAnalysisManager &AM) { 7987 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7988 auto &LI = AM.getResult<LoopAnalysis>(F); 7989 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7990 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7991 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7992 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7993 auto &AA = AM.getResult<AAManager>(F); 7994 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7995 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7996 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7997 MemorySSA *MSSA = EnableMSSALoopDependency 7998 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7999 : nullptr; 8000 8001 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8002 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8003 [&](Loop &L) -> const LoopAccessInfo & { 8004 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8005 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8006 }; 8007 const ModuleAnalysisManager &MAM = 8008 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8009 ProfileSummaryInfo *PSI = 8010 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8011 bool Changed = 8012 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8013 if (!Changed) 8014 return PreservedAnalyses::all(); 8015 PreservedAnalyses PA; 8016 8017 // We currently do not preserve loopinfo/dominator analyses with outer loop 8018 // vectorization. Until this is addressed, mark these analyses as preserved 8019 // only for non-VPlan-native path. 8020 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8021 if (!EnableVPlanNativePath) { 8022 PA.preserve<LoopAnalysis>(); 8023 PA.preserve<DominatorTreeAnalysis>(); 8024 } 8025 PA.preserve<BasicAA>(); 8026 PA.preserve<GlobalsAA>(); 8027 return PA; 8028 } 8029