1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = FixedVectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I, VPUser &Operands, 411 VPTransformState &State); 412 413 /// Widen a single call instruction within the innermost loop. 414 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 415 VPTransformState &State); 416 417 /// Widen a single select instruction within the innermost loop. 418 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 419 bool InvariantCond, VPTransformState &State); 420 421 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 422 void fixVectorizedLoop(); 423 424 // Return true if any runtime check is added. 425 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 426 427 /// A type for vectorized values in the new loop. Each value from the 428 /// original loop, when vectorized, is represented by UF vector values in the 429 /// new unrolled loop, where UF is the unroll factor. 430 using VectorParts = SmallVector<Value *, 2>; 431 432 /// Vectorize a single GetElementPtrInst based on information gathered and 433 /// decisions taken during planning. 434 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 435 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 436 437 /// Vectorize a single PHINode in a block. This method handles the induction 438 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 439 /// arbitrary length vectors. 440 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 441 442 /// A helper function to scalarize a single Instruction in the innermost loop. 443 /// Generates a sequence of scalar instances for each lane between \p MinLane 444 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 445 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 446 /// Instr's operands. 447 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 448 const VPIteration &Instance, bool IfPredicateInstr, 449 VPTransformState &State); 450 451 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 452 /// is provided, the integer induction variable will first be truncated to 453 /// the corresponding type. 454 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 455 456 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 457 /// vector or scalar value on-demand if one is not yet available. When 458 /// vectorizing a loop, we visit the definition of an instruction before its 459 /// uses. When visiting the definition, we either vectorize or scalarize the 460 /// instruction, creating an entry for it in the corresponding map. (In some 461 /// cases, such as induction variables, we will create both vector and scalar 462 /// entries.) Then, as we encounter uses of the definition, we derive values 463 /// for each scalar or vector use unless such a value is already available. 464 /// For example, if we scalarize a definition and one of its uses is vector, 465 /// we build the required vector on-demand with an insertelement sequence 466 /// when visiting the use. Otherwise, if the use is scalar, we can use the 467 /// existing scalar definition. 468 /// 469 /// Return a value in the new loop corresponding to \p V from the original 470 /// loop at unroll index \p Part. If the value has already been vectorized, 471 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 472 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 473 /// a new vector value on-demand by inserting the scalar values into a vector 474 /// with an insertelement sequence. If the value has been neither vectorized 475 /// nor scalarized, it must be loop invariant, so we simply broadcast the 476 /// value into a vector. 477 Value *getOrCreateVectorValue(Value *V, unsigned Part); 478 479 /// Return a value in the new loop corresponding to \p V from the original 480 /// loop at unroll and vector indices \p Instance. If the value has been 481 /// vectorized but not scalarized, the necessary extractelement instruction 482 /// will be generated. 483 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 484 485 /// Construct the vector value of a scalarized value \p V one lane at a time. 486 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 487 488 /// Try to vectorize interleaved access group \p Group with the base address 489 /// given in \p Addr, optionally masking the vector operations if \p 490 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 491 /// values in the vectorized loop. 492 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 493 VPTransformState &State, VPValue *Addr, 494 VPValue *BlockInMask = nullptr); 495 496 /// Vectorize Load and Store instructions with the base address given in \p 497 /// Addr, optionally masking the vector operations if \p BlockInMask is 498 /// non-null. Use \p State to translate given VPValues to IR values in the 499 /// vectorized loop. 500 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 501 VPValue *Addr, VPValue *StoredValue, 502 VPValue *BlockInMask); 503 504 /// Set the debug location in the builder using the debug location in 505 /// the instruction. 506 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 507 508 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 509 void fixNonInductionPHIs(void); 510 511 protected: 512 friend class LoopVectorizationPlanner; 513 514 /// A small list of PHINodes. 515 using PhiVector = SmallVector<PHINode *, 4>; 516 517 /// A type for scalarized values in the new loop. Each value from the 518 /// original loop, when scalarized, is represented by UF x VF scalar values 519 /// in the new unrolled loop, where UF is the unroll factor and VF is the 520 /// vectorization factor. 521 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 522 523 /// Set up the values of the IVs correctly when exiting the vector loop. 524 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 525 Value *CountRoundDown, Value *EndValue, 526 BasicBlock *MiddleBlock); 527 528 /// Create a new induction variable inside L. 529 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 530 Value *Step, Instruction *DL); 531 532 /// Handle all cross-iteration phis in the header. 533 void fixCrossIterationPHIs(); 534 535 /// Fix a first-order recurrence. This is the second phase of vectorizing 536 /// this phi node. 537 void fixFirstOrderRecurrence(PHINode *Phi); 538 539 /// Fix a reduction cross-iteration phi. This is the second phase of 540 /// vectorizing this phi node. 541 void fixReduction(PHINode *Phi); 542 543 /// Clear NSW/NUW flags from reduction instructions if necessary. 544 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 545 546 /// The Loop exit block may have single value PHI nodes with some 547 /// incoming value. While vectorizing we only handled real values 548 /// that were defined inside the loop and we should have one value for 549 /// each predecessor of its parent basic block. See PR14725. 550 void fixLCSSAPHIs(); 551 552 /// Iteratively sink the scalarized operands of a predicated instruction into 553 /// the block that was created for it. 554 void sinkScalarOperands(Instruction *PredInst); 555 556 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 557 /// represented as. 558 void truncateToMinimalBitwidths(); 559 560 /// Create a broadcast instruction. This method generates a broadcast 561 /// instruction (shuffle) for loop invariant values and for the induction 562 /// value. If this is the induction variable then we extend it to N, N+1, ... 563 /// this is needed because each iteration in the loop corresponds to a SIMD 564 /// element. 565 virtual Value *getBroadcastInstrs(Value *V); 566 567 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 568 /// to each vector element of Val. The sequence starts at StartIndex. 569 /// \p Opcode is relevant for FP induction variable. 570 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 571 Instruction::BinaryOps Opcode = 572 Instruction::BinaryOpsEnd); 573 574 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 575 /// variable on which to base the steps, \p Step is the size of the step, and 576 /// \p EntryVal is the value from the original loop that maps to the steps. 577 /// Note that \p EntryVal doesn't have to be an induction variable - it 578 /// can also be a truncate instruction. 579 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 580 const InductionDescriptor &ID); 581 582 /// Create a vector induction phi node based on an existing scalar one. \p 583 /// EntryVal is the value from the original loop that maps to the vector phi 584 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 585 /// truncate instruction, instead of widening the original IV, we widen a 586 /// version of the IV truncated to \p EntryVal's type. 587 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 588 Value *Step, Instruction *EntryVal); 589 590 /// Returns true if an instruction \p I should be scalarized instead of 591 /// vectorized for the chosen vectorization factor. 592 bool shouldScalarizeInstruction(Instruction *I) const; 593 594 /// Returns true if we should generate a scalar version of \p IV. 595 bool needsScalarInduction(Instruction *IV) const; 596 597 /// If there is a cast involved in the induction variable \p ID, which should 598 /// be ignored in the vectorized loop body, this function records the 599 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 600 /// cast. We had already proved that the casted Phi is equal to the uncasted 601 /// Phi in the vectorized loop (under a runtime guard), and therefore 602 /// there is no need to vectorize the cast - the same value can be used in the 603 /// vector loop for both the Phi and the cast. 604 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 605 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 606 /// 607 /// \p EntryVal is the value from the original loop that maps to the vector 608 /// phi node and is used to distinguish what is the IV currently being 609 /// processed - original one (if \p EntryVal is a phi corresponding to the 610 /// original IV) or the "newly-created" one based on the proof mentioned above 611 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 612 /// latter case \p EntryVal is a TruncInst and we must not record anything for 613 /// that IV, but it's error-prone to expect callers of this routine to care 614 /// about that, hence this explicit parameter. 615 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 616 const Instruction *EntryVal, 617 Value *VectorLoopValue, 618 unsigned Part, 619 unsigned Lane = UINT_MAX); 620 621 /// Generate a shuffle sequence that will reverse the vector Vec. 622 virtual Value *reverseVector(Value *Vec); 623 624 /// Returns (and creates if needed) the original loop trip count. 625 Value *getOrCreateTripCount(Loop *NewLoop); 626 627 /// Returns (and creates if needed) the trip count of the widened loop. 628 Value *getOrCreateVectorTripCount(Loop *NewLoop); 629 630 /// Returns a bitcasted value to the requested vector type. 631 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 632 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 633 const DataLayout &DL); 634 635 /// Emit a bypass check to see if the vector trip count is zero, including if 636 /// it overflows. 637 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 638 639 /// Emit a bypass check to see if all of the SCEV assumptions we've 640 /// had to make are correct. 641 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 642 643 /// Emit bypass checks to check any memory assumptions we may have made. 644 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Compute the transformed value of Index at offset StartValue using step 647 /// StepValue. 648 /// For integer induction, returns StartValue + Index * StepValue. 649 /// For pointer induction, returns StartValue[Index * StepValue]. 650 /// FIXME: The newly created binary instructions should contain nsw/nuw 651 /// flags, which can be found from the original scalar operations. 652 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 653 const DataLayout &DL, 654 const InductionDescriptor &ID) const; 655 656 /// Add additional metadata to \p To that was not present on \p Orig. 657 /// 658 /// Currently this is used to add the noalias annotations based on the 659 /// inserted memchecks. Use this for instructions that are *cloned* into the 660 /// vector loop. 661 void addNewMetadata(Instruction *To, const Instruction *Orig); 662 663 /// Add metadata from one instruction to another. 664 /// 665 /// This includes both the original MDs from \p From and additional ones (\see 666 /// addNewMetadata). Use this for *newly created* instructions in the vector 667 /// loop. 668 void addMetadata(Instruction *To, Instruction *From); 669 670 /// Similar to the previous function but it adds the metadata to a 671 /// vector of instructions. 672 void addMetadata(ArrayRef<Value *> To, Instruction *From); 673 674 /// The original loop. 675 Loop *OrigLoop; 676 677 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 678 /// dynamic knowledge to simplify SCEV expressions and converts them to a 679 /// more usable form. 680 PredicatedScalarEvolution &PSE; 681 682 /// Loop Info. 683 LoopInfo *LI; 684 685 /// Dominator Tree. 686 DominatorTree *DT; 687 688 /// Alias Analysis. 689 AliasAnalysis *AA; 690 691 /// Target Library Info. 692 const TargetLibraryInfo *TLI; 693 694 /// Target Transform Info. 695 const TargetTransformInfo *TTI; 696 697 /// Assumption Cache. 698 AssumptionCache *AC; 699 700 /// Interface to emit optimization remarks. 701 OptimizationRemarkEmitter *ORE; 702 703 /// LoopVersioning. It's only set up (non-null) if memchecks were 704 /// used. 705 /// 706 /// This is currently only used to add no-alias metadata based on the 707 /// memchecks. The actually versioning is performed manually. 708 std::unique_ptr<LoopVersioning> LVer; 709 710 /// The vectorization SIMD factor to use. Each vector will have this many 711 /// vector elements. 712 unsigned VF; 713 714 /// The vectorization unroll factor to use. Each scalar is vectorized to this 715 /// many different vector instructions. 716 unsigned UF; 717 718 /// The builder that we use 719 IRBuilder<> Builder; 720 721 // --- Vectorization state --- 722 723 /// The vector-loop preheader. 724 BasicBlock *LoopVectorPreHeader; 725 726 /// The scalar-loop preheader. 727 BasicBlock *LoopScalarPreHeader; 728 729 /// Middle Block between the vector and the scalar. 730 BasicBlock *LoopMiddleBlock; 731 732 /// The ExitBlock of the scalar loop. 733 BasicBlock *LoopExitBlock; 734 735 /// The vector loop body. 736 BasicBlock *LoopVectorBody; 737 738 /// The scalar loop body. 739 BasicBlock *LoopScalarBody; 740 741 /// A list of all bypass blocks. The first block is the entry of the loop. 742 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 743 744 /// The new Induction variable which was added to the new block. 745 PHINode *Induction = nullptr; 746 747 /// The induction variable of the old basic block. 748 PHINode *OldInduction = nullptr; 749 750 /// Maps values from the original loop to their corresponding values in the 751 /// vectorized loop. A key value can map to either vector values, scalar 752 /// values or both kinds of values, depending on whether the key was 753 /// vectorized and scalarized. 754 VectorizerValueMap VectorLoopValueMap; 755 756 /// Store instructions that were predicated. 757 SmallVector<Instruction *, 4> PredicatedInstructions; 758 759 /// Trip count of the original loop. 760 Value *TripCount = nullptr; 761 762 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 763 Value *VectorTripCount = nullptr; 764 765 /// The legality analysis. 766 LoopVectorizationLegality *Legal; 767 768 /// The profitablity analysis. 769 LoopVectorizationCostModel *Cost; 770 771 // Record whether runtime checks are added. 772 bool AddedSafetyChecks = false; 773 774 // Holds the end values for each induction variable. We save the end values 775 // so we can later fix-up the external users of the induction variables. 776 DenseMap<PHINode *, Value *> IVEndValues; 777 778 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 779 // fixed up at the end of vector code generation. 780 SmallVector<PHINode *, 8> OrigPHIsToFix; 781 }; 782 783 class InnerLoopUnroller : public InnerLoopVectorizer { 784 public: 785 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 786 LoopInfo *LI, DominatorTree *DT, 787 const TargetLibraryInfo *TLI, 788 const TargetTransformInfo *TTI, AssumptionCache *AC, 789 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 790 LoopVectorizationLegality *LVL, 791 LoopVectorizationCostModel *CM) 792 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 793 UnrollFactor, LVL, CM) {} 794 795 private: 796 Value *getBroadcastInstrs(Value *V) override; 797 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 798 Instruction::BinaryOps Opcode = 799 Instruction::BinaryOpsEnd) override; 800 Value *reverseVector(Value *Vec) override; 801 }; 802 803 } // end namespace llvm 804 805 /// Look for a meaningful debug location on the instruction or it's 806 /// operands. 807 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 808 if (!I) 809 return I; 810 811 DebugLoc Empty; 812 if (I->getDebugLoc() != Empty) 813 return I; 814 815 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 816 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 817 if (OpInst->getDebugLoc() != Empty) 818 return OpInst; 819 } 820 821 return I; 822 } 823 824 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 825 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 826 const DILocation *DIL = Inst->getDebugLoc(); 827 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 828 !isa<DbgInfoIntrinsic>(Inst)) { 829 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 830 if (NewDIL) 831 B.SetCurrentDebugLocation(NewDIL.getValue()); 832 else 833 LLVM_DEBUG(dbgs() 834 << "Failed to create new discriminator: " 835 << DIL->getFilename() << " Line: " << DIL->getLine()); 836 } 837 else 838 B.SetCurrentDebugLocation(DIL); 839 } else 840 B.SetCurrentDebugLocation(DebugLoc()); 841 } 842 843 /// Write a record \p DebugMsg about vectorization failure to the debug 844 /// output stream. If \p I is passed, it is an instruction that prevents 845 /// vectorization. 846 #ifndef NDEBUG 847 static void debugVectorizationFailure(const StringRef DebugMsg, 848 Instruction *I) { 849 dbgs() << "LV: Not vectorizing: " << DebugMsg; 850 if (I != nullptr) 851 dbgs() << " " << *I; 852 else 853 dbgs() << '.'; 854 dbgs() << '\n'; 855 } 856 #endif 857 858 /// Create an analysis remark that explains why vectorization failed 859 /// 860 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 861 /// RemarkName is the identifier for the remark. If \p I is passed it is an 862 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 863 /// the location of the remark. \return the remark object that can be 864 /// streamed to. 865 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 866 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 867 Value *CodeRegion = TheLoop->getHeader(); 868 DebugLoc DL = TheLoop->getStartLoc(); 869 870 if (I) { 871 CodeRegion = I->getParent(); 872 // If there is no debug location attached to the instruction, revert back to 873 // using the loop's. 874 if (I->getDebugLoc()) 875 DL = I->getDebugLoc(); 876 } 877 878 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 879 R << "loop not vectorized: "; 880 return R; 881 } 882 883 namespace llvm { 884 885 void reportVectorizationFailure(const StringRef DebugMsg, 886 const StringRef OREMsg, const StringRef ORETag, 887 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 888 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 889 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 890 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 891 ORETag, TheLoop, I) << OREMsg); 892 } 893 894 } // end namespace llvm 895 896 #ifndef NDEBUG 897 /// \return string containing a file name and a line # for the given loop. 898 static std::string getDebugLocString(const Loop *L) { 899 std::string Result; 900 if (L) { 901 raw_string_ostream OS(Result); 902 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 903 LoopDbgLoc.print(OS); 904 else 905 // Just print the module name. 906 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 907 OS.flush(); 908 } 909 return Result; 910 } 911 #endif 912 913 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 914 const Instruction *Orig) { 915 // If the loop was versioned with memchecks, add the corresponding no-alias 916 // metadata. 917 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 918 LVer->annotateInstWithNoAlias(To, Orig); 919 } 920 921 void InnerLoopVectorizer::addMetadata(Instruction *To, 922 Instruction *From) { 923 propagateMetadata(To, From); 924 addNewMetadata(To, From); 925 } 926 927 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 928 Instruction *From) { 929 for (Value *V : To) { 930 if (Instruction *I = dyn_cast<Instruction>(V)) 931 addMetadata(I, From); 932 } 933 } 934 935 namespace llvm { 936 937 // Loop vectorization cost-model hints how the scalar epilogue loop should be 938 // lowered. 939 enum ScalarEpilogueLowering { 940 941 // The default: allowing scalar epilogues. 942 CM_ScalarEpilogueAllowed, 943 944 // Vectorization with OptForSize: don't allow epilogues. 945 CM_ScalarEpilogueNotAllowedOptSize, 946 947 // A special case of vectorisation with OptForSize: loops with a very small 948 // trip count are considered for vectorization under OptForSize, thereby 949 // making sure the cost of their loop body is dominant, free of runtime 950 // guards and scalar iteration overheads. 951 CM_ScalarEpilogueNotAllowedLowTripLoop, 952 953 // Loop hint predicate indicating an epilogue is undesired. 954 CM_ScalarEpilogueNotNeededUsePredicate 955 }; 956 957 /// LoopVectorizationCostModel - estimates the expected speedups due to 958 /// vectorization. 959 /// In many cases vectorization is not profitable. This can happen because of 960 /// a number of reasons. In this class we mainly attempt to predict the 961 /// expected speedup/slowdowns due to the supported instruction set. We use the 962 /// TargetTransformInfo to query the different backends for the cost of 963 /// different operations. 964 class LoopVectorizationCostModel { 965 public: 966 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 967 PredicatedScalarEvolution &PSE, LoopInfo *LI, 968 LoopVectorizationLegality *Legal, 969 const TargetTransformInfo &TTI, 970 const TargetLibraryInfo *TLI, DemandedBits *DB, 971 AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, const Function *F, 973 const LoopVectorizeHints *Hints, 974 InterleavedAccessInfo &IAI) 975 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 976 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 977 Hints(Hints), InterleaveInfo(IAI) {} 978 979 /// \return An upper bound for the vectorization factor, or None if 980 /// vectorization and interleaving should be avoided up front. 981 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 982 983 /// \return True if runtime checks are required for vectorization, and false 984 /// otherwise. 985 bool runtimeChecksRequired(); 986 987 /// \return The most profitable vectorization factor and the cost of that VF. 988 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 989 /// then this vectorization factor will be selected if vectorization is 990 /// possible. 991 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 992 993 /// Setup cost-based decisions for user vectorization factor. 994 void selectUserVectorizationFactor(unsigned UserVF) { 995 collectUniformsAndScalars(UserVF); 996 collectInstsToScalarize(UserVF); 997 } 998 999 /// \return The size (in bits) of the smallest and widest types in the code 1000 /// that needs to be vectorized. We ignore values that remain scalar such as 1001 /// 64 bit loop indices. 1002 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1003 1004 /// \return The desired interleave count. 1005 /// If interleave count has been specified by metadata it will be returned. 1006 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1007 /// are the selected vectorization factor and the cost of the selected VF. 1008 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1009 1010 /// Memory access instruction may be vectorized in more than one way. 1011 /// Form of instruction after vectorization depends on cost. 1012 /// This function takes cost-based decisions for Load/Store instructions 1013 /// and collects them in a map. This decisions map is used for building 1014 /// the lists of loop-uniform and loop-scalar instructions. 1015 /// The calculated cost is saved with widening decision in order to 1016 /// avoid redundant calculations. 1017 void setCostBasedWideningDecision(unsigned VF); 1018 1019 /// A struct that represents some properties of the register usage 1020 /// of a loop. 1021 struct RegisterUsage { 1022 /// Holds the number of loop invariant values that are used in the loop. 1023 /// The key is ClassID of target-provided register class. 1024 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1025 /// Holds the maximum number of concurrent live intervals in the loop. 1026 /// The key is ClassID of target-provided register class. 1027 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1028 }; 1029 1030 /// \return Returns information about the register usages of the loop for the 1031 /// given vectorization factors. 1032 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1033 1034 /// Collect values we want to ignore in the cost model. 1035 void collectValuesToIgnore(); 1036 1037 /// \returns The smallest bitwidth each instruction can be represented with. 1038 /// The vector equivalents of these instructions should be truncated to this 1039 /// type. 1040 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1041 return MinBWs; 1042 } 1043 1044 /// \returns True if it is more profitable to scalarize instruction \p I for 1045 /// vectorization factor \p VF. 1046 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1047 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1048 1049 // Cost model is not run in the VPlan-native path - return conservative 1050 // result until this changes. 1051 if (EnableVPlanNativePath) 1052 return false; 1053 1054 auto Scalars = InstsToScalarize.find(VF); 1055 assert(Scalars != InstsToScalarize.end() && 1056 "VF not yet analyzed for scalarization profitability"); 1057 return Scalars->second.find(I) != Scalars->second.end(); 1058 } 1059 1060 /// Returns true if \p I is known to be uniform after vectorization. 1061 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1062 if (VF == 1) 1063 return true; 1064 1065 // Cost model is not run in the VPlan-native path - return conservative 1066 // result until this changes. 1067 if (EnableVPlanNativePath) 1068 return false; 1069 1070 auto UniformsPerVF = Uniforms.find(VF); 1071 assert(UniformsPerVF != Uniforms.end() && 1072 "VF not yet analyzed for uniformity"); 1073 return UniformsPerVF->second.count(I); 1074 } 1075 1076 /// Returns true if \p I is known to be scalar after vectorization. 1077 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1078 if (VF == 1) 1079 return true; 1080 1081 // Cost model is not run in the VPlan-native path - return conservative 1082 // result until this changes. 1083 if (EnableVPlanNativePath) 1084 return false; 1085 1086 auto ScalarsPerVF = Scalars.find(VF); 1087 assert(ScalarsPerVF != Scalars.end() && 1088 "Scalar values are not calculated for VF"); 1089 return ScalarsPerVF->second.count(I); 1090 } 1091 1092 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1093 /// for vectorization factor \p VF. 1094 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1095 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1096 !isProfitableToScalarize(I, VF) && 1097 !isScalarAfterVectorization(I, VF); 1098 } 1099 1100 /// Decision that was taken during cost calculation for memory instruction. 1101 enum InstWidening { 1102 CM_Unknown, 1103 CM_Widen, // For consecutive accesses with stride +1. 1104 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1105 CM_Interleave, 1106 CM_GatherScatter, 1107 CM_Scalarize 1108 }; 1109 1110 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1111 /// instruction \p I and vector width \p VF. 1112 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1113 unsigned Cost) { 1114 assert(VF >= 2 && "Expected VF >=2"); 1115 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1116 } 1117 1118 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1119 /// interleaving group \p Grp and vector width \p VF. 1120 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1121 InstWidening W, unsigned Cost) { 1122 assert(VF >= 2 && "Expected VF >=2"); 1123 /// Broadcast this decicion to all instructions inside the group. 1124 /// But the cost will be assigned to one instruction only. 1125 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1126 if (auto *I = Grp->getMember(i)) { 1127 if (Grp->getInsertPos() == I) 1128 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1129 else 1130 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1131 } 1132 } 1133 } 1134 1135 /// Return the cost model decision for the given instruction \p I and vector 1136 /// width \p VF. Return CM_Unknown if this instruction did not pass 1137 /// through the cost modeling. 1138 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1139 assert(VF >= 2 && "Expected VF >=2"); 1140 1141 // Cost model is not run in the VPlan-native path - return conservative 1142 // result until this changes. 1143 if (EnableVPlanNativePath) 1144 return CM_GatherScatter; 1145 1146 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1147 auto Itr = WideningDecisions.find(InstOnVF); 1148 if (Itr == WideningDecisions.end()) 1149 return CM_Unknown; 1150 return Itr->second.first; 1151 } 1152 1153 /// Return the vectorization cost for the given instruction \p I and vector 1154 /// width \p VF. 1155 unsigned getWideningCost(Instruction *I, unsigned VF) { 1156 assert(VF >= 2 && "Expected VF >=2"); 1157 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1158 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1159 "The cost is not calculated"); 1160 return WideningDecisions[InstOnVF].second; 1161 } 1162 1163 /// Return True if instruction \p I is an optimizable truncate whose operand 1164 /// is an induction variable. Such a truncate will be removed by adding a new 1165 /// induction variable with the destination type. 1166 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1167 // If the instruction is not a truncate, return false. 1168 auto *Trunc = dyn_cast<TruncInst>(I); 1169 if (!Trunc) 1170 return false; 1171 1172 // Get the source and destination types of the truncate. 1173 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1174 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1175 1176 // If the truncate is free for the given types, return false. Replacing a 1177 // free truncate with an induction variable would add an induction variable 1178 // update instruction to each iteration of the loop. We exclude from this 1179 // check the primary induction variable since it will need an update 1180 // instruction regardless. 1181 Value *Op = Trunc->getOperand(0); 1182 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1183 return false; 1184 1185 // If the truncated value is not an induction variable, return false. 1186 return Legal->isInductionPhi(Op); 1187 } 1188 1189 /// Collects the instructions to scalarize for each predicated instruction in 1190 /// the loop. 1191 void collectInstsToScalarize(unsigned VF); 1192 1193 /// Collect Uniform and Scalar values for the given \p VF. 1194 /// The sets depend on CM decision for Load/Store instructions 1195 /// that may be vectorized as interleave, gather-scatter or scalarized. 1196 void collectUniformsAndScalars(unsigned VF) { 1197 // Do the analysis once. 1198 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1199 return; 1200 setCostBasedWideningDecision(VF); 1201 collectLoopUniforms(VF); 1202 collectLoopScalars(VF); 1203 } 1204 1205 /// Returns true if the target machine supports masked store operation 1206 /// for the given \p DataType and kind of access to \p Ptr. 1207 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1208 return Legal->isConsecutivePtr(Ptr) && 1209 TTI.isLegalMaskedStore(DataType, Alignment); 1210 } 1211 1212 /// Returns true if the target machine supports masked load operation 1213 /// for the given \p DataType and kind of access to \p Ptr. 1214 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1215 return Legal->isConsecutivePtr(Ptr) && 1216 TTI.isLegalMaskedLoad(DataType, Alignment); 1217 } 1218 1219 /// Returns true if the target machine supports masked scatter operation 1220 /// for the given \p DataType. 1221 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1222 return TTI.isLegalMaskedScatter(DataType, Alignment); 1223 } 1224 1225 /// Returns true if the target machine supports masked gather operation 1226 /// for the given \p DataType. 1227 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1228 return TTI.isLegalMaskedGather(DataType, Alignment); 1229 } 1230 1231 /// Returns true if the target machine can represent \p V as a masked gather 1232 /// or scatter operation. 1233 bool isLegalGatherOrScatter(Value *V) { 1234 bool LI = isa<LoadInst>(V); 1235 bool SI = isa<StoreInst>(V); 1236 if (!LI && !SI) 1237 return false; 1238 auto *Ty = getMemInstValueType(V); 1239 Align Align = getLoadStoreAlignment(V); 1240 return (LI && isLegalMaskedGather(Ty, Align)) || 1241 (SI && isLegalMaskedScatter(Ty, Align)); 1242 } 1243 1244 /// Returns true if \p I is an instruction that will be scalarized with 1245 /// predication. Such instructions include conditional stores and 1246 /// instructions that may divide by zero. 1247 /// If a non-zero VF has been calculated, we check if I will be scalarized 1248 /// predication for that VF. 1249 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1250 1251 // Returns true if \p I is an instruction that will be predicated either 1252 // through scalar predication or masked load/store or masked gather/scatter. 1253 // Superset of instructions that return true for isScalarWithPredication. 1254 bool isPredicatedInst(Instruction *I) { 1255 if (!blockNeedsPredication(I->getParent())) 1256 return false; 1257 // Loads and stores that need some form of masked operation are predicated 1258 // instructions. 1259 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1260 return Legal->isMaskRequired(I); 1261 return isScalarWithPredication(I); 1262 } 1263 1264 /// Returns true if \p I is a memory instruction with consecutive memory 1265 /// access that can be widened. 1266 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1267 1268 /// Returns true if \p I is a memory instruction in an interleaved-group 1269 /// of memory accesses that can be vectorized with wide vector loads/stores 1270 /// and shuffles. 1271 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1272 1273 /// Check if \p Instr belongs to any interleaved access group. 1274 bool isAccessInterleaved(Instruction *Instr) { 1275 return InterleaveInfo.isInterleaved(Instr); 1276 } 1277 1278 /// Get the interleaved access group that \p Instr belongs to. 1279 const InterleaveGroup<Instruction> * 1280 getInterleavedAccessGroup(Instruction *Instr) { 1281 return InterleaveInfo.getInterleaveGroup(Instr); 1282 } 1283 1284 /// Returns true if an interleaved group requires a scalar iteration 1285 /// to handle accesses with gaps, and there is nothing preventing us from 1286 /// creating a scalar epilogue. 1287 bool requiresScalarEpilogue() const { 1288 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1289 } 1290 1291 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1292 /// loop hint annotation. 1293 bool isScalarEpilogueAllowed() const { 1294 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1295 } 1296 1297 /// Returns true if all loop blocks should be masked to fold tail loop. 1298 bool foldTailByMasking() const { return FoldTailByMasking; } 1299 1300 bool blockNeedsPredication(BasicBlock *BB) { 1301 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1302 } 1303 1304 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1305 /// with factor VF. Return the cost of the instruction, including 1306 /// scalarization overhead if it's needed. 1307 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1308 1309 /// Estimate cost of a call instruction CI if it were vectorized with factor 1310 /// VF. Return the cost of the instruction, including scalarization overhead 1311 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1312 /// scalarized - 1313 /// i.e. either vector version isn't available, or is too expensive. 1314 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1315 1316 /// Invalidates decisions already taken by the cost model. 1317 void invalidateCostModelingDecisions() { 1318 WideningDecisions.clear(); 1319 Uniforms.clear(); 1320 Scalars.clear(); 1321 } 1322 1323 private: 1324 unsigned NumPredStores = 0; 1325 1326 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1327 /// than zero. One is returned if vectorization should best be avoided due 1328 /// to cost. 1329 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1330 1331 /// The vectorization cost is a combination of the cost itself and a boolean 1332 /// indicating whether any of the contributing operations will actually 1333 /// operate on 1334 /// vector values after type legalization in the backend. If this latter value 1335 /// is 1336 /// false, then all operations will be scalarized (i.e. no vectorization has 1337 /// actually taken place). 1338 using VectorizationCostTy = std::pair<unsigned, bool>; 1339 1340 /// Returns the expected execution cost. The unit of the cost does 1341 /// not matter because we use the 'cost' units to compare different 1342 /// vector widths. The cost that is returned is *not* normalized by 1343 /// the factor width. 1344 VectorizationCostTy expectedCost(unsigned VF); 1345 1346 /// Returns the execution time cost of an instruction for a given vector 1347 /// width. Vector width of one means scalar. 1348 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1349 1350 /// The cost-computation logic from getInstructionCost which provides 1351 /// the vector type as an output parameter. 1352 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1353 1354 /// Calculate vectorization cost of memory instruction \p I. 1355 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1356 1357 /// The cost computation for scalarized memory instruction. 1358 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1359 1360 /// The cost computation for interleaving group of memory instructions. 1361 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1362 1363 /// The cost computation for Gather/Scatter instruction. 1364 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1365 1366 /// The cost computation for widening instruction \p I with consecutive 1367 /// memory access. 1368 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1369 1370 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1371 /// Load: scalar load + broadcast. 1372 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1373 /// element) 1374 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1375 1376 /// Estimate the overhead of scalarizing an instruction. This is a 1377 /// convenience wrapper for the type-based getScalarizationOverhead API. 1378 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1379 1380 /// Returns whether the instruction is a load or store and will be a emitted 1381 /// as a vector operation. 1382 bool isConsecutiveLoadOrStore(Instruction *I); 1383 1384 /// Returns true if an artificially high cost for emulated masked memrefs 1385 /// should be used. 1386 bool useEmulatedMaskMemRefHack(Instruction *I); 1387 1388 /// Map of scalar integer values to the smallest bitwidth they can be legally 1389 /// represented as. The vector equivalents of these values should be truncated 1390 /// to this type. 1391 MapVector<Instruction *, uint64_t> MinBWs; 1392 1393 /// A type representing the costs for instructions if they were to be 1394 /// scalarized rather than vectorized. The entries are Instruction-Cost 1395 /// pairs. 1396 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1397 1398 /// A set containing all BasicBlocks that are known to present after 1399 /// vectorization as a predicated block. 1400 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1401 1402 /// Records whether it is allowed to have the original scalar loop execute at 1403 /// least once. This may be needed as a fallback loop in case runtime 1404 /// aliasing/dependence checks fail, or to handle the tail/remainder 1405 /// iterations when the trip count is unknown or doesn't divide by the VF, 1406 /// or as a peel-loop to handle gaps in interleave-groups. 1407 /// Under optsize and when the trip count is very small we don't allow any 1408 /// iterations to execute in the scalar loop. 1409 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1410 1411 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1412 bool FoldTailByMasking = false; 1413 1414 /// A map holding scalar costs for different vectorization factors. The 1415 /// presence of a cost for an instruction in the mapping indicates that the 1416 /// instruction will be scalarized when vectorizing with the associated 1417 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1418 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1419 1420 /// Holds the instructions known to be uniform after vectorization. 1421 /// The data is collected per VF. 1422 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1423 1424 /// Holds the instructions known to be scalar after vectorization. 1425 /// The data is collected per VF. 1426 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1427 1428 /// Holds the instructions (address computations) that are forced to be 1429 /// scalarized. 1430 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1431 1432 /// Returns the expected difference in cost from scalarizing the expression 1433 /// feeding a predicated instruction \p PredInst. The instructions to 1434 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1435 /// non-negative return value implies the expression will be scalarized. 1436 /// Currently, only single-use chains are considered for scalarization. 1437 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1438 unsigned VF); 1439 1440 /// Collect the instructions that are uniform after vectorization. An 1441 /// instruction is uniform if we represent it with a single scalar value in 1442 /// the vectorized loop corresponding to each vector iteration. Examples of 1443 /// uniform instructions include pointer operands of consecutive or 1444 /// interleaved memory accesses. Note that although uniformity implies an 1445 /// instruction will be scalar, the reverse is not true. In general, a 1446 /// scalarized instruction will be represented by VF scalar values in the 1447 /// vectorized loop, each corresponding to an iteration of the original 1448 /// scalar loop. 1449 void collectLoopUniforms(unsigned VF); 1450 1451 /// Collect the instructions that are scalar after vectorization. An 1452 /// instruction is scalar if it is known to be uniform or will be scalarized 1453 /// during vectorization. Non-uniform scalarized instructions will be 1454 /// represented by VF values in the vectorized loop, each corresponding to an 1455 /// iteration of the original scalar loop. 1456 void collectLoopScalars(unsigned VF); 1457 1458 /// Keeps cost model vectorization decision and cost for instructions. 1459 /// Right now it is used for memory instructions only. 1460 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1461 std::pair<InstWidening, unsigned>>; 1462 1463 DecisionList WideningDecisions; 1464 1465 /// Returns true if \p V is expected to be vectorized and it needs to be 1466 /// extracted. 1467 bool needsExtract(Value *V, unsigned VF) const { 1468 Instruction *I = dyn_cast<Instruction>(V); 1469 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1470 return false; 1471 1472 // Assume we can vectorize V (and hence we need extraction) if the 1473 // scalars are not computed yet. This can happen, because it is called 1474 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1475 // the scalars are collected. That should be a safe assumption in most 1476 // cases, because we check if the operands have vectorizable types 1477 // beforehand in LoopVectorizationLegality. 1478 return Scalars.find(VF) == Scalars.end() || 1479 !isScalarAfterVectorization(I, VF); 1480 }; 1481 1482 /// Returns a range containing only operands needing to be extracted. 1483 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1484 unsigned VF) { 1485 return SmallVector<Value *, 4>(make_filter_range( 1486 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1487 } 1488 1489 public: 1490 /// The loop that we evaluate. 1491 Loop *TheLoop; 1492 1493 /// Predicated scalar evolution analysis. 1494 PredicatedScalarEvolution &PSE; 1495 1496 /// Loop Info analysis. 1497 LoopInfo *LI; 1498 1499 /// Vectorization legality. 1500 LoopVectorizationLegality *Legal; 1501 1502 /// Vector target information. 1503 const TargetTransformInfo &TTI; 1504 1505 /// Target Library Info. 1506 const TargetLibraryInfo *TLI; 1507 1508 /// Demanded bits analysis. 1509 DemandedBits *DB; 1510 1511 /// Assumption cache. 1512 AssumptionCache *AC; 1513 1514 /// Interface to emit optimization remarks. 1515 OptimizationRemarkEmitter *ORE; 1516 1517 const Function *TheFunction; 1518 1519 /// Loop Vectorize Hint. 1520 const LoopVectorizeHints *Hints; 1521 1522 /// The interleave access information contains groups of interleaved accesses 1523 /// with the same stride and close to each other. 1524 InterleavedAccessInfo &InterleaveInfo; 1525 1526 /// Values to ignore in the cost model. 1527 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1528 1529 /// Values to ignore in the cost model when VF > 1. 1530 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1531 }; 1532 1533 } // end namespace llvm 1534 1535 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1536 // vectorization. The loop needs to be annotated with #pragma omp simd 1537 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1538 // vector length information is not provided, vectorization is not considered 1539 // explicit. Interleave hints are not allowed either. These limitations will be 1540 // relaxed in the future. 1541 // Please, note that we are currently forced to abuse the pragma 'clang 1542 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1543 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1544 // provides *explicit vectorization hints* (LV can bypass legal checks and 1545 // assume that vectorization is legal). However, both hints are implemented 1546 // using the same metadata (llvm.loop.vectorize, processed by 1547 // LoopVectorizeHints). This will be fixed in the future when the native IR 1548 // representation for pragma 'omp simd' is introduced. 1549 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1550 OptimizationRemarkEmitter *ORE) { 1551 assert(!OuterLp->empty() && "This is not an outer loop"); 1552 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1553 1554 // Only outer loops with an explicit vectorization hint are supported. 1555 // Unannotated outer loops are ignored. 1556 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1557 return false; 1558 1559 Function *Fn = OuterLp->getHeader()->getParent(); 1560 if (!Hints.allowVectorization(Fn, OuterLp, 1561 true /*VectorizeOnlyWhenForced*/)) { 1562 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1563 return false; 1564 } 1565 1566 if (Hints.getInterleave() > 1) { 1567 // TODO: Interleave support is future work. 1568 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1569 "outer loops.\n"); 1570 Hints.emitRemarkWithHints(); 1571 return false; 1572 } 1573 1574 return true; 1575 } 1576 1577 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1578 OptimizationRemarkEmitter *ORE, 1579 SmallVectorImpl<Loop *> &V) { 1580 // Collect inner loops and outer loops without irreducible control flow. For 1581 // now, only collect outer loops that have explicit vectorization hints. If we 1582 // are stress testing the VPlan H-CFG construction, we collect the outermost 1583 // loop of every loop nest. 1584 if (L.empty() || VPlanBuildStressTest || 1585 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1586 LoopBlocksRPO RPOT(&L); 1587 RPOT.perform(LI); 1588 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1589 V.push_back(&L); 1590 // TODO: Collect inner loops inside marked outer loops in case 1591 // vectorization fails for the outer loop. Do not invoke 1592 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1593 // already known to be reducible. We can use an inherited attribute for 1594 // that. 1595 return; 1596 } 1597 } 1598 for (Loop *InnerL : L) 1599 collectSupportedLoops(*InnerL, LI, ORE, V); 1600 } 1601 1602 namespace { 1603 1604 /// The LoopVectorize Pass. 1605 struct LoopVectorize : public FunctionPass { 1606 /// Pass identification, replacement for typeid 1607 static char ID; 1608 1609 LoopVectorizePass Impl; 1610 1611 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1612 bool VectorizeOnlyWhenForced = false) 1613 : FunctionPass(ID), 1614 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1615 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1616 } 1617 1618 bool runOnFunction(Function &F) override { 1619 if (skipFunction(F)) 1620 return false; 1621 1622 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1623 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1624 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1625 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1626 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1627 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1628 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1629 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1630 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1631 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1632 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1633 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1634 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1635 1636 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1637 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1638 1639 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1640 GetLAA, *ORE, PSI).MadeAnyChange; 1641 } 1642 1643 void getAnalysisUsage(AnalysisUsage &AU) const override { 1644 AU.addRequired<AssumptionCacheTracker>(); 1645 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1646 AU.addRequired<DominatorTreeWrapperPass>(); 1647 AU.addRequired<LoopInfoWrapperPass>(); 1648 AU.addRequired<ScalarEvolutionWrapperPass>(); 1649 AU.addRequired<TargetTransformInfoWrapperPass>(); 1650 AU.addRequired<AAResultsWrapperPass>(); 1651 AU.addRequired<LoopAccessLegacyAnalysis>(); 1652 AU.addRequired<DemandedBitsWrapperPass>(); 1653 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1654 AU.addRequired<InjectTLIMappingsLegacy>(); 1655 1656 // We currently do not preserve loopinfo/dominator analyses with outer loop 1657 // vectorization. Until this is addressed, mark these analyses as preserved 1658 // only for non-VPlan-native path. 1659 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1660 if (!EnableVPlanNativePath) { 1661 AU.addPreserved<LoopInfoWrapperPass>(); 1662 AU.addPreserved<DominatorTreeWrapperPass>(); 1663 } 1664 1665 AU.addPreserved<BasicAAWrapperPass>(); 1666 AU.addPreserved<GlobalsAAWrapperPass>(); 1667 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1668 } 1669 }; 1670 1671 } // end anonymous namespace 1672 1673 //===----------------------------------------------------------------------===// 1674 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1675 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1676 //===----------------------------------------------------------------------===// 1677 1678 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1679 // We need to place the broadcast of invariant variables outside the loop, 1680 // but only if it's proven safe to do so. Else, broadcast will be inside 1681 // vector loop body. 1682 Instruction *Instr = dyn_cast<Instruction>(V); 1683 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1684 (!Instr || 1685 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1686 // Place the code for broadcasting invariant variables in the new preheader. 1687 IRBuilder<>::InsertPointGuard Guard(Builder); 1688 if (SafeToHoist) 1689 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1690 1691 // Broadcast the scalar into all locations in the vector. 1692 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1693 1694 return Shuf; 1695 } 1696 1697 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1698 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1699 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1700 "Expected either an induction phi-node or a truncate of it!"); 1701 Value *Start = II.getStartValue(); 1702 1703 // Construct the initial value of the vector IV in the vector loop preheader 1704 auto CurrIP = Builder.saveIP(); 1705 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1706 if (isa<TruncInst>(EntryVal)) { 1707 assert(Start->getType()->isIntegerTy() && 1708 "Truncation requires an integer type"); 1709 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1710 Step = Builder.CreateTrunc(Step, TruncType); 1711 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1712 } 1713 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1714 Value *SteppedStart = 1715 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1716 1717 // We create vector phi nodes for both integer and floating-point induction 1718 // variables. Here, we determine the kind of arithmetic we will perform. 1719 Instruction::BinaryOps AddOp; 1720 Instruction::BinaryOps MulOp; 1721 if (Step->getType()->isIntegerTy()) { 1722 AddOp = Instruction::Add; 1723 MulOp = Instruction::Mul; 1724 } else { 1725 AddOp = II.getInductionOpcode(); 1726 MulOp = Instruction::FMul; 1727 } 1728 1729 // Multiply the vectorization factor by the step using integer or 1730 // floating-point arithmetic as appropriate. 1731 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1732 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1733 1734 // Create a vector splat to use in the induction update. 1735 // 1736 // FIXME: If the step is non-constant, we create the vector splat with 1737 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1738 // handle a constant vector splat. 1739 Value *SplatVF = 1740 isa<Constant>(Mul) 1741 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1742 : Builder.CreateVectorSplat(VF, Mul); 1743 Builder.restoreIP(CurrIP); 1744 1745 // We may need to add the step a number of times, depending on the unroll 1746 // factor. The last of those goes into the PHI. 1747 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1748 &*LoopVectorBody->getFirstInsertionPt()); 1749 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1750 Instruction *LastInduction = VecInd; 1751 for (unsigned Part = 0; Part < UF; ++Part) { 1752 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1753 1754 if (isa<TruncInst>(EntryVal)) 1755 addMetadata(LastInduction, EntryVal); 1756 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1757 1758 LastInduction = cast<Instruction>(addFastMathFlag( 1759 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1760 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1761 } 1762 1763 // Move the last step to the end of the latch block. This ensures consistent 1764 // placement of all induction updates. 1765 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1766 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1767 auto *ICmp = cast<Instruction>(Br->getCondition()); 1768 LastInduction->moveBefore(ICmp); 1769 LastInduction->setName("vec.ind.next"); 1770 1771 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1772 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1773 } 1774 1775 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1776 return Cost->isScalarAfterVectorization(I, VF) || 1777 Cost->isProfitableToScalarize(I, VF); 1778 } 1779 1780 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1781 if (shouldScalarizeInstruction(IV)) 1782 return true; 1783 auto isScalarInst = [&](User *U) -> bool { 1784 auto *I = cast<Instruction>(U); 1785 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1786 }; 1787 return llvm::any_of(IV->users(), isScalarInst); 1788 } 1789 1790 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1791 const InductionDescriptor &ID, const Instruction *EntryVal, 1792 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1793 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1794 "Expected either an induction phi-node or a truncate of it!"); 1795 1796 // This induction variable is not the phi from the original loop but the 1797 // newly-created IV based on the proof that casted Phi is equal to the 1798 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1799 // re-uses the same InductionDescriptor that original IV uses but we don't 1800 // have to do any recording in this case - that is done when original IV is 1801 // processed. 1802 if (isa<TruncInst>(EntryVal)) 1803 return; 1804 1805 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1806 if (Casts.empty()) 1807 return; 1808 // Only the first Cast instruction in the Casts vector is of interest. 1809 // The rest of the Casts (if exist) have no uses outside the 1810 // induction update chain itself. 1811 Instruction *CastInst = *Casts.begin(); 1812 if (Lane < UINT_MAX) 1813 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1814 else 1815 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1816 } 1817 1818 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1819 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1820 "Primary induction variable must have an integer type"); 1821 1822 auto II = Legal->getInductionVars().find(IV); 1823 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1824 1825 auto ID = II->second; 1826 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1827 1828 // The value from the original loop to which we are mapping the new induction 1829 // variable. 1830 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1831 1832 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1833 1834 // Generate code for the induction step. Note that induction steps are 1835 // required to be loop-invariant 1836 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1837 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1838 "Induction step should be loop invariant"); 1839 if (PSE.getSE()->isSCEVable(IV->getType())) { 1840 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1841 return Exp.expandCodeFor(Step, Step->getType(), 1842 LoopVectorPreHeader->getTerminator()); 1843 } 1844 return cast<SCEVUnknown>(Step)->getValue(); 1845 }; 1846 1847 // The scalar value to broadcast. This is derived from the canonical 1848 // induction variable. If a truncation type is given, truncate the canonical 1849 // induction variable and step. Otherwise, derive these values from the 1850 // induction descriptor. 1851 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1852 Value *ScalarIV = Induction; 1853 if (IV != OldInduction) { 1854 ScalarIV = IV->getType()->isIntegerTy() 1855 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1856 : Builder.CreateCast(Instruction::SIToFP, Induction, 1857 IV->getType()); 1858 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1859 ScalarIV->setName("offset.idx"); 1860 } 1861 if (Trunc) { 1862 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1863 assert(Step->getType()->isIntegerTy() && 1864 "Truncation requires an integer step"); 1865 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1866 Step = Builder.CreateTrunc(Step, TruncType); 1867 } 1868 return ScalarIV; 1869 }; 1870 1871 // Create the vector values from the scalar IV, in the absence of creating a 1872 // vector IV. 1873 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1874 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1875 for (unsigned Part = 0; Part < UF; ++Part) { 1876 Value *EntryPart = 1877 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1878 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1879 if (Trunc) 1880 addMetadata(EntryPart, Trunc); 1881 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1882 } 1883 }; 1884 1885 // Now do the actual transformations, and start with creating the step value. 1886 Value *Step = CreateStepValue(ID.getStep()); 1887 if (VF <= 1) { 1888 Value *ScalarIV = CreateScalarIV(Step); 1889 CreateSplatIV(ScalarIV, Step); 1890 return; 1891 } 1892 1893 // Determine if we want a scalar version of the induction variable. This is 1894 // true if the induction variable itself is not widened, or if it has at 1895 // least one user in the loop that is not widened. 1896 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1897 if (!NeedsScalarIV) { 1898 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1899 return; 1900 } 1901 1902 // Try to create a new independent vector induction variable. If we can't 1903 // create the phi node, we will splat the scalar induction variable in each 1904 // loop iteration. 1905 if (!shouldScalarizeInstruction(EntryVal)) { 1906 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1907 Value *ScalarIV = CreateScalarIV(Step); 1908 // Create scalar steps that can be used by instructions we will later 1909 // scalarize. Note that the addition of the scalar steps will not increase 1910 // the number of instructions in the loop in the common case prior to 1911 // InstCombine. We will be trading one vector extract for each scalar step. 1912 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1913 return; 1914 } 1915 1916 // All IV users are scalar instructions, so only emit a scalar IV, not a 1917 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1918 // predicate used by the masked loads/stores. 1919 Value *ScalarIV = CreateScalarIV(Step); 1920 if (!Cost->isScalarEpilogueAllowed()) 1921 CreateSplatIV(ScalarIV, Step); 1922 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1923 } 1924 1925 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1926 Instruction::BinaryOps BinOp) { 1927 // Create and check the types. 1928 auto *ValVTy = cast<VectorType>(Val->getType()); 1929 int VLen = ValVTy->getNumElements(); 1930 1931 Type *STy = Val->getType()->getScalarType(); 1932 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1933 "Induction Step must be an integer or FP"); 1934 assert(Step->getType() == STy && "Step has wrong type"); 1935 1936 SmallVector<Constant *, 8> Indices; 1937 1938 if (STy->isIntegerTy()) { 1939 // Create a vector of consecutive numbers from zero to VF. 1940 for (int i = 0; i < VLen; ++i) 1941 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1942 1943 // Add the consecutive indices to the vector value. 1944 Constant *Cv = ConstantVector::get(Indices); 1945 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1946 Step = Builder.CreateVectorSplat(VLen, Step); 1947 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1948 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1949 // which can be found from the original scalar operations. 1950 Step = Builder.CreateMul(Cv, Step); 1951 return Builder.CreateAdd(Val, Step, "induction"); 1952 } 1953 1954 // Floating point induction. 1955 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1956 "Binary Opcode should be specified for FP induction"); 1957 // Create a vector of consecutive numbers from zero to VF. 1958 for (int i = 0; i < VLen; ++i) 1959 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1960 1961 // Add the consecutive indices to the vector value. 1962 Constant *Cv = ConstantVector::get(Indices); 1963 1964 Step = Builder.CreateVectorSplat(VLen, Step); 1965 1966 // Floating point operations had to be 'fast' to enable the induction. 1967 FastMathFlags Flags; 1968 Flags.setFast(); 1969 1970 Value *MulOp = Builder.CreateFMul(Cv, Step); 1971 if (isa<Instruction>(MulOp)) 1972 // Have to check, MulOp may be a constant 1973 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1974 1975 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1976 if (isa<Instruction>(BOp)) 1977 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1978 return BOp; 1979 } 1980 1981 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1982 Instruction *EntryVal, 1983 const InductionDescriptor &ID) { 1984 // We shouldn't have to build scalar steps if we aren't vectorizing. 1985 assert(VF > 1 && "VF should be greater than one"); 1986 1987 // Get the value type and ensure it and the step have the same integer type. 1988 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1989 assert(ScalarIVTy == Step->getType() && 1990 "Val and Step should have the same type"); 1991 1992 // We build scalar steps for both integer and floating-point induction 1993 // variables. Here, we determine the kind of arithmetic we will perform. 1994 Instruction::BinaryOps AddOp; 1995 Instruction::BinaryOps MulOp; 1996 if (ScalarIVTy->isIntegerTy()) { 1997 AddOp = Instruction::Add; 1998 MulOp = Instruction::Mul; 1999 } else { 2000 AddOp = ID.getInductionOpcode(); 2001 MulOp = Instruction::FMul; 2002 } 2003 2004 // Determine the number of scalars we need to generate for each unroll 2005 // iteration. If EntryVal is uniform, we only need to generate the first 2006 // lane. Otherwise, we generate all VF values. 2007 unsigned Lanes = 2008 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2009 : VF; 2010 // Compute the scalar steps and save the results in VectorLoopValueMap. 2011 for (unsigned Part = 0; Part < UF; ++Part) { 2012 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2013 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2014 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2015 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2016 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2017 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2018 } 2019 } 2020 } 2021 2022 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2023 assert(V != Induction && "The new induction variable should not be used."); 2024 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2025 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2026 2027 // If we have a stride that is replaced by one, do it here. Defer this for 2028 // the VPlan-native path until we start running Legal checks in that path. 2029 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2030 V = ConstantInt::get(V->getType(), 1); 2031 2032 // If we have a vector mapped to this value, return it. 2033 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2034 return VectorLoopValueMap.getVectorValue(V, Part); 2035 2036 // If the value has not been vectorized, check if it has been scalarized 2037 // instead. If it has been scalarized, and we actually need the value in 2038 // vector form, we will construct the vector values on demand. 2039 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2040 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2041 2042 // If we've scalarized a value, that value should be an instruction. 2043 auto *I = cast<Instruction>(V); 2044 2045 // If we aren't vectorizing, we can just copy the scalar map values over to 2046 // the vector map. 2047 if (VF == 1) { 2048 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2049 return ScalarValue; 2050 } 2051 2052 // Get the last scalar instruction we generated for V and Part. If the value 2053 // is known to be uniform after vectorization, this corresponds to lane zero 2054 // of the Part unroll iteration. Otherwise, the last instruction is the one 2055 // we created for the last vector lane of the Part unroll iteration. 2056 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2057 auto *LastInst = cast<Instruction>( 2058 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2059 2060 // Set the insert point after the last scalarized instruction. This ensures 2061 // the insertelement sequence will directly follow the scalar definitions. 2062 auto OldIP = Builder.saveIP(); 2063 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2064 Builder.SetInsertPoint(&*NewIP); 2065 2066 // However, if we are vectorizing, we need to construct the vector values. 2067 // If the value is known to be uniform after vectorization, we can just 2068 // broadcast the scalar value corresponding to lane zero for each unroll 2069 // iteration. Otherwise, we construct the vector values using insertelement 2070 // instructions. Since the resulting vectors are stored in 2071 // VectorLoopValueMap, we will only generate the insertelements once. 2072 Value *VectorValue = nullptr; 2073 if (Cost->isUniformAfterVectorization(I, VF)) { 2074 VectorValue = getBroadcastInstrs(ScalarValue); 2075 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2076 } else { 2077 // Initialize packing with insertelements to start from undef. 2078 Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); 2079 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2080 for (unsigned Lane = 0; Lane < VF; ++Lane) 2081 packScalarIntoVectorValue(V, {Part, Lane}); 2082 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2083 } 2084 Builder.restoreIP(OldIP); 2085 return VectorValue; 2086 } 2087 2088 // If this scalar is unknown, assume that it is a constant or that it is 2089 // loop invariant. Broadcast V and save the value for future uses. 2090 Value *B = getBroadcastInstrs(V); 2091 VectorLoopValueMap.setVectorValue(V, Part, B); 2092 return B; 2093 } 2094 2095 Value * 2096 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2097 const VPIteration &Instance) { 2098 // If the value is not an instruction contained in the loop, it should 2099 // already be scalar. 2100 if (OrigLoop->isLoopInvariant(V)) 2101 return V; 2102 2103 assert(Instance.Lane > 0 2104 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2105 : true && "Uniform values only have lane zero"); 2106 2107 // If the value from the original loop has not been vectorized, it is 2108 // represented by UF x VF scalar values in the new loop. Return the requested 2109 // scalar value. 2110 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2111 return VectorLoopValueMap.getScalarValue(V, Instance); 2112 2113 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2114 // for the given unroll part. If this entry is not a vector type (i.e., the 2115 // vectorization factor is one), there is no need to generate an 2116 // extractelement instruction. 2117 auto *U = getOrCreateVectorValue(V, Instance.Part); 2118 if (!U->getType()->isVectorTy()) { 2119 assert(VF == 1 && "Value not scalarized has non-vector type"); 2120 return U; 2121 } 2122 2123 // Otherwise, the value from the original loop has been vectorized and is 2124 // represented by UF vector values. Extract and return the requested scalar 2125 // value from the appropriate vector lane. 2126 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2127 } 2128 2129 void InnerLoopVectorizer::packScalarIntoVectorValue( 2130 Value *V, const VPIteration &Instance) { 2131 assert(V != Induction && "The new induction variable should not be used."); 2132 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2133 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2134 2135 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2136 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2137 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2138 Builder.getInt32(Instance.Lane)); 2139 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2140 } 2141 2142 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2143 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2144 SmallVector<int, 8> ShuffleMask; 2145 for (unsigned i = 0; i < VF; ++i) 2146 ShuffleMask.push_back(VF - i - 1); 2147 2148 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2149 ShuffleMask, "reverse"); 2150 } 2151 2152 // Return whether we allow using masked interleave-groups (for dealing with 2153 // strided loads/stores that reside in predicated blocks, or for dealing 2154 // with gaps). 2155 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2156 // If an override option has been passed in for interleaved accesses, use it. 2157 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2158 return EnableMaskedInterleavedMemAccesses; 2159 2160 return TTI.enableMaskedInterleavedAccessVectorization(); 2161 } 2162 2163 // Try to vectorize the interleave group that \p Instr belongs to. 2164 // 2165 // E.g. Translate following interleaved load group (factor = 3): 2166 // for (i = 0; i < N; i+=3) { 2167 // R = Pic[i]; // Member of index 0 2168 // G = Pic[i+1]; // Member of index 1 2169 // B = Pic[i+2]; // Member of index 2 2170 // ... // do something to R, G, B 2171 // } 2172 // To: 2173 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2174 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2175 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2176 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2177 // 2178 // Or translate following interleaved store group (factor = 3): 2179 // for (i = 0; i < N; i+=3) { 2180 // ... do something to R, G, B 2181 // Pic[i] = R; // Member of index 0 2182 // Pic[i+1] = G; // Member of index 1 2183 // Pic[i+2] = B; // Member of index 2 2184 // } 2185 // To: 2186 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2187 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2188 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2189 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2190 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2191 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2192 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2193 VPValue *Addr, VPValue *BlockInMask) { 2194 Instruction *Instr = Group->getInsertPos(); 2195 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2196 2197 // Prepare for the vector type of the interleaved load/store. 2198 Type *ScalarTy = getMemInstValueType(Instr); 2199 unsigned InterleaveFactor = Group->getFactor(); 2200 auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); 2201 2202 // Prepare for the new pointers. 2203 SmallVector<Value *, 2> AddrParts; 2204 unsigned Index = Group->getIndex(Instr); 2205 2206 // TODO: extend the masked interleaved-group support to reversed access. 2207 assert((!BlockInMask || !Group->isReverse()) && 2208 "Reversed masked interleave-group not supported."); 2209 2210 // If the group is reverse, adjust the index to refer to the last vector lane 2211 // instead of the first. We adjust the index from the first vector lane, 2212 // rather than directly getting the pointer for lane VF - 1, because the 2213 // pointer operand of the interleaved access is supposed to be uniform. For 2214 // uniform instructions, we're only required to generate a value for the 2215 // first vector lane in each unroll iteration. 2216 if (Group->isReverse()) 2217 Index += (VF - 1) * Group->getFactor(); 2218 2219 for (unsigned Part = 0; Part < UF; Part++) { 2220 Value *AddrPart = State.get(Addr, {Part, 0}); 2221 setDebugLocFromInst(Builder, AddrPart); 2222 2223 // Notice current instruction could be any index. Need to adjust the address 2224 // to the member of index 0. 2225 // 2226 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2227 // b = A[i]; // Member of index 0 2228 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2229 // 2230 // E.g. A[i+1] = a; // Member of index 1 2231 // A[i] = b; // Member of index 0 2232 // A[i+2] = c; // Member of index 2 (Current instruction) 2233 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2234 2235 bool InBounds = false; 2236 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2237 InBounds = gep->isInBounds(); 2238 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2239 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2240 2241 // Cast to the vector pointer type. 2242 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2243 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2244 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2245 } 2246 2247 setDebugLocFromInst(Builder, Instr); 2248 Value *UndefVec = UndefValue::get(VecTy); 2249 2250 Value *MaskForGaps = nullptr; 2251 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2252 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2253 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2254 } 2255 2256 // Vectorize the interleaved load group. 2257 if (isa<LoadInst>(Instr)) { 2258 // For each unroll part, create a wide load for the group. 2259 SmallVector<Value *, 2> NewLoads; 2260 for (unsigned Part = 0; Part < UF; Part++) { 2261 Instruction *NewLoad; 2262 if (BlockInMask || MaskForGaps) { 2263 assert(useMaskedInterleavedAccesses(*TTI) && 2264 "masked interleaved groups are not allowed."); 2265 Value *GroupMask = MaskForGaps; 2266 if (BlockInMask) { 2267 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2268 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2269 Value *ShuffledMask = Builder.CreateShuffleVector( 2270 BlockInMaskPart, Undefs, 2271 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2272 GroupMask = MaskForGaps 2273 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2274 MaskForGaps) 2275 : ShuffledMask; 2276 } 2277 NewLoad = 2278 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2279 GroupMask, UndefVec, "wide.masked.vec"); 2280 } 2281 else 2282 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2283 Group->getAlign(), "wide.vec"); 2284 Group->addMetadata(NewLoad); 2285 NewLoads.push_back(NewLoad); 2286 } 2287 2288 // For each member in the group, shuffle out the appropriate data from the 2289 // wide loads. 2290 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2291 Instruction *Member = Group->getMember(I); 2292 2293 // Skip the gaps in the group. 2294 if (!Member) 2295 continue; 2296 2297 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2298 for (unsigned Part = 0; Part < UF; Part++) { 2299 Value *StridedVec = Builder.CreateShuffleVector( 2300 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2301 2302 // If this member has different type, cast the result type. 2303 if (Member->getType() != ScalarTy) { 2304 VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); 2305 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2306 } 2307 2308 if (Group->isReverse()) 2309 StridedVec = reverseVector(StridedVec); 2310 2311 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2312 } 2313 } 2314 return; 2315 } 2316 2317 // The sub vector type for current instruction. 2318 auto *SubVT = FixedVectorType::get(ScalarTy, VF); 2319 2320 // Vectorize the interleaved store group. 2321 for (unsigned Part = 0; Part < UF; Part++) { 2322 // Collect the stored vector from each member. 2323 SmallVector<Value *, 4> StoredVecs; 2324 for (unsigned i = 0; i < InterleaveFactor; i++) { 2325 // Interleaved store group doesn't allow a gap, so each index has a member 2326 Instruction *Member = Group->getMember(i); 2327 assert(Member && "Fail to get a member from an interleaved store group"); 2328 2329 Value *StoredVec = getOrCreateVectorValue( 2330 cast<StoreInst>(Member)->getValueOperand(), Part); 2331 if (Group->isReverse()) 2332 StoredVec = reverseVector(StoredVec); 2333 2334 // If this member has different type, cast it to a unified type. 2335 2336 if (StoredVec->getType() != SubVT) 2337 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2338 2339 StoredVecs.push_back(StoredVec); 2340 } 2341 2342 // Concatenate all vectors into a wide vector. 2343 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2344 2345 // Interleave the elements in the wide vector. 2346 Value *IVec = Builder.CreateShuffleVector( 2347 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2348 "interleaved.vec"); 2349 2350 Instruction *NewStoreInstr; 2351 if (BlockInMask) { 2352 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2353 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2354 Value *ShuffledMask = Builder.CreateShuffleVector( 2355 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2356 "interleaved.mask"); 2357 NewStoreInstr = Builder.CreateMaskedStore( 2358 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2359 } 2360 else 2361 NewStoreInstr = 2362 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2363 2364 Group->addMetadata(NewStoreInstr); 2365 } 2366 } 2367 2368 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2369 VPTransformState &State, 2370 VPValue *Addr, 2371 VPValue *StoredValue, 2372 VPValue *BlockInMask) { 2373 // Attempt to issue a wide load. 2374 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2375 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2376 2377 assert((LI || SI) && "Invalid Load/Store instruction"); 2378 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2379 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2380 2381 LoopVectorizationCostModel::InstWidening Decision = 2382 Cost->getWideningDecision(Instr, VF); 2383 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2384 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2385 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2386 "CM decision is not to widen the memory instruction"); 2387 2388 Type *ScalarDataTy = getMemInstValueType(Instr); 2389 auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); 2390 const Align Alignment = getLoadStoreAlignment(Instr); 2391 2392 // Determine if the pointer operand of the access is either consecutive or 2393 // reverse consecutive. 2394 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2395 bool ConsecutiveStride = 2396 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2397 bool CreateGatherScatter = 2398 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2399 2400 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2401 // gather/scatter. Otherwise Decision should have been to Scalarize. 2402 assert((ConsecutiveStride || CreateGatherScatter) && 2403 "The instruction should be scalarized"); 2404 (void)ConsecutiveStride; 2405 2406 VectorParts BlockInMaskParts(UF); 2407 bool isMaskRequired = BlockInMask; 2408 if (isMaskRequired) 2409 for (unsigned Part = 0; Part < UF; ++Part) 2410 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2411 2412 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2413 // Calculate the pointer for the specific unroll-part. 2414 GetElementPtrInst *PartPtr = nullptr; 2415 2416 bool InBounds = false; 2417 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2418 InBounds = gep->isInBounds(); 2419 2420 if (Reverse) { 2421 // If the address is consecutive but reversed, then the 2422 // wide store needs to start at the last vector element. 2423 PartPtr = cast<GetElementPtrInst>( 2424 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2425 PartPtr->setIsInBounds(InBounds); 2426 PartPtr = cast<GetElementPtrInst>( 2427 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2428 PartPtr->setIsInBounds(InBounds); 2429 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2430 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2431 } else { 2432 PartPtr = cast<GetElementPtrInst>( 2433 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2434 PartPtr->setIsInBounds(InBounds); 2435 } 2436 2437 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2438 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2439 }; 2440 2441 // Handle Stores: 2442 if (SI) { 2443 setDebugLocFromInst(Builder, SI); 2444 2445 for (unsigned Part = 0; Part < UF; ++Part) { 2446 Instruction *NewSI = nullptr; 2447 Value *StoredVal = State.get(StoredValue, Part); 2448 if (CreateGatherScatter) { 2449 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2450 Value *VectorGep = State.get(Addr, Part); 2451 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2452 MaskPart); 2453 } else { 2454 if (Reverse) { 2455 // If we store to reverse consecutive memory locations, then we need 2456 // to reverse the order of elements in the stored value. 2457 StoredVal = reverseVector(StoredVal); 2458 // We don't want to update the value in the map as it might be used in 2459 // another expression. So don't call resetVectorValue(StoredVal). 2460 } 2461 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2462 if (isMaskRequired) 2463 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2464 BlockInMaskParts[Part]); 2465 else 2466 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2467 } 2468 addMetadata(NewSI, SI); 2469 } 2470 return; 2471 } 2472 2473 // Handle loads. 2474 assert(LI && "Must have a load instruction"); 2475 setDebugLocFromInst(Builder, LI); 2476 for (unsigned Part = 0; Part < UF; ++Part) { 2477 Value *NewLI; 2478 if (CreateGatherScatter) { 2479 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2480 Value *VectorGep = State.get(Addr, Part); 2481 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2482 nullptr, "wide.masked.gather"); 2483 addMetadata(NewLI, LI); 2484 } else { 2485 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2486 if (isMaskRequired) 2487 NewLI = Builder.CreateMaskedLoad( 2488 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2489 "wide.masked.load"); 2490 else 2491 NewLI = 2492 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2493 2494 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2495 addMetadata(NewLI, LI); 2496 if (Reverse) 2497 NewLI = reverseVector(NewLI); 2498 } 2499 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2500 } 2501 } 2502 2503 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2504 const VPIteration &Instance, 2505 bool IfPredicateInstr, 2506 VPTransformState &State) { 2507 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2508 2509 setDebugLocFromInst(Builder, Instr); 2510 2511 // Does this instruction return a value ? 2512 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2513 2514 Instruction *Cloned = Instr->clone(); 2515 if (!IsVoidRetTy) 2516 Cloned->setName(Instr->getName() + ".cloned"); 2517 2518 // Replace the operands of the cloned instructions with their scalar 2519 // equivalents in the new loop. 2520 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2521 auto *NewOp = State.get(User.getOperand(op), Instance); 2522 Cloned->setOperand(op, NewOp); 2523 } 2524 addNewMetadata(Cloned, Instr); 2525 2526 // Place the cloned scalar in the new loop. 2527 Builder.Insert(Cloned); 2528 2529 // Add the cloned scalar to the scalar map entry. 2530 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2531 2532 // If we just cloned a new assumption, add it the assumption cache. 2533 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2534 if (II->getIntrinsicID() == Intrinsic::assume) 2535 AC->registerAssumption(II); 2536 2537 // End if-block. 2538 if (IfPredicateInstr) 2539 PredicatedInstructions.push_back(Cloned); 2540 } 2541 2542 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2543 Value *End, Value *Step, 2544 Instruction *DL) { 2545 BasicBlock *Header = L->getHeader(); 2546 BasicBlock *Latch = L->getLoopLatch(); 2547 // As we're just creating this loop, it's possible no latch exists 2548 // yet. If so, use the header as this will be a single block loop. 2549 if (!Latch) 2550 Latch = Header; 2551 2552 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2553 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2554 setDebugLocFromInst(Builder, OldInst); 2555 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2556 2557 Builder.SetInsertPoint(Latch->getTerminator()); 2558 setDebugLocFromInst(Builder, OldInst); 2559 2560 // Create i+1 and fill the PHINode. 2561 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2562 Induction->addIncoming(Start, L->getLoopPreheader()); 2563 Induction->addIncoming(Next, Latch); 2564 // Create the compare. 2565 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2566 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2567 2568 // Now we have two terminators. Remove the old one from the block. 2569 Latch->getTerminator()->eraseFromParent(); 2570 2571 return Induction; 2572 } 2573 2574 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2575 if (TripCount) 2576 return TripCount; 2577 2578 assert(L && "Create Trip Count for null loop."); 2579 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2580 // Find the loop boundaries. 2581 ScalarEvolution *SE = PSE.getSE(); 2582 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2583 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2584 "Invalid loop count"); 2585 2586 Type *IdxTy = Legal->getWidestInductionType(); 2587 assert(IdxTy && "No type for induction"); 2588 2589 // The exit count might have the type of i64 while the phi is i32. This can 2590 // happen if we have an induction variable that is sign extended before the 2591 // compare. The only way that we get a backedge taken count is that the 2592 // induction variable was signed and as such will not overflow. In such a case 2593 // truncation is legal. 2594 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2595 IdxTy->getPrimitiveSizeInBits()) 2596 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2597 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2598 2599 // Get the total trip count from the count by adding 1. 2600 const SCEV *ExitCount = SE->getAddExpr( 2601 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2602 2603 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2604 2605 // Expand the trip count and place the new instructions in the preheader. 2606 // Notice that the pre-header does not change, only the loop body. 2607 SCEVExpander Exp(*SE, DL, "induction"); 2608 2609 // Count holds the overall loop count (N). 2610 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2611 L->getLoopPreheader()->getTerminator()); 2612 2613 if (TripCount->getType()->isPointerTy()) 2614 TripCount = 2615 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2616 L->getLoopPreheader()->getTerminator()); 2617 2618 return TripCount; 2619 } 2620 2621 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2622 if (VectorTripCount) 2623 return VectorTripCount; 2624 2625 Value *TC = getOrCreateTripCount(L); 2626 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2627 2628 Type *Ty = TC->getType(); 2629 Constant *Step = ConstantInt::get(Ty, VF * UF); 2630 2631 // If the tail is to be folded by masking, round the number of iterations N 2632 // up to a multiple of Step instead of rounding down. This is done by first 2633 // adding Step-1 and then rounding down. Note that it's ok if this addition 2634 // overflows: the vector induction variable will eventually wrap to zero given 2635 // that it starts at zero and its Step is a power of two; the loop will then 2636 // exit, with the last early-exit vector comparison also producing all-true. 2637 if (Cost->foldTailByMasking()) { 2638 assert(isPowerOf2_32(VF * UF) && 2639 "VF*UF must be a power of 2 when folding tail by masking"); 2640 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2641 } 2642 2643 // Now we need to generate the expression for the part of the loop that the 2644 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2645 // iterations are not required for correctness, or N - Step, otherwise. Step 2646 // is equal to the vectorization factor (number of SIMD elements) times the 2647 // unroll factor (number of SIMD instructions). 2648 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2649 2650 // If there is a non-reversed interleaved group that may speculatively access 2651 // memory out-of-bounds, we need to ensure that there will be at least one 2652 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2653 // the trip count, we set the remainder to be equal to the step. If the step 2654 // does not evenly divide the trip count, no adjustment is necessary since 2655 // there will already be scalar iterations. Note that the minimum iterations 2656 // check ensures that N >= Step. 2657 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2658 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2659 R = Builder.CreateSelect(IsZero, Step, R); 2660 } 2661 2662 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2663 2664 return VectorTripCount; 2665 } 2666 2667 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2668 const DataLayout &DL) { 2669 // Verify that V is a vector type with same number of elements as DstVTy. 2670 unsigned VF = DstVTy->getNumElements(); 2671 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2672 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2673 Type *SrcElemTy = SrcVecTy->getElementType(); 2674 Type *DstElemTy = DstVTy->getElementType(); 2675 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2676 "Vector elements must have same size"); 2677 2678 // Do a direct cast if element types are castable. 2679 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2680 return Builder.CreateBitOrPointerCast(V, DstVTy); 2681 } 2682 // V cannot be directly casted to desired vector type. 2683 // May happen when V is a floating point vector but DstVTy is a vector of 2684 // pointers or vice-versa. Handle this using a two-step bitcast using an 2685 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2686 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2687 "Only one type should be a pointer type"); 2688 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2689 "Only one type should be a floating point type"); 2690 Type *IntTy = 2691 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2692 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2693 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2694 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2695 } 2696 2697 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2698 BasicBlock *Bypass) { 2699 Value *Count = getOrCreateTripCount(L); 2700 // Reuse existing vector loop preheader for TC checks. 2701 // Note that new preheader block is generated for vector loop. 2702 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2703 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2704 2705 // Generate code to check if the loop's trip count is less than VF * UF, or 2706 // equal to it in case a scalar epilogue is required; this implies that the 2707 // vector trip count is zero. This check also covers the case where adding one 2708 // to the backedge-taken count overflowed leading to an incorrect trip count 2709 // of zero. In this case we will also jump to the scalar loop. 2710 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2711 : ICmpInst::ICMP_ULT; 2712 2713 // If tail is to be folded, vector loop takes care of all iterations. 2714 Value *CheckMinIters = Builder.getFalse(); 2715 if (!Cost->foldTailByMasking()) 2716 CheckMinIters = Builder.CreateICmp( 2717 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2718 "min.iters.check"); 2719 2720 // Create new preheader for vector loop. 2721 LoopVectorPreHeader = 2722 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2723 "vector.ph"); 2724 2725 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2726 DT->getNode(Bypass)->getIDom()) && 2727 "TC check is expected to dominate Bypass"); 2728 2729 // Update dominator for Bypass & LoopExit. 2730 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2731 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2732 2733 ReplaceInstWithInst( 2734 TCCheckBlock->getTerminator(), 2735 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2736 LoopBypassBlocks.push_back(TCCheckBlock); 2737 } 2738 2739 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2740 // Reuse existing vector loop preheader for SCEV checks. 2741 // Note that new preheader block is generated for vector loop. 2742 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2743 2744 // Generate the code to check that the SCEV assumptions that we made. 2745 // We want the new basic block to start at the first instruction in a 2746 // sequence of instructions that form a check. 2747 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2748 "scev.check"); 2749 Value *SCEVCheck = Exp.expandCodeForPredicate( 2750 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2751 2752 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2753 if (C->isZero()) 2754 return; 2755 2756 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2757 "Cannot SCEV check stride or overflow when optimizing for size"); 2758 2759 SCEVCheckBlock->setName("vector.scevcheck"); 2760 // Create new preheader for vector loop. 2761 LoopVectorPreHeader = 2762 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2763 nullptr, "vector.ph"); 2764 2765 // Update dominator only if this is first RT check. 2766 if (LoopBypassBlocks.empty()) { 2767 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2768 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2769 } 2770 2771 ReplaceInstWithInst( 2772 SCEVCheckBlock->getTerminator(), 2773 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2774 LoopBypassBlocks.push_back(SCEVCheckBlock); 2775 AddedSafetyChecks = true; 2776 } 2777 2778 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2779 // VPlan-native path does not do any analysis for runtime checks currently. 2780 if (EnableVPlanNativePath) 2781 return; 2782 2783 // Reuse existing vector loop preheader for runtime memory checks. 2784 // Note that new preheader block is generated for vector loop. 2785 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2786 2787 // Generate the code that checks in runtime if arrays overlap. We put the 2788 // checks into a separate block to make the more common case of few elements 2789 // faster. 2790 auto *LAI = Legal->getLAI(); 2791 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2792 if (!RtPtrChecking.Need) 2793 return; 2794 Instruction *FirstCheckInst; 2795 Instruction *MemRuntimeCheck; 2796 std::tie(FirstCheckInst, MemRuntimeCheck) = 2797 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2798 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2799 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2800 "claimed checks are required"); 2801 2802 if (MemCheckBlock->getParent()->hasOptSize()) { 2803 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2804 "Cannot emit memory checks when optimizing for size, unless forced " 2805 "to vectorize."); 2806 ORE->emit([&]() { 2807 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2808 L->getStartLoc(), L->getHeader()) 2809 << "Code-size may be reduced by not forcing " 2810 "vectorization, or by source-code modifications " 2811 "eliminating the need for runtime checks " 2812 "(e.g., adding 'restrict')."; 2813 }); 2814 } 2815 2816 MemCheckBlock->setName("vector.memcheck"); 2817 // Create new preheader for vector loop. 2818 LoopVectorPreHeader = 2819 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2820 "vector.ph"); 2821 2822 // Update dominator only if this is first RT check. 2823 if (LoopBypassBlocks.empty()) { 2824 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2825 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2826 } 2827 2828 ReplaceInstWithInst( 2829 MemCheckBlock->getTerminator(), 2830 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2831 LoopBypassBlocks.push_back(MemCheckBlock); 2832 AddedSafetyChecks = true; 2833 2834 // We currently don't use LoopVersioning for the actual loop cloning but we 2835 // still use it to add the noalias metadata. 2836 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2837 PSE.getSE()); 2838 LVer->prepareNoAliasMetadata(); 2839 } 2840 2841 Value *InnerLoopVectorizer::emitTransformedIndex( 2842 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2843 const InductionDescriptor &ID) const { 2844 2845 SCEVExpander Exp(*SE, DL, "induction"); 2846 auto Step = ID.getStep(); 2847 auto StartValue = ID.getStartValue(); 2848 assert(Index->getType() == Step->getType() && 2849 "Index type does not match StepValue type"); 2850 2851 // Note: the IR at this point is broken. We cannot use SE to create any new 2852 // SCEV and then expand it, hoping that SCEV's simplification will give us 2853 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2854 // lead to various SCEV crashes. So all we can do is to use builder and rely 2855 // on InstCombine for future simplifications. Here we handle some trivial 2856 // cases only. 2857 auto CreateAdd = [&B](Value *X, Value *Y) { 2858 assert(X->getType() == Y->getType() && "Types don't match!"); 2859 if (auto *CX = dyn_cast<ConstantInt>(X)) 2860 if (CX->isZero()) 2861 return Y; 2862 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2863 if (CY->isZero()) 2864 return X; 2865 return B.CreateAdd(X, Y); 2866 }; 2867 2868 auto CreateMul = [&B](Value *X, Value *Y) { 2869 assert(X->getType() == Y->getType() && "Types don't match!"); 2870 if (auto *CX = dyn_cast<ConstantInt>(X)) 2871 if (CX->isOne()) 2872 return Y; 2873 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2874 if (CY->isOne()) 2875 return X; 2876 return B.CreateMul(X, Y); 2877 }; 2878 2879 switch (ID.getKind()) { 2880 case InductionDescriptor::IK_IntInduction: { 2881 assert(Index->getType() == StartValue->getType() && 2882 "Index type does not match StartValue type"); 2883 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2884 return B.CreateSub(StartValue, Index); 2885 auto *Offset = CreateMul( 2886 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2887 return CreateAdd(StartValue, Offset); 2888 } 2889 case InductionDescriptor::IK_PtrInduction: { 2890 assert(isa<SCEVConstant>(Step) && 2891 "Expected constant step for pointer induction"); 2892 return B.CreateGEP( 2893 StartValue->getType()->getPointerElementType(), StartValue, 2894 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2895 &*B.GetInsertPoint()))); 2896 } 2897 case InductionDescriptor::IK_FpInduction: { 2898 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2899 auto InductionBinOp = ID.getInductionBinOp(); 2900 assert(InductionBinOp && 2901 (InductionBinOp->getOpcode() == Instruction::FAdd || 2902 InductionBinOp->getOpcode() == Instruction::FSub) && 2903 "Original bin op should be defined for FP induction"); 2904 2905 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2906 2907 // Floating point operations had to be 'fast' to enable the induction. 2908 FastMathFlags Flags; 2909 Flags.setFast(); 2910 2911 Value *MulExp = B.CreateFMul(StepValue, Index); 2912 if (isa<Instruction>(MulExp)) 2913 // We have to check, the MulExp may be a constant. 2914 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2915 2916 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2917 "induction"); 2918 if (isa<Instruction>(BOp)) 2919 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2920 2921 return BOp; 2922 } 2923 case InductionDescriptor::IK_NoInduction: 2924 return nullptr; 2925 } 2926 llvm_unreachable("invalid enum"); 2927 } 2928 2929 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2930 /* 2931 In this function we generate a new loop. The new loop will contain 2932 the vectorized instructions while the old loop will continue to run the 2933 scalar remainder. 2934 2935 [ ] <-- loop iteration number check. 2936 / | 2937 / v 2938 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2939 | / | 2940 | / v 2941 || [ ] <-- vector pre header. 2942 |/ | 2943 | v 2944 | [ ] \ 2945 | [ ]_| <-- vector loop. 2946 | | 2947 | v 2948 | -[ ] <--- middle-block. 2949 | / | 2950 | / v 2951 -|- >[ ] <--- new preheader. 2952 | | 2953 | v 2954 | [ ] \ 2955 | [ ]_| <-- old scalar loop to handle remainder. 2956 \ | 2957 \ v 2958 >[ ] <-- exit block. 2959 ... 2960 */ 2961 2962 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2963 2964 // Some loops have a single integer induction variable, while other loops 2965 // don't. One example is c++ iterators that often have multiple pointer 2966 // induction variables. In the code below we also support a case where we 2967 // don't have a single induction variable. 2968 // 2969 // We try to obtain an induction variable from the original loop as hard 2970 // as possible. However if we don't find one that: 2971 // - is an integer 2972 // - counts from zero, stepping by one 2973 // - is the size of the widest induction variable type 2974 // then we create a new one. 2975 OldInduction = Legal->getPrimaryInduction(); 2976 Type *IdxTy = Legal->getWidestInductionType(); 2977 2978 // Split the single block loop into the two loop structure described above. 2979 LoopScalarBody = OrigLoop->getHeader(); 2980 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2981 LoopExitBlock = OrigLoop->getExitBlock(); 2982 assert(LoopExitBlock && "Must have an exit block"); 2983 assert(LoopVectorPreHeader && "Invalid loop structure"); 2984 2985 LoopMiddleBlock = 2986 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2987 LI, nullptr, "middle.block"); 2988 LoopScalarPreHeader = 2989 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2990 nullptr, "scalar.ph"); 2991 // We intentionally don't let SplitBlock to update LoopInfo since 2992 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2993 // LoopVectorBody is explicitly added to the correct place few lines later. 2994 LoopVectorBody = 2995 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2996 nullptr, nullptr, "vector.body"); 2997 2998 // Update dominator for loop exit. 2999 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3000 3001 // Create and register the new vector loop. 3002 Loop *Lp = LI->AllocateLoop(); 3003 Loop *ParentLoop = OrigLoop->getParentLoop(); 3004 3005 // Insert the new loop into the loop nest and register the new basic blocks 3006 // before calling any utilities such as SCEV that require valid LoopInfo. 3007 if (ParentLoop) { 3008 ParentLoop->addChildLoop(Lp); 3009 } else { 3010 LI->addTopLevelLoop(Lp); 3011 } 3012 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3013 3014 // Find the loop boundaries. 3015 Value *Count = getOrCreateTripCount(Lp); 3016 3017 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3018 3019 // Now, compare the new count to zero. If it is zero skip the vector loop and 3020 // jump to the scalar loop. This check also covers the case where the 3021 // backedge-taken count is uint##_max: adding one to it will overflow leading 3022 // to an incorrect trip count of zero. In this (rare) case we will also jump 3023 // to the scalar loop. 3024 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3025 3026 // Generate the code to check any assumptions that we've made for SCEV 3027 // expressions. 3028 emitSCEVChecks(Lp, LoopScalarPreHeader); 3029 3030 // Generate the code that checks in runtime if arrays overlap. We put the 3031 // checks into a separate block to make the more common case of few elements 3032 // faster. 3033 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3034 3035 // Generate the induction variable. 3036 // The loop step is equal to the vectorization factor (num of SIMD elements) 3037 // times the unroll factor (num of SIMD instructions). 3038 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3039 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3040 Induction = 3041 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3042 getDebugLocFromInstOrOperands(OldInduction)); 3043 3044 // We are going to resume the execution of the scalar loop. 3045 // Go over all of the induction variables that we found and fix the 3046 // PHIs that are left in the scalar version of the loop. 3047 // The starting values of PHI nodes depend on the counter of the last 3048 // iteration in the vectorized loop. 3049 // If we come from a bypass edge then we need to start from the original 3050 // start value. 3051 3052 // This variable saves the new starting index for the scalar loop. It is used 3053 // to test if there are any tail iterations left once the vector loop has 3054 // completed. 3055 for (auto &InductionEntry : Legal->getInductionVars()) { 3056 PHINode *OrigPhi = InductionEntry.first; 3057 InductionDescriptor II = InductionEntry.second; 3058 3059 // Create phi nodes to merge from the backedge-taken check block. 3060 PHINode *BCResumeVal = 3061 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3062 LoopScalarPreHeader->getTerminator()); 3063 // Copy original phi DL over to the new one. 3064 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3065 Value *&EndValue = IVEndValues[OrigPhi]; 3066 if (OrigPhi == OldInduction) { 3067 // We know what the end value is. 3068 EndValue = CountRoundDown; 3069 } else { 3070 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3071 Type *StepType = II.getStep()->getType(); 3072 Instruction::CastOps CastOp = 3073 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3074 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3075 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3076 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3077 EndValue->setName("ind.end"); 3078 } 3079 3080 // The new PHI merges the original incoming value, in case of a bypass, 3081 // or the value at the end of the vectorized loop. 3082 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3083 3084 // Fix the scalar body counter (PHI node). 3085 // The old induction's phi node in the scalar body needs the truncated 3086 // value. 3087 for (BasicBlock *BB : LoopBypassBlocks) 3088 BCResumeVal->addIncoming(II.getStartValue(), BB); 3089 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3090 } 3091 3092 // We need the OrigLoop (scalar loop part) latch terminator to help 3093 // produce correct debug info for the middle block BB instructions. 3094 // The legality check stage guarantees that the loop will have a single 3095 // latch. 3096 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3097 "Scalar loop latch terminator isn't a branch"); 3098 BranchInst *ScalarLatchBr = 3099 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3100 3101 // Add a check in the middle block to see if we have completed 3102 // all of the iterations in the first vector loop. 3103 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3104 // If tail is to be folded, we know we don't need to run the remainder. 3105 Value *CmpN = Builder.getTrue(); 3106 if (!Cost->foldTailByMasking()) { 3107 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3108 CountRoundDown, "cmp.n", 3109 LoopMiddleBlock->getTerminator()); 3110 3111 // Here we use the same DebugLoc as the scalar loop latch branch instead 3112 // of the corresponding compare because they may have ended up with 3113 // different line numbers and we want to avoid awkward line stepping while 3114 // debugging. Eg. if the compare has got a line number inside the loop. 3115 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3116 } 3117 3118 BranchInst *BrInst = 3119 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3120 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3121 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3122 3123 // Get ready to start creating new instructions into the vectorized body. 3124 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3125 "Inconsistent vector loop preheader"); 3126 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3127 3128 Optional<MDNode *> VectorizedLoopID = 3129 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3130 LLVMLoopVectorizeFollowupVectorized}); 3131 if (VectorizedLoopID.hasValue()) { 3132 Lp->setLoopID(VectorizedLoopID.getValue()); 3133 3134 // Do not setAlreadyVectorized if loop attributes have been defined 3135 // explicitly. 3136 return LoopVectorPreHeader; 3137 } 3138 3139 // Keep all loop hints from the original loop on the vector loop (we'll 3140 // replace the vectorizer-specific hints below). 3141 if (MDNode *LID = OrigLoop->getLoopID()) 3142 Lp->setLoopID(LID); 3143 3144 LoopVectorizeHints Hints(Lp, true, *ORE); 3145 Hints.setAlreadyVectorized(); 3146 3147 #ifdef EXPENSIVE_CHECKS 3148 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3149 LI->verify(*DT); 3150 #endif 3151 3152 return LoopVectorPreHeader; 3153 } 3154 3155 // Fix up external users of the induction variable. At this point, we are 3156 // in LCSSA form, with all external PHIs that use the IV having one input value, 3157 // coming from the remainder loop. We need those PHIs to also have a correct 3158 // value for the IV when arriving directly from the middle block. 3159 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3160 const InductionDescriptor &II, 3161 Value *CountRoundDown, Value *EndValue, 3162 BasicBlock *MiddleBlock) { 3163 // There are two kinds of external IV usages - those that use the value 3164 // computed in the last iteration (the PHI) and those that use the penultimate 3165 // value (the value that feeds into the phi from the loop latch). 3166 // We allow both, but they, obviously, have different values. 3167 3168 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3169 3170 DenseMap<Value *, Value *> MissingVals; 3171 3172 // An external user of the last iteration's value should see the value that 3173 // the remainder loop uses to initialize its own IV. 3174 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3175 for (User *U : PostInc->users()) { 3176 Instruction *UI = cast<Instruction>(U); 3177 if (!OrigLoop->contains(UI)) { 3178 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3179 MissingVals[UI] = EndValue; 3180 } 3181 } 3182 3183 // An external user of the penultimate value need to see EndValue - Step. 3184 // The simplest way to get this is to recompute it from the constituent SCEVs, 3185 // that is Start + (Step * (CRD - 1)). 3186 for (User *U : OrigPhi->users()) { 3187 auto *UI = cast<Instruction>(U); 3188 if (!OrigLoop->contains(UI)) { 3189 const DataLayout &DL = 3190 OrigLoop->getHeader()->getModule()->getDataLayout(); 3191 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3192 3193 IRBuilder<> B(MiddleBlock->getTerminator()); 3194 Value *CountMinusOne = B.CreateSub( 3195 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3196 Value *CMO = 3197 !II.getStep()->getType()->isIntegerTy() 3198 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3199 II.getStep()->getType()) 3200 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3201 CMO->setName("cast.cmo"); 3202 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3203 Escape->setName("ind.escape"); 3204 MissingVals[UI] = Escape; 3205 } 3206 } 3207 3208 for (auto &I : MissingVals) { 3209 PHINode *PHI = cast<PHINode>(I.first); 3210 // One corner case we have to handle is two IVs "chasing" each-other, 3211 // that is %IV2 = phi [...], [ %IV1, %latch ] 3212 // In this case, if IV1 has an external use, we need to avoid adding both 3213 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3214 // don't already have an incoming value for the middle block. 3215 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3216 PHI->addIncoming(I.second, MiddleBlock); 3217 } 3218 } 3219 3220 namespace { 3221 3222 struct CSEDenseMapInfo { 3223 static bool canHandle(const Instruction *I) { 3224 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3225 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3226 } 3227 3228 static inline Instruction *getEmptyKey() { 3229 return DenseMapInfo<Instruction *>::getEmptyKey(); 3230 } 3231 3232 static inline Instruction *getTombstoneKey() { 3233 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3234 } 3235 3236 static unsigned getHashValue(const Instruction *I) { 3237 assert(canHandle(I) && "Unknown instruction!"); 3238 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3239 I->value_op_end())); 3240 } 3241 3242 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3243 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3244 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3245 return LHS == RHS; 3246 return LHS->isIdenticalTo(RHS); 3247 } 3248 }; 3249 3250 } // end anonymous namespace 3251 3252 ///Perform cse of induction variable instructions. 3253 static void cse(BasicBlock *BB) { 3254 // Perform simple cse. 3255 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3256 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3257 Instruction *In = &*I++; 3258 3259 if (!CSEDenseMapInfo::canHandle(In)) 3260 continue; 3261 3262 // Check if we can replace this instruction with any of the 3263 // visited instructions. 3264 if (Instruction *V = CSEMap.lookup(In)) { 3265 In->replaceAllUsesWith(V); 3266 In->eraseFromParent(); 3267 continue; 3268 } 3269 3270 CSEMap[In] = In; 3271 } 3272 } 3273 3274 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3275 unsigned VF, 3276 bool &NeedToScalarize) { 3277 Function *F = CI->getCalledFunction(); 3278 Type *ScalarRetTy = CI->getType(); 3279 SmallVector<Type *, 4> Tys, ScalarTys; 3280 for (auto &ArgOp : CI->arg_operands()) 3281 ScalarTys.push_back(ArgOp->getType()); 3282 3283 // Estimate cost of scalarized vector call. The source operands are assumed 3284 // to be vectors, so we need to extract individual elements from there, 3285 // execute VF scalar calls, and then gather the result into the vector return 3286 // value. 3287 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3288 TTI::TCK_RecipThroughput); 3289 if (VF == 1) 3290 return ScalarCallCost; 3291 3292 // Compute corresponding vector type for return value and arguments. 3293 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3294 for (Type *ScalarTy : ScalarTys) 3295 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3296 3297 // Compute costs of unpacking argument values for the scalar calls and 3298 // packing the return values to a vector. 3299 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3300 3301 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3302 3303 // If we can't emit a vector call for this function, then the currently found 3304 // cost is the cost we need to return. 3305 NeedToScalarize = true; 3306 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3307 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3308 3309 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3310 return Cost; 3311 3312 // If the corresponding vector cost is cheaper, return its cost. 3313 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3314 TTI::TCK_RecipThroughput); 3315 if (VectorCallCost < Cost) { 3316 NeedToScalarize = false; 3317 return VectorCallCost; 3318 } 3319 return Cost; 3320 } 3321 3322 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3323 unsigned VF) { 3324 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3325 assert(ID && "Expected intrinsic call!"); 3326 3327 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3328 return TTI.getIntrinsicInstrCost(CostAttrs, 3329 TargetTransformInfo::TCK_RecipThroughput); 3330 } 3331 3332 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3333 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3334 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3335 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3336 } 3337 3338 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3339 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3340 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3341 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3342 } 3343 3344 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3345 // For every instruction `I` in MinBWs, truncate the operands, create a 3346 // truncated version of `I` and reextend its result. InstCombine runs 3347 // later and will remove any ext/trunc pairs. 3348 SmallPtrSet<Value *, 4> Erased; 3349 for (const auto &KV : Cost->getMinimalBitwidths()) { 3350 // If the value wasn't vectorized, we must maintain the original scalar 3351 // type. The absence of the value from VectorLoopValueMap indicates that it 3352 // wasn't vectorized. 3353 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3354 continue; 3355 for (unsigned Part = 0; Part < UF; ++Part) { 3356 Value *I = getOrCreateVectorValue(KV.first, Part); 3357 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3358 continue; 3359 Type *OriginalTy = I->getType(); 3360 Type *ScalarTruncatedTy = 3361 IntegerType::get(OriginalTy->getContext(), KV.second); 3362 auto *TruncatedTy = FixedVectorType::get( 3363 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3364 if (TruncatedTy == OriginalTy) 3365 continue; 3366 3367 IRBuilder<> B(cast<Instruction>(I)); 3368 auto ShrinkOperand = [&](Value *V) -> Value * { 3369 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3370 if (ZI->getSrcTy() == TruncatedTy) 3371 return ZI->getOperand(0); 3372 return B.CreateZExtOrTrunc(V, TruncatedTy); 3373 }; 3374 3375 // The actual instruction modification depends on the instruction type, 3376 // unfortunately. 3377 Value *NewI = nullptr; 3378 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3379 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3380 ShrinkOperand(BO->getOperand(1))); 3381 3382 // Any wrapping introduced by shrinking this operation shouldn't be 3383 // considered undefined behavior. So, we can't unconditionally copy 3384 // arithmetic wrapping flags to NewI. 3385 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3386 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3387 NewI = 3388 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3389 ShrinkOperand(CI->getOperand(1))); 3390 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3391 NewI = B.CreateSelect(SI->getCondition(), 3392 ShrinkOperand(SI->getTrueValue()), 3393 ShrinkOperand(SI->getFalseValue())); 3394 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3395 switch (CI->getOpcode()) { 3396 default: 3397 llvm_unreachable("Unhandled cast!"); 3398 case Instruction::Trunc: 3399 NewI = ShrinkOperand(CI->getOperand(0)); 3400 break; 3401 case Instruction::SExt: 3402 NewI = B.CreateSExtOrTrunc( 3403 CI->getOperand(0), 3404 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3405 break; 3406 case Instruction::ZExt: 3407 NewI = B.CreateZExtOrTrunc( 3408 CI->getOperand(0), 3409 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3410 break; 3411 } 3412 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3413 auto Elements0 = 3414 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3415 auto *O0 = B.CreateZExtOrTrunc( 3416 SI->getOperand(0), 3417 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3418 auto Elements1 = 3419 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3420 auto *O1 = B.CreateZExtOrTrunc( 3421 SI->getOperand(1), 3422 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3423 3424 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3425 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3426 // Don't do anything with the operands, just extend the result. 3427 continue; 3428 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3429 auto Elements = 3430 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3431 auto *O0 = B.CreateZExtOrTrunc( 3432 IE->getOperand(0), 3433 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3434 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3435 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3436 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3437 auto Elements = 3438 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3439 auto *O0 = B.CreateZExtOrTrunc( 3440 EE->getOperand(0), 3441 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3442 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3443 } else { 3444 // If we don't know what to do, be conservative and don't do anything. 3445 continue; 3446 } 3447 3448 // Lastly, extend the result. 3449 NewI->takeName(cast<Instruction>(I)); 3450 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3451 I->replaceAllUsesWith(Res); 3452 cast<Instruction>(I)->eraseFromParent(); 3453 Erased.insert(I); 3454 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3455 } 3456 } 3457 3458 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3459 for (const auto &KV : Cost->getMinimalBitwidths()) { 3460 // If the value wasn't vectorized, we must maintain the original scalar 3461 // type. The absence of the value from VectorLoopValueMap indicates that it 3462 // wasn't vectorized. 3463 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3464 continue; 3465 for (unsigned Part = 0; Part < UF; ++Part) { 3466 Value *I = getOrCreateVectorValue(KV.first, Part); 3467 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3468 if (Inst && Inst->use_empty()) { 3469 Value *NewI = Inst->getOperand(0); 3470 Inst->eraseFromParent(); 3471 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3472 } 3473 } 3474 } 3475 } 3476 3477 void InnerLoopVectorizer::fixVectorizedLoop() { 3478 // Insert truncates and extends for any truncated instructions as hints to 3479 // InstCombine. 3480 if (VF > 1) 3481 truncateToMinimalBitwidths(); 3482 3483 // Fix widened non-induction PHIs by setting up the PHI operands. 3484 if (OrigPHIsToFix.size()) { 3485 assert(EnableVPlanNativePath && 3486 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3487 fixNonInductionPHIs(); 3488 } 3489 3490 // At this point every instruction in the original loop is widened to a 3491 // vector form. Now we need to fix the recurrences in the loop. These PHI 3492 // nodes are currently empty because we did not want to introduce cycles. 3493 // This is the second stage of vectorizing recurrences. 3494 fixCrossIterationPHIs(); 3495 3496 // Forget the original basic block. 3497 PSE.getSE()->forgetLoop(OrigLoop); 3498 3499 // Fix-up external users of the induction variables. 3500 for (auto &Entry : Legal->getInductionVars()) 3501 fixupIVUsers(Entry.first, Entry.second, 3502 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3503 IVEndValues[Entry.first], LoopMiddleBlock); 3504 3505 fixLCSSAPHIs(); 3506 for (Instruction *PI : PredicatedInstructions) 3507 sinkScalarOperands(&*PI); 3508 3509 // Remove redundant induction instructions. 3510 cse(LoopVectorBody); 3511 3512 // Set/update profile weights for the vector and remainder loops as original 3513 // loop iterations are now distributed among them. Note that original loop 3514 // represented by LoopScalarBody becomes remainder loop after vectorization. 3515 // 3516 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3517 // end up getting slightly roughened result but that should be OK since 3518 // profile is not inherently precise anyway. Note also possible bypass of 3519 // vector code caused by legality checks is ignored, assigning all the weight 3520 // to the vector loop, optimistically. 3521 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3522 LI->getLoopFor(LoopVectorBody), 3523 LI->getLoopFor(LoopScalarBody), VF * UF); 3524 } 3525 3526 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3527 // In order to support recurrences we need to be able to vectorize Phi nodes. 3528 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3529 // stage #2: We now need to fix the recurrences by adding incoming edges to 3530 // the currently empty PHI nodes. At this point every instruction in the 3531 // original loop is widened to a vector form so we can use them to construct 3532 // the incoming edges. 3533 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3534 // Handle first-order recurrences and reductions that need to be fixed. 3535 if (Legal->isFirstOrderRecurrence(&Phi)) 3536 fixFirstOrderRecurrence(&Phi); 3537 else if (Legal->isReductionVariable(&Phi)) 3538 fixReduction(&Phi); 3539 } 3540 } 3541 3542 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3543 // This is the second phase of vectorizing first-order recurrences. An 3544 // overview of the transformation is described below. Suppose we have the 3545 // following loop. 3546 // 3547 // for (int i = 0; i < n; ++i) 3548 // b[i] = a[i] - a[i - 1]; 3549 // 3550 // There is a first-order recurrence on "a". For this loop, the shorthand 3551 // scalar IR looks like: 3552 // 3553 // scalar.ph: 3554 // s_init = a[-1] 3555 // br scalar.body 3556 // 3557 // scalar.body: 3558 // i = phi [0, scalar.ph], [i+1, scalar.body] 3559 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3560 // s2 = a[i] 3561 // b[i] = s2 - s1 3562 // br cond, scalar.body, ... 3563 // 3564 // In this example, s1 is a recurrence because it's value depends on the 3565 // previous iteration. In the first phase of vectorization, we created a 3566 // temporary value for s1. We now complete the vectorization and produce the 3567 // shorthand vector IR shown below (for VF = 4, UF = 1). 3568 // 3569 // vector.ph: 3570 // v_init = vector(..., ..., ..., a[-1]) 3571 // br vector.body 3572 // 3573 // vector.body 3574 // i = phi [0, vector.ph], [i+4, vector.body] 3575 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3576 // v2 = a[i, i+1, i+2, i+3]; 3577 // v3 = vector(v1(3), v2(0, 1, 2)) 3578 // b[i, i+1, i+2, i+3] = v2 - v3 3579 // br cond, vector.body, middle.block 3580 // 3581 // middle.block: 3582 // x = v2(3) 3583 // br scalar.ph 3584 // 3585 // scalar.ph: 3586 // s_init = phi [x, middle.block], [a[-1], otherwise] 3587 // br scalar.body 3588 // 3589 // After execution completes the vector loop, we extract the next value of 3590 // the recurrence (x) to use as the initial value in the scalar loop. 3591 3592 // Get the original loop preheader and single loop latch. 3593 auto *Preheader = OrigLoop->getLoopPreheader(); 3594 auto *Latch = OrigLoop->getLoopLatch(); 3595 3596 // Get the initial and previous values of the scalar recurrence. 3597 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3598 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3599 3600 // Create a vector from the initial value. 3601 auto *VectorInit = ScalarInit; 3602 if (VF > 1) { 3603 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3604 VectorInit = Builder.CreateInsertElement( 3605 UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), 3606 VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); 3607 } 3608 3609 // We constructed a temporary phi node in the first phase of vectorization. 3610 // This phi node will eventually be deleted. 3611 Builder.SetInsertPoint( 3612 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3613 3614 // Create a phi node for the new recurrence. The current value will either be 3615 // the initial value inserted into a vector or loop-varying vector value. 3616 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3617 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3618 3619 // Get the vectorized previous value of the last part UF - 1. It appears last 3620 // among all unrolled iterations, due to the order of their construction. 3621 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3622 3623 // Find and set the insertion point after the previous value if it is an 3624 // instruction. 3625 BasicBlock::iterator InsertPt; 3626 // Note that the previous value may have been constant-folded so it is not 3627 // guaranteed to be an instruction in the vector loop. 3628 // FIXME: Loop invariant values do not form recurrences. We should deal with 3629 // them earlier. 3630 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3631 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3632 else { 3633 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3634 if (isa<PHINode>(PreviousLastPart)) 3635 // If the previous value is a phi node, we should insert after all the phi 3636 // nodes in the block containing the PHI to avoid breaking basic block 3637 // verification. Note that the basic block may be different to 3638 // LoopVectorBody, in case we predicate the loop. 3639 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3640 else 3641 InsertPt = ++PreviousInst->getIterator(); 3642 } 3643 Builder.SetInsertPoint(&*InsertPt); 3644 3645 // We will construct a vector for the recurrence by combining the values for 3646 // the current and previous iterations. This is the required shuffle mask. 3647 SmallVector<int, 8> ShuffleMask(VF); 3648 ShuffleMask[0] = VF - 1; 3649 for (unsigned I = 1; I < VF; ++I) 3650 ShuffleMask[I] = I + VF - 1; 3651 3652 // The vector from which to take the initial value for the current iteration 3653 // (actual or unrolled). Initially, this is the vector phi node. 3654 Value *Incoming = VecPhi; 3655 3656 // Shuffle the current and previous vector and update the vector parts. 3657 for (unsigned Part = 0; Part < UF; ++Part) { 3658 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3659 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3660 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3661 ShuffleMask) 3662 : Incoming; 3663 PhiPart->replaceAllUsesWith(Shuffle); 3664 cast<Instruction>(PhiPart)->eraseFromParent(); 3665 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3666 Incoming = PreviousPart; 3667 } 3668 3669 // Fix the latch value of the new recurrence in the vector loop. 3670 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3671 3672 // Extract the last vector element in the middle block. This will be the 3673 // initial value for the recurrence when jumping to the scalar loop. 3674 auto *ExtractForScalar = Incoming; 3675 if (VF > 1) { 3676 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3677 ExtractForScalar = Builder.CreateExtractElement( 3678 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3679 } 3680 // Extract the second last element in the middle block if the 3681 // Phi is used outside the loop. We need to extract the phi itself 3682 // and not the last element (the phi update in the current iteration). This 3683 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3684 // when the scalar loop is not run at all. 3685 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3686 if (VF > 1) 3687 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3688 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3689 // When loop is unrolled without vectorizing, initialize 3690 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3691 // `Incoming`. This is analogous to the vectorized case above: extracting the 3692 // second last element when VF > 1. 3693 else if (UF > 1) 3694 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3695 3696 // Fix the initial value of the original recurrence in the scalar loop. 3697 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3698 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3699 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3700 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3701 Start->addIncoming(Incoming, BB); 3702 } 3703 3704 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3705 Phi->setName("scalar.recur"); 3706 3707 // Finally, fix users of the recurrence outside the loop. The users will need 3708 // either the last value of the scalar recurrence or the last value of the 3709 // vector recurrence we extracted in the middle block. Since the loop is in 3710 // LCSSA form, we just need to find all the phi nodes for the original scalar 3711 // recurrence in the exit block, and then add an edge for the middle block. 3712 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3713 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3714 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3715 } 3716 } 3717 } 3718 3719 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3720 Constant *Zero = Builder.getInt32(0); 3721 3722 // Get it's reduction variable descriptor. 3723 assert(Legal->isReductionVariable(Phi) && 3724 "Unable to find the reduction variable"); 3725 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3726 3727 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3728 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3729 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3730 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3731 RdxDesc.getMinMaxRecurrenceKind(); 3732 setDebugLocFromInst(Builder, ReductionStartValue); 3733 3734 // We need to generate a reduction vector from the incoming scalar. 3735 // To do so, we need to generate the 'identity' vector and override 3736 // one of the elements with the incoming scalar reduction. We need 3737 // to do it in the vector-loop preheader. 3738 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3739 3740 // This is the vector-clone of the value that leaves the loop. 3741 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3742 3743 // Find the reduction identity variable. Zero for addition, or, xor, 3744 // one for multiplication, -1 for And. 3745 Value *Identity; 3746 Value *VectorStart; 3747 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3748 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3749 // MinMax reduction have the start value as their identify. 3750 if (VF == 1) { 3751 VectorStart = Identity = ReductionStartValue; 3752 } else { 3753 VectorStart = Identity = 3754 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3755 } 3756 } else { 3757 // Handle other reduction kinds: 3758 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3759 RK, VecTy->getScalarType()); 3760 if (VF == 1) { 3761 Identity = Iden; 3762 // This vector is the Identity vector where the first element is the 3763 // incoming scalar reduction. 3764 VectorStart = ReductionStartValue; 3765 } else { 3766 Identity = ConstantVector::getSplat({VF, false}, Iden); 3767 3768 // This vector is the Identity vector where the first element is the 3769 // incoming scalar reduction. 3770 VectorStart = 3771 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3772 } 3773 } 3774 3775 // Wrap flags are in general invalid after vectorization, clear them. 3776 clearReductionWrapFlags(RdxDesc); 3777 3778 // Fix the vector-loop phi. 3779 3780 // Reductions do not have to start at zero. They can start with 3781 // any loop invariant values. 3782 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3783 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3784 3785 for (unsigned Part = 0; Part < UF; ++Part) { 3786 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3787 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3788 // Make sure to add the reduction start value only to the 3789 // first unroll part. 3790 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3791 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3792 cast<PHINode>(VecRdxPhi) 3793 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3794 } 3795 3796 // Before each round, move the insertion point right between 3797 // the PHIs and the values we are going to write. 3798 // This allows us to write both PHINodes and the extractelement 3799 // instructions. 3800 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3801 3802 setDebugLocFromInst(Builder, LoopExitInst); 3803 3804 // If tail is folded by masking, the vector value to leave the loop should be 3805 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3806 // instead of the former. 3807 if (Cost->foldTailByMasking()) { 3808 for (unsigned Part = 0; Part < UF; ++Part) { 3809 Value *VecLoopExitInst = 3810 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3811 Value *Sel = nullptr; 3812 for (User *U : VecLoopExitInst->users()) { 3813 if (isa<SelectInst>(U)) { 3814 assert(!Sel && "Reduction exit feeding two selects"); 3815 Sel = U; 3816 } else 3817 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3818 } 3819 assert(Sel && "Reduction exit feeds no select"); 3820 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3821 } 3822 } 3823 3824 // If the vector reduction can be performed in a smaller type, we truncate 3825 // then extend the loop exit value to enable InstCombine to evaluate the 3826 // entire expression in the smaller type. 3827 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3828 Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); 3829 Builder.SetInsertPoint( 3830 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3831 VectorParts RdxParts(UF); 3832 for (unsigned Part = 0; Part < UF; ++Part) { 3833 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3834 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3835 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3836 : Builder.CreateZExt(Trunc, VecTy); 3837 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3838 UI != RdxParts[Part]->user_end();) 3839 if (*UI != Trunc) { 3840 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3841 RdxParts[Part] = Extnd; 3842 } else { 3843 ++UI; 3844 } 3845 } 3846 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3847 for (unsigned Part = 0; Part < UF; ++Part) { 3848 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3849 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3850 } 3851 } 3852 3853 // Reduce all of the unrolled parts into a single vector. 3854 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3855 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3856 3857 // The middle block terminator has already been assigned a DebugLoc here (the 3858 // OrigLoop's single latch terminator). We want the whole middle block to 3859 // appear to execute on this line because: (a) it is all compiler generated, 3860 // (b) these instructions are always executed after evaluating the latch 3861 // conditional branch, and (c) other passes may add new predecessors which 3862 // terminate on this line. This is the easiest way to ensure we don't 3863 // accidentally cause an extra step back into the loop while debugging. 3864 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3865 for (unsigned Part = 1; Part < UF; ++Part) { 3866 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3867 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3868 // Floating point operations had to be 'fast' to enable the reduction. 3869 ReducedPartRdx = addFastMathFlag( 3870 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3871 ReducedPartRdx, "bin.rdx"), 3872 RdxDesc.getFastMathFlags()); 3873 else 3874 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3875 RdxPart); 3876 } 3877 3878 if (VF > 1) { 3879 bool NoNaN = Legal->hasFunNoNaNAttr(); 3880 ReducedPartRdx = 3881 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3882 // If the reduction can be performed in a smaller type, we need to extend 3883 // the reduction to the wider type before we branch to the original loop. 3884 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3885 ReducedPartRdx = 3886 RdxDesc.isSigned() 3887 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3888 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3889 } 3890 3891 // Create a phi node that merges control-flow from the backedge-taken check 3892 // block and the middle block. 3893 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3894 LoopScalarPreHeader->getTerminator()); 3895 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3896 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3897 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3898 3899 // Now, we need to fix the users of the reduction variable 3900 // inside and outside of the scalar remainder loop. 3901 // We know that the loop is in LCSSA form. We need to update the 3902 // PHI nodes in the exit blocks. 3903 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3904 // All PHINodes need to have a single entry edge, or two if 3905 // we already fixed them. 3906 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3907 3908 // We found a reduction value exit-PHI. Update it with the 3909 // incoming bypass edge. 3910 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3911 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3912 } // end of the LCSSA phi scan. 3913 3914 // Fix the scalar loop reduction variable with the incoming reduction sum 3915 // from the vector body and from the backedge value. 3916 int IncomingEdgeBlockIdx = 3917 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3918 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3919 // Pick the other block. 3920 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3921 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3922 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3923 } 3924 3925 void InnerLoopVectorizer::clearReductionWrapFlags( 3926 RecurrenceDescriptor &RdxDesc) { 3927 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3928 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3929 RK != RecurrenceDescriptor::RK_IntegerMult) 3930 return; 3931 3932 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3933 assert(LoopExitInstr && "null loop exit instruction"); 3934 SmallVector<Instruction *, 8> Worklist; 3935 SmallPtrSet<Instruction *, 8> Visited; 3936 Worklist.push_back(LoopExitInstr); 3937 Visited.insert(LoopExitInstr); 3938 3939 while (!Worklist.empty()) { 3940 Instruction *Cur = Worklist.pop_back_val(); 3941 if (isa<OverflowingBinaryOperator>(Cur)) 3942 for (unsigned Part = 0; Part < UF; ++Part) { 3943 Value *V = getOrCreateVectorValue(Cur, Part); 3944 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3945 } 3946 3947 for (User *U : Cur->users()) { 3948 Instruction *UI = cast<Instruction>(U); 3949 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3950 Visited.insert(UI).second) 3951 Worklist.push_back(UI); 3952 } 3953 } 3954 } 3955 3956 void InnerLoopVectorizer::fixLCSSAPHIs() { 3957 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3958 if (LCSSAPhi.getNumIncomingValues() == 1) { 3959 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3960 // Non-instruction incoming values will have only one value. 3961 unsigned LastLane = 0; 3962 if (isa<Instruction>(IncomingValue)) 3963 LastLane = Cost->isUniformAfterVectorization( 3964 cast<Instruction>(IncomingValue), VF) 3965 ? 0 3966 : VF - 1; 3967 // Can be a loop invariant incoming value or the last scalar value to be 3968 // extracted from the vectorized loop. 3969 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3970 Value *lastIncomingValue = 3971 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3972 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3973 } 3974 } 3975 } 3976 3977 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3978 // The basic block and loop containing the predicated instruction. 3979 auto *PredBB = PredInst->getParent(); 3980 auto *VectorLoop = LI->getLoopFor(PredBB); 3981 3982 // Initialize a worklist with the operands of the predicated instruction. 3983 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3984 3985 // Holds instructions that we need to analyze again. An instruction may be 3986 // reanalyzed if we don't yet know if we can sink it or not. 3987 SmallVector<Instruction *, 8> InstsToReanalyze; 3988 3989 // Returns true if a given use occurs in the predicated block. Phi nodes use 3990 // their operands in their corresponding predecessor blocks. 3991 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3992 auto *I = cast<Instruction>(U.getUser()); 3993 BasicBlock *BB = I->getParent(); 3994 if (auto *Phi = dyn_cast<PHINode>(I)) 3995 BB = Phi->getIncomingBlock( 3996 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3997 return BB == PredBB; 3998 }; 3999 4000 // Iteratively sink the scalarized operands of the predicated instruction 4001 // into the block we created for it. When an instruction is sunk, it's 4002 // operands are then added to the worklist. The algorithm ends after one pass 4003 // through the worklist doesn't sink a single instruction. 4004 bool Changed; 4005 do { 4006 // Add the instructions that need to be reanalyzed to the worklist, and 4007 // reset the changed indicator. 4008 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4009 InstsToReanalyze.clear(); 4010 Changed = false; 4011 4012 while (!Worklist.empty()) { 4013 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4014 4015 // We can't sink an instruction if it is a phi node, is already in the 4016 // predicated block, is not in the loop, or may have side effects. 4017 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4018 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4019 continue; 4020 4021 // It's legal to sink the instruction if all its uses occur in the 4022 // predicated block. Otherwise, there's nothing to do yet, and we may 4023 // need to reanalyze the instruction. 4024 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4025 InstsToReanalyze.push_back(I); 4026 continue; 4027 } 4028 4029 // Move the instruction to the beginning of the predicated block, and add 4030 // it's operands to the worklist. 4031 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4032 Worklist.insert(I->op_begin(), I->op_end()); 4033 4034 // The sinking may have enabled other instructions to be sunk, so we will 4035 // need to iterate. 4036 Changed = true; 4037 } 4038 } while (Changed); 4039 } 4040 4041 void InnerLoopVectorizer::fixNonInductionPHIs() { 4042 for (PHINode *OrigPhi : OrigPHIsToFix) { 4043 PHINode *NewPhi = 4044 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4045 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4046 4047 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4048 predecessors(OrigPhi->getParent())); 4049 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4050 predecessors(NewPhi->getParent())); 4051 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4052 "Scalar and Vector BB should have the same number of predecessors"); 4053 4054 // The insertion point in Builder may be invalidated by the time we get 4055 // here. Force the Builder insertion point to something valid so that we do 4056 // not run into issues during insertion point restore in 4057 // getOrCreateVectorValue calls below. 4058 Builder.SetInsertPoint(NewPhi); 4059 4060 // The predecessor order is preserved and we can rely on mapping between 4061 // scalar and vector block predecessors. 4062 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4063 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4064 4065 // When looking up the new scalar/vector values to fix up, use incoming 4066 // values from original phi. 4067 Value *ScIncV = 4068 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4069 4070 // Scalar incoming value may need a broadcast 4071 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4072 NewPhi->addIncoming(NewIncV, NewPredBB); 4073 } 4074 } 4075 } 4076 4077 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4078 unsigned VF, bool IsPtrLoopInvariant, 4079 SmallBitVector &IsIndexLoopInvariant) { 4080 // Construct a vector GEP by widening the operands of the scalar GEP as 4081 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4082 // results in a vector of pointers when at least one operand of the GEP 4083 // is vector-typed. Thus, to keep the representation compact, we only use 4084 // vector-typed operands for loop-varying values. 4085 4086 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4087 // If we are vectorizing, but the GEP has only loop-invariant operands, 4088 // the GEP we build (by only using vector-typed operands for 4089 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4090 // produce a vector of pointers, we need to either arbitrarily pick an 4091 // operand to broadcast, or broadcast a clone of the original GEP. 4092 // Here, we broadcast a clone of the original. 4093 // 4094 // TODO: If at some point we decide to scalarize instructions having 4095 // loop-invariant operands, this special case will no longer be 4096 // required. We would add the scalarization decision to 4097 // collectLoopScalars() and teach getVectorValue() to broadcast 4098 // the lane-zero scalar value. 4099 auto *Clone = Builder.Insert(GEP->clone()); 4100 for (unsigned Part = 0; Part < UF; ++Part) { 4101 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4102 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4103 addMetadata(EntryPart, GEP); 4104 } 4105 } else { 4106 // If the GEP has at least one loop-varying operand, we are sure to 4107 // produce a vector of pointers. But if we are only unrolling, we want 4108 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4109 // produce with the code below will be scalar (if VF == 1) or vector 4110 // (otherwise). Note that for the unroll-only case, we still maintain 4111 // values in the vector mapping with initVector, as we do for other 4112 // instructions. 4113 for (unsigned Part = 0; Part < UF; ++Part) { 4114 // The pointer operand of the new GEP. If it's loop-invariant, we 4115 // won't broadcast it. 4116 auto *Ptr = IsPtrLoopInvariant 4117 ? GEP->getPointerOperand() 4118 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4119 4120 // Collect all the indices for the new GEP. If any index is 4121 // loop-invariant, we won't broadcast it. 4122 SmallVector<Value *, 4> Indices; 4123 for (auto Index : enumerate(GEP->indices())) { 4124 Value *User = Index.value().get(); 4125 if (IsIndexLoopInvariant[Index.index()]) 4126 Indices.push_back(User); 4127 else 4128 Indices.push_back(getOrCreateVectorValue(User, Part)); 4129 } 4130 4131 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4132 // but it should be a vector, otherwise. 4133 auto *NewGEP = 4134 GEP->isInBounds() 4135 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4136 Indices) 4137 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4138 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4139 "NewGEP is not a pointer vector"); 4140 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4141 addMetadata(NewGEP, GEP); 4142 } 4143 } 4144 } 4145 4146 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4147 unsigned VF) { 4148 PHINode *P = cast<PHINode>(PN); 4149 if (EnableVPlanNativePath) { 4150 // Currently we enter here in the VPlan-native path for non-induction 4151 // PHIs where all control flow is uniform. We simply widen these PHIs. 4152 // Create a vector phi with no operands - the vector phi operands will be 4153 // set at the end of vector code generation. 4154 Type *VecTy = 4155 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4156 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4157 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4158 OrigPHIsToFix.push_back(P); 4159 4160 return; 4161 } 4162 4163 assert(PN->getParent() == OrigLoop->getHeader() && 4164 "Non-header phis should have been handled elsewhere"); 4165 4166 // In order to support recurrences we need to be able to vectorize Phi nodes. 4167 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4168 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4169 // this value when we vectorize all of the instructions that use the PHI. 4170 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4171 for (unsigned Part = 0; Part < UF; ++Part) { 4172 // This is phase one of vectorizing PHIs. 4173 Type *VecTy = 4174 (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); 4175 Value *EntryPart = PHINode::Create( 4176 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4177 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4178 } 4179 return; 4180 } 4181 4182 setDebugLocFromInst(Builder, P); 4183 4184 // This PHINode must be an induction variable. 4185 // Make sure that we know about it. 4186 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4187 4188 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4189 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4190 4191 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4192 // which can be found from the original scalar operations. 4193 switch (II.getKind()) { 4194 case InductionDescriptor::IK_NoInduction: 4195 llvm_unreachable("Unknown induction"); 4196 case InductionDescriptor::IK_IntInduction: 4197 case InductionDescriptor::IK_FpInduction: 4198 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4199 case InductionDescriptor::IK_PtrInduction: { 4200 // Handle the pointer induction variable case. 4201 assert(P->getType()->isPointerTy() && "Unexpected type."); 4202 // This is the normalized GEP that starts counting at zero. 4203 Value *PtrInd = Induction; 4204 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4205 // Determine the number of scalars we need to generate for each unroll 4206 // iteration. If the instruction is uniform, we only need to generate the 4207 // first lane. Otherwise, we generate all VF values. 4208 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4209 // These are the scalar results. Notice that we don't generate vector GEPs 4210 // because scalar GEPs result in better code. 4211 for (unsigned Part = 0; Part < UF; ++Part) { 4212 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4213 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4214 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4215 Value *SclrGep = 4216 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4217 SclrGep->setName("next.gep"); 4218 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4219 } 4220 } 4221 return; 4222 } 4223 } 4224 } 4225 4226 /// A helper function for checking whether an integer division-related 4227 /// instruction may divide by zero (in which case it must be predicated if 4228 /// executed conditionally in the scalar code). 4229 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4230 /// Non-zero divisors that are non compile-time constants will not be 4231 /// converted into multiplication, so we will still end up scalarizing 4232 /// the division, but can do so w/o predication. 4233 static bool mayDivideByZero(Instruction &I) { 4234 assert((I.getOpcode() == Instruction::UDiv || 4235 I.getOpcode() == Instruction::SDiv || 4236 I.getOpcode() == Instruction::URem || 4237 I.getOpcode() == Instruction::SRem) && 4238 "Unexpected instruction"); 4239 Value *Divisor = I.getOperand(1); 4240 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4241 return !CInt || CInt->isZero(); 4242 } 4243 4244 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4245 VPTransformState &State) { 4246 switch (I.getOpcode()) { 4247 case Instruction::Call: 4248 case Instruction::Br: 4249 case Instruction::PHI: 4250 case Instruction::GetElementPtr: 4251 case Instruction::Select: 4252 llvm_unreachable("This instruction is handled by a different recipe."); 4253 case Instruction::UDiv: 4254 case Instruction::SDiv: 4255 case Instruction::SRem: 4256 case Instruction::URem: 4257 case Instruction::Add: 4258 case Instruction::FAdd: 4259 case Instruction::Sub: 4260 case Instruction::FSub: 4261 case Instruction::FNeg: 4262 case Instruction::Mul: 4263 case Instruction::FMul: 4264 case Instruction::FDiv: 4265 case Instruction::FRem: 4266 case Instruction::Shl: 4267 case Instruction::LShr: 4268 case Instruction::AShr: 4269 case Instruction::And: 4270 case Instruction::Or: 4271 case Instruction::Xor: { 4272 // Just widen unops and binops. 4273 setDebugLocFromInst(Builder, &I); 4274 4275 for (unsigned Part = 0; Part < UF; ++Part) { 4276 SmallVector<Value *, 2> Ops; 4277 for (VPValue *VPOp : User.operands()) 4278 Ops.push_back(State.get(VPOp, Part)); 4279 4280 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4281 4282 if (auto *VecOp = dyn_cast<Instruction>(V)) 4283 VecOp->copyIRFlags(&I); 4284 4285 // Use this vector value for all users of the original instruction. 4286 VectorLoopValueMap.setVectorValue(&I, Part, V); 4287 addMetadata(V, &I); 4288 } 4289 4290 break; 4291 } 4292 case Instruction::ICmp: 4293 case Instruction::FCmp: { 4294 // Widen compares. Generate vector compares. 4295 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4296 auto *Cmp = cast<CmpInst>(&I); 4297 setDebugLocFromInst(Builder, Cmp); 4298 for (unsigned Part = 0; Part < UF; ++Part) { 4299 Value *A = State.get(User.getOperand(0), Part); 4300 Value *B = State.get(User.getOperand(1), Part); 4301 Value *C = nullptr; 4302 if (FCmp) { 4303 // Propagate fast math flags. 4304 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4305 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4306 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4307 } else { 4308 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4309 } 4310 VectorLoopValueMap.setVectorValue(&I, Part, C); 4311 addMetadata(C, &I); 4312 } 4313 4314 break; 4315 } 4316 4317 case Instruction::ZExt: 4318 case Instruction::SExt: 4319 case Instruction::FPToUI: 4320 case Instruction::FPToSI: 4321 case Instruction::FPExt: 4322 case Instruction::PtrToInt: 4323 case Instruction::IntToPtr: 4324 case Instruction::SIToFP: 4325 case Instruction::UIToFP: 4326 case Instruction::Trunc: 4327 case Instruction::FPTrunc: 4328 case Instruction::BitCast: { 4329 auto *CI = cast<CastInst>(&I); 4330 setDebugLocFromInst(Builder, CI); 4331 4332 /// Vectorize casts. 4333 Type *DestTy = 4334 (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); 4335 4336 for (unsigned Part = 0; Part < UF; ++Part) { 4337 Value *A = State.get(User.getOperand(0), Part); 4338 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4339 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4340 addMetadata(Cast, &I); 4341 } 4342 break; 4343 } 4344 default: 4345 // This instruction is not vectorized by simple widening. 4346 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4347 llvm_unreachable("Unhandled instruction!"); 4348 } // end of switch. 4349 } 4350 4351 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4352 VPTransformState &State) { 4353 assert(!isa<DbgInfoIntrinsic>(I) && 4354 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4355 setDebugLocFromInst(Builder, &I); 4356 4357 Module *M = I.getParent()->getParent()->getParent(); 4358 auto *CI = cast<CallInst>(&I); 4359 4360 SmallVector<Type *, 4> Tys; 4361 for (Value *ArgOperand : CI->arg_operands()) 4362 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4363 4364 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4365 4366 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4367 // version of the instruction. 4368 // Is it beneficial to perform intrinsic call compared to lib call? 4369 bool NeedToScalarize = false; 4370 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4371 bool UseVectorIntrinsic = 4372 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4373 assert((UseVectorIntrinsic || !NeedToScalarize) && 4374 "Instruction should be scalarized elsewhere."); 4375 4376 for (unsigned Part = 0; Part < UF; ++Part) { 4377 SmallVector<Value *, 4> Args; 4378 for (auto &I : enumerate(ArgOperands.operands())) { 4379 // Some intrinsics have a scalar argument - don't replace it with a 4380 // vector. 4381 Value *Arg; 4382 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4383 Arg = State.get(I.value(), Part); 4384 else 4385 Arg = State.get(I.value(), {0, 0}); 4386 Args.push_back(Arg); 4387 } 4388 4389 Function *VectorF; 4390 if (UseVectorIntrinsic) { 4391 // Use vector version of the intrinsic. 4392 Type *TysForDecl[] = {CI->getType()}; 4393 if (VF > 1) 4394 TysForDecl[0] = 4395 FixedVectorType::get(CI->getType()->getScalarType(), VF); 4396 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4397 assert(VectorF && "Can't retrieve vector intrinsic."); 4398 } else { 4399 // Use vector version of the function call. 4400 const VFShape Shape = 4401 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4402 #ifndef NDEBUG 4403 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4404 "Can't create vector function."); 4405 #endif 4406 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4407 } 4408 SmallVector<OperandBundleDef, 1> OpBundles; 4409 CI->getOperandBundlesAsDefs(OpBundles); 4410 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4411 4412 if (isa<FPMathOperator>(V)) 4413 V->copyFastMathFlags(CI); 4414 4415 VectorLoopValueMap.setVectorValue(&I, Part, V); 4416 addMetadata(V, &I); 4417 } 4418 } 4419 4420 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4421 VPUser &Operands, 4422 bool InvariantCond, 4423 VPTransformState &State) { 4424 setDebugLocFromInst(Builder, &I); 4425 4426 // The condition can be loop invariant but still defined inside the 4427 // loop. This means that we can't just use the original 'cond' value. 4428 // We have to take the 'vectorized' value and pick the first lane. 4429 // Instcombine will make this a no-op. 4430 auto *InvarCond = 4431 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4432 4433 for (unsigned Part = 0; Part < UF; ++Part) { 4434 Value *Cond = 4435 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4436 Value *Op0 = State.get(Operands.getOperand(1), Part); 4437 Value *Op1 = State.get(Operands.getOperand(2), Part); 4438 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4439 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4440 addMetadata(Sel, &I); 4441 } 4442 } 4443 4444 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4445 // We should not collect Scalars more than once per VF. Right now, this 4446 // function is called from collectUniformsAndScalars(), which already does 4447 // this check. Collecting Scalars for VF=1 does not make any sense. 4448 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4449 "This function should not be visited twice for the same VF"); 4450 4451 SmallSetVector<Instruction *, 8> Worklist; 4452 4453 // These sets are used to seed the analysis with pointers used by memory 4454 // accesses that will remain scalar. 4455 SmallSetVector<Instruction *, 8> ScalarPtrs; 4456 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4457 4458 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4459 // The pointer operands of loads and stores will be scalar as long as the 4460 // memory access is not a gather or scatter operation. The value operand of a 4461 // store will remain scalar if the store is scalarized. 4462 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4463 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4464 assert(WideningDecision != CM_Unknown && 4465 "Widening decision should be ready at this moment"); 4466 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4467 if (Ptr == Store->getValueOperand()) 4468 return WideningDecision == CM_Scalarize; 4469 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4470 "Ptr is neither a value or pointer operand"); 4471 return WideningDecision != CM_GatherScatter; 4472 }; 4473 4474 // A helper that returns true if the given value is a bitcast or 4475 // getelementptr instruction contained in the loop. 4476 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4477 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4478 isa<GetElementPtrInst>(V)) && 4479 !TheLoop->isLoopInvariant(V); 4480 }; 4481 4482 // A helper that evaluates a memory access's use of a pointer. If the use 4483 // will be a scalar use, and the pointer is only used by memory accesses, we 4484 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4485 // PossibleNonScalarPtrs. 4486 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4487 // We only care about bitcast and getelementptr instructions contained in 4488 // the loop. 4489 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4490 return; 4491 4492 // If the pointer has already been identified as scalar (e.g., if it was 4493 // also identified as uniform), there's nothing to do. 4494 auto *I = cast<Instruction>(Ptr); 4495 if (Worklist.count(I)) 4496 return; 4497 4498 // If the use of the pointer will be a scalar use, and all users of the 4499 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4500 // place the pointer in PossibleNonScalarPtrs. 4501 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4502 return isa<LoadInst>(U) || isa<StoreInst>(U); 4503 })) 4504 ScalarPtrs.insert(I); 4505 else 4506 PossibleNonScalarPtrs.insert(I); 4507 }; 4508 4509 // We seed the scalars analysis with three classes of instructions: (1) 4510 // instructions marked uniform-after-vectorization, (2) bitcast and 4511 // getelementptr instructions used by memory accesses requiring a scalar use, 4512 // and (3) pointer induction variables and their update instructions (we 4513 // currently only scalarize these). 4514 // 4515 // (1) Add to the worklist all instructions that have been identified as 4516 // uniform-after-vectorization. 4517 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4518 4519 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4520 // memory accesses requiring a scalar use. The pointer operands of loads and 4521 // stores will be scalar as long as the memory accesses is not a gather or 4522 // scatter operation. The value operand of a store will remain scalar if the 4523 // store is scalarized. 4524 for (auto *BB : TheLoop->blocks()) 4525 for (auto &I : *BB) { 4526 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4527 evaluatePtrUse(Load, Load->getPointerOperand()); 4528 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4529 evaluatePtrUse(Store, Store->getPointerOperand()); 4530 evaluatePtrUse(Store, Store->getValueOperand()); 4531 } 4532 } 4533 for (auto *I : ScalarPtrs) 4534 if (!PossibleNonScalarPtrs.count(I)) { 4535 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4536 Worklist.insert(I); 4537 } 4538 4539 // (3) Add to the worklist all pointer induction variables and their update 4540 // instructions. 4541 // 4542 // TODO: Once we are able to vectorize pointer induction variables we should 4543 // no longer insert them into the worklist here. 4544 auto *Latch = TheLoop->getLoopLatch(); 4545 for (auto &Induction : Legal->getInductionVars()) { 4546 auto *Ind = Induction.first; 4547 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4548 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4549 continue; 4550 Worklist.insert(Ind); 4551 Worklist.insert(IndUpdate); 4552 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4553 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4554 << "\n"); 4555 } 4556 4557 // Insert the forced scalars. 4558 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4559 // induction variable when the PHI user is scalarized. 4560 auto ForcedScalar = ForcedScalars.find(VF); 4561 if (ForcedScalar != ForcedScalars.end()) 4562 for (auto *I : ForcedScalar->second) 4563 Worklist.insert(I); 4564 4565 // Expand the worklist by looking through any bitcasts and getelementptr 4566 // instructions we've already identified as scalar. This is similar to the 4567 // expansion step in collectLoopUniforms(); however, here we're only 4568 // expanding to include additional bitcasts and getelementptr instructions. 4569 unsigned Idx = 0; 4570 while (Idx != Worklist.size()) { 4571 Instruction *Dst = Worklist[Idx++]; 4572 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4573 continue; 4574 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4575 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4576 auto *J = cast<Instruction>(U); 4577 return !TheLoop->contains(J) || Worklist.count(J) || 4578 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4579 isScalarUse(J, Src)); 4580 })) { 4581 Worklist.insert(Src); 4582 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4583 } 4584 } 4585 4586 // An induction variable will remain scalar if all users of the induction 4587 // variable and induction variable update remain scalar. 4588 for (auto &Induction : Legal->getInductionVars()) { 4589 auto *Ind = Induction.first; 4590 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4591 4592 // We already considered pointer induction variables, so there's no reason 4593 // to look at their users again. 4594 // 4595 // TODO: Once we are able to vectorize pointer induction variables we 4596 // should no longer skip over them here. 4597 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4598 continue; 4599 4600 // If tail-folding is applied, the primary induction variable will be used 4601 // to feed a vector compare. 4602 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4603 continue; 4604 4605 // Determine if all users of the induction variable are scalar after 4606 // vectorization. 4607 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4608 auto *I = cast<Instruction>(U); 4609 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4610 }); 4611 if (!ScalarInd) 4612 continue; 4613 4614 // Determine if all users of the induction variable update instruction are 4615 // scalar after vectorization. 4616 auto ScalarIndUpdate = 4617 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4618 auto *I = cast<Instruction>(U); 4619 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4620 }); 4621 if (!ScalarIndUpdate) 4622 continue; 4623 4624 // The induction variable and its update instruction will remain scalar. 4625 Worklist.insert(Ind); 4626 Worklist.insert(IndUpdate); 4627 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4628 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4629 << "\n"); 4630 } 4631 4632 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4633 } 4634 4635 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4636 if (!blockNeedsPredication(I->getParent())) 4637 return false; 4638 switch(I->getOpcode()) { 4639 default: 4640 break; 4641 case Instruction::Load: 4642 case Instruction::Store: { 4643 if (!Legal->isMaskRequired(I)) 4644 return false; 4645 auto *Ptr = getLoadStorePointerOperand(I); 4646 auto *Ty = getMemInstValueType(I); 4647 // We have already decided how to vectorize this instruction, get that 4648 // result. 4649 if (VF > 1) { 4650 InstWidening WideningDecision = getWideningDecision(I, VF); 4651 assert(WideningDecision != CM_Unknown && 4652 "Widening decision should be ready at this moment"); 4653 return WideningDecision == CM_Scalarize; 4654 } 4655 const Align Alignment = getLoadStoreAlignment(I); 4656 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4657 isLegalMaskedGather(Ty, Alignment)) 4658 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4659 isLegalMaskedScatter(Ty, Alignment)); 4660 } 4661 case Instruction::UDiv: 4662 case Instruction::SDiv: 4663 case Instruction::SRem: 4664 case Instruction::URem: 4665 return mayDivideByZero(*I); 4666 } 4667 return false; 4668 } 4669 4670 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4671 unsigned VF) { 4672 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4673 assert(getWideningDecision(I, VF) == CM_Unknown && 4674 "Decision should not be set yet."); 4675 auto *Group = getInterleavedAccessGroup(I); 4676 assert(Group && "Must have a group."); 4677 4678 // If the instruction's allocated size doesn't equal it's type size, it 4679 // requires padding and will be scalarized. 4680 auto &DL = I->getModule()->getDataLayout(); 4681 auto *ScalarTy = getMemInstValueType(I); 4682 if (hasIrregularType(ScalarTy, DL, VF)) 4683 return false; 4684 4685 // Check if masking is required. 4686 // A Group may need masking for one of two reasons: it resides in a block that 4687 // needs predication, or it was decided to use masking to deal with gaps. 4688 bool PredicatedAccessRequiresMasking = 4689 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4690 bool AccessWithGapsRequiresMasking = 4691 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4692 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4693 return true; 4694 4695 // If masked interleaving is required, we expect that the user/target had 4696 // enabled it, because otherwise it either wouldn't have been created or 4697 // it should have been invalidated by the CostModel. 4698 assert(useMaskedInterleavedAccesses(TTI) && 4699 "Masked interleave-groups for predicated accesses are not enabled."); 4700 4701 auto *Ty = getMemInstValueType(I); 4702 const Align Alignment = getLoadStoreAlignment(I); 4703 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4704 : TTI.isLegalMaskedStore(Ty, Alignment); 4705 } 4706 4707 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4708 unsigned VF) { 4709 // Get and ensure we have a valid memory instruction. 4710 LoadInst *LI = dyn_cast<LoadInst>(I); 4711 StoreInst *SI = dyn_cast<StoreInst>(I); 4712 assert((LI || SI) && "Invalid memory instruction"); 4713 4714 auto *Ptr = getLoadStorePointerOperand(I); 4715 4716 // In order to be widened, the pointer should be consecutive, first of all. 4717 if (!Legal->isConsecutivePtr(Ptr)) 4718 return false; 4719 4720 // If the instruction is a store located in a predicated block, it will be 4721 // scalarized. 4722 if (isScalarWithPredication(I)) 4723 return false; 4724 4725 // If the instruction's allocated size doesn't equal it's type size, it 4726 // requires padding and will be scalarized. 4727 auto &DL = I->getModule()->getDataLayout(); 4728 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4729 if (hasIrregularType(ScalarTy, DL, VF)) 4730 return false; 4731 4732 return true; 4733 } 4734 4735 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4736 // We should not collect Uniforms more than once per VF. Right now, 4737 // this function is called from collectUniformsAndScalars(), which 4738 // already does this check. Collecting Uniforms for VF=1 does not make any 4739 // sense. 4740 4741 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4742 "This function should not be visited twice for the same VF"); 4743 4744 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4745 // not analyze again. Uniforms.count(VF) will return 1. 4746 Uniforms[VF].clear(); 4747 4748 // We now know that the loop is vectorizable! 4749 // Collect instructions inside the loop that will remain uniform after 4750 // vectorization. 4751 4752 // Global values, params and instructions outside of current loop are out of 4753 // scope. 4754 auto isOutOfScope = [&](Value *V) -> bool { 4755 Instruction *I = dyn_cast<Instruction>(V); 4756 return (!I || !TheLoop->contains(I)); 4757 }; 4758 4759 SetVector<Instruction *> Worklist; 4760 BasicBlock *Latch = TheLoop->getLoopLatch(); 4761 4762 // Instructions that are scalar with predication must not be considered 4763 // uniform after vectorization, because that would create an erroneous 4764 // replicating region where only a single instance out of VF should be formed. 4765 // TODO: optimize such seldom cases if found important, see PR40816. 4766 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4767 if (isScalarWithPredication(I, VF)) { 4768 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4769 << *I << "\n"); 4770 return; 4771 } 4772 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4773 Worklist.insert(I); 4774 }; 4775 4776 // Start with the conditional branch. If the branch condition is an 4777 // instruction contained in the loop that is only used by the branch, it is 4778 // uniform. 4779 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4780 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4781 addToWorklistIfAllowed(Cmp); 4782 4783 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4784 // are pointers that are treated like consecutive pointers during 4785 // vectorization. The pointer operands of interleaved accesses are an 4786 // example. 4787 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4788 4789 // Holds pointer operands of instructions that are possibly non-uniform. 4790 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4791 4792 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4793 InstWidening WideningDecision = getWideningDecision(I, VF); 4794 assert(WideningDecision != CM_Unknown && 4795 "Widening decision should be ready at this moment"); 4796 4797 return (WideningDecision == CM_Widen || 4798 WideningDecision == CM_Widen_Reverse || 4799 WideningDecision == CM_Interleave); 4800 }; 4801 // Iterate over the instructions in the loop, and collect all 4802 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4803 // that a consecutive-like pointer operand will be scalarized, we collect it 4804 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4805 // getelementptr instruction can be used by both vectorized and scalarized 4806 // memory instructions. For example, if a loop loads and stores from the same 4807 // location, but the store is conditional, the store will be scalarized, and 4808 // the getelementptr won't remain uniform. 4809 for (auto *BB : TheLoop->blocks()) 4810 for (auto &I : *BB) { 4811 // If there's no pointer operand, there's nothing to do. 4812 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4813 if (!Ptr) 4814 continue; 4815 4816 // True if all users of Ptr are memory accesses that have Ptr as their 4817 // pointer operand. 4818 auto UsersAreMemAccesses = 4819 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4820 return getLoadStorePointerOperand(U) == Ptr; 4821 }); 4822 4823 // Ensure the memory instruction will not be scalarized or used by 4824 // gather/scatter, making its pointer operand non-uniform. If the pointer 4825 // operand is used by any instruction other than a memory access, we 4826 // conservatively assume the pointer operand may be non-uniform. 4827 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4828 PossibleNonUniformPtrs.insert(Ptr); 4829 4830 // If the memory instruction will be vectorized and its pointer operand 4831 // is consecutive-like, or interleaving - the pointer operand should 4832 // remain uniform. 4833 else 4834 ConsecutiveLikePtrs.insert(Ptr); 4835 } 4836 4837 // Add to the Worklist all consecutive and consecutive-like pointers that 4838 // aren't also identified as possibly non-uniform. 4839 for (auto *V : ConsecutiveLikePtrs) 4840 if (!PossibleNonUniformPtrs.count(V)) 4841 addToWorklistIfAllowed(V); 4842 4843 // Expand Worklist in topological order: whenever a new instruction 4844 // is added , its users should be already inside Worklist. It ensures 4845 // a uniform instruction will only be used by uniform instructions. 4846 unsigned idx = 0; 4847 while (idx != Worklist.size()) { 4848 Instruction *I = Worklist[idx++]; 4849 4850 for (auto OV : I->operand_values()) { 4851 // isOutOfScope operands cannot be uniform instructions. 4852 if (isOutOfScope(OV)) 4853 continue; 4854 // First order recurrence Phi's should typically be considered 4855 // non-uniform. 4856 auto *OP = dyn_cast<PHINode>(OV); 4857 if (OP && Legal->isFirstOrderRecurrence(OP)) 4858 continue; 4859 // If all the users of the operand are uniform, then add the 4860 // operand into the uniform worklist. 4861 auto *OI = cast<Instruction>(OV); 4862 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4863 auto *J = cast<Instruction>(U); 4864 return Worklist.count(J) || 4865 (OI == getLoadStorePointerOperand(J) && 4866 isUniformDecision(J, VF)); 4867 })) 4868 addToWorklistIfAllowed(OI); 4869 } 4870 } 4871 4872 // Returns true if Ptr is the pointer operand of a memory access instruction 4873 // I, and I is known to not require scalarization. 4874 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4875 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4876 }; 4877 4878 // For an instruction to be added into Worklist above, all its users inside 4879 // the loop should also be in Worklist. However, this condition cannot be 4880 // true for phi nodes that form a cyclic dependence. We must process phi 4881 // nodes separately. An induction variable will remain uniform if all users 4882 // of the induction variable and induction variable update remain uniform. 4883 // The code below handles both pointer and non-pointer induction variables. 4884 for (auto &Induction : Legal->getInductionVars()) { 4885 auto *Ind = Induction.first; 4886 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4887 4888 // Determine if all users of the induction variable are uniform after 4889 // vectorization. 4890 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4891 auto *I = cast<Instruction>(U); 4892 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4893 isVectorizedMemAccessUse(I, Ind); 4894 }); 4895 if (!UniformInd) 4896 continue; 4897 4898 // Determine if all users of the induction variable update instruction are 4899 // uniform after vectorization. 4900 auto UniformIndUpdate = 4901 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4902 auto *I = cast<Instruction>(U); 4903 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4904 isVectorizedMemAccessUse(I, IndUpdate); 4905 }); 4906 if (!UniformIndUpdate) 4907 continue; 4908 4909 // The induction variable and its update instruction will remain uniform. 4910 addToWorklistIfAllowed(Ind); 4911 addToWorklistIfAllowed(IndUpdate); 4912 } 4913 4914 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4915 } 4916 4917 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4918 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4919 4920 if (Legal->getRuntimePointerChecking()->Need) { 4921 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4922 "runtime pointer checks needed. Enable vectorization of this " 4923 "loop with '#pragma clang loop vectorize(enable)' when " 4924 "compiling with -Os/-Oz", 4925 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4926 return true; 4927 } 4928 4929 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4930 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4931 "runtime SCEV checks needed. Enable vectorization of this " 4932 "loop with '#pragma clang loop vectorize(enable)' when " 4933 "compiling with -Os/-Oz", 4934 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4935 return true; 4936 } 4937 4938 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4939 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4940 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4941 "runtime stride == 1 checks needed. Enable vectorization of " 4942 "this loop with '#pragma clang loop vectorize(enable)' when " 4943 "compiling with -Os/-Oz", 4944 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4945 return true; 4946 } 4947 4948 return false; 4949 } 4950 4951 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 4952 unsigned UserIC) { 4953 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4954 // TODO: It may by useful to do since it's still likely to be dynamically 4955 // uniform if the target can skip. 4956 reportVectorizationFailure( 4957 "Not inserting runtime ptr check for divergent target", 4958 "runtime pointer checks needed. Not enabled for divergent target", 4959 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4960 return None; 4961 } 4962 4963 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4964 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4965 if (TC == 1) { 4966 reportVectorizationFailure("Single iteration (non) loop", 4967 "loop trip count is one, irrelevant for vectorization", 4968 "SingleIterationLoop", ORE, TheLoop); 4969 return None; 4970 } 4971 4972 switch (ScalarEpilogueStatus) { 4973 case CM_ScalarEpilogueAllowed: 4974 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 4975 case CM_ScalarEpilogueNotNeededUsePredicate: 4976 LLVM_DEBUG( 4977 dbgs() << "LV: vector predicate hint/switch found.\n" 4978 << "LV: Not allowing scalar epilogue, creating predicated " 4979 << "vector loop.\n"); 4980 break; 4981 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4982 // fallthrough as a special case of OptForSize 4983 case CM_ScalarEpilogueNotAllowedOptSize: 4984 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4985 LLVM_DEBUG( 4986 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4987 else 4988 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4989 << "count.\n"); 4990 4991 // Bail if runtime checks are required, which are not good when optimising 4992 // for size. 4993 if (runtimeChecksRequired()) 4994 return None; 4995 break; 4996 } 4997 4998 // Now try the tail folding 4999 5000 // Invalidate interleave groups that require an epilogue if we can't mask 5001 // the interleave-group. 5002 if (!useMaskedInterleavedAccesses(TTI)) { 5003 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5004 "No decisions should have been taken at this point"); 5005 // Note: There is no need to invalidate any cost modeling decisions here, as 5006 // non where taken so far. 5007 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5008 } 5009 5010 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5011 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5012 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5013 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5014 // Accept MaxVF if we do not have a tail. 5015 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5016 return MaxVF; 5017 } 5018 5019 // If we don't know the precise trip count, or if the trip count that we 5020 // found modulo the vectorization factor is not zero, try to fold the tail 5021 // by masking. 5022 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5023 if (Legal->prepareToFoldTailByMasking()) { 5024 FoldTailByMasking = true; 5025 return MaxVF; 5026 } 5027 5028 if (TC == 0) { 5029 reportVectorizationFailure( 5030 "Unable to calculate the loop count due to complex control flow", 5031 "unable to calculate the loop count due to complex control flow", 5032 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5033 return None; 5034 } 5035 5036 reportVectorizationFailure( 5037 "Cannot optimize for size and vectorize at the same time.", 5038 "cannot optimize for size and vectorize at the same time. " 5039 "Enable vectorization of this loop with '#pragma clang loop " 5040 "vectorize(enable)' when compiling with -Os/-Oz", 5041 "NoTailLoopWithOptForSize", ORE, TheLoop); 5042 return None; 5043 } 5044 5045 unsigned 5046 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5047 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5048 unsigned SmallestType, WidestType; 5049 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5050 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5051 5052 // Get the maximum safe dependence distance in bits computed by LAA. 5053 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5054 // the memory accesses that is most restrictive (involved in the smallest 5055 // dependence distance). 5056 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5057 5058 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5059 5060 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5061 // Note that both WidestRegister and WidestType may not be a powers of 2. 5062 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5063 5064 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5065 << " / " << WidestType << " bits.\n"); 5066 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5067 << WidestRegister << " bits.\n"); 5068 5069 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5070 " into one vector!"); 5071 if (MaxVectorSize == 0) { 5072 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5073 MaxVectorSize = 1; 5074 return MaxVectorSize; 5075 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5076 isPowerOf2_32(ConstTripCount)) { 5077 // We need to clamp the VF to be the ConstTripCount. There is no point in 5078 // choosing a higher viable VF as done in the loop below. 5079 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5080 << ConstTripCount << "\n"); 5081 MaxVectorSize = ConstTripCount; 5082 return MaxVectorSize; 5083 } 5084 5085 unsigned MaxVF = MaxVectorSize; 5086 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5087 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5088 // Collect all viable vectorization factors larger than the default MaxVF 5089 // (i.e. MaxVectorSize). 5090 SmallVector<unsigned, 8> VFs; 5091 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5092 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5093 VFs.push_back(VS); 5094 5095 // For each VF calculate its register usage. 5096 auto RUs = calculateRegisterUsage(VFs); 5097 5098 // Select the largest VF which doesn't require more registers than existing 5099 // ones. 5100 for (int i = RUs.size() - 1; i >= 0; --i) { 5101 bool Selected = true; 5102 for (auto& pair : RUs[i].MaxLocalUsers) { 5103 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5104 if (pair.second > TargetNumRegisters) 5105 Selected = false; 5106 } 5107 if (Selected) { 5108 MaxVF = VFs[i]; 5109 break; 5110 } 5111 } 5112 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5113 if (MaxVF < MinVF) { 5114 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5115 << ") with target's minimum: " << MinVF << '\n'); 5116 MaxVF = MinVF; 5117 } 5118 } 5119 } 5120 return MaxVF; 5121 } 5122 5123 VectorizationFactor 5124 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5125 float Cost = expectedCost(1).first; 5126 const float ScalarCost = Cost; 5127 unsigned Width = 1; 5128 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5129 5130 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5131 if (ForceVectorization && MaxVF > 1) { 5132 // Ignore scalar width, because the user explicitly wants vectorization. 5133 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5134 // evaluation. 5135 Cost = std::numeric_limits<float>::max(); 5136 } 5137 5138 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5139 // Notice that the vector loop needs to be executed less times, so 5140 // we need to divide the cost of the vector loops by the width of 5141 // the vector elements. 5142 VectorizationCostTy C = expectedCost(i); 5143 float VectorCost = C.first / (float)i; 5144 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5145 << " costs: " << (int)VectorCost << ".\n"); 5146 if (!C.second && !ForceVectorization) { 5147 LLVM_DEBUG( 5148 dbgs() << "LV: Not considering vector loop of width " << i 5149 << " because it will not generate any vector instructions.\n"); 5150 continue; 5151 } 5152 if (VectorCost < Cost) { 5153 Cost = VectorCost; 5154 Width = i; 5155 } 5156 } 5157 5158 if (!EnableCondStoresVectorization && NumPredStores) { 5159 reportVectorizationFailure("There are conditional stores.", 5160 "store that is conditionally executed prevents vectorization", 5161 "ConditionalStore", ORE, TheLoop); 5162 Width = 1; 5163 Cost = ScalarCost; 5164 } 5165 5166 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5167 << "LV: Vectorization seems to be not beneficial, " 5168 << "but was forced by a user.\n"); 5169 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5170 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5171 return Factor; 5172 } 5173 5174 std::pair<unsigned, unsigned> 5175 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5176 unsigned MinWidth = -1U; 5177 unsigned MaxWidth = 8; 5178 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5179 5180 // For each block. 5181 for (BasicBlock *BB : TheLoop->blocks()) { 5182 // For each instruction in the loop. 5183 for (Instruction &I : BB->instructionsWithoutDebug()) { 5184 Type *T = I.getType(); 5185 5186 // Skip ignored values. 5187 if (ValuesToIgnore.count(&I)) 5188 continue; 5189 5190 // Only examine Loads, Stores and PHINodes. 5191 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5192 continue; 5193 5194 // Examine PHI nodes that are reduction variables. Update the type to 5195 // account for the recurrence type. 5196 if (auto *PN = dyn_cast<PHINode>(&I)) { 5197 if (!Legal->isReductionVariable(PN)) 5198 continue; 5199 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5200 T = RdxDesc.getRecurrenceType(); 5201 } 5202 5203 // Examine the stored values. 5204 if (auto *ST = dyn_cast<StoreInst>(&I)) 5205 T = ST->getValueOperand()->getType(); 5206 5207 // Ignore loaded pointer types and stored pointer types that are not 5208 // vectorizable. 5209 // 5210 // FIXME: The check here attempts to predict whether a load or store will 5211 // be vectorized. We only know this for certain after a VF has 5212 // been selected. Here, we assume that if an access can be 5213 // vectorized, it will be. We should also look at extending this 5214 // optimization to non-pointer types. 5215 // 5216 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5217 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5218 continue; 5219 5220 MinWidth = std::min(MinWidth, 5221 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5222 MaxWidth = std::max(MaxWidth, 5223 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5224 } 5225 } 5226 5227 return {MinWidth, MaxWidth}; 5228 } 5229 5230 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5231 unsigned LoopCost) { 5232 // -- The interleave heuristics -- 5233 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5234 // There are many micro-architectural considerations that we can't predict 5235 // at this level. For example, frontend pressure (on decode or fetch) due to 5236 // code size, or the number and capabilities of the execution ports. 5237 // 5238 // We use the following heuristics to select the interleave count: 5239 // 1. If the code has reductions, then we interleave to break the cross 5240 // iteration dependency. 5241 // 2. If the loop is really small, then we interleave to reduce the loop 5242 // overhead. 5243 // 3. We don't interleave if we think that we will spill registers to memory 5244 // due to the increased register pressure. 5245 5246 if (!isScalarEpilogueAllowed()) 5247 return 1; 5248 5249 // We used the distance for the interleave count. 5250 if (Legal->getMaxSafeDepDistBytes() != -1U) 5251 return 1; 5252 5253 // Do not interleave loops with a relatively small known or estimated trip 5254 // count. 5255 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5256 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5257 return 1; 5258 5259 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5260 // We divide by these constants so assume that we have at least one 5261 // instruction that uses at least one register. 5262 for (auto& pair : R.MaxLocalUsers) { 5263 pair.second = std::max(pair.second, 1U); 5264 } 5265 5266 // We calculate the interleave count using the following formula. 5267 // Subtract the number of loop invariants from the number of available 5268 // registers. These registers are used by all of the interleaved instances. 5269 // Next, divide the remaining registers by the number of registers that is 5270 // required by the loop, in order to estimate how many parallel instances 5271 // fit without causing spills. All of this is rounded down if necessary to be 5272 // a power of two. We want power of two interleave count to simplify any 5273 // addressing operations or alignment considerations. 5274 // We also want power of two interleave counts to ensure that the induction 5275 // variable of the vector loop wraps to zero, when tail is folded by masking; 5276 // this currently happens when OptForSize, in which case IC is set to 1 above. 5277 unsigned IC = UINT_MAX; 5278 5279 for (auto& pair : R.MaxLocalUsers) { 5280 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5281 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5282 << " registers of " 5283 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5284 if (VF == 1) { 5285 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5286 TargetNumRegisters = ForceTargetNumScalarRegs; 5287 } else { 5288 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5289 TargetNumRegisters = ForceTargetNumVectorRegs; 5290 } 5291 unsigned MaxLocalUsers = pair.second; 5292 unsigned LoopInvariantRegs = 0; 5293 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5294 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5295 5296 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5297 // Don't count the induction variable as interleaved. 5298 if (EnableIndVarRegisterHeur) { 5299 TmpIC = 5300 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5301 std::max(1U, (MaxLocalUsers - 1))); 5302 } 5303 5304 IC = std::min(IC, TmpIC); 5305 } 5306 5307 // Clamp the interleave ranges to reasonable counts. 5308 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5309 5310 // Check if the user has overridden the max. 5311 if (VF == 1) { 5312 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5313 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5314 } else { 5315 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5316 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5317 } 5318 5319 // If trip count is known or estimated compile time constant, limit the 5320 // interleave count to be less than the trip count divided by VF. 5321 if (BestKnownTC) { 5322 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5323 } 5324 5325 // If we did not calculate the cost for VF (because the user selected the VF) 5326 // then we calculate the cost of VF here. 5327 if (LoopCost == 0) 5328 LoopCost = expectedCost(VF).first; 5329 5330 assert(LoopCost && "Non-zero loop cost expected"); 5331 5332 // Clamp the calculated IC to be between the 1 and the max interleave count 5333 // that the target and trip count allows. 5334 if (IC > MaxInterleaveCount) 5335 IC = MaxInterleaveCount; 5336 else if (IC < 1) 5337 IC = 1; 5338 5339 // Interleave if we vectorized this loop and there is a reduction that could 5340 // benefit from interleaving. 5341 if (VF > 1 && !Legal->getReductionVars().empty()) { 5342 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5343 return IC; 5344 } 5345 5346 // Note that if we've already vectorized the loop we will have done the 5347 // runtime check and so interleaving won't require further checks. 5348 bool InterleavingRequiresRuntimePointerCheck = 5349 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5350 5351 // We want to interleave small loops in order to reduce the loop overhead and 5352 // potentially expose ILP opportunities. 5353 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5354 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5355 // We assume that the cost overhead is 1 and we use the cost model 5356 // to estimate the cost of the loop and interleave until the cost of the 5357 // loop overhead is about 5% of the cost of the loop. 5358 unsigned SmallIC = 5359 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5360 5361 // Interleave until store/load ports (estimated by max interleave count) are 5362 // saturated. 5363 unsigned NumStores = Legal->getNumStores(); 5364 unsigned NumLoads = Legal->getNumLoads(); 5365 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5366 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5367 5368 // If we have a scalar reduction (vector reductions are already dealt with 5369 // by this point), we can increase the critical path length if the loop 5370 // we're interleaving is inside another loop. Limit, by default to 2, so the 5371 // critical path only gets increased by one reduction operation. 5372 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5373 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5374 SmallIC = std::min(SmallIC, F); 5375 StoresIC = std::min(StoresIC, F); 5376 LoadsIC = std::min(LoadsIC, F); 5377 } 5378 5379 if (EnableLoadStoreRuntimeInterleave && 5380 std::max(StoresIC, LoadsIC) > SmallIC) { 5381 LLVM_DEBUG( 5382 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5383 return std::max(StoresIC, LoadsIC); 5384 } 5385 5386 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5387 return SmallIC; 5388 } 5389 5390 // Interleave if this is a large loop (small loops are already dealt with by 5391 // this point) that could benefit from interleaving. 5392 bool HasReductions = !Legal->getReductionVars().empty(); 5393 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5394 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5395 return IC; 5396 } 5397 5398 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5399 return 1; 5400 } 5401 5402 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5403 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5404 // This function calculates the register usage by measuring the highest number 5405 // of values that are alive at a single location. Obviously, this is a very 5406 // rough estimation. We scan the loop in a topological order in order and 5407 // assign a number to each instruction. We use RPO to ensure that defs are 5408 // met before their users. We assume that each instruction that has in-loop 5409 // users starts an interval. We record every time that an in-loop value is 5410 // used, so we have a list of the first and last occurrences of each 5411 // instruction. Next, we transpose this data structure into a multi map that 5412 // holds the list of intervals that *end* at a specific location. This multi 5413 // map allows us to perform a linear search. We scan the instructions linearly 5414 // and record each time that a new interval starts, by placing it in a set. 5415 // If we find this value in the multi-map then we remove it from the set. 5416 // The max register usage is the maximum size of the set. 5417 // We also search for instructions that are defined outside the loop, but are 5418 // used inside the loop. We need this number separately from the max-interval 5419 // usage number because when we unroll, loop-invariant values do not take 5420 // more register. 5421 LoopBlocksDFS DFS(TheLoop); 5422 DFS.perform(LI); 5423 5424 RegisterUsage RU; 5425 5426 // Each 'key' in the map opens a new interval. The values 5427 // of the map are the index of the 'last seen' usage of the 5428 // instruction that is the key. 5429 using IntervalMap = DenseMap<Instruction *, unsigned>; 5430 5431 // Maps instruction to its index. 5432 SmallVector<Instruction *, 64> IdxToInstr; 5433 // Marks the end of each interval. 5434 IntervalMap EndPoint; 5435 // Saves the list of instruction indices that are used in the loop. 5436 SmallPtrSet<Instruction *, 8> Ends; 5437 // Saves the list of values that are used in the loop but are 5438 // defined outside the loop, such as arguments and constants. 5439 SmallPtrSet<Value *, 8> LoopInvariants; 5440 5441 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5442 for (Instruction &I : BB->instructionsWithoutDebug()) { 5443 IdxToInstr.push_back(&I); 5444 5445 // Save the end location of each USE. 5446 for (Value *U : I.operands()) { 5447 auto *Instr = dyn_cast<Instruction>(U); 5448 5449 // Ignore non-instruction values such as arguments, constants, etc. 5450 if (!Instr) 5451 continue; 5452 5453 // If this instruction is outside the loop then record it and continue. 5454 if (!TheLoop->contains(Instr)) { 5455 LoopInvariants.insert(Instr); 5456 continue; 5457 } 5458 5459 // Overwrite previous end points. 5460 EndPoint[Instr] = IdxToInstr.size(); 5461 Ends.insert(Instr); 5462 } 5463 } 5464 } 5465 5466 // Saves the list of intervals that end with the index in 'key'. 5467 using InstrList = SmallVector<Instruction *, 2>; 5468 DenseMap<unsigned, InstrList> TransposeEnds; 5469 5470 // Transpose the EndPoints to a list of values that end at each index. 5471 for (auto &Interval : EndPoint) 5472 TransposeEnds[Interval.second].push_back(Interval.first); 5473 5474 SmallPtrSet<Instruction *, 8> OpenIntervals; 5475 5476 // Get the size of the widest register. 5477 unsigned MaxSafeDepDist = -1U; 5478 if (Legal->getMaxSafeDepDistBytes() != -1U) 5479 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5480 unsigned WidestRegister = 5481 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5482 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5483 5484 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5485 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5486 5487 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5488 5489 // A lambda that gets the register usage for the given type and VF. 5490 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5491 if (Ty->isTokenTy()) 5492 return 0U; 5493 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5494 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5495 }; 5496 5497 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5498 Instruction *I = IdxToInstr[i]; 5499 5500 // Remove all of the instructions that end at this location. 5501 InstrList &List = TransposeEnds[i]; 5502 for (Instruction *ToRemove : List) 5503 OpenIntervals.erase(ToRemove); 5504 5505 // Ignore instructions that are never used within the loop. 5506 if (!Ends.count(I)) 5507 continue; 5508 5509 // Skip ignored values. 5510 if (ValuesToIgnore.count(I)) 5511 continue; 5512 5513 // For each VF find the maximum usage of registers. 5514 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5515 // Count the number of live intervals. 5516 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5517 5518 if (VFs[j] == 1) { 5519 for (auto Inst : OpenIntervals) { 5520 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5521 if (RegUsage.find(ClassID) == RegUsage.end()) 5522 RegUsage[ClassID] = 1; 5523 else 5524 RegUsage[ClassID] += 1; 5525 } 5526 } else { 5527 collectUniformsAndScalars(VFs[j]); 5528 for (auto Inst : OpenIntervals) { 5529 // Skip ignored values for VF > 1. 5530 if (VecValuesToIgnore.count(Inst)) 5531 continue; 5532 if (isScalarAfterVectorization(Inst, VFs[j])) { 5533 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5534 if (RegUsage.find(ClassID) == RegUsage.end()) 5535 RegUsage[ClassID] = 1; 5536 else 5537 RegUsage[ClassID] += 1; 5538 } else { 5539 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5540 if (RegUsage.find(ClassID) == RegUsage.end()) 5541 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5542 else 5543 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5544 } 5545 } 5546 } 5547 5548 for (auto& pair : RegUsage) { 5549 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5550 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5551 else 5552 MaxUsages[j][pair.first] = pair.second; 5553 } 5554 } 5555 5556 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5557 << OpenIntervals.size() << '\n'); 5558 5559 // Add the current instruction to the list of open intervals. 5560 OpenIntervals.insert(I); 5561 } 5562 5563 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5564 SmallMapVector<unsigned, unsigned, 4> Invariant; 5565 5566 for (auto Inst : LoopInvariants) { 5567 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5568 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5569 if (Invariant.find(ClassID) == Invariant.end()) 5570 Invariant[ClassID] = Usage; 5571 else 5572 Invariant[ClassID] += Usage; 5573 } 5574 5575 LLVM_DEBUG({ 5576 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5577 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5578 << " item\n"; 5579 for (const auto &pair : MaxUsages[i]) { 5580 dbgs() << "LV(REG): RegisterClass: " 5581 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5582 << " registers\n"; 5583 } 5584 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5585 << " item\n"; 5586 for (const auto &pair : Invariant) { 5587 dbgs() << "LV(REG): RegisterClass: " 5588 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5589 << " registers\n"; 5590 } 5591 }); 5592 5593 RU.LoopInvariantRegs = Invariant; 5594 RU.MaxLocalUsers = MaxUsages[i]; 5595 RUs[i] = RU; 5596 } 5597 5598 return RUs; 5599 } 5600 5601 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5602 // TODO: Cost model for emulated masked load/store is completely 5603 // broken. This hack guides the cost model to use an artificially 5604 // high enough value to practically disable vectorization with such 5605 // operations, except where previously deployed legality hack allowed 5606 // using very low cost values. This is to avoid regressions coming simply 5607 // from moving "masked load/store" check from legality to cost model. 5608 // Masked Load/Gather emulation was previously never allowed. 5609 // Limited number of Masked Store/Scatter emulation was allowed. 5610 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5611 return isa<LoadInst>(I) || 5612 (isa<StoreInst>(I) && 5613 NumPredStores > NumberOfStoresToPredicate); 5614 } 5615 5616 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5617 // If we aren't vectorizing the loop, or if we've already collected the 5618 // instructions to scalarize, there's nothing to do. Collection may already 5619 // have occurred if we have a user-selected VF and are now computing the 5620 // expected cost for interleaving. 5621 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5622 return; 5623 5624 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5625 // not profitable to scalarize any instructions, the presence of VF in the 5626 // map will indicate that we've analyzed it already. 5627 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5628 5629 // Find all the instructions that are scalar with predication in the loop and 5630 // determine if it would be better to not if-convert the blocks they are in. 5631 // If so, we also record the instructions to scalarize. 5632 for (BasicBlock *BB : TheLoop->blocks()) { 5633 if (!blockNeedsPredication(BB)) 5634 continue; 5635 for (Instruction &I : *BB) 5636 if (isScalarWithPredication(&I)) { 5637 ScalarCostsTy ScalarCosts; 5638 // Do not apply discount logic if hacked cost is needed 5639 // for emulated masked memrefs. 5640 if (!useEmulatedMaskMemRefHack(&I) && 5641 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5642 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5643 // Remember that BB will remain after vectorization. 5644 PredicatedBBsAfterVectorization.insert(BB); 5645 } 5646 } 5647 } 5648 5649 int LoopVectorizationCostModel::computePredInstDiscount( 5650 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5651 unsigned VF) { 5652 assert(!isUniformAfterVectorization(PredInst, VF) && 5653 "Instruction marked uniform-after-vectorization will be predicated"); 5654 5655 // Initialize the discount to zero, meaning that the scalar version and the 5656 // vector version cost the same. 5657 int Discount = 0; 5658 5659 // Holds instructions to analyze. The instructions we visit are mapped in 5660 // ScalarCosts. Those instructions are the ones that would be scalarized if 5661 // we find that the scalar version costs less. 5662 SmallVector<Instruction *, 8> Worklist; 5663 5664 // Returns true if the given instruction can be scalarized. 5665 auto canBeScalarized = [&](Instruction *I) -> bool { 5666 // We only attempt to scalarize instructions forming a single-use chain 5667 // from the original predicated block that would otherwise be vectorized. 5668 // Although not strictly necessary, we give up on instructions we know will 5669 // already be scalar to avoid traversing chains that are unlikely to be 5670 // beneficial. 5671 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5672 isScalarAfterVectorization(I, VF)) 5673 return false; 5674 5675 // If the instruction is scalar with predication, it will be analyzed 5676 // separately. We ignore it within the context of PredInst. 5677 if (isScalarWithPredication(I)) 5678 return false; 5679 5680 // If any of the instruction's operands are uniform after vectorization, 5681 // the instruction cannot be scalarized. This prevents, for example, a 5682 // masked load from being scalarized. 5683 // 5684 // We assume we will only emit a value for lane zero of an instruction 5685 // marked uniform after vectorization, rather than VF identical values. 5686 // Thus, if we scalarize an instruction that uses a uniform, we would 5687 // create uses of values corresponding to the lanes we aren't emitting code 5688 // for. This behavior can be changed by allowing getScalarValue to clone 5689 // the lane zero values for uniforms rather than asserting. 5690 for (Use &U : I->operands()) 5691 if (auto *J = dyn_cast<Instruction>(U.get())) 5692 if (isUniformAfterVectorization(J, VF)) 5693 return false; 5694 5695 // Otherwise, we can scalarize the instruction. 5696 return true; 5697 }; 5698 5699 // Compute the expected cost discount from scalarizing the entire expression 5700 // feeding the predicated instruction. We currently only consider expressions 5701 // that are single-use instruction chains. 5702 Worklist.push_back(PredInst); 5703 while (!Worklist.empty()) { 5704 Instruction *I = Worklist.pop_back_val(); 5705 5706 // If we've already analyzed the instruction, there's nothing to do. 5707 if (ScalarCosts.find(I) != ScalarCosts.end()) 5708 continue; 5709 5710 // Compute the cost of the vector instruction. Note that this cost already 5711 // includes the scalarization overhead of the predicated instruction. 5712 unsigned VectorCost = getInstructionCost(I, VF).first; 5713 5714 // Compute the cost of the scalarized instruction. This cost is the cost of 5715 // the instruction as if it wasn't if-converted and instead remained in the 5716 // predicated block. We will scale this cost by block probability after 5717 // computing the scalarization overhead. 5718 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5719 5720 // Compute the scalarization overhead of needed insertelement instructions 5721 // and phi nodes. 5722 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5723 ScalarCost += TTI.getScalarizationOverhead( 5724 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5725 APInt::getAllOnesValue(VF), true, false); 5726 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, 5727 TTI::TCK_RecipThroughput); 5728 } 5729 5730 // Compute the scalarization overhead of needed extractelement 5731 // instructions. For each of the instruction's operands, if the operand can 5732 // be scalarized, add it to the worklist; otherwise, account for the 5733 // overhead. 5734 for (Use &U : I->operands()) 5735 if (auto *J = dyn_cast<Instruction>(U.get())) { 5736 assert(VectorType::isValidElementType(J->getType()) && 5737 "Instruction has non-scalar type"); 5738 if (canBeScalarized(J)) 5739 Worklist.push_back(J); 5740 else if (needsExtract(J, VF)) 5741 ScalarCost += TTI.getScalarizationOverhead( 5742 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5743 APInt::getAllOnesValue(VF), false, true); 5744 } 5745 5746 // Scale the total scalar cost by block probability. 5747 ScalarCost /= getReciprocalPredBlockProb(); 5748 5749 // Compute the discount. A non-negative discount means the vector version 5750 // of the instruction costs more, and scalarizing would be beneficial. 5751 Discount += VectorCost - ScalarCost; 5752 ScalarCosts[I] = ScalarCost; 5753 } 5754 5755 return Discount; 5756 } 5757 5758 LoopVectorizationCostModel::VectorizationCostTy 5759 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5760 VectorizationCostTy Cost; 5761 5762 // For each block. 5763 for (BasicBlock *BB : TheLoop->blocks()) { 5764 VectorizationCostTy BlockCost; 5765 5766 // For each instruction in the old loop. 5767 for (Instruction &I : BB->instructionsWithoutDebug()) { 5768 // Skip ignored values. 5769 if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) 5770 continue; 5771 5772 VectorizationCostTy C = getInstructionCost(&I, VF); 5773 5774 // Check if we should override the cost. 5775 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5776 C.first = ForceTargetInstructionCost; 5777 5778 BlockCost.first += C.first; 5779 BlockCost.second |= C.second; 5780 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5781 << " for VF " << VF << " For instruction: " << I 5782 << '\n'); 5783 } 5784 5785 // If we are vectorizing a predicated block, it will have been 5786 // if-converted. This means that the block's instructions (aside from 5787 // stores and instructions that may divide by zero) will now be 5788 // unconditionally executed. For the scalar case, we may not always execute 5789 // the predicated block. Thus, scale the block's cost by the probability of 5790 // executing it. 5791 if (VF == 1 && blockNeedsPredication(BB)) 5792 BlockCost.first /= getReciprocalPredBlockProb(); 5793 5794 Cost.first += BlockCost.first; 5795 Cost.second |= BlockCost.second; 5796 } 5797 5798 return Cost; 5799 } 5800 5801 /// Gets Address Access SCEV after verifying that the access pattern 5802 /// is loop invariant except the induction variable dependence. 5803 /// 5804 /// This SCEV can be sent to the Target in order to estimate the address 5805 /// calculation cost. 5806 static const SCEV *getAddressAccessSCEV( 5807 Value *Ptr, 5808 LoopVectorizationLegality *Legal, 5809 PredicatedScalarEvolution &PSE, 5810 const Loop *TheLoop) { 5811 5812 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5813 if (!Gep) 5814 return nullptr; 5815 5816 // We are looking for a gep with all loop invariant indices except for one 5817 // which should be an induction variable. 5818 auto SE = PSE.getSE(); 5819 unsigned NumOperands = Gep->getNumOperands(); 5820 for (unsigned i = 1; i < NumOperands; ++i) { 5821 Value *Opd = Gep->getOperand(i); 5822 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5823 !Legal->isInductionVariable(Opd)) 5824 return nullptr; 5825 } 5826 5827 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5828 return PSE.getSCEV(Ptr); 5829 } 5830 5831 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5832 return Legal->hasStride(I->getOperand(0)) || 5833 Legal->hasStride(I->getOperand(1)); 5834 } 5835 5836 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5837 unsigned VF) { 5838 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5839 Type *ValTy = getMemInstValueType(I); 5840 auto SE = PSE.getSE(); 5841 5842 unsigned AS = getLoadStoreAddressSpace(I); 5843 Value *Ptr = getLoadStorePointerOperand(I); 5844 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5845 5846 // Figure out whether the access is strided and get the stride value 5847 // if it's known in compile time 5848 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5849 5850 // Get the cost of the scalar memory instruction and address computation. 5851 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5852 5853 // Don't pass *I here, since it is scalar but will actually be part of a 5854 // vectorized loop where the user of it is a vectorized instruction. 5855 const Align Alignment = getLoadStoreAlignment(I); 5856 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5857 Alignment, AS, 5858 TTI::TCK_RecipThroughput); 5859 5860 // Get the overhead of the extractelement and insertelement instructions 5861 // we might create due to scalarization. 5862 Cost += getScalarizationOverhead(I, VF); 5863 5864 // If we have a predicated store, it may not be executed for each vector 5865 // lane. Scale the cost by the probability of executing the predicated 5866 // block. 5867 if (isPredicatedInst(I)) { 5868 Cost /= getReciprocalPredBlockProb(); 5869 5870 if (useEmulatedMaskMemRefHack(I)) 5871 // Artificially setting to a high enough value to practically disable 5872 // vectorization with such operations. 5873 Cost = 3000000; 5874 } 5875 5876 return Cost; 5877 } 5878 5879 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5880 unsigned VF) { 5881 Type *ValTy = getMemInstValueType(I); 5882 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5883 Value *Ptr = getLoadStorePointerOperand(I); 5884 unsigned AS = getLoadStoreAddressSpace(I); 5885 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5886 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5887 5888 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5889 "Stride should be 1 or -1 for consecutive memory access"); 5890 const Align Alignment = getLoadStoreAlignment(I); 5891 unsigned Cost = 0; 5892 if (Legal->isMaskRequired(I)) 5893 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5894 Alignment.value(), AS, CostKind); 5895 else 5896 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5897 CostKind, I); 5898 5899 bool Reverse = ConsecutiveStride < 0; 5900 if (Reverse) 5901 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5902 return Cost; 5903 } 5904 5905 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5906 unsigned VF) { 5907 Type *ValTy = getMemInstValueType(I); 5908 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5909 const Align Alignment = getLoadStoreAlignment(I); 5910 unsigned AS = getLoadStoreAddressSpace(I); 5911 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5912 if (isa<LoadInst>(I)) { 5913 return TTI.getAddressComputationCost(ValTy) + 5914 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5915 CostKind) + 5916 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5917 } 5918 StoreInst *SI = cast<StoreInst>(I); 5919 5920 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5921 return TTI.getAddressComputationCost(ValTy) + 5922 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5923 CostKind) + 5924 (isLoopInvariantStoreValue 5925 ? 0 5926 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5927 VF - 1)); 5928 } 5929 5930 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5931 unsigned VF) { 5932 Type *ValTy = getMemInstValueType(I); 5933 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5934 const Align Alignment = getLoadStoreAlignment(I); 5935 Value *Ptr = getLoadStorePointerOperand(I); 5936 5937 return TTI.getAddressComputationCost(VectorTy) + 5938 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5939 Legal->isMaskRequired(I), Alignment.value(), 5940 TargetTransformInfo::TCK_RecipThroughput, 5941 I); 5942 } 5943 5944 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5945 unsigned VF) { 5946 Type *ValTy = getMemInstValueType(I); 5947 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5948 unsigned AS = getLoadStoreAddressSpace(I); 5949 5950 auto Group = getInterleavedAccessGroup(I); 5951 assert(Group && "Fail to get an interleaved access group."); 5952 5953 unsigned InterleaveFactor = Group->getFactor(); 5954 auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); 5955 5956 // Holds the indices of existing members in an interleaved load group. 5957 // An interleaved store group doesn't need this as it doesn't allow gaps. 5958 SmallVector<unsigned, 4> Indices; 5959 if (isa<LoadInst>(I)) { 5960 for (unsigned i = 0; i < InterleaveFactor; i++) 5961 if (Group->getMember(i)) 5962 Indices.push_back(i); 5963 } 5964 5965 // Calculate the cost of the whole interleaved group. 5966 bool UseMaskForGaps = 5967 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5968 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5969 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5970 Group->getAlign().value(), AS, TTI::TCK_RecipThroughput, 5971 Legal->isMaskRequired(I), UseMaskForGaps); 5972 5973 if (Group->isReverse()) { 5974 // TODO: Add support for reversed masked interleaved access. 5975 assert(!Legal->isMaskRequired(I) && 5976 "Reverse masked interleaved access not supported."); 5977 Cost += Group->getNumMembers() * 5978 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5979 } 5980 return Cost; 5981 } 5982 5983 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5984 unsigned VF) { 5985 // Calculate scalar cost only. Vectorization cost should be ready at this 5986 // moment. 5987 if (VF == 1) { 5988 Type *ValTy = getMemInstValueType(I); 5989 const Align Alignment = getLoadStoreAlignment(I); 5990 unsigned AS = getLoadStoreAddressSpace(I); 5991 5992 return TTI.getAddressComputationCost(ValTy) + 5993 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 5994 TTI::TCK_RecipThroughput, I); 5995 } 5996 return getWideningCost(I, VF); 5997 } 5998 5999 LoopVectorizationCostModel::VectorizationCostTy 6000 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 6001 // If we know that this instruction will remain uniform, check the cost of 6002 // the scalar version. 6003 if (isUniformAfterVectorization(I, VF)) 6004 VF = 1; 6005 6006 if (VF > 1 && isProfitableToScalarize(I, VF)) 6007 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6008 6009 // Forced scalars do not have any scalarization overhead. 6010 auto ForcedScalar = ForcedScalars.find(VF); 6011 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6012 auto InstSet = ForcedScalar->second; 6013 if (InstSet.count(I)) 6014 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6015 } 6016 6017 Type *VectorTy; 6018 unsigned C = getInstructionCost(I, VF, VectorTy); 6019 6020 bool TypeNotScalarized = 6021 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6022 return VectorizationCostTy(C, TypeNotScalarized); 6023 } 6024 6025 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6026 unsigned VF) { 6027 6028 if (VF == 1) 6029 return 0; 6030 6031 unsigned Cost = 0; 6032 Type *RetTy = ToVectorTy(I->getType(), VF); 6033 if (!RetTy->isVoidTy() && 6034 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6035 Cost += TTI.getScalarizationOverhead( 6036 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6037 6038 // Some targets keep addresses scalar. 6039 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6040 return Cost; 6041 6042 // Some targets support efficient element stores. 6043 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6044 return Cost; 6045 6046 // Collect operands to consider. 6047 CallInst *CI = dyn_cast<CallInst>(I); 6048 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6049 6050 // Skip operands that do not require extraction/scalarization and do not incur 6051 // any overhead. 6052 return Cost + TTI.getOperandsScalarizationOverhead( 6053 filterExtractingOperands(Ops, VF), VF); 6054 } 6055 6056 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6057 if (VF == 1) 6058 return; 6059 NumPredStores = 0; 6060 for (BasicBlock *BB : TheLoop->blocks()) { 6061 // For each instruction in the old loop. 6062 for (Instruction &I : *BB) { 6063 Value *Ptr = getLoadStorePointerOperand(&I); 6064 if (!Ptr) 6065 continue; 6066 6067 // TODO: We should generate better code and update the cost model for 6068 // predicated uniform stores. Today they are treated as any other 6069 // predicated store (see added test cases in 6070 // invariant-store-vectorization.ll). 6071 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6072 NumPredStores++; 6073 6074 if (Legal->isUniform(Ptr) && 6075 // Conditional loads and stores should be scalarized and predicated. 6076 // isScalarWithPredication cannot be used here since masked 6077 // gather/scatters are not considered scalar with predication. 6078 !Legal->blockNeedsPredication(I.getParent())) { 6079 // TODO: Avoid replicating loads and stores instead of 6080 // relying on instcombine to remove them. 6081 // Load: Scalar load + broadcast 6082 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6083 unsigned Cost = getUniformMemOpCost(&I, VF); 6084 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6085 continue; 6086 } 6087 6088 // We assume that widening is the best solution when possible. 6089 if (memoryInstructionCanBeWidened(&I, VF)) { 6090 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6091 int ConsecutiveStride = 6092 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6093 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6094 "Expected consecutive stride."); 6095 InstWidening Decision = 6096 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6097 setWideningDecision(&I, VF, Decision, Cost); 6098 continue; 6099 } 6100 6101 // Choose between Interleaving, Gather/Scatter or Scalarization. 6102 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6103 unsigned NumAccesses = 1; 6104 if (isAccessInterleaved(&I)) { 6105 auto Group = getInterleavedAccessGroup(&I); 6106 assert(Group && "Fail to get an interleaved access group."); 6107 6108 // Make one decision for the whole group. 6109 if (getWideningDecision(&I, VF) != CM_Unknown) 6110 continue; 6111 6112 NumAccesses = Group->getNumMembers(); 6113 if (interleavedAccessCanBeWidened(&I, VF)) 6114 InterleaveCost = getInterleaveGroupCost(&I, VF); 6115 } 6116 6117 unsigned GatherScatterCost = 6118 isLegalGatherOrScatter(&I) 6119 ? getGatherScatterCost(&I, VF) * NumAccesses 6120 : std::numeric_limits<unsigned>::max(); 6121 6122 unsigned ScalarizationCost = 6123 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6124 6125 // Choose better solution for the current VF, 6126 // write down this decision and use it during vectorization. 6127 unsigned Cost; 6128 InstWidening Decision; 6129 if (InterleaveCost <= GatherScatterCost && 6130 InterleaveCost < ScalarizationCost) { 6131 Decision = CM_Interleave; 6132 Cost = InterleaveCost; 6133 } else if (GatherScatterCost < ScalarizationCost) { 6134 Decision = CM_GatherScatter; 6135 Cost = GatherScatterCost; 6136 } else { 6137 Decision = CM_Scalarize; 6138 Cost = ScalarizationCost; 6139 } 6140 // If the instructions belongs to an interleave group, the whole group 6141 // receives the same decision. The whole group receives the cost, but 6142 // the cost will actually be assigned to one instruction. 6143 if (auto Group = getInterleavedAccessGroup(&I)) 6144 setWideningDecision(Group, VF, Decision, Cost); 6145 else 6146 setWideningDecision(&I, VF, Decision, Cost); 6147 } 6148 } 6149 6150 // Make sure that any load of address and any other address computation 6151 // remains scalar unless there is gather/scatter support. This avoids 6152 // inevitable extracts into address registers, and also has the benefit of 6153 // activating LSR more, since that pass can't optimize vectorized 6154 // addresses. 6155 if (TTI.prefersVectorizedAddressing()) 6156 return; 6157 6158 // Start with all scalar pointer uses. 6159 SmallPtrSet<Instruction *, 8> AddrDefs; 6160 for (BasicBlock *BB : TheLoop->blocks()) 6161 for (Instruction &I : *BB) { 6162 Instruction *PtrDef = 6163 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6164 if (PtrDef && TheLoop->contains(PtrDef) && 6165 getWideningDecision(&I, VF) != CM_GatherScatter) 6166 AddrDefs.insert(PtrDef); 6167 } 6168 6169 // Add all instructions used to generate the addresses. 6170 SmallVector<Instruction *, 4> Worklist; 6171 for (auto *I : AddrDefs) 6172 Worklist.push_back(I); 6173 while (!Worklist.empty()) { 6174 Instruction *I = Worklist.pop_back_val(); 6175 for (auto &Op : I->operands()) 6176 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6177 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6178 AddrDefs.insert(InstOp).second) 6179 Worklist.push_back(InstOp); 6180 } 6181 6182 for (auto *I : AddrDefs) { 6183 if (isa<LoadInst>(I)) { 6184 // Setting the desired widening decision should ideally be handled in 6185 // by cost functions, but since this involves the task of finding out 6186 // if the loaded register is involved in an address computation, it is 6187 // instead changed here when we know this is the case. 6188 InstWidening Decision = getWideningDecision(I, VF); 6189 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6190 // Scalarize a widened load of address. 6191 setWideningDecision(I, VF, CM_Scalarize, 6192 (VF * getMemoryInstructionCost(I, 1))); 6193 else if (auto Group = getInterleavedAccessGroup(I)) { 6194 // Scalarize an interleave group of address loads. 6195 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6196 if (Instruction *Member = Group->getMember(I)) 6197 setWideningDecision(Member, VF, CM_Scalarize, 6198 (VF * getMemoryInstructionCost(Member, 1))); 6199 } 6200 } 6201 } else 6202 // Make sure I gets scalarized and a cost estimate without 6203 // scalarization overhead. 6204 ForcedScalars[VF].insert(I); 6205 } 6206 } 6207 6208 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6209 unsigned VF, 6210 Type *&VectorTy) { 6211 Type *RetTy = I->getType(); 6212 if (canTruncateToMinimalBitwidth(I, VF)) 6213 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6214 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6215 auto SE = PSE.getSE(); 6216 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6217 6218 // TODO: We need to estimate the cost of intrinsic calls. 6219 switch (I->getOpcode()) { 6220 case Instruction::GetElementPtr: 6221 // We mark this instruction as zero-cost because the cost of GEPs in 6222 // vectorized code depends on whether the corresponding memory instruction 6223 // is scalarized or not. Therefore, we handle GEPs with the memory 6224 // instruction cost. 6225 return 0; 6226 case Instruction::Br: { 6227 // In cases of scalarized and predicated instructions, there will be VF 6228 // predicated blocks in the vectorized loop. Each branch around these 6229 // blocks requires also an extract of its vector compare i1 element. 6230 bool ScalarPredicatedBB = false; 6231 BranchInst *BI = cast<BranchInst>(I); 6232 if (VF > 1 && BI->isConditional() && 6233 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6234 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6235 ScalarPredicatedBB = true; 6236 6237 if (ScalarPredicatedBB) { 6238 // Return cost for branches around scalarized and predicated blocks. 6239 auto *Vec_i1Ty = 6240 FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6241 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6242 false, true) + 6243 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); 6244 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6245 // The back-edge branch will remain, as will all scalar branches. 6246 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6247 else 6248 // This branch will be eliminated by if-conversion. 6249 return 0; 6250 // Note: We currently assume zero cost for an unconditional branch inside 6251 // a predicated block since it will become a fall-through, although we 6252 // may decide in the future to call TTI for all branches. 6253 } 6254 case Instruction::PHI: { 6255 auto *Phi = cast<PHINode>(I); 6256 6257 // First-order recurrences are replaced by vector shuffles inside the loop. 6258 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6259 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6260 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6261 cast<VectorType>(VectorTy), VF - 1, 6262 FixedVectorType::get(RetTy, 1)); 6263 6264 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6265 // converted into select instructions. We require N - 1 selects per phi 6266 // node, where N is the number of incoming values. 6267 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6268 return (Phi->getNumIncomingValues() - 1) * 6269 TTI.getCmpSelInstrCost( 6270 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6271 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6272 CostKind); 6273 6274 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6275 } 6276 case Instruction::UDiv: 6277 case Instruction::SDiv: 6278 case Instruction::URem: 6279 case Instruction::SRem: 6280 // If we have a predicated instruction, it may not be executed for each 6281 // vector lane. Get the scalarization cost and scale this amount by the 6282 // probability of executing the predicated block. If the instruction is not 6283 // predicated, we fall through to the next case. 6284 if (VF > 1 && isScalarWithPredication(I)) { 6285 unsigned Cost = 0; 6286 6287 // These instructions have a non-void type, so account for the phi nodes 6288 // that we will create. This cost is likely to be zero. The phi node 6289 // cost, if any, should be scaled by the block probability because it 6290 // models a copy at the end of each predicated block. 6291 Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6292 6293 // The cost of the non-predicated instruction. 6294 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6295 6296 // The cost of insertelement and extractelement instructions needed for 6297 // scalarization. 6298 Cost += getScalarizationOverhead(I, VF); 6299 6300 // Scale the cost by the probability of executing the predicated blocks. 6301 // This assumes the predicated block for each vector lane is equally 6302 // likely. 6303 return Cost / getReciprocalPredBlockProb(); 6304 } 6305 LLVM_FALLTHROUGH; 6306 case Instruction::Add: 6307 case Instruction::FAdd: 6308 case Instruction::Sub: 6309 case Instruction::FSub: 6310 case Instruction::Mul: 6311 case Instruction::FMul: 6312 case Instruction::FDiv: 6313 case Instruction::FRem: 6314 case Instruction::Shl: 6315 case Instruction::LShr: 6316 case Instruction::AShr: 6317 case Instruction::And: 6318 case Instruction::Or: 6319 case Instruction::Xor: { 6320 // Since we will replace the stride by 1 the multiplication should go away. 6321 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6322 return 0; 6323 // Certain instructions can be cheaper to vectorize if they have a constant 6324 // second vector operand. One example of this are shifts on x86. 6325 Value *Op2 = I->getOperand(1); 6326 TargetTransformInfo::OperandValueProperties Op2VP; 6327 TargetTransformInfo::OperandValueKind Op2VK = 6328 TTI.getOperandInfo(Op2, Op2VP); 6329 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6330 Op2VK = TargetTransformInfo::OK_UniformValue; 6331 6332 SmallVector<const Value *, 4> Operands(I->operand_values()); 6333 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6334 return N * TTI.getArithmeticInstrCost( 6335 I->getOpcode(), VectorTy, CostKind, 6336 TargetTransformInfo::OK_AnyValue, 6337 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6338 } 6339 case Instruction::FNeg: { 6340 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6341 return N * TTI.getArithmeticInstrCost( 6342 I->getOpcode(), VectorTy, CostKind, 6343 TargetTransformInfo::OK_AnyValue, 6344 TargetTransformInfo::OK_AnyValue, 6345 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6346 I->getOperand(0), I); 6347 } 6348 case Instruction::Select: { 6349 SelectInst *SI = cast<SelectInst>(I); 6350 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6351 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6352 Type *CondTy = SI->getCondition()->getType(); 6353 if (!ScalarCond) 6354 CondTy = FixedVectorType::get(CondTy, VF); 6355 6356 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6357 CostKind, I); 6358 } 6359 case Instruction::ICmp: 6360 case Instruction::FCmp: { 6361 Type *ValTy = I->getOperand(0)->getType(); 6362 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6363 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6364 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6365 VectorTy = ToVectorTy(ValTy, VF); 6366 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6367 I); 6368 } 6369 case Instruction::Store: 6370 case Instruction::Load: { 6371 unsigned Width = VF; 6372 if (Width > 1) { 6373 InstWidening Decision = getWideningDecision(I, Width); 6374 assert(Decision != CM_Unknown && 6375 "CM decision should be taken at this point"); 6376 if (Decision == CM_Scalarize) 6377 Width = 1; 6378 } 6379 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6380 return getMemoryInstructionCost(I, VF); 6381 } 6382 case Instruction::ZExt: 6383 case Instruction::SExt: 6384 case Instruction::FPToUI: 6385 case Instruction::FPToSI: 6386 case Instruction::FPExt: 6387 case Instruction::PtrToInt: 6388 case Instruction::IntToPtr: 6389 case Instruction::SIToFP: 6390 case Instruction::UIToFP: 6391 case Instruction::Trunc: 6392 case Instruction::FPTrunc: 6393 case Instruction::BitCast: { 6394 // We optimize the truncation of induction variables having constant 6395 // integer steps. The cost of these truncations is the same as the scalar 6396 // operation. 6397 if (isOptimizableIVTruncate(I, VF)) { 6398 auto *Trunc = cast<TruncInst>(I); 6399 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6400 Trunc->getSrcTy(), CostKind, Trunc); 6401 } 6402 6403 Type *SrcScalarTy = I->getOperand(0)->getType(); 6404 Type *SrcVecTy = 6405 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6406 if (canTruncateToMinimalBitwidth(I, VF)) { 6407 // This cast is going to be shrunk. This may remove the cast or it might 6408 // turn it into slightly different cast. For example, if MinBW == 16, 6409 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6410 // 6411 // Calculate the modified src and dest types. 6412 Type *MinVecTy = VectorTy; 6413 if (I->getOpcode() == Instruction::Trunc) { 6414 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6415 VectorTy = 6416 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6417 } else if (I->getOpcode() == Instruction::ZExt || 6418 I->getOpcode() == Instruction::SExt) { 6419 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6420 VectorTy = 6421 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6422 } 6423 } 6424 6425 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6426 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, 6427 CostKind, I); 6428 } 6429 case Instruction::Call: { 6430 bool NeedToScalarize; 6431 CallInst *CI = cast<CallInst>(I); 6432 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6433 if (getVectorIntrinsicIDForCall(CI, TLI)) 6434 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6435 return CallCost; 6436 } 6437 default: 6438 // The cost of executing VF copies of the scalar instruction. This opcode 6439 // is unknown. Assume that it is the same as 'mul'. 6440 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6441 CostKind) + 6442 getScalarizationOverhead(I, VF); 6443 } // end of switch. 6444 } 6445 6446 char LoopVectorize::ID = 0; 6447 6448 static const char lv_name[] = "Loop Vectorization"; 6449 6450 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6451 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6452 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6453 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6454 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6455 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6456 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6457 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6458 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6459 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6460 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6461 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6462 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6463 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6464 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6465 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6466 6467 namespace llvm { 6468 6469 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6470 6471 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6472 bool VectorizeOnlyWhenForced) { 6473 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6474 } 6475 6476 } // end namespace llvm 6477 6478 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6479 // Check if the pointer operand of a load or store instruction is 6480 // consecutive. 6481 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6482 return Legal->isConsecutivePtr(Ptr); 6483 return false; 6484 } 6485 6486 void LoopVectorizationCostModel::collectValuesToIgnore() { 6487 // Ignore ephemeral values. 6488 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6489 6490 // Ignore type-promoting instructions we identified during reduction 6491 // detection. 6492 for (auto &Reduction : Legal->getReductionVars()) { 6493 RecurrenceDescriptor &RedDes = Reduction.second; 6494 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6495 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6496 } 6497 // Ignore type-casting instructions we identified during induction 6498 // detection. 6499 for (auto &Induction : Legal->getInductionVars()) { 6500 InductionDescriptor &IndDes = Induction.second; 6501 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6502 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6503 } 6504 } 6505 6506 // TODO: we could return a pair of values that specify the max VF and 6507 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6508 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6509 // doesn't have a cost model that can choose which plan to execute if 6510 // more than one is generated. 6511 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6512 LoopVectorizationCostModel &CM) { 6513 unsigned WidestType; 6514 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6515 return WidestVectorRegBits / WidestType; 6516 } 6517 6518 VectorizationFactor 6519 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6520 unsigned VF = UserVF; 6521 // Outer loop handling: They may require CFG and instruction level 6522 // transformations before even evaluating whether vectorization is profitable. 6523 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6524 // the vectorization pipeline. 6525 if (!OrigLoop->empty()) { 6526 // If the user doesn't provide a vectorization factor, determine a 6527 // reasonable one. 6528 if (!UserVF) { 6529 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6530 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6531 6532 // Make sure we have a VF > 1 for stress testing. 6533 if (VPlanBuildStressTest && VF < 2) { 6534 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6535 << "overriding computed VF.\n"); 6536 VF = 4; 6537 } 6538 } 6539 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6540 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6541 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6542 << " to build VPlans.\n"); 6543 buildVPlans(VF, VF); 6544 6545 // For VPlan build stress testing, we bail out after VPlan construction. 6546 if (VPlanBuildStressTest) 6547 return VectorizationFactor::Disabled(); 6548 6549 return {VF, 0}; 6550 } 6551 6552 LLVM_DEBUG( 6553 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6554 "VPlan-native path.\n"); 6555 return VectorizationFactor::Disabled(); 6556 } 6557 6558 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6559 unsigned UserIC) { 6560 assert(OrigLoop->empty() && "Inner loop expected."); 6561 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6562 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6563 return None; 6564 6565 // Invalidate interleave groups if all blocks of loop will be predicated. 6566 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6567 !useMaskedInterleavedAccesses(*TTI)) { 6568 LLVM_DEBUG( 6569 dbgs() 6570 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6571 "which requires masked-interleaved support.\n"); 6572 if (CM.InterleaveInfo.invalidateGroups()) 6573 // Invalidating interleave groups also requires invalidating all decisions 6574 // based on them, which includes widening decisions and uniform and scalar 6575 // values. 6576 CM.invalidateCostModelingDecisions(); 6577 } 6578 6579 if (UserVF) { 6580 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6581 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6582 // Collect the instructions (and their associated costs) that will be more 6583 // profitable to scalarize. 6584 CM.selectUserVectorizationFactor(UserVF); 6585 buildVPlansWithVPRecipes(UserVF, UserVF); 6586 LLVM_DEBUG(printPlans(dbgs())); 6587 return {{UserVF, 0}}; 6588 } 6589 6590 unsigned MaxVF = MaybeMaxVF.getValue(); 6591 assert(MaxVF != 0 && "MaxVF is zero."); 6592 6593 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6594 // Collect Uniform and Scalar instructions after vectorization with VF. 6595 CM.collectUniformsAndScalars(VF); 6596 6597 // Collect the instructions (and their associated costs) that will be more 6598 // profitable to scalarize. 6599 if (VF > 1) 6600 CM.collectInstsToScalarize(VF); 6601 } 6602 6603 buildVPlansWithVPRecipes(1, MaxVF); 6604 LLVM_DEBUG(printPlans(dbgs())); 6605 if (MaxVF == 1) 6606 return VectorizationFactor::Disabled(); 6607 6608 // Select the optimal vectorization factor. 6609 return CM.selectVectorizationFactor(MaxVF); 6610 } 6611 6612 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6613 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6614 << '\n'); 6615 BestVF = VF; 6616 BestUF = UF; 6617 6618 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6619 return !Plan->hasVF(VF); 6620 }); 6621 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6622 } 6623 6624 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6625 DominatorTree *DT) { 6626 // Perform the actual loop transformation. 6627 6628 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6629 VPCallbackILV CallbackILV(ILV); 6630 6631 VPTransformState State{BestVF, BestUF, LI, 6632 DT, ILV.Builder, ILV.VectorLoopValueMap, 6633 &ILV, CallbackILV}; 6634 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6635 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6636 State.CanonicalIV = ILV.Induction; 6637 6638 //===------------------------------------------------===// 6639 // 6640 // Notice: any optimization or new instruction that go 6641 // into the code below should also be implemented in 6642 // the cost-model. 6643 // 6644 //===------------------------------------------------===// 6645 6646 // 2. Copy and widen instructions from the old loop into the new loop. 6647 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6648 VPlans.front()->execute(&State); 6649 6650 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6651 // predication, updating analyses. 6652 ILV.fixVectorizedLoop(); 6653 } 6654 6655 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6656 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6657 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6658 6659 // We create new control-flow for the vectorized loop, so the original 6660 // condition will be dead after vectorization if it's only used by the 6661 // branch. 6662 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6663 if (Cmp && Cmp->hasOneUse()) 6664 DeadInstructions.insert(Cmp); 6665 6666 // We create new "steps" for induction variable updates to which the original 6667 // induction variables map. An original update instruction will be dead if 6668 // all its users except the induction variable are dead. 6669 for (auto &Induction : Legal->getInductionVars()) { 6670 PHINode *Ind = Induction.first; 6671 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6672 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6673 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 6674 })) 6675 DeadInstructions.insert(IndUpdate); 6676 6677 // We record as "Dead" also the type-casting instructions we had identified 6678 // during induction analysis. We don't need any handling for them in the 6679 // vectorized loop because we have proven that, under a proper runtime 6680 // test guarding the vectorized loop, the value of the phi, and the casted 6681 // value of the phi, are the same. The last instruction in this casting chain 6682 // will get its scalar/vector/widened def from the scalar/vector/widened def 6683 // of the respective phi node. Any other casts in the induction def-use chain 6684 // have no other uses outside the phi update chain, and will be ignored. 6685 InductionDescriptor &IndDes = Induction.second; 6686 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6687 DeadInstructions.insert(Casts.begin(), Casts.end()); 6688 } 6689 } 6690 6691 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6692 6693 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6694 6695 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6696 Instruction::BinaryOps BinOp) { 6697 // When unrolling and the VF is 1, we only need to add a simple scalar. 6698 Type *Ty = Val->getType(); 6699 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6700 6701 if (Ty->isFloatingPointTy()) { 6702 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6703 6704 // Floating point operations had to be 'fast' to enable the unrolling. 6705 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6706 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6707 } 6708 Constant *C = ConstantInt::get(Ty, StartIdx); 6709 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6710 } 6711 6712 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6713 SmallVector<Metadata *, 4> MDs; 6714 // Reserve first location for self reference to the LoopID metadata node. 6715 MDs.push_back(nullptr); 6716 bool IsUnrollMetadata = false; 6717 MDNode *LoopID = L->getLoopID(); 6718 if (LoopID) { 6719 // First find existing loop unrolling disable metadata. 6720 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6721 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6722 if (MD) { 6723 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6724 IsUnrollMetadata = 6725 S && S->getString().startswith("llvm.loop.unroll.disable"); 6726 } 6727 MDs.push_back(LoopID->getOperand(i)); 6728 } 6729 } 6730 6731 if (!IsUnrollMetadata) { 6732 // Add runtime unroll disable metadata. 6733 LLVMContext &Context = L->getHeader()->getContext(); 6734 SmallVector<Metadata *, 1> DisableOperands; 6735 DisableOperands.push_back( 6736 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6737 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6738 MDs.push_back(DisableNode); 6739 MDNode *NewLoopID = MDNode::get(Context, MDs); 6740 // Set operand 0 to refer to the loop id itself. 6741 NewLoopID->replaceOperandWith(0, NewLoopID); 6742 L->setLoopID(NewLoopID); 6743 } 6744 } 6745 6746 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6747 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6748 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6749 bool PredicateAtRangeStart = Predicate(Range.Start); 6750 6751 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6752 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6753 Range.End = TmpVF; 6754 break; 6755 } 6756 6757 return PredicateAtRangeStart; 6758 } 6759 6760 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6761 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6762 /// of VF's starting at a given VF and extending it as much as possible. Each 6763 /// vectorization decision can potentially shorten this sub-range during 6764 /// buildVPlan(). 6765 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6766 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6767 VFRange SubRange = {VF, MaxVF + 1}; 6768 VPlans.push_back(buildVPlan(SubRange)); 6769 VF = SubRange.End; 6770 } 6771 } 6772 6773 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6774 VPlanPtr &Plan) { 6775 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6776 6777 // Look for cached value. 6778 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6779 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6780 if (ECEntryIt != EdgeMaskCache.end()) 6781 return ECEntryIt->second; 6782 6783 VPValue *SrcMask = createBlockInMask(Src, Plan); 6784 6785 // The terminator has to be a branch inst! 6786 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6787 assert(BI && "Unexpected terminator found"); 6788 6789 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6790 return EdgeMaskCache[Edge] = SrcMask; 6791 6792 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6793 assert(EdgeMask && "No Edge Mask found for condition"); 6794 6795 if (BI->getSuccessor(0) != Dst) 6796 EdgeMask = Builder.createNot(EdgeMask); 6797 6798 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6799 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6800 6801 return EdgeMaskCache[Edge] = EdgeMask; 6802 } 6803 6804 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6805 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6806 6807 // Look for cached value. 6808 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6809 if (BCEntryIt != BlockMaskCache.end()) 6810 return BCEntryIt->second; 6811 6812 // All-one mask is modelled as no-mask following the convention for masked 6813 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6814 VPValue *BlockMask = nullptr; 6815 6816 if (OrigLoop->getHeader() == BB) { 6817 if (!CM.blockNeedsPredication(BB)) 6818 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6819 6820 // Introduce the early-exit compare IV <= BTC to form header block mask. 6821 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6822 // Start by constructing the desired canonical IV. 6823 VPValue *IV = nullptr; 6824 if (Legal->getPrimaryInduction()) 6825 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6826 else { 6827 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6828 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6829 IV = IVRecipe->getVPValue(); 6830 } 6831 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6832 bool TailFolded = !CM.isScalarEpilogueAllowed(); 6833 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) 6834 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); 6835 else 6836 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6837 return BlockMaskCache[BB] = BlockMask; 6838 } 6839 6840 // This is the block mask. We OR all incoming edges. 6841 for (auto *Predecessor : predecessors(BB)) { 6842 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6843 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6844 return BlockMaskCache[BB] = EdgeMask; 6845 6846 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6847 BlockMask = EdgeMask; 6848 continue; 6849 } 6850 6851 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6852 } 6853 6854 return BlockMaskCache[BB] = BlockMask; 6855 } 6856 6857 VPWidenMemoryInstructionRecipe * 6858 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6859 VPlanPtr &Plan) { 6860 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6861 "Must be called with either a load or store"); 6862 6863 auto willWiden = [&](unsigned VF) -> bool { 6864 if (VF == 1) 6865 return false; 6866 LoopVectorizationCostModel::InstWidening Decision = 6867 CM.getWideningDecision(I, VF); 6868 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6869 "CM decision should be taken at this point."); 6870 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6871 return true; 6872 if (CM.isScalarAfterVectorization(I, VF) || 6873 CM.isProfitableToScalarize(I, VF)) 6874 return false; 6875 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6876 }; 6877 6878 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6879 return nullptr; 6880 6881 VPValue *Mask = nullptr; 6882 if (Legal->isMaskRequired(I)) 6883 Mask = createBlockInMask(I->getParent(), Plan); 6884 6885 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6886 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6887 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6888 6889 StoreInst *Store = cast<StoreInst>(I); 6890 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6891 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6892 } 6893 6894 VPWidenIntOrFpInductionRecipe * 6895 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 6896 // Check if this is an integer or fp induction. If so, build the recipe that 6897 // produces its scalar and vector values. 6898 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6899 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6900 II.getKind() == InductionDescriptor::IK_FpInduction) 6901 return new VPWidenIntOrFpInductionRecipe(Phi); 6902 6903 return nullptr; 6904 } 6905 6906 VPWidenIntOrFpInductionRecipe * 6907 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 6908 VFRange &Range) const { 6909 // Optimize the special case where the source is a constant integer 6910 // induction variable. Notice that we can only optimize the 'trunc' case 6911 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6912 // (c) other casts depend on pointer size. 6913 6914 // Determine whether \p K is a truncation based on an induction variable that 6915 // can be optimized. 6916 auto isOptimizableIVTruncate = 6917 [&](Instruction *K) -> std::function<bool(unsigned)> { 6918 return 6919 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6920 }; 6921 6922 if (LoopVectorizationPlanner::getDecisionAndClampRange( 6923 isOptimizableIVTruncate(I), Range)) 6924 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6925 I); 6926 return nullptr; 6927 } 6928 6929 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 6930 // We know that all PHIs in non-header blocks are converted into selects, so 6931 // we don't have to worry about the insertion order and we can just use the 6932 // builder. At this point we generate the predication tree. There may be 6933 // duplications since this is a simple recursive scan, but future 6934 // optimizations will clean it up. 6935 6936 SmallVector<VPValue *, 2> Operands; 6937 unsigned NumIncoming = Phi->getNumIncomingValues(); 6938 for (unsigned In = 0; In < NumIncoming; In++) { 6939 VPValue *EdgeMask = 6940 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6941 assert((EdgeMask || NumIncoming == 1) && 6942 "Multiple predecessors with one having a full mask"); 6943 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6944 if (EdgeMask) 6945 Operands.push_back(EdgeMask); 6946 } 6947 return new VPBlendRecipe(Phi, Operands); 6948 } 6949 6950 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 6951 VPlan &Plan) const { 6952 6953 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6954 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 6955 Range); 6956 6957 if (IsPredicated) 6958 return nullptr; 6959 6960 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6961 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6962 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6963 return nullptr; 6964 6965 auto willWiden = [&](unsigned VF) -> bool { 6966 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6967 // The following case may be scalarized depending on the VF. 6968 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6969 // version of the instruction. 6970 // Is it beneficial to perform intrinsic call compared to lib call? 6971 bool NeedToScalarize = false; 6972 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6973 bool UseVectorIntrinsic = 6974 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6975 return UseVectorIntrinsic || !NeedToScalarize; 6976 }; 6977 6978 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6979 return nullptr; 6980 6981 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 6982 } 6983 6984 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 6985 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 6986 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 6987 // Instruction should be widened, unless it is scalar after vectorization, 6988 // scalarization is profitable or it is predicated. 6989 auto WillScalarize = [this, I](unsigned VF) -> bool { 6990 return CM.isScalarAfterVectorization(I, VF) || 6991 CM.isProfitableToScalarize(I, VF) || 6992 CM.isScalarWithPredication(I, VF); 6993 }; 6994 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 6995 Range); 6996 } 6997 6998 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 6999 auto IsVectorizableOpcode = [](unsigned Opcode) { 7000 switch (Opcode) { 7001 case Instruction::Add: 7002 case Instruction::And: 7003 case Instruction::AShr: 7004 case Instruction::BitCast: 7005 case Instruction::FAdd: 7006 case Instruction::FCmp: 7007 case Instruction::FDiv: 7008 case Instruction::FMul: 7009 case Instruction::FNeg: 7010 case Instruction::FPExt: 7011 case Instruction::FPToSI: 7012 case Instruction::FPToUI: 7013 case Instruction::FPTrunc: 7014 case Instruction::FRem: 7015 case Instruction::FSub: 7016 case Instruction::ICmp: 7017 case Instruction::IntToPtr: 7018 case Instruction::LShr: 7019 case Instruction::Mul: 7020 case Instruction::Or: 7021 case Instruction::PtrToInt: 7022 case Instruction::SDiv: 7023 case Instruction::Select: 7024 case Instruction::SExt: 7025 case Instruction::Shl: 7026 case Instruction::SIToFP: 7027 case Instruction::SRem: 7028 case Instruction::Sub: 7029 case Instruction::Trunc: 7030 case Instruction::UDiv: 7031 case Instruction::UIToFP: 7032 case Instruction::URem: 7033 case Instruction::Xor: 7034 case Instruction::ZExt: 7035 return true; 7036 } 7037 return false; 7038 }; 7039 7040 if (!IsVectorizableOpcode(I->getOpcode())) 7041 return nullptr; 7042 7043 // Success: widen this instruction. 7044 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7045 } 7046 7047 VPBasicBlock *VPRecipeBuilder::handleReplication( 7048 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7049 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7050 VPlanPtr &Plan) { 7051 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7052 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7053 Range); 7054 7055 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7056 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7057 7058 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7059 IsUniform, IsPredicated); 7060 setRecipe(I, Recipe); 7061 7062 // Find if I uses a predicated instruction. If so, it will use its scalar 7063 // value. Avoid hoisting the insert-element which packs the scalar value into 7064 // a vector value, as that happens iff all users use the vector value. 7065 for (auto &Op : I->operands()) 7066 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7067 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7068 PredInst2Recipe[PredInst]->setAlsoPack(false); 7069 7070 // Finalize the recipe for Instr, first if it is not predicated. 7071 if (!IsPredicated) { 7072 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7073 VPBB->appendRecipe(Recipe); 7074 return VPBB; 7075 } 7076 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7077 assert(VPBB->getSuccessors().empty() && 7078 "VPBB has successors when handling predicated replication."); 7079 // Record predicated instructions for above packing optimizations. 7080 PredInst2Recipe[I] = Recipe; 7081 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7082 VPBlockUtils::insertBlockAfter(Region, VPBB); 7083 auto *RegSucc = new VPBasicBlock(); 7084 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7085 return RegSucc; 7086 } 7087 7088 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7089 VPRecipeBase *PredRecipe, 7090 VPlanPtr &Plan) { 7091 // Instructions marked for predication are replicated and placed under an 7092 // if-then construct to prevent side-effects. 7093 7094 // Generate recipes to compute the block mask for this region. 7095 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7096 7097 // Build the triangular if-then region. 7098 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7099 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7100 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7101 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7102 auto *PHIRecipe = 7103 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7104 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7105 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7106 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7107 7108 // Note: first set Entry as region entry and then connect successors starting 7109 // from it in order, to propagate the "parent" of each VPBasicBlock. 7110 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7111 VPBlockUtils::connectBlocks(Pred, Exit); 7112 7113 return Region; 7114 } 7115 7116 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7117 VFRange &Range, 7118 VPlanPtr &Plan) { 7119 // First, check for specific widening recipes that deal with calls, memory 7120 // operations, inductions and Phi nodes. 7121 if (auto *CI = dyn_cast<CallInst>(Instr)) 7122 return tryToWidenCall(CI, Range, *Plan); 7123 7124 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7125 return tryToWidenMemory(Instr, Range, Plan); 7126 7127 VPRecipeBase *Recipe; 7128 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7129 if (Phi->getParent() != OrigLoop->getHeader()) 7130 return tryToBlend(Phi, Plan); 7131 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7132 return Recipe; 7133 return new VPWidenPHIRecipe(Phi); 7134 } 7135 7136 if (isa<TruncInst>(Instr) && 7137 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7138 return Recipe; 7139 7140 if (!shouldWiden(Instr, Range)) 7141 return nullptr; 7142 7143 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7144 return new VPWidenGEPRecipe(GEP, OrigLoop); 7145 7146 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7147 bool InvariantCond = 7148 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7149 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7150 InvariantCond); 7151 } 7152 7153 return tryToWiden(Instr, *Plan); 7154 } 7155 7156 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7157 unsigned MaxVF) { 7158 assert(OrigLoop->empty() && "Inner loop expected."); 7159 7160 // Collect conditions feeding internal conditional branches; they need to be 7161 // represented in VPlan for it to model masking. 7162 SmallPtrSet<Value *, 1> NeedDef; 7163 7164 auto *Latch = OrigLoop->getLoopLatch(); 7165 for (BasicBlock *BB : OrigLoop->blocks()) { 7166 if (BB == Latch) 7167 continue; 7168 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7169 if (Branch && Branch->isConditional()) 7170 NeedDef.insert(Branch->getCondition()); 7171 } 7172 7173 // If the tail is to be folded by masking, the primary induction variable, if 7174 // exists needs to be represented in VPlan for it to model early-exit masking. 7175 // Also, both the Phi and the live-out instruction of each reduction are 7176 // required in order to introduce a select between them in VPlan. 7177 if (CM.foldTailByMasking()) { 7178 if (Legal->getPrimaryInduction()) 7179 NeedDef.insert(Legal->getPrimaryInduction()); 7180 for (auto &Reduction : Legal->getReductionVars()) { 7181 NeedDef.insert(Reduction.first); 7182 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7183 } 7184 } 7185 7186 // Collect instructions from the original loop that will become trivially dead 7187 // in the vectorized loop. We don't need to vectorize these instructions. For 7188 // example, original induction update instructions can become dead because we 7189 // separately emit induction "steps" when generating code for the new loop. 7190 // Similarly, we create a new latch condition when setting up the structure 7191 // of the new loop, so the old one can become dead. 7192 SmallPtrSet<Instruction *, 4> DeadInstructions; 7193 collectTriviallyDeadInstructions(DeadInstructions); 7194 7195 // Add assume instructions we need to drop to DeadInstructions, to prevent 7196 // them from being added to the VPlan. 7197 // TODO: We only need to drop assumes in blocks that get flattend. If the 7198 // control flow is preserved, we should keep them. 7199 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7200 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7201 7202 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7203 // Dead instructions do not need sinking. Remove them from SinkAfter. 7204 for (Instruction *I : DeadInstructions) 7205 SinkAfter.erase(I); 7206 7207 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7208 VFRange SubRange = {VF, MaxVF + 1}; 7209 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7210 DeadInstructions, SinkAfter)); 7211 VF = SubRange.End; 7212 } 7213 } 7214 7215 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7216 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7217 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7218 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7219 7220 // Hold a mapping from predicated instructions to their recipes, in order to 7221 // fix their AlsoPack behavior if a user is determined to replicate and use a 7222 // scalar instead of vector value. 7223 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7224 7225 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7226 7227 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7228 7229 // --------------------------------------------------------------------------- 7230 // Pre-construction: record ingredients whose recipes we'll need to further 7231 // process after constructing the initial VPlan. 7232 // --------------------------------------------------------------------------- 7233 7234 // Mark instructions we'll need to sink later and their targets as 7235 // ingredients whose recipe we'll need to record. 7236 for (auto &Entry : SinkAfter) { 7237 RecipeBuilder.recordRecipeOf(Entry.first); 7238 RecipeBuilder.recordRecipeOf(Entry.second); 7239 } 7240 7241 // For each interleave group which is relevant for this (possibly trimmed) 7242 // Range, add it to the set of groups to be later applied to the VPlan and add 7243 // placeholders for its members' Recipes which we'll be replacing with a 7244 // single VPInterleaveRecipe. 7245 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7246 auto applyIG = [IG, this](unsigned VF) -> bool { 7247 return (VF >= 2 && // Query is illegal for VF == 1 7248 CM.getWideningDecision(IG->getInsertPos(), VF) == 7249 LoopVectorizationCostModel::CM_Interleave); 7250 }; 7251 if (!getDecisionAndClampRange(applyIG, Range)) 7252 continue; 7253 InterleaveGroups.insert(IG); 7254 for (unsigned i = 0; i < IG->getFactor(); i++) 7255 if (Instruction *Member = IG->getMember(i)) 7256 RecipeBuilder.recordRecipeOf(Member); 7257 }; 7258 7259 // --------------------------------------------------------------------------- 7260 // Build initial VPlan: Scan the body of the loop in a topological order to 7261 // visit each basic block after having visited its predecessor basic blocks. 7262 // --------------------------------------------------------------------------- 7263 7264 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7265 auto Plan = std::make_unique<VPlan>(); 7266 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7267 Plan->setEntry(VPBB); 7268 7269 // Represent values that will have defs inside VPlan. 7270 for (Value *V : NeedDef) 7271 Plan->addVPValue(V); 7272 7273 // Scan the body of the loop in a topological order to visit each basic block 7274 // after having visited its predecessor basic blocks. 7275 LoopBlocksDFS DFS(OrigLoop); 7276 DFS.perform(LI); 7277 7278 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7279 // Relevant instructions from basic block BB will be grouped into VPRecipe 7280 // ingredients and fill a new VPBasicBlock. 7281 unsigned VPBBsForBB = 0; 7282 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7283 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7284 VPBB = FirstVPBBForBB; 7285 Builder.setInsertPoint(VPBB); 7286 7287 // Introduce each ingredient into VPlan. 7288 // TODO: Model and preserve debug instrinsics in VPlan. 7289 for (Instruction &I : BB->instructionsWithoutDebug()) { 7290 Instruction *Instr = &I; 7291 7292 // First filter out irrelevant instructions, to ensure no recipes are 7293 // built for them. 7294 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7295 continue; 7296 7297 if (auto Recipe = 7298 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7299 RecipeBuilder.setRecipe(Instr, Recipe); 7300 VPBB->appendRecipe(Recipe); 7301 continue; 7302 } 7303 7304 // Otherwise, if all widening options failed, Instruction is to be 7305 // replicated. This may create a successor for VPBB. 7306 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7307 Instr, Range, VPBB, PredInst2Recipe, Plan); 7308 if (NextVPBB != VPBB) { 7309 VPBB = NextVPBB; 7310 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7311 : ""); 7312 } 7313 } 7314 } 7315 7316 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7317 // may also be empty, such as the last one VPBB, reflecting original 7318 // basic-blocks with no recipes. 7319 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7320 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7321 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7322 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7323 delete PreEntry; 7324 7325 // --------------------------------------------------------------------------- 7326 // Transform initial VPlan: Apply previously taken decisions, in order, to 7327 // bring the VPlan to its final state. 7328 // --------------------------------------------------------------------------- 7329 7330 // Apply Sink-After legal constraints. 7331 for (auto &Entry : SinkAfter) { 7332 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7333 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7334 Sink->moveAfter(Target); 7335 } 7336 7337 // Interleave memory: for each Interleave Group we marked earlier as relevant 7338 // for this VPlan, replace the Recipes widening its memory instructions with a 7339 // single VPInterleaveRecipe at its insertion point. 7340 for (auto IG : InterleaveGroups) { 7341 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7342 RecipeBuilder.getRecipe(IG->getInsertPos())); 7343 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7344 ->insertBefore(Recipe); 7345 7346 for (unsigned i = 0; i < IG->getFactor(); ++i) 7347 if (Instruction *Member = IG->getMember(i)) { 7348 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7349 } 7350 } 7351 7352 // Finally, if tail is folded by masking, introduce selects between the phi 7353 // and the live-out instruction of each reduction, at the end of the latch. 7354 if (CM.foldTailByMasking()) { 7355 Builder.setInsertPoint(VPBB); 7356 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7357 for (auto &Reduction : Legal->getReductionVars()) { 7358 VPValue *Phi = Plan->getVPValue(Reduction.first); 7359 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7360 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7361 } 7362 } 7363 7364 std::string PlanName; 7365 raw_string_ostream RSO(PlanName); 7366 unsigned VF = Range.Start; 7367 Plan->addVF(VF); 7368 RSO << "Initial VPlan for VF={" << VF; 7369 for (VF *= 2; VF < Range.End; VF *= 2) { 7370 Plan->addVF(VF); 7371 RSO << "," << VF; 7372 } 7373 RSO << "},UF>=1"; 7374 RSO.flush(); 7375 Plan->setName(PlanName); 7376 7377 return Plan; 7378 } 7379 7380 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7381 // Outer loop handling: They may require CFG and instruction level 7382 // transformations before even evaluating whether vectorization is profitable. 7383 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7384 // the vectorization pipeline. 7385 assert(!OrigLoop->empty()); 7386 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7387 7388 // Create new empty VPlan 7389 auto Plan = std::make_unique<VPlan>(); 7390 7391 // Build hierarchical CFG 7392 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7393 HCFGBuilder.buildHierarchicalCFG(); 7394 7395 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7396 Plan->addVF(VF); 7397 7398 if (EnableVPlanPredication) { 7399 VPlanPredicator VPP(*Plan); 7400 VPP.predicate(); 7401 7402 // Avoid running transformation to recipes until masked code generation in 7403 // VPlan-native path is in place. 7404 return Plan; 7405 } 7406 7407 SmallPtrSet<Instruction *, 1> DeadInstructions; 7408 VPlanTransforms::VPInstructionsToVPRecipes( 7409 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7410 return Plan; 7411 } 7412 7413 Value* LoopVectorizationPlanner::VPCallbackILV:: 7414 getOrCreateVectorValues(Value *V, unsigned Part) { 7415 return ILV.getOrCreateVectorValue(V, Part); 7416 } 7417 7418 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7419 Value *V, const VPIteration &Instance) { 7420 return ILV.getOrCreateScalarValue(V, Instance); 7421 } 7422 7423 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7424 VPSlotTracker &SlotTracker) const { 7425 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7426 IG->getInsertPos()->printAsOperand(O, false); 7427 O << ", "; 7428 getAddr()->printAsOperand(O, SlotTracker); 7429 VPValue *Mask = getMask(); 7430 if (Mask) { 7431 O << ", "; 7432 Mask->printAsOperand(O, SlotTracker); 7433 } 7434 for (unsigned i = 0; i < IG->getFactor(); ++i) 7435 if (Instruction *I = IG->getMember(i)) 7436 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7437 } 7438 7439 void VPWidenCallRecipe::execute(VPTransformState &State) { 7440 State.ILV->widenCallInstruction(Ingredient, User, State); 7441 } 7442 7443 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7444 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7445 } 7446 7447 void VPWidenRecipe::execute(VPTransformState &State) { 7448 State.ILV->widenInstruction(Ingredient, User, State); 7449 } 7450 7451 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7452 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7453 IsIndexLoopInvariant); 7454 } 7455 7456 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7457 assert(!State.Instance && "Int or FP induction being replicated."); 7458 State.ILV->widenIntOrFpInduction(IV, Trunc); 7459 } 7460 7461 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7462 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7463 } 7464 7465 void VPBlendRecipe::execute(VPTransformState &State) { 7466 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7467 // We know that all PHIs in non-header blocks are converted into 7468 // selects, so we don't have to worry about the insertion order and we 7469 // can just use the builder. 7470 // At this point we generate the predication tree. There may be 7471 // duplications since this is a simple recursive scan, but future 7472 // optimizations will clean it up. 7473 7474 unsigned NumIncoming = getNumIncomingValues(); 7475 7476 // Generate a sequence of selects of the form: 7477 // SELECT(Mask3, In3, 7478 // SELECT(Mask2, In2, 7479 // SELECT(Mask1, In1, 7480 // In0))) 7481 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7482 // are essentially undef are taken from In0. 7483 InnerLoopVectorizer::VectorParts Entry(State.UF); 7484 for (unsigned In = 0; In < NumIncoming; ++In) { 7485 for (unsigned Part = 0; Part < State.UF; ++Part) { 7486 // We might have single edge PHIs (blocks) - use an identity 7487 // 'select' for the first PHI operand. 7488 Value *In0 = State.get(getIncomingValue(In), Part); 7489 if (In == 0) 7490 Entry[Part] = In0; // Initialize with the first incoming value. 7491 else { 7492 // Select between the current value and the previous incoming edge 7493 // based on the incoming mask. 7494 Value *Cond = State.get(getMask(In), Part); 7495 Entry[Part] = 7496 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7497 } 7498 } 7499 } 7500 for (unsigned Part = 0; Part < State.UF; ++Part) 7501 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7502 } 7503 7504 void VPInterleaveRecipe::execute(VPTransformState &State) { 7505 assert(!State.Instance && "Interleave group being replicated."); 7506 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7507 } 7508 7509 void VPReplicateRecipe::execute(VPTransformState &State) { 7510 if (State.Instance) { // Generate a single instance. 7511 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7512 IsPredicated, State); 7513 // Insert scalar instance packing it into a vector. 7514 if (AlsoPack && State.VF > 1) { 7515 // If we're constructing lane 0, initialize to start from undef. 7516 if (State.Instance->Lane == 0) { 7517 Value *Undef = UndefValue::get( 7518 FixedVectorType::get(Ingredient->getType(), State.VF)); 7519 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7520 } 7521 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7522 } 7523 return; 7524 } 7525 7526 // Generate scalar instances for all VF lanes of all UF parts, unless the 7527 // instruction is uniform inwhich case generate only the first lane for each 7528 // of the UF parts. 7529 unsigned EndLane = IsUniform ? 1 : State.VF; 7530 for (unsigned Part = 0; Part < State.UF; ++Part) 7531 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7532 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7533 IsPredicated, State); 7534 } 7535 7536 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7537 assert(State.Instance && "Branch on Mask works only on single instance."); 7538 7539 unsigned Part = State.Instance->Part; 7540 unsigned Lane = State.Instance->Lane; 7541 7542 Value *ConditionBit = nullptr; 7543 VPValue *BlockInMask = getMask(); 7544 if (BlockInMask) { 7545 ConditionBit = State.get(BlockInMask, Part); 7546 if (ConditionBit->getType()->isVectorTy()) 7547 ConditionBit = State.Builder.CreateExtractElement( 7548 ConditionBit, State.Builder.getInt32(Lane)); 7549 } else // Block in mask is all-one. 7550 ConditionBit = State.Builder.getTrue(); 7551 7552 // Replace the temporary unreachable terminator with a new conditional branch, 7553 // whose two destinations will be set later when they are created. 7554 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7555 assert(isa<UnreachableInst>(CurrentTerminator) && 7556 "Expected to replace unreachable terminator with conditional branch."); 7557 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7558 CondBr->setSuccessor(0, nullptr); 7559 ReplaceInstWithInst(CurrentTerminator, CondBr); 7560 } 7561 7562 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7563 assert(State.Instance && "Predicated instruction PHI works per instance."); 7564 Instruction *ScalarPredInst = cast<Instruction>( 7565 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7566 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7567 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7568 assert(PredicatingBB && "Predicated block has no single predecessor."); 7569 7570 // By current pack/unpack logic we need to generate only a single phi node: if 7571 // a vector value for the predicated instruction exists at this point it means 7572 // the instruction has vector users only, and a phi for the vector value is 7573 // needed. In this case the recipe of the predicated instruction is marked to 7574 // also do that packing, thereby "hoisting" the insert-element sequence. 7575 // Otherwise, a phi node for the scalar value is needed. 7576 unsigned Part = State.Instance->Part; 7577 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7578 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7579 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7580 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7581 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7582 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7583 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7584 } else { 7585 Type *PredInstType = PredInst->getType(); 7586 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7587 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7588 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7589 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7590 } 7591 } 7592 7593 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7594 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7595 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7596 getMask()); 7597 } 7598 7599 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7600 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7601 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7602 // for predication. 7603 static ScalarEpilogueLowering getScalarEpilogueLowering( 7604 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7605 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7606 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7607 LoopVectorizationLegality &LVL) { 7608 bool OptSize = 7609 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7610 PGSOQueryType::IRPass); 7611 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7612 // don't look at hints or options, and don't request a scalar epilogue. 7613 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7614 return CM_ScalarEpilogueNotAllowedOptSize; 7615 7616 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7617 !PreferPredicateOverEpilog; 7618 7619 // 2) Next, if disabling predication is requested on the command line, honour 7620 // this and request a scalar epilogue. 7621 if (PredicateOptDisabled) 7622 return CM_ScalarEpilogueAllowed; 7623 7624 // 3) and 4) look if enabling predication is requested on the command line, 7625 // with a loop hint, or if the TTI hook indicates this is profitable, request 7626 // predication . 7627 if (PreferPredicateOverEpilog || 7628 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7629 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7630 LVL.getLAI()) && 7631 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7632 return CM_ScalarEpilogueNotNeededUsePredicate; 7633 7634 return CM_ScalarEpilogueAllowed; 7635 } 7636 7637 // Process the loop in the VPlan-native vectorization path. This path builds 7638 // VPlan upfront in the vectorization pipeline, which allows to apply 7639 // VPlan-to-VPlan transformations from the very beginning without modifying the 7640 // input LLVM IR. 7641 static bool processLoopInVPlanNativePath( 7642 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7643 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7644 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7645 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7646 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7647 7648 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 7649 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 7650 return false; 7651 } 7652 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7653 Function *F = L->getHeader()->getParent(); 7654 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7655 7656 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7657 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7658 7659 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7660 &Hints, IAI); 7661 // Use the planner for outer loop vectorization. 7662 // TODO: CM is not used at this point inside the planner. Turn CM into an 7663 // optional argument if we don't need it in the future. 7664 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7665 7666 // Get user vectorization factor. 7667 const unsigned UserVF = Hints.getWidth(); 7668 7669 // Plan how to best vectorize, return the best VF and its cost. 7670 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7671 7672 // If we are stress testing VPlan builds, do not attempt to generate vector 7673 // code. Masked vector code generation support will follow soon. 7674 // Also, do not attempt to vectorize if no vector code will be produced. 7675 if (VPlanBuildStressTest || EnableVPlanPredication || 7676 VectorizationFactor::Disabled() == VF) 7677 return false; 7678 7679 LVP.setBestPlan(VF.Width, 1); 7680 7681 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7682 &CM); 7683 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7684 << L->getHeader()->getParent()->getName() << "\"\n"); 7685 LVP.executePlan(LB, DT); 7686 7687 // Mark the loop as already vectorized to avoid vectorizing again. 7688 Hints.setAlreadyVectorized(); 7689 7690 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 7691 return true; 7692 } 7693 7694 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7695 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7696 !EnableLoopInterleaving), 7697 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7698 !EnableLoopVectorization) {} 7699 7700 bool LoopVectorizePass::processLoop(Loop *L) { 7701 assert((EnableVPlanNativePath || L->empty()) && 7702 "VPlan-native path is not enabled. Only process inner loops."); 7703 7704 #ifndef NDEBUG 7705 const std::string DebugLocStr = getDebugLocString(L); 7706 #endif /* NDEBUG */ 7707 7708 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7709 << L->getHeader()->getParent()->getName() << "\" from " 7710 << DebugLocStr << "\n"); 7711 7712 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7713 7714 LLVM_DEBUG( 7715 dbgs() << "LV: Loop hints:" 7716 << " force=" 7717 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7718 ? "disabled" 7719 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7720 ? "enabled" 7721 : "?")) 7722 << " width=" << Hints.getWidth() 7723 << " unroll=" << Hints.getInterleave() << "\n"); 7724 7725 // Function containing loop 7726 Function *F = L->getHeader()->getParent(); 7727 7728 // Looking at the diagnostic output is the only way to determine if a loop 7729 // was vectorized (other than looking at the IR or machine code), so it 7730 // is important to generate an optimization remark for each loop. Most of 7731 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7732 // generated as OptimizationRemark and OptimizationRemarkMissed are 7733 // less verbose reporting vectorized loops and unvectorized loops that may 7734 // benefit from vectorization, respectively. 7735 7736 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7737 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7738 return false; 7739 } 7740 7741 PredicatedScalarEvolution PSE(*SE, *L); 7742 7743 // Check if it is legal to vectorize the loop. 7744 LoopVectorizationRequirements Requirements(*ORE); 7745 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7746 &Requirements, &Hints, DB, AC); 7747 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7748 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7749 Hints.emitRemarkWithHints(); 7750 return false; 7751 } 7752 7753 // Check the function attributes and profiles to find out if this function 7754 // should be optimized for size. 7755 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7756 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7757 7758 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7759 // here. They may require CFG and instruction level transformations before 7760 // even evaluating whether vectorization is profitable. Since we cannot modify 7761 // the incoming IR, we need to build VPlan upfront in the vectorization 7762 // pipeline. 7763 if (!L->empty()) 7764 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7765 ORE, BFI, PSI, Hints); 7766 7767 assert(L->empty() && "Inner loop expected."); 7768 7769 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7770 // count by optimizing for size, to minimize overheads. 7771 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7772 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7773 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7774 << "This loop is worth vectorizing only if no scalar " 7775 << "iteration overheads are incurred."); 7776 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7777 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7778 else { 7779 LLVM_DEBUG(dbgs() << "\n"); 7780 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7781 } 7782 } 7783 7784 // Check the function attributes to see if implicit floats are allowed. 7785 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7786 // an integer loop and the vector instructions selected are purely integer 7787 // vector instructions? 7788 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7789 reportVectorizationFailure( 7790 "Can't vectorize when the NoImplicitFloat attribute is used", 7791 "loop not vectorized due to NoImplicitFloat attribute", 7792 "NoImplicitFloat", ORE, L); 7793 Hints.emitRemarkWithHints(); 7794 return false; 7795 } 7796 7797 // Check if the target supports potentially unsafe FP vectorization. 7798 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7799 // for the target we're vectorizing for, to make sure none of the 7800 // additional fp-math flags can help. 7801 if (Hints.isPotentiallyUnsafe() && 7802 TTI->isFPVectorizationPotentiallyUnsafe()) { 7803 reportVectorizationFailure( 7804 "Potentially unsafe FP op prevents vectorization", 7805 "loop not vectorized due to unsafe FP support.", 7806 "UnsafeFP", ORE, L); 7807 Hints.emitRemarkWithHints(); 7808 return false; 7809 } 7810 7811 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7812 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7813 7814 // If an override option has been passed in for interleaved accesses, use it. 7815 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7816 UseInterleaved = EnableInterleavedMemAccesses; 7817 7818 // Analyze interleaved memory accesses. 7819 if (UseInterleaved) { 7820 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7821 } 7822 7823 // Use the cost model. 7824 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7825 F, &Hints, IAI); 7826 CM.collectValuesToIgnore(); 7827 7828 // Use the planner for vectorization. 7829 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7830 7831 // Get user vectorization factor and interleave count. 7832 unsigned UserVF = Hints.getWidth(); 7833 unsigned UserIC = Hints.getInterleave(); 7834 7835 // Plan how to best vectorize, return the best VF and its cost. 7836 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 7837 7838 VectorizationFactor VF = VectorizationFactor::Disabled(); 7839 unsigned IC = 1; 7840 7841 if (MaybeVF) { 7842 VF = *MaybeVF; 7843 // Select the interleave count. 7844 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7845 } 7846 7847 // Identify the diagnostic messages that should be produced. 7848 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7849 bool VectorizeLoop = true, InterleaveLoop = true; 7850 if (Requirements.doesNotMeet(F, L, Hints)) { 7851 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7852 "requirements.\n"); 7853 Hints.emitRemarkWithHints(); 7854 return false; 7855 } 7856 7857 if (VF.Width == 1) { 7858 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7859 VecDiagMsg = std::make_pair( 7860 "VectorizationNotBeneficial", 7861 "the cost-model indicates that vectorization is not beneficial"); 7862 VectorizeLoop = false; 7863 } 7864 7865 if (!MaybeVF && UserIC > 1) { 7866 // Tell the user interleaving was avoided up-front, despite being explicitly 7867 // requested. 7868 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7869 "interleaving should be avoided up front\n"); 7870 IntDiagMsg = std::make_pair( 7871 "InterleavingAvoided", 7872 "Ignoring UserIC, because interleaving was avoided up front"); 7873 InterleaveLoop = false; 7874 } else if (IC == 1 && UserIC <= 1) { 7875 // Tell the user interleaving is not beneficial. 7876 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7877 IntDiagMsg = std::make_pair( 7878 "InterleavingNotBeneficial", 7879 "the cost-model indicates that interleaving is not beneficial"); 7880 InterleaveLoop = false; 7881 if (UserIC == 1) { 7882 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7883 IntDiagMsg.second += 7884 " and is explicitly disabled or interleave count is set to 1"; 7885 } 7886 } else if (IC > 1 && UserIC == 1) { 7887 // Tell the user interleaving is beneficial, but it explicitly disabled. 7888 LLVM_DEBUG( 7889 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7890 IntDiagMsg = std::make_pair( 7891 "InterleavingBeneficialButDisabled", 7892 "the cost-model indicates that interleaving is beneficial " 7893 "but is explicitly disabled or interleave count is set to 1"); 7894 InterleaveLoop = false; 7895 } 7896 7897 // Override IC if user provided an interleave count. 7898 IC = UserIC > 0 ? UserIC : IC; 7899 7900 // Emit diagnostic messages, if any. 7901 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7902 if (!VectorizeLoop && !InterleaveLoop) { 7903 // Do not vectorize or interleaving the loop. 7904 ORE->emit([&]() { 7905 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7906 L->getStartLoc(), L->getHeader()) 7907 << VecDiagMsg.second; 7908 }); 7909 ORE->emit([&]() { 7910 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7911 L->getStartLoc(), L->getHeader()) 7912 << IntDiagMsg.second; 7913 }); 7914 return false; 7915 } else if (!VectorizeLoop && InterleaveLoop) { 7916 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7917 ORE->emit([&]() { 7918 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7919 L->getStartLoc(), L->getHeader()) 7920 << VecDiagMsg.second; 7921 }); 7922 } else if (VectorizeLoop && !InterleaveLoop) { 7923 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7924 << ") in " << DebugLocStr << '\n'); 7925 ORE->emit([&]() { 7926 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7927 L->getStartLoc(), L->getHeader()) 7928 << IntDiagMsg.second; 7929 }); 7930 } else if (VectorizeLoop && InterleaveLoop) { 7931 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7932 << ") in " << DebugLocStr << '\n'); 7933 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7934 } 7935 7936 LVP.setBestPlan(VF.Width, IC); 7937 7938 using namespace ore; 7939 bool DisableRuntimeUnroll = false; 7940 MDNode *OrigLoopID = L->getLoopID(); 7941 7942 if (!VectorizeLoop) { 7943 assert(IC > 1 && "interleave count should not be 1 or 0"); 7944 // If we decided that it is not legal to vectorize the loop, then 7945 // interleave it. 7946 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7947 &CM); 7948 LVP.executePlan(Unroller, DT); 7949 7950 ORE->emit([&]() { 7951 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7952 L->getHeader()) 7953 << "interleaved loop (interleaved count: " 7954 << NV("InterleaveCount", IC) << ")"; 7955 }); 7956 } else { 7957 // If we decided that it is *legal* to vectorize the loop, then do it. 7958 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7959 &LVL, &CM); 7960 LVP.executePlan(LB, DT); 7961 ++LoopsVectorized; 7962 7963 // Add metadata to disable runtime unrolling a scalar loop when there are 7964 // no runtime checks about strides and memory. A scalar loop that is 7965 // rarely used is not worth unrolling. 7966 if (!LB.areSafetyChecksAdded()) 7967 DisableRuntimeUnroll = true; 7968 7969 // Report the vectorization decision. 7970 ORE->emit([&]() { 7971 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7972 L->getHeader()) 7973 << "vectorized loop (vectorization width: " 7974 << NV("VectorizationFactor", VF.Width) 7975 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7976 }); 7977 } 7978 7979 Optional<MDNode *> RemainderLoopID = 7980 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7981 LLVMLoopVectorizeFollowupEpilogue}); 7982 if (RemainderLoopID.hasValue()) { 7983 L->setLoopID(RemainderLoopID.getValue()); 7984 } else { 7985 if (DisableRuntimeUnroll) 7986 AddRuntimeUnrollDisableMetaData(L); 7987 7988 // Mark the loop as already vectorized to avoid vectorizing again. 7989 Hints.setAlreadyVectorized(); 7990 } 7991 7992 assert(!verifyFunction(*L->getHeader()->getParent())); 7993 return true; 7994 } 7995 7996 LoopVectorizeResult LoopVectorizePass::runImpl( 7997 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7998 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7999 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 8000 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8001 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8002 SE = &SE_; 8003 LI = &LI_; 8004 TTI = &TTI_; 8005 DT = &DT_; 8006 BFI = &BFI_; 8007 TLI = TLI_; 8008 AA = &AA_; 8009 AC = &AC_; 8010 GetLAA = &GetLAA_; 8011 DB = &DB_; 8012 ORE = &ORE_; 8013 PSI = PSI_; 8014 8015 // Don't attempt if 8016 // 1. the target claims to have no vector registers, and 8017 // 2. interleaving won't help ILP. 8018 // 8019 // The second condition is necessary because, even if the target has no 8020 // vector registers, loop vectorization may still enable scalar 8021 // interleaving. 8022 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8023 TTI->getMaxInterleaveFactor(1) < 2) 8024 return LoopVectorizeResult(false, false); 8025 8026 bool Changed = false, CFGChanged = false; 8027 8028 // The vectorizer requires loops to be in simplified form. 8029 // Since simplification may add new inner loops, it has to run before the 8030 // legality and profitability checks. This means running the loop vectorizer 8031 // will simplify all loops, regardless of whether anything end up being 8032 // vectorized. 8033 for (auto &L : *LI) 8034 Changed |= CFGChanged |= 8035 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8036 8037 // Build up a worklist of inner-loops to vectorize. This is necessary as 8038 // the act of vectorizing or partially unrolling a loop creates new loops 8039 // and can invalidate iterators across the loops. 8040 SmallVector<Loop *, 8> Worklist; 8041 8042 for (Loop *L : *LI) 8043 collectSupportedLoops(*L, LI, ORE, Worklist); 8044 8045 LoopsAnalyzed += Worklist.size(); 8046 8047 // Now walk the identified inner loops. 8048 while (!Worklist.empty()) { 8049 Loop *L = Worklist.pop_back_val(); 8050 8051 // For the inner loops we actually process, form LCSSA to simplify the 8052 // transform. 8053 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8054 8055 Changed |= CFGChanged |= processLoop(L); 8056 } 8057 8058 // Process each loop nest in the function. 8059 return LoopVectorizeResult(Changed, CFGChanged); 8060 } 8061 8062 PreservedAnalyses LoopVectorizePass::run(Function &F, 8063 FunctionAnalysisManager &AM) { 8064 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8065 auto &LI = AM.getResult<LoopAnalysis>(F); 8066 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8067 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8068 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8069 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8070 auto &AA = AM.getResult<AAManager>(F); 8071 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8072 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8073 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8074 MemorySSA *MSSA = EnableMSSALoopDependency 8075 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8076 : nullptr; 8077 8078 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8079 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8080 [&](Loop &L) -> const LoopAccessInfo & { 8081 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8082 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8083 }; 8084 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8085 ProfileSummaryInfo *PSI = 8086 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8087 LoopVectorizeResult Result = 8088 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8089 if (!Result.MadeAnyChange) 8090 return PreservedAnalyses::all(); 8091 PreservedAnalyses PA; 8092 8093 // We currently do not preserve loopinfo/dominator analyses with outer loop 8094 // vectorization. Until this is addressed, mark these analyses as preserved 8095 // only for non-VPlan-native path. 8096 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8097 if (!EnableVPlanNativePath) { 8098 PA.preserve<LoopAnalysis>(); 8099 PA.preserve<DominatorTreeAnalysis>(); 8100 } 8101 PA.preserve<BasicAA>(); 8102 PA.preserve<GlobalsAA>(); 8103 if (!Result.MadeCFGChange) 8104 PA.preserveSet<CFGAnalyses>(); 8105 return PA; 8106 } 8107