1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I, VPUser &Operands, 411 VPTransformState &State); 412 413 /// Widen a single call instruction within the innermost loop. 414 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 415 VPTransformState &State); 416 417 /// Widen a single select instruction within the innermost loop. 418 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 419 bool InvariantCond, VPTransformState &State); 420 421 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 422 void fixVectorizedLoop(); 423 424 // Return true if any runtime check is added. 425 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 426 427 /// A type for vectorized values in the new loop. Each value from the 428 /// original loop, when vectorized, is represented by UF vector values in the 429 /// new unrolled loop, where UF is the unroll factor. 430 using VectorParts = SmallVector<Value *, 2>; 431 432 /// Vectorize a single GetElementPtrInst based on information gathered and 433 /// decisions taken during planning. 434 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 435 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 436 437 /// Vectorize a single PHINode in a block. This method handles the induction 438 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 439 /// arbitrary length vectors. 440 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 441 442 /// A helper function to scalarize a single Instruction in the innermost loop. 443 /// Generates a sequence of scalar instances for each lane between \p MinLane 444 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 445 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 446 /// Instr's operands. 447 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 448 const VPIteration &Instance, bool IfPredicateInstr, 449 VPTransformState &State); 450 451 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 452 /// is provided, the integer induction variable will first be truncated to 453 /// the corresponding type. 454 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 455 456 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 457 /// vector or scalar value on-demand if one is not yet available. When 458 /// vectorizing a loop, we visit the definition of an instruction before its 459 /// uses. When visiting the definition, we either vectorize or scalarize the 460 /// instruction, creating an entry for it in the corresponding map. (In some 461 /// cases, such as induction variables, we will create both vector and scalar 462 /// entries.) Then, as we encounter uses of the definition, we derive values 463 /// for each scalar or vector use unless such a value is already available. 464 /// For example, if we scalarize a definition and one of its uses is vector, 465 /// we build the required vector on-demand with an insertelement sequence 466 /// when visiting the use. Otherwise, if the use is scalar, we can use the 467 /// existing scalar definition. 468 /// 469 /// Return a value in the new loop corresponding to \p V from the original 470 /// loop at unroll index \p Part. If the value has already been vectorized, 471 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 472 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 473 /// a new vector value on-demand by inserting the scalar values into a vector 474 /// with an insertelement sequence. If the value has been neither vectorized 475 /// nor scalarized, it must be loop invariant, so we simply broadcast the 476 /// value into a vector. 477 Value *getOrCreateVectorValue(Value *V, unsigned Part); 478 479 /// Return a value in the new loop corresponding to \p V from the original 480 /// loop at unroll and vector indices \p Instance. If the value has been 481 /// vectorized but not scalarized, the necessary extractelement instruction 482 /// will be generated. 483 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 484 485 /// Construct the vector value of a scalarized value \p V one lane at a time. 486 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 487 488 /// Try to vectorize interleaved access group \p Group with the base address 489 /// given in \p Addr, optionally masking the vector operations if \p 490 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 491 /// values in the vectorized loop. 492 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 493 VPTransformState &State, VPValue *Addr, 494 VPValue *BlockInMask = nullptr); 495 496 /// Vectorize Load and Store instructions with the base address given in \p 497 /// Addr, optionally masking the vector operations if \p BlockInMask is 498 /// non-null. Use \p State to translate given VPValues to IR values in the 499 /// vectorized loop. 500 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 501 VPValue *Addr, VPValue *StoredValue, 502 VPValue *BlockInMask); 503 504 /// Set the debug location in the builder using the debug location in 505 /// the instruction. 506 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 507 508 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 509 void fixNonInductionPHIs(void); 510 511 protected: 512 friend class LoopVectorizationPlanner; 513 514 /// A small list of PHINodes. 515 using PhiVector = SmallVector<PHINode *, 4>; 516 517 /// A type for scalarized values in the new loop. Each value from the 518 /// original loop, when scalarized, is represented by UF x VF scalar values 519 /// in the new unrolled loop, where UF is the unroll factor and VF is the 520 /// vectorization factor. 521 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 522 523 /// Set up the values of the IVs correctly when exiting the vector loop. 524 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 525 Value *CountRoundDown, Value *EndValue, 526 BasicBlock *MiddleBlock); 527 528 /// Create a new induction variable inside L. 529 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 530 Value *Step, Instruction *DL); 531 532 /// Handle all cross-iteration phis in the header. 533 void fixCrossIterationPHIs(); 534 535 /// Fix a first-order recurrence. This is the second phase of vectorizing 536 /// this phi node. 537 void fixFirstOrderRecurrence(PHINode *Phi); 538 539 /// Fix a reduction cross-iteration phi. This is the second phase of 540 /// vectorizing this phi node. 541 void fixReduction(PHINode *Phi); 542 543 /// Clear NSW/NUW flags from reduction instructions if necessary. 544 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 545 546 /// The Loop exit block may have single value PHI nodes with some 547 /// incoming value. While vectorizing we only handled real values 548 /// that were defined inside the loop and we should have one value for 549 /// each predecessor of its parent basic block. See PR14725. 550 void fixLCSSAPHIs(); 551 552 /// Iteratively sink the scalarized operands of a predicated instruction into 553 /// the block that was created for it. 554 void sinkScalarOperands(Instruction *PredInst); 555 556 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 557 /// represented as. 558 void truncateToMinimalBitwidths(); 559 560 /// Create a broadcast instruction. This method generates a broadcast 561 /// instruction (shuffle) for loop invariant values and for the induction 562 /// value. If this is the induction variable then we extend it to N, N+1, ... 563 /// this is needed because each iteration in the loop corresponds to a SIMD 564 /// element. 565 virtual Value *getBroadcastInstrs(Value *V); 566 567 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 568 /// to each vector element of Val. The sequence starts at StartIndex. 569 /// \p Opcode is relevant for FP induction variable. 570 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 571 Instruction::BinaryOps Opcode = 572 Instruction::BinaryOpsEnd); 573 574 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 575 /// variable on which to base the steps, \p Step is the size of the step, and 576 /// \p EntryVal is the value from the original loop that maps to the steps. 577 /// Note that \p EntryVal doesn't have to be an induction variable - it 578 /// can also be a truncate instruction. 579 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 580 const InductionDescriptor &ID); 581 582 /// Create a vector induction phi node based on an existing scalar one. \p 583 /// EntryVal is the value from the original loop that maps to the vector phi 584 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 585 /// truncate instruction, instead of widening the original IV, we widen a 586 /// version of the IV truncated to \p EntryVal's type. 587 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 588 Value *Step, Instruction *EntryVal); 589 590 /// Returns true if an instruction \p I should be scalarized instead of 591 /// vectorized for the chosen vectorization factor. 592 bool shouldScalarizeInstruction(Instruction *I) const; 593 594 /// Returns true if we should generate a scalar version of \p IV. 595 bool needsScalarInduction(Instruction *IV) const; 596 597 /// If there is a cast involved in the induction variable \p ID, which should 598 /// be ignored in the vectorized loop body, this function records the 599 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 600 /// cast. We had already proved that the casted Phi is equal to the uncasted 601 /// Phi in the vectorized loop (under a runtime guard), and therefore 602 /// there is no need to vectorize the cast - the same value can be used in the 603 /// vector loop for both the Phi and the cast. 604 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 605 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 606 /// 607 /// \p EntryVal is the value from the original loop that maps to the vector 608 /// phi node and is used to distinguish what is the IV currently being 609 /// processed - original one (if \p EntryVal is a phi corresponding to the 610 /// original IV) or the "newly-created" one based on the proof mentioned above 611 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 612 /// latter case \p EntryVal is a TruncInst and we must not record anything for 613 /// that IV, but it's error-prone to expect callers of this routine to care 614 /// about that, hence this explicit parameter. 615 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 616 const Instruction *EntryVal, 617 Value *VectorLoopValue, 618 unsigned Part, 619 unsigned Lane = UINT_MAX); 620 621 /// Generate a shuffle sequence that will reverse the vector Vec. 622 virtual Value *reverseVector(Value *Vec); 623 624 /// Returns (and creates if needed) the original loop trip count. 625 Value *getOrCreateTripCount(Loop *NewLoop); 626 627 /// Returns (and creates if needed) the trip count of the widened loop. 628 Value *getOrCreateVectorTripCount(Loop *NewLoop); 629 630 /// Returns a bitcasted value to the requested vector type. 631 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 632 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 633 const DataLayout &DL); 634 635 /// Emit a bypass check to see if the vector trip count is zero, including if 636 /// it overflows. 637 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 638 639 /// Emit a bypass check to see if all of the SCEV assumptions we've 640 /// had to make are correct. 641 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 642 643 /// Emit bypass checks to check any memory assumptions we may have made. 644 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Compute the transformed value of Index at offset StartValue using step 647 /// StepValue. 648 /// For integer induction, returns StartValue + Index * StepValue. 649 /// For pointer induction, returns StartValue[Index * StepValue]. 650 /// FIXME: The newly created binary instructions should contain nsw/nuw 651 /// flags, which can be found from the original scalar operations. 652 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 653 const DataLayout &DL, 654 const InductionDescriptor &ID) const; 655 656 /// Add additional metadata to \p To that was not present on \p Orig. 657 /// 658 /// Currently this is used to add the noalias annotations based on the 659 /// inserted memchecks. Use this for instructions that are *cloned* into the 660 /// vector loop. 661 void addNewMetadata(Instruction *To, const Instruction *Orig); 662 663 /// Add metadata from one instruction to another. 664 /// 665 /// This includes both the original MDs from \p From and additional ones (\see 666 /// addNewMetadata). Use this for *newly created* instructions in the vector 667 /// loop. 668 void addMetadata(Instruction *To, Instruction *From); 669 670 /// Similar to the previous function but it adds the metadata to a 671 /// vector of instructions. 672 void addMetadata(ArrayRef<Value *> To, Instruction *From); 673 674 /// The original loop. 675 Loop *OrigLoop; 676 677 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 678 /// dynamic knowledge to simplify SCEV expressions and converts them to a 679 /// more usable form. 680 PredicatedScalarEvolution &PSE; 681 682 /// Loop Info. 683 LoopInfo *LI; 684 685 /// Dominator Tree. 686 DominatorTree *DT; 687 688 /// Alias Analysis. 689 AliasAnalysis *AA; 690 691 /// Target Library Info. 692 const TargetLibraryInfo *TLI; 693 694 /// Target Transform Info. 695 const TargetTransformInfo *TTI; 696 697 /// Assumption Cache. 698 AssumptionCache *AC; 699 700 /// Interface to emit optimization remarks. 701 OptimizationRemarkEmitter *ORE; 702 703 /// LoopVersioning. It's only set up (non-null) if memchecks were 704 /// used. 705 /// 706 /// This is currently only used to add no-alias metadata based on the 707 /// memchecks. The actually versioning is performed manually. 708 std::unique_ptr<LoopVersioning> LVer; 709 710 /// The vectorization SIMD factor to use. Each vector will have this many 711 /// vector elements. 712 unsigned VF; 713 714 /// The vectorization unroll factor to use. Each scalar is vectorized to this 715 /// many different vector instructions. 716 unsigned UF; 717 718 /// The builder that we use 719 IRBuilder<> Builder; 720 721 // --- Vectorization state --- 722 723 /// The vector-loop preheader. 724 BasicBlock *LoopVectorPreHeader; 725 726 /// The scalar-loop preheader. 727 BasicBlock *LoopScalarPreHeader; 728 729 /// Middle Block between the vector and the scalar. 730 BasicBlock *LoopMiddleBlock; 731 732 /// The ExitBlock of the scalar loop. 733 BasicBlock *LoopExitBlock; 734 735 /// The vector loop body. 736 BasicBlock *LoopVectorBody; 737 738 /// The scalar loop body. 739 BasicBlock *LoopScalarBody; 740 741 /// A list of all bypass blocks. The first block is the entry of the loop. 742 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 743 744 /// The new Induction variable which was added to the new block. 745 PHINode *Induction = nullptr; 746 747 /// The induction variable of the old basic block. 748 PHINode *OldInduction = nullptr; 749 750 /// Maps values from the original loop to their corresponding values in the 751 /// vectorized loop. A key value can map to either vector values, scalar 752 /// values or both kinds of values, depending on whether the key was 753 /// vectorized and scalarized. 754 VectorizerValueMap VectorLoopValueMap; 755 756 /// Store instructions that were predicated. 757 SmallVector<Instruction *, 4> PredicatedInstructions; 758 759 /// Trip count of the original loop. 760 Value *TripCount = nullptr; 761 762 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 763 Value *VectorTripCount = nullptr; 764 765 /// The legality analysis. 766 LoopVectorizationLegality *Legal; 767 768 /// The profitablity analysis. 769 LoopVectorizationCostModel *Cost; 770 771 // Record whether runtime checks are added. 772 bool AddedSafetyChecks = false; 773 774 // Holds the end values for each induction variable. We save the end values 775 // so we can later fix-up the external users of the induction variables. 776 DenseMap<PHINode *, Value *> IVEndValues; 777 778 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 779 // fixed up at the end of vector code generation. 780 SmallVector<PHINode *, 8> OrigPHIsToFix; 781 }; 782 783 class InnerLoopUnroller : public InnerLoopVectorizer { 784 public: 785 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 786 LoopInfo *LI, DominatorTree *DT, 787 const TargetLibraryInfo *TLI, 788 const TargetTransformInfo *TTI, AssumptionCache *AC, 789 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 790 LoopVectorizationLegality *LVL, 791 LoopVectorizationCostModel *CM) 792 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 793 UnrollFactor, LVL, CM) {} 794 795 private: 796 Value *getBroadcastInstrs(Value *V) override; 797 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 798 Instruction::BinaryOps Opcode = 799 Instruction::BinaryOpsEnd) override; 800 Value *reverseVector(Value *Vec) override; 801 }; 802 803 } // end namespace llvm 804 805 /// Look for a meaningful debug location on the instruction or it's 806 /// operands. 807 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 808 if (!I) 809 return I; 810 811 DebugLoc Empty; 812 if (I->getDebugLoc() != Empty) 813 return I; 814 815 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 816 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 817 if (OpInst->getDebugLoc() != Empty) 818 return OpInst; 819 } 820 821 return I; 822 } 823 824 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 825 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 826 const DILocation *DIL = Inst->getDebugLoc(); 827 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 828 !isa<DbgInfoIntrinsic>(Inst)) { 829 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 830 if (NewDIL) 831 B.SetCurrentDebugLocation(NewDIL.getValue()); 832 else 833 LLVM_DEBUG(dbgs() 834 << "Failed to create new discriminator: " 835 << DIL->getFilename() << " Line: " << DIL->getLine()); 836 } 837 else 838 B.SetCurrentDebugLocation(DIL); 839 } else 840 B.SetCurrentDebugLocation(DebugLoc()); 841 } 842 843 /// Write a record \p DebugMsg about vectorization failure to the debug 844 /// output stream. If \p I is passed, it is an instruction that prevents 845 /// vectorization. 846 #ifndef NDEBUG 847 static void debugVectorizationFailure(const StringRef DebugMsg, 848 Instruction *I) { 849 dbgs() << "LV: Not vectorizing: " << DebugMsg; 850 if (I != nullptr) 851 dbgs() << " " << *I; 852 else 853 dbgs() << '.'; 854 dbgs() << '\n'; 855 } 856 #endif 857 858 /// Create an analysis remark that explains why vectorization failed 859 /// 860 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 861 /// RemarkName is the identifier for the remark. If \p I is passed it is an 862 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 863 /// the location of the remark. \return the remark object that can be 864 /// streamed to. 865 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 866 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 867 Value *CodeRegion = TheLoop->getHeader(); 868 DebugLoc DL = TheLoop->getStartLoc(); 869 870 if (I) { 871 CodeRegion = I->getParent(); 872 // If there is no debug location attached to the instruction, revert back to 873 // using the loop's. 874 if (I->getDebugLoc()) 875 DL = I->getDebugLoc(); 876 } 877 878 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 879 R << "loop not vectorized: "; 880 return R; 881 } 882 883 namespace llvm { 884 885 void reportVectorizationFailure(const StringRef DebugMsg, 886 const StringRef OREMsg, const StringRef ORETag, 887 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 888 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 889 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 890 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 891 ORETag, TheLoop, I) << OREMsg); 892 } 893 894 } // end namespace llvm 895 896 #ifndef NDEBUG 897 /// \return string containing a file name and a line # for the given loop. 898 static std::string getDebugLocString(const Loop *L) { 899 std::string Result; 900 if (L) { 901 raw_string_ostream OS(Result); 902 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 903 LoopDbgLoc.print(OS); 904 else 905 // Just print the module name. 906 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 907 OS.flush(); 908 } 909 return Result; 910 } 911 #endif 912 913 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 914 const Instruction *Orig) { 915 // If the loop was versioned with memchecks, add the corresponding no-alias 916 // metadata. 917 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 918 LVer->annotateInstWithNoAlias(To, Orig); 919 } 920 921 void InnerLoopVectorizer::addMetadata(Instruction *To, 922 Instruction *From) { 923 propagateMetadata(To, From); 924 addNewMetadata(To, From); 925 } 926 927 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 928 Instruction *From) { 929 for (Value *V : To) { 930 if (Instruction *I = dyn_cast<Instruction>(V)) 931 addMetadata(I, From); 932 } 933 } 934 935 namespace llvm { 936 937 // Loop vectorization cost-model hints how the scalar epilogue loop should be 938 // lowered. 939 enum ScalarEpilogueLowering { 940 941 // The default: allowing scalar epilogues. 942 CM_ScalarEpilogueAllowed, 943 944 // Vectorization with OptForSize: don't allow epilogues. 945 CM_ScalarEpilogueNotAllowedOptSize, 946 947 // A special case of vectorisation with OptForSize: loops with a very small 948 // trip count are considered for vectorization under OptForSize, thereby 949 // making sure the cost of their loop body is dominant, free of runtime 950 // guards and scalar iteration overheads. 951 CM_ScalarEpilogueNotAllowedLowTripLoop, 952 953 // Loop hint predicate indicating an epilogue is undesired. 954 CM_ScalarEpilogueNotNeededUsePredicate 955 }; 956 957 /// LoopVectorizationCostModel - estimates the expected speedups due to 958 /// vectorization. 959 /// In many cases vectorization is not profitable. This can happen because of 960 /// a number of reasons. In this class we mainly attempt to predict the 961 /// expected speedup/slowdowns due to the supported instruction set. We use the 962 /// TargetTransformInfo to query the different backends for the cost of 963 /// different operations. 964 class LoopVectorizationCostModel { 965 public: 966 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 967 PredicatedScalarEvolution &PSE, LoopInfo *LI, 968 LoopVectorizationLegality *Legal, 969 const TargetTransformInfo &TTI, 970 const TargetLibraryInfo *TLI, DemandedBits *DB, 971 AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, const Function *F, 973 const LoopVectorizeHints *Hints, 974 InterleavedAccessInfo &IAI) 975 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 976 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 977 Hints(Hints), InterleaveInfo(IAI) {} 978 979 /// \return An upper bound for the vectorization factor, or None if 980 /// vectorization and interleaving should be avoided up front. 981 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 982 983 /// \return True if runtime checks are required for vectorization, and false 984 /// otherwise. 985 bool runtimeChecksRequired(); 986 987 /// \return The most profitable vectorization factor and the cost of that VF. 988 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 989 /// then this vectorization factor will be selected if vectorization is 990 /// possible. 991 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 992 993 /// Setup cost-based decisions for user vectorization factor. 994 void selectUserVectorizationFactor(unsigned UserVF) { 995 collectUniformsAndScalars(UserVF); 996 collectInstsToScalarize(UserVF); 997 } 998 999 /// \return The size (in bits) of the smallest and widest types in the code 1000 /// that needs to be vectorized. We ignore values that remain scalar such as 1001 /// 64 bit loop indices. 1002 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1003 1004 /// \return The desired interleave count. 1005 /// If interleave count has been specified by metadata it will be returned. 1006 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1007 /// are the selected vectorization factor and the cost of the selected VF. 1008 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1009 1010 /// Memory access instruction may be vectorized in more than one way. 1011 /// Form of instruction after vectorization depends on cost. 1012 /// This function takes cost-based decisions for Load/Store instructions 1013 /// and collects them in a map. This decisions map is used for building 1014 /// the lists of loop-uniform and loop-scalar instructions. 1015 /// The calculated cost is saved with widening decision in order to 1016 /// avoid redundant calculations. 1017 void setCostBasedWideningDecision(unsigned VF); 1018 1019 /// A struct that represents some properties of the register usage 1020 /// of a loop. 1021 struct RegisterUsage { 1022 /// Holds the number of loop invariant values that are used in the loop. 1023 /// The key is ClassID of target-provided register class. 1024 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1025 /// Holds the maximum number of concurrent live intervals in the loop. 1026 /// The key is ClassID of target-provided register class. 1027 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1028 }; 1029 1030 /// \return Returns information about the register usages of the loop for the 1031 /// given vectorization factors. 1032 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1033 1034 /// Collect values we want to ignore in the cost model. 1035 void collectValuesToIgnore(); 1036 1037 /// \returns The smallest bitwidth each instruction can be represented with. 1038 /// The vector equivalents of these instructions should be truncated to this 1039 /// type. 1040 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1041 return MinBWs; 1042 } 1043 1044 /// \returns True if it is more profitable to scalarize instruction \p I for 1045 /// vectorization factor \p VF. 1046 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1047 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1048 1049 // Cost model is not run in the VPlan-native path - return conservative 1050 // result until this changes. 1051 if (EnableVPlanNativePath) 1052 return false; 1053 1054 auto Scalars = InstsToScalarize.find(VF); 1055 assert(Scalars != InstsToScalarize.end() && 1056 "VF not yet analyzed for scalarization profitability"); 1057 return Scalars->second.find(I) != Scalars->second.end(); 1058 } 1059 1060 /// Returns true if \p I is known to be uniform after vectorization. 1061 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1062 if (VF == 1) 1063 return true; 1064 1065 // Cost model is not run in the VPlan-native path - return conservative 1066 // result until this changes. 1067 if (EnableVPlanNativePath) 1068 return false; 1069 1070 auto UniformsPerVF = Uniforms.find(VF); 1071 assert(UniformsPerVF != Uniforms.end() && 1072 "VF not yet analyzed for uniformity"); 1073 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1074 } 1075 1076 /// Returns true if \p I is known to be scalar after vectorization. 1077 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1078 if (VF == 1) 1079 return true; 1080 1081 // Cost model is not run in the VPlan-native path - return conservative 1082 // result until this changes. 1083 if (EnableVPlanNativePath) 1084 return false; 1085 1086 auto ScalarsPerVF = Scalars.find(VF); 1087 assert(ScalarsPerVF != Scalars.end() && 1088 "Scalar values are not calculated for VF"); 1089 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1090 } 1091 1092 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1093 /// for vectorization factor \p VF. 1094 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1095 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1096 !isProfitableToScalarize(I, VF) && 1097 !isScalarAfterVectorization(I, VF); 1098 } 1099 1100 /// Decision that was taken during cost calculation for memory instruction. 1101 enum InstWidening { 1102 CM_Unknown, 1103 CM_Widen, // For consecutive accesses with stride +1. 1104 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1105 CM_Interleave, 1106 CM_GatherScatter, 1107 CM_Scalarize 1108 }; 1109 1110 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1111 /// instruction \p I and vector width \p VF. 1112 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1113 unsigned Cost) { 1114 assert(VF >= 2 && "Expected VF >=2"); 1115 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1116 } 1117 1118 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1119 /// interleaving group \p Grp and vector width \p VF. 1120 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1121 InstWidening W, unsigned Cost) { 1122 assert(VF >= 2 && "Expected VF >=2"); 1123 /// Broadcast this decicion to all instructions inside the group. 1124 /// But the cost will be assigned to one instruction only. 1125 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1126 if (auto *I = Grp->getMember(i)) { 1127 if (Grp->getInsertPos() == I) 1128 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1129 else 1130 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1131 } 1132 } 1133 } 1134 1135 /// Return the cost model decision for the given instruction \p I and vector 1136 /// width \p VF. Return CM_Unknown if this instruction did not pass 1137 /// through the cost modeling. 1138 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1139 assert(VF >= 2 && "Expected VF >=2"); 1140 1141 // Cost model is not run in the VPlan-native path - return conservative 1142 // result until this changes. 1143 if (EnableVPlanNativePath) 1144 return CM_GatherScatter; 1145 1146 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1147 auto Itr = WideningDecisions.find(InstOnVF); 1148 if (Itr == WideningDecisions.end()) 1149 return CM_Unknown; 1150 return Itr->second.first; 1151 } 1152 1153 /// Return the vectorization cost for the given instruction \p I and vector 1154 /// width \p VF. 1155 unsigned getWideningCost(Instruction *I, unsigned VF) { 1156 assert(VF >= 2 && "Expected VF >=2"); 1157 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1158 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1159 "The cost is not calculated"); 1160 return WideningDecisions[InstOnVF].second; 1161 } 1162 1163 /// Return True if instruction \p I is an optimizable truncate whose operand 1164 /// is an induction variable. Such a truncate will be removed by adding a new 1165 /// induction variable with the destination type. 1166 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1167 // If the instruction is not a truncate, return false. 1168 auto *Trunc = dyn_cast<TruncInst>(I); 1169 if (!Trunc) 1170 return false; 1171 1172 // Get the source and destination types of the truncate. 1173 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1174 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1175 1176 // If the truncate is free for the given types, return false. Replacing a 1177 // free truncate with an induction variable would add an induction variable 1178 // update instruction to each iteration of the loop. We exclude from this 1179 // check the primary induction variable since it will need an update 1180 // instruction regardless. 1181 Value *Op = Trunc->getOperand(0); 1182 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1183 return false; 1184 1185 // If the truncated value is not an induction variable, return false. 1186 return Legal->isInductionPhi(Op); 1187 } 1188 1189 /// Collects the instructions to scalarize for each predicated instruction in 1190 /// the loop. 1191 void collectInstsToScalarize(unsigned VF); 1192 1193 /// Collect Uniform and Scalar values for the given \p VF. 1194 /// The sets depend on CM decision for Load/Store instructions 1195 /// that may be vectorized as interleave, gather-scatter or scalarized. 1196 void collectUniformsAndScalars(unsigned VF) { 1197 // Do the analysis once. 1198 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1199 return; 1200 setCostBasedWideningDecision(VF); 1201 collectLoopUniforms(VF); 1202 collectLoopScalars(VF); 1203 } 1204 1205 /// Returns true if the target machine supports masked store operation 1206 /// for the given \p DataType and kind of access to \p Ptr. 1207 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1208 return Legal->isConsecutivePtr(Ptr) && 1209 TTI.isLegalMaskedStore(DataType, Alignment); 1210 } 1211 1212 /// Returns true if the target machine supports masked load operation 1213 /// for the given \p DataType and kind of access to \p Ptr. 1214 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1215 return Legal->isConsecutivePtr(Ptr) && 1216 TTI.isLegalMaskedLoad(DataType, Alignment); 1217 } 1218 1219 /// Returns true if the target machine supports masked scatter operation 1220 /// for the given \p DataType. 1221 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1222 return TTI.isLegalMaskedScatter(DataType, Alignment); 1223 } 1224 1225 /// Returns true if the target machine supports masked gather operation 1226 /// for the given \p DataType. 1227 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1228 return TTI.isLegalMaskedGather(DataType, Alignment); 1229 } 1230 1231 /// Returns true if the target machine can represent \p V as a masked gather 1232 /// or scatter operation. 1233 bool isLegalGatherOrScatter(Value *V) { 1234 bool LI = isa<LoadInst>(V); 1235 bool SI = isa<StoreInst>(V); 1236 if (!LI && !SI) 1237 return false; 1238 auto *Ty = getMemInstValueType(V); 1239 Align Align = getLoadStoreAlignment(V); 1240 return (LI && isLegalMaskedGather(Ty, Align)) || 1241 (SI && isLegalMaskedScatter(Ty, Align)); 1242 } 1243 1244 /// Returns true if \p I is an instruction that will be scalarized with 1245 /// predication. Such instructions include conditional stores and 1246 /// instructions that may divide by zero. 1247 /// If a non-zero VF has been calculated, we check if I will be scalarized 1248 /// predication for that VF. 1249 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1250 1251 // Returns true if \p I is an instruction that will be predicated either 1252 // through scalar predication or masked load/store or masked gather/scatter. 1253 // Superset of instructions that return true for isScalarWithPredication. 1254 bool isPredicatedInst(Instruction *I) { 1255 if (!blockNeedsPredication(I->getParent())) 1256 return false; 1257 // Loads and stores that need some form of masked operation are predicated 1258 // instructions. 1259 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1260 return Legal->isMaskRequired(I); 1261 return isScalarWithPredication(I); 1262 } 1263 1264 /// Returns true if \p I is a memory instruction with consecutive memory 1265 /// access that can be widened. 1266 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1267 1268 /// Returns true if \p I is a memory instruction in an interleaved-group 1269 /// of memory accesses that can be vectorized with wide vector loads/stores 1270 /// and shuffles. 1271 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1272 1273 /// Check if \p Instr belongs to any interleaved access group. 1274 bool isAccessInterleaved(Instruction *Instr) { 1275 return InterleaveInfo.isInterleaved(Instr); 1276 } 1277 1278 /// Get the interleaved access group that \p Instr belongs to. 1279 const InterleaveGroup<Instruction> * 1280 getInterleavedAccessGroup(Instruction *Instr) { 1281 return InterleaveInfo.getInterleaveGroup(Instr); 1282 } 1283 1284 /// Returns true if an interleaved group requires a scalar iteration 1285 /// to handle accesses with gaps, and there is nothing preventing us from 1286 /// creating a scalar epilogue. 1287 bool requiresScalarEpilogue() const { 1288 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1289 } 1290 1291 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1292 /// loop hint annotation. 1293 bool isScalarEpilogueAllowed() const { 1294 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1295 } 1296 1297 /// Returns true if all loop blocks should be masked to fold tail loop. 1298 bool foldTailByMasking() const { return FoldTailByMasking; } 1299 1300 bool blockNeedsPredication(BasicBlock *BB) { 1301 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1302 } 1303 1304 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1305 /// with factor VF. Return the cost of the instruction, including 1306 /// scalarization overhead if it's needed. 1307 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1308 1309 /// Estimate cost of a call instruction CI if it were vectorized with factor 1310 /// VF. Return the cost of the instruction, including scalarization overhead 1311 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1312 /// scalarized - 1313 /// i.e. either vector version isn't available, or is too expensive. 1314 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1315 1316 /// Invalidates decisions already taken by the cost model. 1317 void invalidateCostModelingDecisions() { 1318 WideningDecisions.clear(); 1319 Uniforms.clear(); 1320 Scalars.clear(); 1321 } 1322 1323 private: 1324 unsigned NumPredStores = 0; 1325 1326 /// \return An upper bound for the vectorization factor, larger than zero. 1327 /// One is returned if vectorization should best be avoided due to cost. 1328 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1329 1330 /// The vectorization cost is a combination of the cost itself and a boolean 1331 /// indicating whether any of the contributing operations will actually 1332 /// operate on 1333 /// vector values after type legalization in the backend. If this latter value 1334 /// is 1335 /// false, then all operations will be scalarized (i.e. no vectorization has 1336 /// actually taken place). 1337 using VectorizationCostTy = std::pair<unsigned, bool>; 1338 1339 /// Returns the expected execution cost. The unit of the cost does 1340 /// not matter because we use the 'cost' units to compare different 1341 /// vector widths. The cost that is returned is *not* normalized by 1342 /// the factor width. 1343 VectorizationCostTy expectedCost(unsigned VF); 1344 1345 /// Returns the execution time cost of an instruction for a given vector 1346 /// width. Vector width of one means scalar. 1347 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1348 1349 /// The cost-computation logic from getInstructionCost which provides 1350 /// the vector type as an output parameter. 1351 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1352 1353 /// Calculate vectorization cost of memory instruction \p I. 1354 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1355 1356 /// The cost computation for scalarized memory instruction. 1357 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1358 1359 /// The cost computation for interleaving group of memory instructions. 1360 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1361 1362 /// The cost computation for Gather/Scatter instruction. 1363 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1364 1365 /// The cost computation for widening instruction \p I with consecutive 1366 /// memory access. 1367 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1368 1369 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1370 /// Load: scalar load + broadcast. 1371 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1372 /// element) 1373 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1374 1375 /// Estimate the overhead of scalarizing an instruction. This is a 1376 /// convenience wrapper for the type-based getScalarizationOverhead API. 1377 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1378 1379 /// Returns whether the instruction is a load or store and will be a emitted 1380 /// as a vector operation. 1381 bool isConsecutiveLoadOrStore(Instruction *I); 1382 1383 /// Returns true if an artificially high cost for emulated masked memrefs 1384 /// should be used. 1385 bool useEmulatedMaskMemRefHack(Instruction *I); 1386 1387 /// Map of scalar integer values to the smallest bitwidth they can be legally 1388 /// represented as. The vector equivalents of these values should be truncated 1389 /// to this type. 1390 MapVector<Instruction *, uint64_t> MinBWs; 1391 1392 /// A type representing the costs for instructions if they were to be 1393 /// scalarized rather than vectorized. The entries are Instruction-Cost 1394 /// pairs. 1395 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1396 1397 /// A set containing all BasicBlocks that are known to present after 1398 /// vectorization as a predicated block. 1399 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1400 1401 /// Records whether it is allowed to have the original scalar loop execute at 1402 /// least once. This may be needed as a fallback loop in case runtime 1403 /// aliasing/dependence checks fail, or to handle the tail/remainder 1404 /// iterations when the trip count is unknown or doesn't divide by the VF, 1405 /// or as a peel-loop to handle gaps in interleave-groups. 1406 /// Under optsize and when the trip count is very small we don't allow any 1407 /// iterations to execute in the scalar loop. 1408 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1409 1410 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1411 bool FoldTailByMasking = false; 1412 1413 /// A map holding scalar costs for different vectorization factors. The 1414 /// presence of a cost for an instruction in the mapping indicates that the 1415 /// instruction will be scalarized when vectorizing with the associated 1416 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1417 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1418 1419 /// Holds the instructions known to be uniform after vectorization. 1420 /// The data is collected per VF. 1421 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1422 1423 /// Holds the instructions known to be scalar after vectorization. 1424 /// The data is collected per VF. 1425 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1426 1427 /// Holds the instructions (address computations) that are forced to be 1428 /// scalarized. 1429 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1430 1431 /// Returns the expected difference in cost from scalarizing the expression 1432 /// feeding a predicated instruction \p PredInst. The instructions to 1433 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1434 /// non-negative return value implies the expression will be scalarized. 1435 /// Currently, only single-use chains are considered for scalarization. 1436 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1437 unsigned VF); 1438 1439 /// Collect the instructions that are uniform after vectorization. An 1440 /// instruction is uniform if we represent it with a single scalar value in 1441 /// the vectorized loop corresponding to each vector iteration. Examples of 1442 /// uniform instructions include pointer operands of consecutive or 1443 /// interleaved memory accesses. Note that although uniformity implies an 1444 /// instruction will be scalar, the reverse is not true. In general, a 1445 /// scalarized instruction will be represented by VF scalar values in the 1446 /// vectorized loop, each corresponding to an iteration of the original 1447 /// scalar loop. 1448 void collectLoopUniforms(unsigned VF); 1449 1450 /// Collect the instructions that are scalar after vectorization. An 1451 /// instruction is scalar if it is known to be uniform or will be scalarized 1452 /// during vectorization. Non-uniform scalarized instructions will be 1453 /// represented by VF values in the vectorized loop, each corresponding to an 1454 /// iteration of the original scalar loop. 1455 void collectLoopScalars(unsigned VF); 1456 1457 /// Keeps cost model vectorization decision and cost for instructions. 1458 /// Right now it is used for memory instructions only. 1459 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1460 std::pair<InstWidening, unsigned>>; 1461 1462 DecisionList WideningDecisions; 1463 1464 /// Returns true if \p V is expected to be vectorized and it needs to be 1465 /// extracted. 1466 bool needsExtract(Value *V, unsigned VF) const { 1467 Instruction *I = dyn_cast<Instruction>(V); 1468 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1469 return false; 1470 1471 // Assume we can vectorize V (and hence we need extraction) if the 1472 // scalars are not computed yet. This can happen, because it is called 1473 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1474 // the scalars are collected. That should be a safe assumption in most 1475 // cases, because we check if the operands have vectorizable types 1476 // beforehand in LoopVectorizationLegality. 1477 return Scalars.find(VF) == Scalars.end() || 1478 !isScalarAfterVectorization(I, VF); 1479 }; 1480 1481 /// Returns a range containing only operands needing to be extracted. 1482 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1483 unsigned VF) { 1484 return SmallVector<Value *, 4>(make_filter_range( 1485 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1486 } 1487 1488 public: 1489 /// The loop that we evaluate. 1490 Loop *TheLoop; 1491 1492 /// Predicated scalar evolution analysis. 1493 PredicatedScalarEvolution &PSE; 1494 1495 /// Loop Info analysis. 1496 LoopInfo *LI; 1497 1498 /// Vectorization legality. 1499 LoopVectorizationLegality *Legal; 1500 1501 /// Vector target information. 1502 const TargetTransformInfo &TTI; 1503 1504 /// Target Library Info. 1505 const TargetLibraryInfo *TLI; 1506 1507 /// Demanded bits analysis. 1508 DemandedBits *DB; 1509 1510 /// Assumption cache. 1511 AssumptionCache *AC; 1512 1513 /// Interface to emit optimization remarks. 1514 OptimizationRemarkEmitter *ORE; 1515 1516 const Function *TheFunction; 1517 1518 /// Loop Vectorize Hint. 1519 const LoopVectorizeHints *Hints; 1520 1521 /// The interleave access information contains groups of interleaved accesses 1522 /// with the same stride and close to each other. 1523 InterleavedAccessInfo &InterleaveInfo; 1524 1525 /// Values to ignore in the cost model. 1526 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1527 1528 /// Values to ignore in the cost model when VF > 1. 1529 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1530 }; 1531 1532 } // end namespace llvm 1533 1534 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1535 // vectorization. The loop needs to be annotated with #pragma omp simd 1536 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1537 // vector length information is not provided, vectorization is not considered 1538 // explicit. Interleave hints are not allowed either. These limitations will be 1539 // relaxed in the future. 1540 // Please, note that we are currently forced to abuse the pragma 'clang 1541 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1542 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1543 // provides *explicit vectorization hints* (LV can bypass legal checks and 1544 // assume that vectorization is legal). However, both hints are implemented 1545 // using the same metadata (llvm.loop.vectorize, processed by 1546 // LoopVectorizeHints). This will be fixed in the future when the native IR 1547 // representation for pragma 'omp simd' is introduced. 1548 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1549 OptimizationRemarkEmitter *ORE) { 1550 assert(!OuterLp->empty() && "This is not an outer loop"); 1551 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1552 1553 // Only outer loops with an explicit vectorization hint are supported. 1554 // Unannotated outer loops are ignored. 1555 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1556 return false; 1557 1558 Function *Fn = OuterLp->getHeader()->getParent(); 1559 if (!Hints.allowVectorization(Fn, OuterLp, 1560 true /*VectorizeOnlyWhenForced*/)) { 1561 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1562 return false; 1563 } 1564 1565 if (Hints.getInterleave() > 1) { 1566 // TODO: Interleave support is future work. 1567 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1568 "outer loops.\n"); 1569 Hints.emitRemarkWithHints(); 1570 return false; 1571 } 1572 1573 return true; 1574 } 1575 1576 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1577 OptimizationRemarkEmitter *ORE, 1578 SmallVectorImpl<Loop *> &V) { 1579 // Collect inner loops and outer loops without irreducible control flow. For 1580 // now, only collect outer loops that have explicit vectorization hints. If we 1581 // are stress testing the VPlan H-CFG construction, we collect the outermost 1582 // loop of every loop nest. 1583 if (L.empty() || VPlanBuildStressTest || 1584 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1585 LoopBlocksRPO RPOT(&L); 1586 RPOT.perform(LI); 1587 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1588 V.push_back(&L); 1589 // TODO: Collect inner loops inside marked outer loops in case 1590 // vectorization fails for the outer loop. Do not invoke 1591 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1592 // already known to be reducible. We can use an inherited attribute for 1593 // that. 1594 return; 1595 } 1596 } 1597 for (Loop *InnerL : L) 1598 collectSupportedLoops(*InnerL, LI, ORE, V); 1599 } 1600 1601 namespace { 1602 1603 /// The LoopVectorize Pass. 1604 struct LoopVectorize : public FunctionPass { 1605 /// Pass identification, replacement for typeid 1606 static char ID; 1607 1608 LoopVectorizePass Impl; 1609 1610 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1611 bool VectorizeOnlyWhenForced = false) 1612 : FunctionPass(ID), 1613 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1614 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1615 } 1616 1617 bool runOnFunction(Function &F) override { 1618 if (skipFunction(F)) 1619 return false; 1620 1621 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1622 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1623 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1624 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1625 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1626 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1627 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1628 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1629 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1630 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1631 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1632 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1633 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1634 1635 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1636 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1637 1638 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1639 GetLAA, *ORE, PSI).MadeAnyChange; 1640 } 1641 1642 void getAnalysisUsage(AnalysisUsage &AU) const override { 1643 AU.addRequired<AssumptionCacheTracker>(); 1644 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1645 AU.addRequired<DominatorTreeWrapperPass>(); 1646 AU.addRequired<LoopInfoWrapperPass>(); 1647 AU.addRequired<ScalarEvolutionWrapperPass>(); 1648 AU.addRequired<TargetTransformInfoWrapperPass>(); 1649 AU.addRequired<AAResultsWrapperPass>(); 1650 AU.addRequired<LoopAccessLegacyAnalysis>(); 1651 AU.addRequired<DemandedBitsWrapperPass>(); 1652 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1653 AU.addRequired<InjectTLIMappingsLegacy>(); 1654 1655 // We currently do not preserve loopinfo/dominator analyses with outer loop 1656 // vectorization. Until this is addressed, mark these analyses as preserved 1657 // only for non-VPlan-native path. 1658 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1659 if (!EnableVPlanNativePath) { 1660 AU.addPreserved<LoopInfoWrapperPass>(); 1661 AU.addPreserved<DominatorTreeWrapperPass>(); 1662 } 1663 1664 AU.addPreserved<BasicAAWrapperPass>(); 1665 AU.addPreserved<GlobalsAAWrapperPass>(); 1666 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1667 } 1668 }; 1669 1670 } // end anonymous namespace 1671 1672 //===----------------------------------------------------------------------===// 1673 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1674 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1675 //===----------------------------------------------------------------------===// 1676 1677 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1678 // We need to place the broadcast of invariant variables outside the loop, 1679 // but only if it's proven safe to do so. Else, broadcast will be inside 1680 // vector loop body. 1681 Instruction *Instr = dyn_cast<Instruction>(V); 1682 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1683 (!Instr || 1684 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1685 // Place the code for broadcasting invariant variables in the new preheader. 1686 IRBuilder<>::InsertPointGuard Guard(Builder); 1687 if (SafeToHoist) 1688 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1689 1690 // Broadcast the scalar into all locations in the vector. 1691 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1692 1693 return Shuf; 1694 } 1695 1696 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1697 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1698 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1699 "Expected either an induction phi-node or a truncate of it!"); 1700 Value *Start = II.getStartValue(); 1701 1702 // Construct the initial value of the vector IV in the vector loop preheader 1703 auto CurrIP = Builder.saveIP(); 1704 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1705 if (isa<TruncInst>(EntryVal)) { 1706 assert(Start->getType()->isIntegerTy() && 1707 "Truncation requires an integer type"); 1708 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1709 Step = Builder.CreateTrunc(Step, TruncType); 1710 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1711 } 1712 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1713 Value *SteppedStart = 1714 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1715 1716 // We create vector phi nodes for both integer and floating-point induction 1717 // variables. Here, we determine the kind of arithmetic we will perform. 1718 Instruction::BinaryOps AddOp; 1719 Instruction::BinaryOps MulOp; 1720 if (Step->getType()->isIntegerTy()) { 1721 AddOp = Instruction::Add; 1722 MulOp = Instruction::Mul; 1723 } else { 1724 AddOp = II.getInductionOpcode(); 1725 MulOp = Instruction::FMul; 1726 } 1727 1728 // Multiply the vectorization factor by the step using integer or 1729 // floating-point arithmetic as appropriate. 1730 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1731 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1732 1733 // Create a vector splat to use in the induction update. 1734 // 1735 // FIXME: If the step is non-constant, we create the vector splat with 1736 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1737 // handle a constant vector splat. 1738 Value *SplatVF = 1739 isa<Constant>(Mul) 1740 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1741 : Builder.CreateVectorSplat(VF, Mul); 1742 Builder.restoreIP(CurrIP); 1743 1744 // We may need to add the step a number of times, depending on the unroll 1745 // factor. The last of those goes into the PHI. 1746 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1747 &*LoopVectorBody->getFirstInsertionPt()); 1748 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1749 Instruction *LastInduction = VecInd; 1750 for (unsigned Part = 0; Part < UF; ++Part) { 1751 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1752 1753 if (isa<TruncInst>(EntryVal)) 1754 addMetadata(LastInduction, EntryVal); 1755 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1756 1757 LastInduction = cast<Instruction>(addFastMathFlag( 1758 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1759 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1760 } 1761 1762 // Move the last step to the end of the latch block. This ensures consistent 1763 // placement of all induction updates. 1764 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1765 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1766 auto *ICmp = cast<Instruction>(Br->getCondition()); 1767 LastInduction->moveBefore(ICmp); 1768 LastInduction->setName("vec.ind.next"); 1769 1770 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1771 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1772 } 1773 1774 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1775 return Cost->isScalarAfterVectorization(I, VF) || 1776 Cost->isProfitableToScalarize(I, VF); 1777 } 1778 1779 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1780 if (shouldScalarizeInstruction(IV)) 1781 return true; 1782 auto isScalarInst = [&](User *U) -> bool { 1783 auto *I = cast<Instruction>(U); 1784 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1785 }; 1786 return llvm::any_of(IV->users(), isScalarInst); 1787 } 1788 1789 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1790 const InductionDescriptor &ID, const Instruction *EntryVal, 1791 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1792 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1793 "Expected either an induction phi-node or a truncate of it!"); 1794 1795 // This induction variable is not the phi from the original loop but the 1796 // newly-created IV based on the proof that casted Phi is equal to the 1797 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1798 // re-uses the same InductionDescriptor that original IV uses but we don't 1799 // have to do any recording in this case - that is done when original IV is 1800 // processed. 1801 if (isa<TruncInst>(EntryVal)) 1802 return; 1803 1804 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1805 if (Casts.empty()) 1806 return; 1807 // Only the first Cast instruction in the Casts vector is of interest. 1808 // The rest of the Casts (if exist) have no uses outside the 1809 // induction update chain itself. 1810 Instruction *CastInst = *Casts.begin(); 1811 if (Lane < UINT_MAX) 1812 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1813 else 1814 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1815 } 1816 1817 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1818 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1819 "Primary induction variable must have an integer type"); 1820 1821 auto II = Legal->getInductionVars().find(IV); 1822 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1823 1824 auto ID = II->second; 1825 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1826 1827 // The value from the original loop to which we are mapping the new induction 1828 // variable. 1829 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1830 1831 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1832 1833 // Generate code for the induction step. Note that induction steps are 1834 // required to be loop-invariant 1835 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1836 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1837 "Induction step should be loop invariant"); 1838 if (PSE.getSE()->isSCEVable(IV->getType())) { 1839 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1840 return Exp.expandCodeFor(Step, Step->getType(), 1841 LoopVectorPreHeader->getTerminator()); 1842 } 1843 return cast<SCEVUnknown>(Step)->getValue(); 1844 }; 1845 1846 // The scalar value to broadcast. This is derived from the canonical 1847 // induction variable. If a truncation type is given, truncate the canonical 1848 // induction variable and step. Otherwise, derive these values from the 1849 // induction descriptor. 1850 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1851 Value *ScalarIV = Induction; 1852 if (IV != OldInduction) { 1853 ScalarIV = IV->getType()->isIntegerTy() 1854 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1855 : Builder.CreateCast(Instruction::SIToFP, Induction, 1856 IV->getType()); 1857 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1858 ScalarIV->setName("offset.idx"); 1859 } 1860 if (Trunc) { 1861 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1862 assert(Step->getType()->isIntegerTy() && 1863 "Truncation requires an integer step"); 1864 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1865 Step = Builder.CreateTrunc(Step, TruncType); 1866 } 1867 return ScalarIV; 1868 }; 1869 1870 // Create the vector values from the scalar IV, in the absence of creating a 1871 // vector IV. 1872 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1873 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1874 for (unsigned Part = 0; Part < UF; ++Part) { 1875 Value *EntryPart = 1876 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1877 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1878 if (Trunc) 1879 addMetadata(EntryPart, Trunc); 1880 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1881 } 1882 }; 1883 1884 // Now do the actual transformations, and start with creating the step value. 1885 Value *Step = CreateStepValue(ID.getStep()); 1886 if (VF <= 1) { 1887 Value *ScalarIV = CreateScalarIV(Step); 1888 CreateSplatIV(ScalarIV, Step); 1889 return; 1890 } 1891 1892 // Determine if we want a scalar version of the induction variable. This is 1893 // true if the induction variable itself is not widened, or if it has at 1894 // least one user in the loop that is not widened. 1895 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1896 if (!NeedsScalarIV) { 1897 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1898 return; 1899 } 1900 1901 // Try to create a new independent vector induction variable. If we can't 1902 // create the phi node, we will splat the scalar induction variable in each 1903 // loop iteration. 1904 if (!shouldScalarizeInstruction(EntryVal)) { 1905 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1906 Value *ScalarIV = CreateScalarIV(Step); 1907 // Create scalar steps that can be used by instructions we will later 1908 // scalarize. Note that the addition of the scalar steps will not increase 1909 // the number of instructions in the loop in the common case prior to 1910 // InstCombine. We will be trading one vector extract for each scalar step. 1911 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1912 return; 1913 } 1914 1915 // All IV users are scalar instructions, so only emit a scalar IV, not a 1916 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 1917 // predicate used by the masked loads/stores. 1918 Value *ScalarIV = CreateScalarIV(Step); 1919 if (!Cost->isScalarEpilogueAllowed()) 1920 CreateSplatIV(ScalarIV, Step); 1921 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1922 } 1923 1924 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1925 Instruction::BinaryOps BinOp) { 1926 // Create and check the types. 1927 auto *ValVTy = cast<VectorType>(Val->getType()); 1928 int VLen = ValVTy->getNumElements(); 1929 1930 Type *STy = Val->getType()->getScalarType(); 1931 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1932 "Induction Step must be an integer or FP"); 1933 assert(Step->getType() == STy && "Step has wrong type"); 1934 1935 SmallVector<Constant *, 8> Indices; 1936 1937 if (STy->isIntegerTy()) { 1938 // Create a vector of consecutive numbers from zero to VF. 1939 for (int i = 0; i < VLen; ++i) 1940 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1941 1942 // Add the consecutive indices to the vector value. 1943 Constant *Cv = ConstantVector::get(Indices); 1944 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1945 Step = Builder.CreateVectorSplat(VLen, Step); 1946 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1947 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1948 // which can be found from the original scalar operations. 1949 Step = Builder.CreateMul(Cv, Step); 1950 return Builder.CreateAdd(Val, Step, "induction"); 1951 } 1952 1953 // Floating point induction. 1954 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1955 "Binary Opcode should be specified for FP induction"); 1956 // Create a vector of consecutive numbers from zero to VF. 1957 for (int i = 0; i < VLen; ++i) 1958 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1959 1960 // Add the consecutive indices to the vector value. 1961 Constant *Cv = ConstantVector::get(Indices); 1962 1963 Step = Builder.CreateVectorSplat(VLen, Step); 1964 1965 // Floating point operations had to be 'fast' to enable the induction. 1966 FastMathFlags Flags; 1967 Flags.setFast(); 1968 1969 Value *MulOp = Builder.CreateFMul(Cv, Step); 1970 if (isa<Instruction>(MulOp)) 1971 // Have to check, MulOp may be a constant 1972 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1973 1974 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1975 if (isa<Instruction>(BOp)) 1976 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1977 return BOp; 1978 } 1979 1980 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1981 Instruction *EntryVal, 1982 const InductionDescriptor &ID) { 1983 // We shouldn't have to build scalar steps if we aren't vectorizing. 1984 assert(VF > 1 && "VF should be greater than one"); 1985 1986 // Get the value type and ensure it and the step have the same integer type. 1987 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1988 assert(ScalarIVTy == Step->getType() && 1989 "Val and Step should have the same type"); 1990 1991 // We build scalar steps for both integer and floating-point induction 1992 // variables. Here, we determine the kind of arithmetic we will perform. 1993 Instruction::BinaryOps AddOp; 1994 Instruction::BinaryOps MulOp; 1995 if (ScalarIVTy->isIntegerTy()) { 1996 AddOp = Instruction::Add; 1997 MulOp = Instruction::Mul; 1998 } else { 1999 AddOp = ID.getInductionOpcode(); 2000 MulOp = Instruction::FMul; 2001 } 2002 2003 // Determine the number of scalars we need to generate for each unroll 2004 // iteration. If EntryVal is uniform, we only need to generate the first 2005 // lane. Otherwise, we generate all VF values. 2006 unsigned Lanes = 2007 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 2008 : VF; 2009 // Compute the scalar steps and save the results in VectorLoopValueMap. 2010 for (unsigned Part = 0; Part < UF; ++Part) { 2011 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2012 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 2013 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2014 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2015 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2016 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2017 } 2018 } 2019 } 2020 2021 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2022 assert(V != Induction && "The new induction variable should not be used."); 2023 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2024 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2025 2026 // If we have a stride that is replaced by one, do it here. Defer this for 2027 // the VPlan-native path until we start running Legal checks in that path. 2028 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2029 V = ConstantInt::get(V->getType(), 1); 2030 2031 // If we have a vector mapped to this value, return it. 2032 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2033 return VectorLoopValueMap.getVectorValue(V, Part); 2034 2035 // If the value has not been vectorized, check if it has been scalarized 2036 // instead. If it has been scalarized, and we actually need the value in 2037 // vector form, we will construct the vector values on demand. 2038 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2039 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2040 2041 // If we've scalarized a value, that value should be an instruction. 2042 auto *I = cast<Instruction>(V); 2043 2044 // If we aren't vectorizing, we can just copy the scalar map values over to 2045 // the vector map. 2046 if (VF == 1) { 2047 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2048 return ScalarValue; 2049 } 2050 2051 // Get the last scalar instruction we generated for V and Part. If the value 2052 // is known to be uniform after vectorization, this corresponds to lane zero 2053 // of the Part unroll iteration. Otherwise, the last instruction is the one 2054 // we created for the last vector lane of the Part unroll iteration. 2055 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2056 auto *LastInst = cast<Instruction>( 2057 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2058 2059 // Set the insert point after the last scalarized instruction. This ensures 2060 // the insertelement sequence will directly follow the scalar definitions. 2061 auto OldIP = Builder.saveIP(); 2062 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2063 Builder.SetInsertPoint(&*NewIP); 2064 2065 // However, if we are vectorizing, we need to construct the vector values. 2066 // If the value is known to be uniform after vectorization, we can just 2067 // broadcast the scalar value corresponding to lane zero for each unroll 2068 // iteration. Otherwise, we construct the vector values using insertelement 2069 // instructions. Since the resulting vectors are stored in 2070 // VectorLoopValueMap, we will only generate the insertelements once. 2071 Value *VectorValue = nullptr; 2072 if (Cost->isUniformAfterVectorization(I, VF)) { 2073 VectorValue = getBroadcastInstrs(ScalarValue); 2074 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2075 } else { 2076 // Initialize packing with insertelements to start from undef. 2077 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2078 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2079 for (unsigned Lane = 0; Lane < VF; ++Lane) 2080 packScalarIntoVectorValue(V, {Part, Lane}); 2081 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2082 } 2083 Builder.restoreIP(OldIP); 2084 return VectorValue; 2085 } 2086 2087 // If this scalar is unknown, assume that it is a constant or that it is 2088 // loop invariant. Broadcast V and save the value for future uses. 2089 Value *B = getBroadcastInstrs(V); 2090 VectorLoopValueMap.setVectorValue(V, Part, B); 2091 return B; 2092 } 2093 2094 Value * 2095 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2096 const VPIteration &Instance) { 2097 // If the value is not an instruction contained in the loop, it should 2098 // already be scalar. 2099 if (OrigLoop->isLoopInvariant(V)) 2100 return V; 2101 2102 assert(Instance.Lane > 0 2103 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2104 : true && "Uniform values only have lane zero"); 2105 2106 // If the value from the original loop has not been vectorized, it is 2107 // represented by UF x VF scalar values in the new loop. Return the requested 2108 // scalar value. 2109 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2110 return VectorLoopValueMap.getScalarValue(V, Instance); 2111 2112 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2113 // for the given unroll part. If this entry is not a vector type (i.e., the 2114 // vectorization factor is one), there is no need to generate an 2115 // extractelement instruction. 2116 auto *U = getOrCreateVectorValue(V, Instance.Part); 2117 if (!U->getType()->isVectorTy()) { 2118 assert(VF == 1 && "Value not scalarized has non-vector type"); 2119 return U; 2120 } 2121 2122 // Otherwise, the value from the original loop has been vectorized and is 2123 // represented by UF vector values. Extract and return the requested scalar 2124 // value from the appropriate vector lane. 2125 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2126 } 2127 2128 void InnerLoopVectorizer::packScalarIntoVectorValue( 2129 Value *V, const VPIteration &Instance) { 2130 assert(V != Induction && "The new induction variable should not be used."); 2131 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2132 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2133 2134 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2135 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2136 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2137 Builder.getInt32(Instance.Lane)); 2138 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2139 } 2140 2141 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2142 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2143 SmallVector<int, 8> ShuffleMask; 2144 for (unsigned i = 0; i < VF; ++i) 2145 ShuffleMask.push_back(VF - i - 1); 2146 2147 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2148 ShuffleMask, "reverse"); 2149 } 2150 2151 // Return whether we allow using masked interleave-groups (for dealing with 2152 // strided loads/stores that reside in predicated blocks, or for dealing 2153 // with gaps). 2154 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2155 // If an override option has been passed in for interleaved accesses, use it. 2156 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2157 return EnableMaskedInterleavedMemAccesses; 2158 2159 return TTI.enableMaskedInterleavedAccessVectorization(); 2160 } 2161 2162 // Try to vectorize the interleave group that \p Instr belongs to. 2163 // 2164 // E.g. Translate following interleaved load group (factor = 3): 2165 // for (i = 0; i < N; i+=3) { 2166 // R = Pic[i]; // Member of index 0 2167 // G = Pic[i+1]; // Member of index 1 2168 // B = Pic[i+2]; // Member of index 2 2169 // ... // do something to R, G, B 2170 // } 2171 // To: 2172 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2173 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2174 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2175 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2176 // 2177 // Or translate following interleaved store group (factor = 3): 2178 // for (i = 0; i < N; i+=3) { 2179 // ... do something to R, G, B 2180 // Pic[i] = R; // Member of index 0 2181 // Pic[i+1] = G; // Member of index 1 2182 // Pic[i+2] = B; // Member of index 2 2183 // } 2184 // To: 2185 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2186 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2187 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2188 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2189 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2190 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2191 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2192 VPValue *Addr, VPValue *BlockInMask) { 2193 Instruction *Instr = Group->getInsertPos(); 2194 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2195 2196 // Prepare for the vector type of the interleaved load/store. 2197 Type *ScalarTy = getMemInstValueType(Instr); 2198 unsigned InterleaveFactor = Group->getFactor(); 2199 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2200 2201 // Prepare for the new pointers. 2202 SmallVector<Value *, 2> AddrParts; 2203 unsigned Index = Group->getIndex(Instr); 2204 2205 // TODO: extend the masked interleaved-group support to reversed access. 2206 assert((!BlockInMask || !Group->isReverse()) && 2207 "Reversed masked interleave-group not supported."); 2208 2209 // If the group is reverse, adjust the index to refer to the last vector lane 2210 // instead of the first. We adjust the index from the first vector lane, 2211 // rather than directly getting the pointer for lane VF - 1, because the 2212 // pointer operand of the interleaved access is supposed to be uniform. For 2213 // uniform instructions, we're only required to generate a value for the 2214 // first vector lane in each unroll iteration. 2215 if (Group->isReverse()) 2216 Index += (VF - 1) * Group->getFactor(); 2217 2218 for (unsigned Part = 0; Part < UF; Part++) { 2219 Value *AddrPart = State.get(Addr, {Part, 0}); 2220 setDebugLocFromInst(Builder, AddrPart); 2221 2222 // Notice current instruction could be any index. Need to adjust the address 2223 // to the member of index 0. 2224 // 2225 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2226 // b = A[i]; // Member of index 0 2227 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2228 // 2229 // E.g. A[i+1] = a; // Member of index 1 2230 // A[i] = b; // Member of index 0 2231 // A[i+2] = c; // Member of index 2 (Current instruction) 2232 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2233 2234 bool InBounds = false; 2235 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2236 InBounds = gep->isInBounds(); 2237 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2238 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2239 2240 // Cast to the vector pointer type. 2241 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2242 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2243 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2244 } 2245 2246 setDebugLocFromInst(Builder, Instr); 2247 Value *UndefVec = UndefValue::get(VecTy); 2248 2249 Value *MaskForGaps = nullptr; 2250 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2251 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2252 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2253 } 2254 2255 // Vectorize the interleaved load group. 2256 if (isa<LoadInst>(Instr)) { 2257 // For each unroll part, create a wide load for the group. 2258 SmallVector<Value *, 2> NewLoads; 2259 for (unsigned Part = 0; Part < UF; Part++) { 2260 Instruction *NewLoad; 2261 if (BlockInMask || MaskForGaps) { 2262 assert(useMaskedInterleavedAccesses(*TTI) && 2263 "masked interleaved groups are not allowed."); 2264 Value *GroupMask = MaskForGaps; 2265 if (BlockInMask) { 2266 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2267 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2268 Value *ShuffledMask = Builder.CreateShuffleVector( 2269 BlockInMaskPart, Undefs, 2270 createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); 2271 GroupMask = MaskForGaps 2272 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2273 MaskForGaps) 2274 : ShuffledMask; 2275 } 2276 NewLoad = 2277 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2278 GroupMask, UndefVec, "wide.masked.vec"); 2279 } 2280 else 2281 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2282 Group->getAlign(), "wide.vec"); 2283 Group->addMetadata(NewLoad); 2284 NewLoads.push_back(NewLoad); 2285 } 2286 2287 // For each member in the group, shuffle out the appropriate data from the 2288 // wide loads. 2289 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2290 Instruction *Member = Group->getMember(I); 2291 2292 // Skip the gaps in the group. 2293 if (!Member) 2294 continue; 2295 2296 auto StrideMask = createStrideMask(I, InterleaveFactor, VF); 2297 for (unsigned Part = 0; Part < UF; Part++) { 2298 Value *StridedVec = Builder.CreateShuffleVector( 2299 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2300 2301 // If this member has different type, cast the result type. 2302 if (Member->getType() != ScalarTy) { 2303 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2304 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2305 } 2306 2307 if (Group->isReverse()) 2308 StridedVec = reverseVector(StridedVec); 2309 2310 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2311 } 2312 } 2313 return; 2314 } 2315 2316 // The sub vector type for current instruction. 2317 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2318 2319 // Vectorize the interleaved store group. 2320 for (unsigned Part = 0; Part < UF; Part++) { 2321 // Collect the stored vector from each member. 2322 SmallVector<Value *, 4> StoredVecs; 2323 for (unsigned i = 0; i < InterleaveFactor; i++) { 2324 // Interleaved store group doesn't allow a gap, so each index has a member 2325 Instruction *Member = Group->getMember(i); 2326 assert(Member && "Fail to get a member from an interleaved store group"); 2327 2328 Value *StoredVec = getOrCreateVectorValue( 2329 cast<StoreInst>(Member)->getValueOperand(), Part); 2330 if (Group->isReverse()) 2331 StoredVec = reverseVector(StoredVec); 2332 2333 // If this member has different type, cast it to a unified type. 2334 2335 if (StoredVec->getType() != SubVT) 2336 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2337 2338 StoredVecs.push_back(StoredVec); 2339 } 2340 2341 // Concatenate all vectors into a wide vector. 2342 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2343 2344 // Interleave the elements in the wide vector. 2345 Value *IVec = Builder.CreateShuffleVector( 2346 WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), 2347 "interleaved.vec"); 2348 2349 Instruction *NewStoreInstr; 2350 if (BlockInMask) { 2351 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2352 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2353 Value *ShuffledMask = Builder.CreateShuffleVector( 2354 BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), 2355 "interleaved.mask"); 2356 NewStoreInstr = Builder.CreateMaskedStore( 2357 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2358 } 2359 else 2360 NewStoreInstr = 2361 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2362 2363 Group->addMetadata(NewStoreInstr); 2364 } 2365 } 2366 2367 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2368 VPTransformState &State, 2369 VPValue *Addr, 2370 VPValue *StoredValue, 2371 VPValue *BlockInMask) { 2372 // Attempt to issue a wide load. 2373 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2374 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2375 2376 assert((LI || SI) && "Invalid Load/Store instruction"); 2377 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2378 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2379 2380 LoopVectorizationCostModel::InstWidening Decision = 2381 Cost->getWideningDecision(Instr, VF); 2382 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2383 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2384 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2385 "CM decision is not to widen the memory instruction"); 2386 2387 Type *ScalarDataTy = getMemInstValueType(Instr); 2388 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2389 const Align Alignment = getLoadStoreAlignment(Instr); 2390 2391 // Determine if the pointer operand of the access is either consecutive or 2392 // reverse consecutive. 2393 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2394 bool ConsecutiveStride = 2395 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2396 bool CreateGatherScatter = 2397 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2398 2399 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2400 // gather/scatter. Otherwise Decision should have been to Scalarize. 2401 assert((ConsecutiveStride || CreateGatherScatter) && 2402 "The instruction should be scalarized"); 2403 (void)ConsecutiveStride; 2404 2405 VectorParts BlockInMaskParts(UF); 2406 bool isMaskRequired = BlockInMask; 2407 if (isMaskRequired) 2408 for (unsigned Part = 0; Part < UF; ++Part) 2409 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2410 2411 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2412 // Calculate the pointer for the specific unroll-part. 2413 GetElementPtrInst *PartPtr = nullptr; 2414 2415 bool InBounds = false; 2416 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2417 InBounds = gep->isInBounds(); 2418 2419 if (Reverse) { 2420 // If the address is consecutive but reversed, then the 2421 // wide store needs to start at the last vector element. 2422 PartPtr = cast<GetElementPtrInst>( 2423 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2424 PartPtr->setIsInBounds(InBounds); 2425 PartPtr = cast<GetElementPtrInst>( 2426 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2427 PartPtr->setIsInBounds(InBounds); 2428 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2429 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2430 } else { 2431 PartPtr = cast<GetElementPtrInst>( 2432 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2433 PartPtr->setIsInBounds(InBounds); 2434 } 2435 2436 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2437 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2438 }; 2439 2440 // Handle Stores: 2441 if (SI) { 2442 setDebugLocFromInst(Builder, SI); 2443 2444 for (unsigned Part = 0; Part < UF; ++Part) { 2445 Instruction *NewSI = nullptr; 2446 Value *StoredVal = State.get(StoredValue, Part); 2447 if (CreateGatherScatter) { 2448 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2449 Value *VectorGep = State.get(Addr, Part); 2450 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2451 MaskPart); 2452 } else { 2453 if (Reverse) { 2454 // If we store to reverse consecutive memory locations, then we need 2455 // to reverse the order of elements in the stored value. 2456 StoredVal = reverseVector(StoredVal); 2457 // We don't want to update the value in the map as it might be used in 2458 // another expression. So don't call resetVectorValue(StoredVal). 2459 } 2460 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2461 if (isMaskRequired) 2462 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2463 BlockInMaskParts[Part]); 2464 else 2465 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2466 } 2467 addMetadata(NewSI, SI); 2468 } 2469 return; 2470 } 2471 2472 // Handle loads. 2473 assert(LI && "Must have a load instruction"); 2474 setDebugLocFromInst(Builder, LI); 2475 for (unsigned Part = 0; Part < UF; ++Part) { 2476 Value *NewLI; 2477 if (CreateGatherScatter) { 2478 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2479 Value *VectorGep = State.get(Addr, Part); 2480 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2481 nullptr, "wide.masked.gather"); 2482 addMetadata(NewLI, LI); 2483 } else { 2484 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2485 if (isMaskRequired) 2486 NewLI = Builder.CreateMaskedLoad( 2487 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2488 "wide.masked.load"); 2489 else 2490 NewLI = 2491 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2492 2493 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2494 addMetadata(NewLI, LI); 2495 if (Reverse) 2496 NewLI = reverseVector(NewLI); 2497 } 2498 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2499 } 2500 } 2501 2502 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2503 const VPIteration &Instance, 2504 bool IfPredicateInstr, 2505 VPTransformState &State) { 2506 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2507 2508 setDebugLocFromInst(Builder, Instr); 2509 2510 // Does this instruction return a value ? 2511 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2512 2513 Instruction *Cloned = Instr->clone(); 2514 if (!IsVoidRetTy) 2515 Cloned->setName(Instr->getName() + ".cloned"); 2516 2517 // Replace the operands of the cloned instructions with their scalar 2518 // equivalents in the new loop. 2519 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2520 auto *NewOp = State.get(User.getOperand(op), Instance); 2521 Cloned->setOperand(op, NewOp); 2522 } 2523 addNewMetadata(Cloned, Instr); 2524 2525 // Place the cloned scalar in the new loop. 2526 Builder.Insert(Cloned); 2527 2528 // Add the cloned scalar to the scalar map entry. 2529 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2530 2531 // If we just cloned a new assumption, add it the assumption cache. 2532 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2533 if (II->getIntrinsicID() == Intrinsic::assume) 2534 AC->registerAssumption(II); 2535 2536 // End if-block. 2537 if (IfPredicateInstr) 2538 PredicatedInstructions.push_back(Cloned); 2539 } 2540 2541 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2542 Value *End, Value *Step, 2543 Instruction *DL) { 2544 BasicBlock *Header = L->getHeader(); 2545 BasicBlock *Latch = L->getLoopLatch(); 2546 // As we're just creating this loop, it's possible no latch exists 2547 // yet. If so, use the header as this will be a single block loop. 2548 if (!Latch) 2549 Latch = Header; 2550 2551 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2552 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2553 setDebugLocFromInst(Builder, OldInst); 2554 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2555 2556 Builder.SetInsertPoint(Latch->getTerminator()); 2557 setDebugLocFromInst(Builder, OldInst); 2558 2559 // Create i+1 and fill the PHINode. 2560 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2561 Induction->addIncoming(Start, L->getLoopPreheader()); 2562 Induction->addIncoming(Next, Latch); 2563 // Create the compare. 2564 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2565 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2566 2567 // Now we have two terminators. Remove the old one from the block. 2568 Latch->getTerminator()->eraseFromParent(); 2569 2570 return Induction; 2571 } 2572 2573 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2574 if (TripCount) 2575 return TripCount; 2576 2577 assert(L && "Create Trip Count for null loop."); 2578 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2579 // Find the loop boundaries. 2580 ScalarEvolution *SE = PSE.getSE(); 2581 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2582 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2583 "Invalid loop count"); 2584 2585 Type *IdxTy = Legal->getWidestInductionType(); 2586 assert(IdxTy && "No type for induction"); 2587 2588 // The exit count might have the type of i64 while the phi is i32. This can 2589 // happen if we have an induction variable that is sign extended before the 2590 // compare. The only way that we get a backedge taken count is that the 2591 // induction variable was signed and as such will not overflow. In such a case 2592 // truncation is legal. 2593 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2594 IdxTy->getPrimitiveSizeInBits()) 2595 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2596 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2597 2598 // Get the total trip count from the count by adding 1. 2599 const SCEV *ExitCount = SE->getAddExpr( 2600 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2601 2602 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2603 2604 // Expand the trip count and place the new instructions in the preheader. 2605 // Notice that the pre-header does not change, only the loop body. 2606 SCEVExpander Exp(*SE, DL, "induction"); 2607 2608 // Count holds the overall loop count (N). 2609 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2610 L->getLoopPreheader()->getTerminator()); 2611 2612 if (TripCount->getType()->isPointerTy()) 2613 TripCount = 2614 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2615 L->getLoopPreheader()->getTerminator()); 2616 2617 return TripCount; 2618 } 2619 2620 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2621 if (VectorTripCount) 2622 return VectorTripCount; 2623 2624 Value *TC = getOrCreateTripCount(L); 2625 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2626 2627 Type *Ty = TC->getType(); 2628 Constant *Step = ConstantInt::get(Ty, VF * UF); 2629 2630 // If the tail is to be folded by masking, round the number of iterations N 2631 // up to a multiple of Step instead of rounding down. This is done by first 2632 // adding Step-1 and then rounding down. Note that it's ok if this addition 2633 // overflows: the vector induction variable will eventually wrap to zero given 2634 // that it starts at zero and its Step is a power of two; the loop will then 2635 // exit, with the last early-exit vector comparison also producing all-true. 2636 if (Cost->foldTailByMasking()) { 2637 assert(isPowerOf2_32(VF * UF) && 2638 "VF*UF must be a power of 2 when folding tail by masking"); 2639 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2640 } 2641 2642 // Now we need to generate the expression for the part of the loop that the 2643 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2644 // iterations are not required for correctness, or N - Step, otherwise. Step 2645 // is equal to the vectorization factor (number of SIMD elements) times the 2646 // unroll factor (number of SIMD instructions). 2647 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2648 2649 // If there is a non-reversed interleaved group that may speculatively access 2650 // memory out-of-bounds, we need to ensure that there will be at least one 2651 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2652 // the trip count, we set the remainder to be equal to the step. If the step 2653 // does not evenly divide the trip count, no adjustment is necessary since 2654 // there will already be scalar iterations. Note that the minimum iterations 2655 // check ensures that N >= Step. 2656 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2657 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2658 R = Builder.CreateSelect(IsZero, Step, R); 2659 } 2660 2661 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2662 2663 return VectorTripCount; 2664 } 2665 2666 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2667 const DataLayout &DL) { 2668 // Verify that V is a vector type with same number of elements as DstVTy. 2669 unsigned VF = DstVTy->getNumElements(); 2670 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2671 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2672 Type *SrcElemTy = SrcVecTy->getElementType(); 2673 Type *DstElemTy = DstVTy->getElementType(); 2674 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2675 "Vector elements must have same size"); 2676 2677 // Do a direct cast if element types are castable. 2678 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2679 return Builder.CreateBitOrPointerCast(V, DstVTy); 2680 } 2681 // V cannot be directly casted to desired vector type. 2682 // May happen when V is a floating point vector but DstVTy is a vector of 2683 // pointers or vice-versa. Handle this using a two-step bitcast using an 2684 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2685 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2686 "Only one type should be a pointer type"); 2687 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2688 "Only one type should be a floating point type"); 2689 Type *IntTy = 2690 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2691 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2692 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2693 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2694 } 2695 2696 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2697 BasicBlock *Bypass) { 2698 Value *Count = getOrCreateTripCount(L); 2699 // Reuse existing vector loop preheader for TC checks. 2700 // Note that new preheader block is generated for vector loop. 2701 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2702 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2703 2704 // Generate code to check if the loop's trip count is less than VF * UF, or 2705 // equal to it in case a scalar epilogue is required; this implies that the 2706 // vector trip count is zero. This check also covers the case where adding one 2707 // to the backedge-taken count overflowed leading to an incorrect trip count 2708 // of zero. In this case we will also jump to the scalar loop. 2709 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2710 : ICmpInst::ICMP_ULT; 2711 2712 // If tail is to be folded, vector loop takes care of all iterations. 2713 Value *CheckMinIters = Builder.getFalse(); 2714 if (!Cost->foldTailByMasking()) 2715 CheckMinIters = Builder.CreateICmp( 2716 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2717 "min.iters.check"); 2718 2719 // Create new preheader for vector loop. 2720 LoopVectorPreHeader = 2721 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2722 "vector.ph"); 2723 2724 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2725 DT->getNode(Bypass)->getIDom()) && 2726 "TC check is expected to dominate Bypass"); 2727 2728 // Update dominator for Bypass & LoopExit. 2729 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2730 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2731 2732 ReplaceInstWithInst( 2733 TCCheckBlock->getTerminator(), 2734 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2735 LoopBypassBlocks.push_back(TCCheckBlock); 2736 } 2737 2738 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2739 // Reuse existing vector loop preheader for SCEV checks. 2740 // Note that new preheader block is generated for vector loop. 2741 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2742 2743 // Generate the code to check that the SCEV assumptions that we made. 2744 // We want the new basic block to start at the first instruction in a 2745 // sequence of instructions that form a check. 2746 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2747 "scev.check"); 2748 Value *SCEVCheck = Exp.expandCodeForPredicate( 2749 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2750 2751 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2752 if (C->isZero()) 2753 return; 2754 2755 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2756 "Cannot SCEV check stride or overflow when optimizing for size"); 2757 2758 SCEVCheckBlock->setName("vector.scevcheck"); 2759 // Create new preheader for vector loop. 2760 LoopVectorPreHeader = 2761 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2762 nullptr, "vector.ph"); 2763 2764 // Update dominator only if this is first RT check. 2765 if (LoopBypassBlocks.empty()) { 2766 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2767 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2768 } 2769 2770 ReplaceInstWithInst( 2771 SCEVCheckBlock->getTerminator(), 2772 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2773 LoopBypassBlocks.push_back(SCEVCheckBlock); 2774 AddedSafetyChecks = true; 2775 } 2776 2777 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2778 // VPlan-native path does not do any analysis for runtime checks currently. 2779 if (EnableVPlanNativePath) 2780 return; 2781 2782 // Reuse existing vector loop preheader for runtime memory checks. 2783 // Note that new preheader block is generated for vector loop. 2784 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2785 2786 // Generate the code that checks in runtime if arrays overlap. We put the 2787 // checks into a separate block to make the more common case of few elements 2788 // faster. 2789 auto *LAI = Legal->getLAI(); 2790 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2791 if (!RtPtrChecking.Need) 2792 return; 2793 Instruction *FirstCheckInst; 2794 Instruction *MemRuntimeCheck; 2795 std::tie(FirstCheckInst, MemRuntimeCheck) = 2796 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2797 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2798 if (!MemRuntimeCheck) 2799 return; 2800 2801 if (MemCheckBlock->getParent()->hasOptSize()) { 2802 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2803 "Cannot emit memory checks when optimizing for size, unless forced " 2804 "to vectorize."); 2805 ORE->emit([&]() { 2806 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2807 L->getStartLoc(), L->getHeader()) 2808 << "Code-size may be reduced by not forcing " 2809 "vectorization, or by source-code modifications " 2810 "eliminating the need for runtime checks " 2811 "(e.g., adding 'restrict')."; 2812 }); 2813 } 2814 2815 MemCheckBlock->setName("vector.memcheck"); 2816 // Create new preheader for vector loop. 2817 LoopVectorPreHeader = 2818 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2819 "vector.ph"); 2820 2821 // Update dominator only if this is first RT check. 2822 if (LoopBypassBlocks.empty()) { 2823 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2824 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2825 } 2826 2827 ReplaceInstWithInst( 2828 MemCheckBlock->getTerminator(), 2829 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2830 LoopBypassBlocks.push_back(MemCheckBlock); 2831 AddedSafetyChecks = true; 2832 2833 // We currently don't use LoopVersioning for the actual loop cloning but we 2834 // still use it to add the noalias metadata. 2835 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2836 PSE.getSE()); 2837 LVer->prepareNoAliasMetadata(); 2838 } 2839 2840 Value *InnerLoopVectorizer::emitTransformedIndex( 2841 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2842 const InductionDescriptor &ID) const { 2843 2844 SCEVExpander Exp(*SE, DL, "induction"); 2845 auto Step = ID.getStep(); 2846 auto StartValue = ID.getStartValue(); 2847 assert(Index->getType() == Step->getType() && 2848 "Index type does not match StepValue type"); 2849 2850 // Note: the IR at this point is broken. We cannot use SE to create any new 2851 // SCEV and then expand it, hoping that SCEV's simplification will give us 2852 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2853 // lead to various SCEV crashes. So all we can do is to use builder and rely 2854 // on InstCombine for future simplifications. Here we handle some trivial 2855 // cases only. 2856 auto CreateAdd = [&B](Value *X, Value *Y) { 2857 assert(X->getType() == Y->getType() && "Types don't match!"); 2858 if (auto *CX = dyn_cast<ConstantInt>(X)) 2859 if (CX->isZero()) 2860 return Y; 2861 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2862 if (CY->isZero()) 2863 return X; 2864 return B.CreateAdd(X, Y); 2865 }; 2866 2867 auto CreateMul = [&B](Value *X, Value *Y) { 2868 assert(X->getType() == Y->getType() && "Types don't match!"); 2869 if (auto *CX = dyn_cast<ConstantInt>(X)) 2870 if (CX->isOne()) 2871 return Y; 2872 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2873 if (CY->isOne()) 2874 return X; 2875 return B.CreateMul(X, Y); 2876 }; 2877 2878 switch (ID.getKind()) { 2879 case InductionDescriptor::IK_IntInduction: { 2880 assert(Index->getType() == StartValue->getType() && 2881 "Index type does not match StartValue type"); 2882 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2883 return B.CreateSub(StartValue, Index); 2884 auto *Offset = CreateMul( 2885 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2886 return CreateAdd(StartValue, Offset); 2887 } 2888 case InductionDescriptor::IK_PtrInduction: { 2889 assert(isa<SCEVConstant>(Step) && 2890 "Expected constant step for pointer induction"); 2891 return B.CreateGEP( 2892 StartValue->getType()->getPointerElementType(), StartValue, 2893 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2894 &*B.GetInsertPoint()))); 2895 } 2896 case InductionDescriptor::IK_FpInduction: { 2897 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2898 auto InductionBinOp = ID.getInductionBinOp(); 2899 assert(InductionBinOp && 2900 (InductionBinOp->getOpcode() == Instruction::FAdd || 2901 InductionBinOp->getOpcode() == Instruction::FSub) && 2902 "Original bin op should be defined for FP induction"); 2903 2904 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2905 2906 // Floating point operations had to be 'fast' to enable the induction. 2907 FastMathFlags Flags; 2908 Flags.setFast(); 2909 2910 Value *MulExp = B.CreateFMul(StepValue, Index); 2911 if (isa<Instruction>(MulExp)) 2912 // We have to check, the MulExp may be a constant. 2913 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2914 2915 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2916 "induction"); 2917 if (isa<Instruction>(BOp)) 2918 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2919 2920 return BOp; 2921 } 2922 case InductionDescriptor::IK_NoInduction: 2923 return nullptr; 2924 } 2925 llvm_unreachable("invalid enum"); 2926 } 2927 2928 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2929 /* 2930 In this function we generate a new loop. The new loop will contain 2931 the vectorized instructions while the old loop will continue to run the 2932 scalar remainder. 2933 2934 [ ] <-- loop iteration number check. 2935 / | 2936 / v 2937 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2938 | / | 2939 | / v 2940 || [ ] <-- vector pre header. 2941 |/ | 2942 | v 2943 | [ ] \ 2944 | [ ]_| <-- vector loop. 2945 | | 2946 | v 2947 | -[ ] <--- middle-block. 2948 | / | 2949 | / v 2950 -|- >[ ] <--- new preheader. 2951 | | 2952 | v 2953 | [ ] \ 2954 | [ ]_| <-- old scalar loop to handle remainder. 2955 \ | 2956 \ v 2957 >[ ] <-- exit block. 2958 ... 2959 */ 2960 2961 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2962 2963 // Some loops have a single integer induction variable, while other loops 2964 // don't. One example is c++ iterators that often have multiple pointer 2965 // induction variables. In the code below we also support a case where we 2966 // don't have a single induction variable. 2967 // 2968 // We try to obtain an induction variable from the original loop as hard 2969 // as possible. However if we don't find one that: 2970 // - is an integer 2971 // - counts from zero, stepping by one 2972 // - is the size of the widest induction variable type 2973 // then we create a new one. 2974 OldInduction = Legal->getPrimaryInduction(); 2975 Type *IdxTy = Legal->getWidestInductionType(); 2976 2977 // Split the single block loop into the two loop structure described above. 2978 LoopScalarBody = OrigLoop->getHeader(); 2979 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2980 LoopExitBlock = OrigLoop->getExitBlock(); 2981 assert(LoopExitBlock && "Must have an exit block"); 2982 assert(LoopVectorPreHeader && "Invalid loop structure"); 2983 2984 LoopMiddleBlock = 2985 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2986 LI, nullptr, "middle.block"); 2987 LoopScalarPreHeader = 2988 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2989 nullptr, "scalar.ph"); 2990 // We intentionally don't let SplitBlock to update LoopInfo since 2991 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2992 // LoopVectorBody is explicitly added to the correct place few lines later. 2993 LoopVectorBody = 2994 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2995 nullptr, nullptr, "vector.body"); 2996 2997 // Update dominator for loop exit. 2998 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2999 3000 // Create and register the new vector loop. 3001 Loop *Lp = LI->AllocateLoop(); 3002 Loop *ParentLoop = OrigLoop->getParentLoop(); 3003 3004 // Insert the new loop into the loop nest and register the new basic blocks 3005 // before calling any utilities such as SCEV that require valid LoopInfo. 3006 if (ParentLoop) { 3007 ParentLoop->addChildLoop(Lp); 3008 } else { 3009 LI->addTopLevelLoop(Lp); 3010 } 3011 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3012 3013 // Find the loop boundaries. 3014 Value *Count = getOrCreateTripCount(Lp); 3015 3016 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3017 3018 // Now, compare the new count to zero. If it is zero skip the vector loop and 3019 // jump to the scalar loop. This check also covers the case where the 3020 // backedge-taken count is uint##_max: adding one to it will overflow leading 3021 // to an incorrect trip count of zero. In this (rare) case we will also jump 3022 // to the scalar loop. 3023 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3024 3025 // Generate the code to check any assumptions that we've made for SCEV 3026 // expressions. 3027 emitSCEVChecks(Lp, LoopScalarPreHeader); 3028 3029 // Generate the code that checks in runtime if arrays overlap. We put the 3030 // checks into a separate block to make the more common case of few elements 3031 // faster. 3032 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3033 3034 // Generate the induction variable. 3035 // The loop step is equal to the vectorization factor (num of SIMD elements) 3036 // times the unroll factor (num of SIMD instructions). 3037 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3038 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3039 Induction = 3040 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3041 getDebugLocFromInstOrOperands(OldInduction)); 3042 3043 // We are going to resume the execution of the scalar loop. 3044 // Go over all of the induction variables that we found and fix the 3045 // PHIs that are left in the scalar version of the loop. 3046 // The starting values of PHI nodes depend on the counter of the last 3047 // iteration in the vectorized loop. 3048 // If we come from a bypass edge then we need to start from the original 3049 // start value. 3050 3051 // This variable saves the new starting index for the scalar loop. It is used 3052 // to test if there are any tail iterations left once the vector loop has 3053 // completed. 3054 for (auto &InductionEntry : Legal->getInductionVars()) { 3055 PHINode *OrigPhi = InductionEntry.first; 3056 InductionDescriptor II = InductionEntry.second; 3057 3058 // Create phi nodes to merge from the backedge-taken check block. 3059 PHINode *BCResumeVal = 3060 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3061 LoopScalarPreHeader->getTerminator()); 3062 // Copy original phi DL over to the new one. 3063 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3064 Value *&EndValue = IVEndValues[OrigPhi]; 3065 if (OrigPhi == OldInduction) { 3066 // We know what the end value is. 3067 EndValue = CountRoundDown; 3068 } else { 3069 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3070 Type *StepType = II.getStep()->getType(); 3071 Instruction::CastOps CastOp = 3072 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3073 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3074 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3075 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3076 EndValue->setName("ind.end"); 3077 } 3078 3079 // The new PHI merges the original incoming value, in case of a bypass, 3080 // or the value at the end of the vectorized loop. 3081 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3082 3083 // Fix the scalar body counter (PHI node). 3084 // The old induction's phi node in the scalar body needs the truncated 3085 // value. 3086 for (BasicBlock *BB : LoopBypassBlocks) 3087 BCResumeVal->addIncoming(II.getStartValue(), BB); 3088 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3089 } 3090 3091 // We need the OrigLoop (scalar loop part) latch terminator to help 3092 // produce correct debug info for the middle block BB instructions. 3093 // The legality check stage guarantees that the loop will have a single 3094 // latch. 3095 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3096 "Scalar loop latch terminator isn't a branch"); 3097 BranchInst *ScalarLatchBr = 3098 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3099 3100 // Add a check in the middle block to see if we have completed 3101 // all of the iterations in the first vector loop. 3102 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3103 // If tail is to be folded, we know we don't need to run the remainder. 3104 Value *CmpN = Builder.getTrue(); 3105 if (!Cost->foldTailByMasking()) { 3106 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3107 CountRoundDown, "cmp.n", 3108 LoopMiddleBlock->getTerminator()); 3109 3110 // Here we use the same DebugLoc as the scalar loop latch branch instead 3111 // of the corresponding compare because they may have ended up with 3112 // different line numbers and we want to avoid awkward line stepping while 3113 // debugging. Eg. if the compare has got a line number inside the loop. 3114 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3115 } 3116 3117 BranchInst *BrInst = 3118 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3119 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3120 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3121 3122 // Get ready to start creating new instructions into the vectorized body. 3123 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3124 "Inconsistent vector loop preheader"); 3125 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3126 3127 Optional<MDNode *> VectorizedLoopID = 3128 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3129 LLVMLoopVectorizeFollowupVectorized}); 3130 if (VectorizedLoopID.hasValue()) { 3131 Lp->setLoopID(VectorizedLoopID.getValue()); 3132 3133 // Do not setAlreadyVectorized if loop attributes have been defined 3134 // explicitly. 3135 return LoopVectorPreHeader; 3136 } 3137 3138 // Keep all loop hints from the original loop on the vector loop (we'll 3139 // replace the vectorizer-specific hints below). 3140 if (MDNode *LID = OrigLoop->getLoopID()) 3141 Lp->setLoopID(LID); 3142 3143 LoopVectorizeHints Hints(Lp, true, *ORE); 3144 Hints.setAlreadyVectorized(); 3145 3146 #ifdef EXPENSIVE_CHECKS 3147 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3148 LI->verify(*DT); 3149 #endif 3150 3151 return LoopVectorPreHeader; 3152 } 3153 3154 // Fix up external users of the induction variable. At this point, we are 3155 // in LCSSA form, with all external PHIs that use the IV having one input value, 3156 // coming from the remainder loop. We need those PHIs to also have a correct 3157 // value for the IV when arriving directly from the middle block. 3158 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3159 const InductionDescriptor &II, 3160 Value *CountRoundDown, Value *EndValue, 3161 BasicBlock *MiddleBlock) { 3162 // There are two kinds of external IV usages - those that use the value 3163 // computed in the last iteration (the PHI) and those that use the penultimate 3164 // value (the value that feeds into the phi from the loop latch). 3165 // We allow both, but they, obviously, have different values. 3166 3167 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3168 3169 DenseMap<Value *, Value *> MissingVals; 3170 3171 // An external user of the last iteration's value should see the value that 3172 // the remainder loop uses to initialize its own IV. 3173 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3174 for (User *U : PostInc->users()) { 3175 Instruction *UI = cast<Instruction>(U); 3176 if (!OrigLoop->contains(UI)) { 3177 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3178 MissingVals[UI] = EndValue; 3179 } 3180 } 3181 3182 // An external user of the penultimate value need to see EndValue - Step. 3183 // The simplest way to get this is to recompute it from the constituent SCEVs, 3184 // that is Start + (Step * (CRD - 1)). 3185 for (User *U : OrigPhi->users()) { 3186 auto *UI = cast<Instruction>(U); 3187 if (!OrigLoop->contains(UI)) { 3188 const DataLayout &DL = 3189 OrigLoop->getHeader()->getModule()->getDataLayout(); 3190 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3191 3192 IRBuilder<> B(MiddleBlock->getTerminator()); 3193 Value *CountMinusOne = B.CreateSub( 3194 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3195 Value *CMO = 3196 !II.getStep()->getType()->isIntegerTy() 3197 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3198 II.getStep()->getType()) 3199 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3200 CMO->setName("cast.cmo"); 3201 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3202 Escape->setName("ind.escape"); 3203 MissingVals[UI] = Escape; 3204 } 3205 } 3206 3207 for (auto &I : MissingVals) { 3208 PHINode *PHI = cast<PHINode>(I.first); 3209 // One corner case we have to handle is two IVs "chasing" each-other, 3210 // that is %IV2 = phi [...], [ %IV1, %latch ] 3211 // In this case, if IV1 has an external use, we need to avoid adding both 3212 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3213 // don't already have an incoming value for the middle block. 3214 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3215 PHI->addIncoming(I.second, MiddleBlock); 3216 } 3217 } 3218 3219 namespace { 3220 3221 struct CSEDenseMapInfo { 3222 static bool canHandle(const Instruction *I) { 3223 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3224 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3225 } 3226 3227 static inline Instruction *getEmptyKey() { 3228 return DenseMapInfo<Instruction *>::getEmptyKey(); 3229 } 3230 3231 static inline Instruction *getTombstoneKey() { 3232 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3233 } 3234 3235 static unsigned getHashValue(const Instruction *I) { 3236 assert(canHandle(I) && "Unknown instruction!"); 3237 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3238 I->value_op_end())); 3239 } 3240 3241 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3242 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3243 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3244 return LHS == RHS; 3245 return LHS->isIdenticalTo(RHS); 3246 } 3247 }; 3248 3249 } // end anonymous namespace 3250 3251 ///Perform cse of induction variable instructions. 3252 static void cse(BasicBlock *BB) { 3253 // Perform simple cse. 3254 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3255 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3256 Instruction *In = &*I++; 3257 3258 if (!CSEDenseMapInfo::canHandle(In)) 3259 continue; 3260 3261 // Check if we can replace this instruction with any of the 3262 // visited instructions. 3263 if (Instruction *V = CSEMap.lookup(In)) { 3264 In->replaceAllUsesWith(V); 3265 In->eraseFromParent(); 3266 continue; 3267 } 3268 3269 CSEMap[In] = In; 3270 } 3271 } 3272 3273 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3274 unsigned VF, 3275 bool &NeedToScalarize) { 3276 Function *F = CI->getCalledFunction(); 3277 Type *ScalarRetTy = CI->getType(); 3278 SmallVector<Type *, 4> Tys, ScalarTys; 3279 for (auto &ArgOp : CI->arg_operands()) 3280 ScalarTys.push_back(ArgOp->getType()); 3281 3282 // Estimate cost of scalarized vector call. The source operands are assumed 3283 // to be vectors, so we need to extract individual elements from there, 3284 // execute VF scalar calls, and then gather the result into the vector return 3285 // value. 3286 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3287 TTI::TCK_RecipThroughput); 3288 if (VF == 1) 3289 return ScalarCallCost; 3290 3291 // Compute corresponding vector type for return value and arguments. 3292 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3293 for (Type *ScalarTy : ScalarTys) 3294 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3295 3296 // Compute costs of unpacking argument values for the scalar calls and 3297 // packing the return values to a vector. 3298 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3299 3300 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3301 3302 // If we can't emit a vector call for this function, then the currently found 3303 // cost is the cost we need to return. 3304 NeedToScalarize = true; 3305 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3306 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3307 3308 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3309 return Cost; 3310 3311 // If the corresponding vector cost is cheaper, return its cost. 3312 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3313 TTI::TCK_RecipThroughput); 3314 if (VectorCallCost < Cost) { 3315 NeedToScalarize = false; 3316 return VectorCallCost; 3317 } 3318 return Cost; 3319 } 3320 3321 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3322 unsigned VF) { 3323 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3324 assert(ID && "Expected intrinsic call!"); 3325 3326 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3327 return TTI.getIntrinsicInstrCost(CostAttrs, 3328 TargetTransformInfo::TCK_RecipThroughput); 3329 } 3330 3331 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3332 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3333 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3334 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3335 } 3336 3337 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3338 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3339 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3340 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3341 } 3342 3343 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3344 // For every instruction `I` in MinBWs, truncate the operands, create a 3345 // truncated version of `I` and reextend its result. InstCombine runs 3346 // later and will remove any ext/trunc pairs. 3347 SmallPtrSet<Value *, 4> Erased; 3348 for (const auto &KV : Cost->getMinimalBitwidths()) { 3349 // If the value wasn't vectorized, we must maintain the original scalar 3350 // type. The absence of the value from VectorLoopValueMap indicates that it 3351 // wasn't vectorized. 3352 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3353 continue; 3354 for (unsigned Part = 0; Part < UF; ++Part) { 3355 Value *I = getOrCreateVectorValue(KV.first, Part); 3356 if (Erased.find(I) != Erased.end() || I->use_empty() || 3357 !isa<Instruction>(I)) 3358 continue; 3359 Type *OriginalTy = I->getType(); 3360 Type *ScalarTruncatedTy = 3361 IntegerType::get(OriginalTy->getContext(), KV.second); 3362 Type *TruncatedTy = VectorType::get( 3363 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3364 if (TruncatedTy == OriginalTy) 3365 continue; 3366 3367 IRBuilder<> B(cast<Instruction>(I)); 3368 auto ShrinkOperand = [&](Value *V) -> Value * { 3369 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3370 if (ZI->getSrcTy() == TruncatedTy) 3371 return ZI->getOperand(0); 3372 return B.CreateZExtOrTrunc(V, TruncatedTy); 3373 }; 3374 3375 // The actual instruction modification depends on the instruction type, 3376 // unfortunately. 3377 Value *NewI = nullptr; 3378 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3379 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3380 ShrinkOperand(BO->getOperand(1))); 3381 3382 // Any wrapping introduced by shrinking this operation shouldn't be 3383 // considered undefined behavior. So, we can't unconditionally copy 3384 // arithmetic wrapping flags to NewI. 3385 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3386 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3387 NewI = 3388 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3389 ShrinkOperand(CI->getOperand(1))); 3390 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3391 NewI = B.CreateSelect(SI->getCondition(), 3392 ShrinkOperand(SI->getTrueValue()), 3393 ShrinkOperand(SI->getFalseValue())); 3394 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3395 switch (CI->getOpcode()) { 3396 default: 3397 llvm_unreachable("Unhandled cast!"); 3398 case Instruction::Trunc: 3399 NewI = ShrinkOperand(CI->getOperand(0)); 3400 break; 3401 case Instruction::SExt: 3402 NewI = B.CreateSExtOrTrunc( 3403 CI->getOperand(0), 3404 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3405 break; 3406 case Instruction::ZExt: 3407 NewI = B.CreateZExtOrTrunc( 3408 CI->getOperand(0), 3409 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3410 break; 3411 } 3412 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3413 auto Elements0 = 3414 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3415 auto *O0 = B.CreateZExtOrTrunc( 3416 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3417 auto Elements1 = 3418 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3419 auto *O1 = B.CreateZExtOrTrunc( 3420 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3421 3422 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3423 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3424 // Don't do anything with the operands, just extend the result. 3425 continue; 3426 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3427 auto Elements = 3428 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3429 auto *O0 = B.CreateZExtOrTrunc( 3430 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3431 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3432 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3433 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3434 auto Elements = 3435 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3436 auto *O0 = B.CreateZExtOrTrunc( 3437 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3438 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3439 } else { 3440 // If we don't know what to do, be conservative and don't do anything. 3441 continue; 3442 } 3443 3444 // Lastly, extend the result. 3445 NewI->takeName(cast<Instruction>(I)); 3446 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3447 I->replaceAllUsesWith(Res); 3448 cast<Instruction>(I)->eraseFromParent(); 3449 Erased.insert(I); 3450 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3451 } 3452 } 3453 3454 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3455 for (const auto &KV : Cost->getMinimalBitwidths()) { 3456 // If the value wasn't vectorized, we must maintain the original scalar 3457 // type. The absence of the value from VectorLoopValueMap indicates that it 3458 // wasn't vectorized. 3459 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3460 continue; 3461 for (unsigned Part = 0; Part < UF; ++Part) { 3462 Value *I = getOrCreateVectorValue(KV.first, Part); 3463 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3464 if (Inst && Inst->use_empty()) { 3465 Value *NewI = Inst->getOperand(0); 3466 Inst->eraseFromParent(); 3467 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3468 } 3469 } 3470 } 3471 } 3472 3473 void InnerLoopVectorizer::fixVectorizedLoop() { 3474 // Insert truncates and extends for any truncated instructions as hints to 3475 // InstCombine. 3476 if (VF > 1) 3477 truncateToMinimalBitwidths(); 3478 3479 // Fix widened non-induction PHIs by setting up the PHI operands. 3480 if (OrigPHIsToFix.size()) { 3481 assert(EnableVPlanNativePath && 3482 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3483 fixNonInductionPHIs(); 3484 } 3485 3486 // At this point every instruction in the original loop is widened to a 3487 // vector form. Now we need to fix the recurrences in the loop. These PHI 3488 // nodes are currently empty because we did not want to introduce cycles. 3489 // This is the second stage of vectorizing recurrences. 3490 fixCrossIterationPHIs(); 3491 3492 // Forget the original basic block. 3493 PSE.getSE()->forgetLoop(OrigLoop); 3494 3495 // Fix-up external users of the induction variables. 3496 for (auto &Entry : Legal->getInductionVars()) 3497 fixupIVUsers(Entry.first, Entry.second, 3498 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3499 IVEndValues[Entry.first], LoopMiddleBlock); 3500 3501 fixLCSSAPHIs(); 3502 for (Instruction *PI : PredicatedInstructions) 3503 sinkScalarOperands(&*PI); 3504 3505 // Remove redundant induction instructions. 3506 cse(LoopVectorBody); 3507 3508 // Set/update profile weights for the vector and remainder loops as original 3509 // loop iterations are now distributed among them. Note that original loop 3510 // represented by LoopScalarBody becomes remainder loop after vectorization. 3511 // 3512 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3513 // end up getting slightly roughened result but that should be OK since 3514 // profile is not inherently precise anyway. Note also possible bypass of 3515 // vector code caused by legality checks is ignored, assigning all the weight 3516 // to the vector loop, optimistically. 3517 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3518 LI->getLoopFor(LoopVectorBody), 3519 LI->getLoopFor(LoopScalarBody), VF * UF); 3520 } 3521 3522 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3523 // In order to support recurrences we need to be able to vectorize Phi nodes. 3524 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3525 // stage #2: We now need to fix the recurrences by adding incoming edges to 3526 // the currently empty PHI nodes. At this point every instruction in the 3527 // original loop is widened to a vector form so we can use them to construct 3528 // the incoming edges. 3529 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3530 // Handle first-order recurrences and reductions that need to be fixed. 3531 if (Legal->isFirstOrderRecurrence(&Phi)) 3532 fixFirstOrderRecurrence(&Phi); 3533 else if (Legal->isReductionVariable(&Phi)) 3534 fixReduction(&Phi); 3535 } 3536 } 3537 3538 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3539 // This is the second phase of vectorizing first-order recurrences. An 3540 // overview of the transformation is described below. Suppose we have the 3541 // following loop. 3542 // 3543 // for (int i = 0; i < n; ++i) 3544 // b[i] = a[i] - a[i - 1]; 3545 // 3546 // There is a first-order recurrence on "a". For this loop, the shorthand 3547 // scalar IR looks like: 3548 // 3549 // scalar.ph: 3550 // s_init = a[-1] 3551 // br scalar.body 3552 // 3553 // scalar.body: 3554 // i = phi [0, scalar.ph], [i+1, scalar.body] 3555 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3556 // s2 = a[i] 3557 // b[i] = s2 - s1 3558 // br cond, scalar.body, ... 3559 // 3560 // In this example, s1 is a recurrence because it's value depends on the 3561 // previous iteration. In the first phase of vectorization, we created a 3562 // temporary value for s1. We now complete the vectorization and produce the 3563 // shorthand vector IR shown below (for VF = 4, UF = 1). 3564 // 3565 // vector.ph: 3566 // v_init = vector(..., ..., ..., a[-1]) 3567 // br vector.body 3568 // 3569 // vector.body 3570 // i = phi [0, vector.ph], [i+4, vector.body] 3571 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3572 // v2 = a[i, i+1, i+2, i+3]; 3573 // v3 = vector(v1(3), v2(0, 1, 2)) 3574 // b[i, i+1, i+2, i+3] = v2 - v3 3575 // br cond, vector.body, middle.block 3576 // 3577 // middle.block: 3578 // x = v2(3) 3579 // br scalar.ph 3580 // 3581 // scalar.ph: 3582 // s_init = phi [x, middle.block], [a[-1], otherwise] 3583 // br scalar.body 3584 // 3585 // After execution completes the vector loop, we extract the next value of 3586 // the recurrence (x) to use as the initial value in the scalar loop. 3587 3588 // Get the original loop preheader and single loop latch. 3589 auto *Preheader = OrigLoop->getLoopPreheader(); 3590 auto *Latch = OrigLoop->getLoopLatch(); 3591 3592 // Get the initial and previous values of the scalar recurrence. 3593 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3594 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3595 3596 // Create a vector from the initial value. 3597 auto *VectorInit = ScalarInit; 3598 if (VF > 1) { 3599 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3600 VectorInit = Builder.CreateInsertElement( 3601 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3602 Builder.getInt32(VF - 1), "vector.recur.init"); 3603 } 3604 3605 // We constructed a temporary phi node in the first phase of vectorization. 3606 // This phi node will eventually be deleted. 3607 Builder.SetInsertPoint( 3608 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3609 3610 // Create a phi node for the new recurrence. The current value will either be 3611 // the initial value inserted into a vector or loop-varying vector value. 3612 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3613 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3614 3615 // Get the vectorized previous value of the last part UF - 1. It appears last 3616 // among all unrolled iterations, due to the order of their construction. 3617 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3618 3619 // Find and set the insertion point after the previous value if it is an 3620 // instruction. 3621 BasicBlock::iterator InsertPt; 3622 // Note that the previous value may have been constant-folded so it is not 3623 // guaranteed to be an instruction in the vector loop. 3624 // FIXME: Loop invariant values do not form recurrences. We should deal with 3625 // them earlier. 3626 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3627 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3628 else { 3629 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3630 if (isa<PHINode>(PreviousLastPart)) 3631 // If the previous value is a phi node, we should insert after all the phi 3632 // nodes in the block containing the PHI to avoid breaking basic block 3633 // verification. Note that the basic block may be different to 3634 // LoopVectorBody, in case we predicate the loop. 3635 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3636 else 3637 InsertPt = ++PreviousInst->getIterator(); 3638 } 3639 Builder.SetInsertPoint(&*InsertPt); 3640 3641 // We will construct a vector for the recurrence by combining the values for 3642 // the current and previous iterations. This is the required shuffle mask. 3643 SmallVector<int, 8> ShuffleMask(VF); 3644 ShuffleMask[0] = VF - 1; 3645 for (unsigned I = 1; I < VF; ++I) 3646 ShuffleMask[I] = I + VF - 1; 3647 3648 // The vector from which to take the initial value for the current iteration 3649 // (actual or unrolled). Initially, this is the vector phi node. 3650 Value *Incoming = VecPhi; 3651 3652 // Shuffle the current and previous vector and update the vector parts. 3653 for (unsigned Part = 0; Part < UF; ++Part) { 3654 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3655 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3656 auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3657 ShuffleMask) 3658 : Incoming; 3659 PhiPart->replaceAllUsesWith(Shuffle); 3660 cast<Instruction>(PhiPart)->eraseFromParent(); 3661 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3662 Incoming = PreviousPart; 3663 } 3664 3665 // Fix the latch value of the new recurrence in the vector loop. 3666 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3667 3668 // Extract the last vector element in the middle block. This will be the 3669 // initial value for the recurrence when jumping to the scalar loop. 3670 auto *ExtractForScalar = Incoming; 3671 if (VF > 1) { 3672 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3673 ExtractForScalar = Builder.CreateExtractElement( 3674 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3675 } 3676 // Extract the second last element in the middle block if the 3677 // Phi is used outside the loop. We need to extract the phi itself 3678 // and not the last element (the phi update in the current iteration). This 3679 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3680 // when the scalar loop is not run at all. 3681 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3682 if (VF > 1) 3683 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3684 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3685 // When loop is unrolled without vectorizing, initialize 3686 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3687 // `Incoming`. This is analogous to the vectorized case above: extracting the 3688 // second last element when VF > 1. 3689 else if (UF > 1) 3690 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3691 3692 // Fix the initial value of the original recurrence in the scalar loop. 3693 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3694 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3695 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3696 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3697 Start->addIncoming(Incoming, BB); 3698 } 3699 3700 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3701 Phi->setName("scalar.recur"); 3702 3703 // Finally, fix users of the recurrence outside the loop. The users will need 3704 // either the last value of the scalar recurrence or the last value of the 3705 // vector recurrence we extracted in the middle block. Since the loop is in 3706 // LCSSA form, we just need to find all the phi nodes for the original scalar 3707 // recurrence in the exit block, and then add an edge for the middle block. 3708 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3709 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3710 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3711 } 3712 } 3713 } 3714 3715 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3716 Constant *Zero = Builder.getInt32(0); 3717 3718 // Get it's reduction variable descriptor. 3719 assert(Legal->isReductionVariable(Phi) && 3720 "Unable to find the reduction variable"); 3721 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3722 3723 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3724 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3725 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3726 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3727 RdxDesc.getMinMaxRecurrenceKind(); 3728 setDebugLocFromInst(Builder, ReductionStartValue); 3729 3730 // We need to generate a reduction vector from the incoming scalar. 3731 // To do so, we need to generate the 'identity' vector and override 3732 // one of the elements with the incoming scalar reduction. We need 3733 // to do it in the vector-loop preheader. 3734 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3735 3736 // This is the vector-clone of the value that leaves the loop. 3737 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3738 3739 // Find the reduction identity variable. Zero for addition, or, xor, 3740 // one for multiplication, -1 for And. 3741 Value *Identity; 3742 Value *VectorStart; 3743 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3744 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3745 // MinMax reduction have the start value as their identify. 3746 if (VF == 1) { 3747 VectorStart = Identity = ReductionStartValue; 3748 } else { 3749 VectorStart = Identity = 3750 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3751 } 3752 } else { 3753 // Handle other reduction kinds: 3754 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3755 RK, VecTy->getScalarType()); 3756 if (VF == 1) { 3757 Identity = Iden; 3758 // This vector is the Identity vector where the first element is the 3759 // incoming scalar reduction. 3760 VectorStart = ReductionStartValue; 3761 } else { 3762 Identity = ConstantVector::getSplat({VF, false}, Iden); 3763 3764 // This vector is the Identity vector where the first element is the 3765 // incoming scalar reduction. 3766 VectorStart = 3767 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3768 } 3769 } 3770 3771 // Wrap flags are in general invalid after vectorization, clear them. 3772 clearReductionWrapFlags(RdxDesc); 3773 3774 // Fix the vector-loop phi. 3775 3776 // Reductions do not have to start at zero. They can start with 3777 // any loop invariant values. 3778 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3779 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3780 3781 for (unsigned Part = 0; Part < UF; ++Part) { 3782 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3783 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3784 // Make sure to add the reduction start value only to the 3785 // first unroll part. 3786 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3787 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3788 cast<PHINode>(VecRdxPhi) 3789 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3790 } 3791 3792 // Before each round, move the insertion point right between 3793 // the PHIs and the values we are going to write. 3794 // This allows us to write both PHINodes and the extractelement 3795 // instructions. 3796 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3797 3798 setDebugLocFromInst(Builder, LoopExitInst); 3799 3800 // If tail is folded by masking, the vector value to leave the loop should be 3801 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3802 // instead of the former. 3803 if (Cost->foldTailByMasking()) { 3804 for (unsigned Part = 0; Part < UF; ++Part) { 3805 Value *VecLoopExitInst = 3806 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3807 Value *Sel = nullptr; 3808 for (User *U : VecLoopExitInst->users()) { 3809 if (isa<SelectInst>(U)) { 3810 assert(!Sel && "Reduction exit feeding two selects"); 3811 Sel = U; 3812 } else 3813 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3814 } 3815 assert(Sel && "Reduction exit feeds no select"); 3816 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3817 } 3818 } 3819 3820 // If the vector reduction can be performed in a smaller type, we truncate 3821 // then extend the loop exit value to enable InstCombine to evaluate the 3822 // entire expression in the smaller type. 3823 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3824 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3825 Builder.SetInsertPoint( 3826 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3827 VectorParts RdxParts(UF); 3828 for (unsigned Part = 0; Part < UF; ++Part) { 3829 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3830 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3831 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3832 : Builder.CreateZExt(Trunc, VecTy); 3833 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3834 UI != RdxParts[Part]->user_end();) 3835 if (*UI != Trunc) { 3836 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3837 RdxParts[Part] = Extnd; 3838 } else { 3839 ++UI; 3840 } 3841 } 3842 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3843 for (unsigned Part = 0; Part < UF; ++Part) { 3844 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3845 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3846 } 3847 } 3848 3849 // Reduce all of the unrolled parts into a single vector. 3850 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3851 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3852 3853 // The middle block terminator has already been assigned a DebugLoc here (the 3854 // OrigLoop's single latch terminator). We want the whole middle block to 3855 // appear to execute on this line because: (a) it is all compiler generated, 3856 // (b) these instructions are always executed after evaluating the latch 3857 // conditional branch, and (c) other passes may add new predecessors which 3858 // terminate on this line. This is the easiest way to ensure we don't 3859 // accidentally cause an extra step back into the loop while debugging. 3860 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3861 for (unsigned Part = 1; Part < UF; ++Part) { 3862 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3863 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3864 // Floating point operations had to be 'fast' to enable the reduction. 3865 ReducedPartRdx = addFastMathFlag( 3866 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3867 ReducedPartRdx, "bin.rdx"), 3868 RdxDesc.getFastMathFlags()); 3869 else 3870 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3871 RdxPart); 3872 } 3873 3874 if (VF > 1) { 3875 bool NoNaN = Legal->hasFunNoNaNAttr(); 3876 ReducedPartRdx = 3877 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3878 // If the reduction can be performed in a smaller type, we need to extend 3879 // the reduction to the wider type before we branch to the original loop. 3880 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3881 ReducedPartRdx = 3882 RdxDesc.isSigned() 3883 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3884 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3885 } 3886 3887 // Create a phi node that merges control-flow from the backedge-taken check 3888 // block and the middle block. 3889 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3890 LoopScalarPreHeader->getTerminator()); 3891 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3892 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3893 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3894 3895 // Now, we need to fix the users of the reduction variable 3896 // inside and outside of the scalar remainder loop. 3897 // We know that the loop is in LCSSA form. We need to update the 3898 // PHI nodes in the exit blocks. 3899 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3900 // All PHINodes need to have a single entry edge, or two if 3901 // we already fixed them. 3902 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3903 3904 // We found a reduction value exit-PHI. Update it with the 3905 // incoming bypass edge. 3906 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3907 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3908 } // end of the LCSSA phi scan. 3909 3910 // Fix the scalar loop reduction variable with the incoming reduction sum 3911 // from the vector body and from the backedge value. 3912 int IncomingEdgeBlockIdx = 3913 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3914 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3915 // Pick the other block. 3916 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3917 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3918 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3919 } 3920 3921 void InnerLoopVectorizer::clearReductionWrapFlags( 3922 RecurrenceDescriptor &RdxDesc) { 3923 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3924 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3925 RK != RecurrenceDescriptor::RK_IntegerMult) 3926 return; 3927 3928 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3929 assert(LoopExitInstr && "null loop exit instruction"); 3930 SmallVector<Instruction *, 8> Worklist; 3931 SmallPtrSet<Instruction *, 8> Visited; 3932 Worklist.push_back(LoopExitInstr); 3933 Visited.insert(LoopExitInstr); 3934 3935 while (!Worklist.empty()) { 3936 Instruction *Cur = Worklist.pop_back_val(); 3937 if (isa<OverflowingBinaryOperator>(Cur)) 3938 for (unsigned Part = 0; Part < UF; ++Part) { 3939 Value *V = getOrCreateVectorValue(Cur, Part); 3940 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3941 } 3942 3943 for (User *U : Cur->users()) { 3944 Instruction *UI = cast<Instruction>(U); 3945 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3946 Visited.insert(UI).second) 3947 Worklist.push_back(UI); 3948 } 3949 } 3950 } 3951 3952 void InnerLoopVectorizer::fixLCSSAPHIs() { 3953 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3954 if (LCSSAPhi.getNumIncomingValues() == 1) { 3955 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3956 // Non-instruction incoming values will have only one value. 3957 unsigned LastLane = 0; 3958 if (isa<Instruction>(IncomingValue)) 3959 LastLane = Cost->isUniformAfterVectorization( 3960 cast<Instruction>(IncomingValue), VF) 3961 ? 0 3962 : VF - 1; 3963 // Can be a loop invariant incoming value or the last scalar value to be 3964 // extracted from the vectorized loop. 3965 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3966 Value *lastIncomingValue = 3967 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3968 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3969 } 3970 } 3971 } 3972 3973 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3974 // The basic block and loop containing the predicated instruction. 3975 auto *PredBB = PredInst->getParent(); 3976 auto *VectorLoop = LI->getLoopFor(PredBB); 3977 3978 // Initialize a worklist with the operands of the predicated instruction. 3979 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3980 3981 // Holds instructions that we need to analyze again. An instruction may be 3982 // reanalyzed if we don't yet know if we can sink it or not. 3983 SmallVector<Instruction *, 8> InstsToReanalyze; 3984 3985 // Returns true if a given use occurs in the predicated block. Phi nodes use 3986 // their operands in their corresponding predecessor blocks. 3987 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3988 auto *I = cast<Instruction>(U.getUser()); 3989 BasicBlock *BB = I->getParent(); 3990 if (auto *Phi = dyn_cast<PHINode>(I)) 3991 BB = Phi->getIncomingBlock( 3992 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3993 return BB == PredBB; 3994 }; 3995 3996 // Iteratively sink the scalarized operands of the predicated instruction 3997 // into the block we created for it. When an instruction is sunk, it's 3998 // operands are then added to the worklist. The algorithm ends after one pass 3999 // through the worklist doesn't sink a single instruction. 4000 bool Changed; 4001 do { 4002 // Add the instructions that need to be reanalyzed to the worklist, and 4003 // reset the changed indicator. 4004 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4005 InstsToReanalyze.clear(); 4006 Changed = false; 4007 4008 while (!Worklist.empty()) { 4009 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4010 4011 // We can't sink an instruction if it is a phi node, is already in the 4012 // predicated block, is not in the loop, or may have side effects. 4013 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4014 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4015 continue; 4016 4017 // It's legal to sink the instruction if all its uses occur in the 4018 // predicated block. Otherwise, there's nothing to do yet, and we may 4019 // need to reanalyze the instruction. 4020 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4021 InstsToReanalyze.push_back(I); 4022 continue; 4023 } 4024 4025 // Move the instruction to the beginning of the predicated block, and add 4026 // it's operands to the worklist. 4027 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4028 Worklist.insert(I->op_begin(), I->op_end()); 4029 4030 // The sinking may have enabled other instructions to be sunk, so we will 4031 // need to iterate. 4032 Changed = true; 4033 } 4034 } while (Changed); 4035 } 4036 4037 void InnerLoopVectorizer::fixNonInductionPHIs() { 4038 for (PHINode *OrigPhi : OrigPHIsToFix) { 4039 PHINode *NewPhi = 4040 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4041 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4042 4043 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4044 predecessors(OrigPhi->getParent())); 4045 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4046 predecessors(NewPhi->getParent())); 4047 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4048 "Scalar and Vector BB should have the same number of predecessors"); 4049 4050 // The insertion point in Builder may be invalidated by the time we get 4051 // here. Force the Builder insertion point to something valid so that we do 4052 // not run into issues during insertion point restore in 4053 // getOrCreateVectorValue calls below. 4054 Builder.SetInsertPoint(NewPhi); 4055 4056 // The predecessor order is preserved and we can rely on mapping between 4057 // scalar and vector block predecessors. 4058 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4059 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4060 4061 // When looking up the new scalar/vector values to fix up, use incoming 4062 // values from original phi. 4063 Value *ScIncV = 4064 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4065 4066 // Scalar incoming value may need a broadcast 4067 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4068 NewPhi->addIncoming(NewIncV, NewPredBB); 4069 } 4070 } 4071 } 4072 4073 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4074 unsigned VF, bool IsPtrLoopInvariant, 4075 SmallBitVector &IsIndexLoopInvariant) { 4076 // Construct a vector GEP by widening the operands of the scalar GEP as 4077 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4078 // results in a vector of pointers when at least one operand of the GEP 4079 // is vector-typed. Thus, to keep the representation compact, we only use 4080 // vector-typed operands for loop-varying values. 4081 4082 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4083 // If we are vectorizing, but the GEP has only loop-invariant operands, 4084 // the GEP we build (by only using vector-typed operands for 4085 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4086 // produce a vector of pointers, we need to either arbitrarily pick an 4087 // operand to broadcast, or broadcast a clone of the original GEP. 4088 // Here, we broadcast a clone of the original. 4089 // 4090 // TODO: If at some point we decide to scalarize instructions having 4091 // loop-invariant operands, this special case will no longer be 4092 // required. We would add the scalarization decision to 4093 // collectLoopScalars() and teach getVectorValue() to broadcast 4094 // the lane-zero scalar value. 4095 auto *Clone = Builder.Insert(GEP->clone()); 4096 for (unsigned Part = 0; Part < UF; ++Part) { 4097 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4098 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4099 addMetadata(EntryPart, GEP); 4100 } 4101 } else { 4102 // If the GEP has at least one loop-varying operand, we are sure to 4103 // produce a vector of pointers. But if we are only unrolling, we want 4104 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4105 // produce with the code below will be scalar (if VF == 1) or vector 4106 // (otherwise). Note that for the unroll-only case, we still maintain 4107 // values in the vector mapping with initVector, as we do for other 4108 // instructions. 4109 for (unsigned Part = 0; Part < UF; ++Part) { 4110 // The pointer operand of the new GEP. If it's loop-invariant, we 4111 // won't broadcast it. 4112 auto *Ptr = IsPtrLoopInvariant 4113 ? GEP->getPointerOperand() 4114 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4115 4116 // Collect all the indices for the new GEP. If any index is 4117 // loop-invariant, we won't broadcast it. 4118 SmallVector<Value *, 4> Indices; 4119 for (auto Index : enumerate(GEP->indices())) { 4120 Value *User = Index.value().get(); 4121 if (IsIndexLoopInvariant[Index.index()]) 4122 Indices.push_back(User); 4123 else 4124 Indices.push_back(getOrCreateVectorValue(User, Part)); 4125 } 4126 4127 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4128 // but it should be a vector, otherwise. 4129 auto *NewGEP = 4130 GEP->isInBounds() 4131 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4132 Indices) 4133 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4134 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4135 "NewGEP is not a pointer vector"); 4136 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4137 addMetadata(NewGEP, GEP); 4138 } 4139 } 4140 } 4141 4142 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4143 unsigned VF) { 4144 PHINode *P = cast<PHINode>(PN); 4145 if (EnableVPlanNativePath) { 4146 // Currently we enter here in the VPlan-native path for non-induction 4147 // PHIs where all control flow is uniform. We simply widen these PHIs. 4148 // Create a vector phi with no operands - the vector phi operands will be 4149 // set at the end of vector code generation. 4150 Type *VecTy = 4151 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4152 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4153 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4154 OrigPHIsToFix.push_back(P); 4155 4156 return; 4157 } 4158 4159 assert(PN->getParent() == OrigLoop->getHeader() && 4160 "Non-header phis should have been handled elsewhere"); 4161 4162 // In order to support recurrences we need to be able to vectorize Phi nodes. 4163 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4164 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4165 // this value when we vectorize all of the instructions that use the PHI. 4166 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4167 for (unsigned Part = 0; Part < UF; ++Part) { 4168 // This is phase one of vectorizing PHIs. 4169 Type *VecTy = 4170 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4171 Value *EntryPart = PHINode::Create( 4172 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4173 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4174 } 4175 return; 4176 } 4177 4178 setDebugLocFromInst(Builder, P); 4179 4180 // This PHINode must be an induction variable. 4181 // Make sure that we know about it. 4182 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4183 4184 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4185 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4186 4187 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4188 // which can be found from the original scalar operations. 4189 switch (II.getKind()) { 4190 case InductionDescriptor::IK_NoInduction: 4191 llvm_unreachable("Unknown induction"); 4192 case InductionDescriptor::IK_IntInduction: 4193 case InductionDescriptor::IK_FpInduction: 4194 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4195 case InductionDescriptor::IK_PtrInduction: { 4196 // Handle the pointer induction variable case. 4197 assert(P->getType()->isPointerTy() && "Unexpected type."); 4198 // This is the normalized GEP that starts counting at zero. 4199 Value *PtrInd = Induction; 4200 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4201 // Determine the number of scalars we need to generate for each unroll 4202 // iteration. If the instruction is uniform, we only need to generate the 4203 // first lane. Otherwise, we generate all VF values. 4204 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4205 // These are the scalar results. Notice that we don't generate vector GEPs 4206 // because scalar GEPs result in better code. 4207 for (unsigned Part = 0; Part < UF; ++Part) { 4208 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4209 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4210 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4211 Value *SclrGep = 4212 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4213 SclrGep->setName("next.gep"); 4214 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4215 } 4216 } 4217 return; 4218 } 4219 } 4220 } 4221 4222 /// A helper function for checking whether an integer division-related 4223 /// instruction may divide by zero (in which case it must be predicated if 4224 /// executed conditionally in the scalar code). 4225 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4226 /// Non-zero divisors that are non compile-time constants will not be 4227 /// converted into multiplication, so we will still end up scalarizing 4228 /// the division, but can do so w/o predication. 4229 static bool mayDivideByZero(Instruction &I) { 4230 assert((I.getOpcode() == Instruction::UDiv || 4231 I.getOpcode() == Instruction::SDiv || 4232 I.getOpcode() == Instruction::URem || 4233 I.getOpcode() == Instruction::SRem) && 4234 "Unexpected instruction"); 4235 Value *Divisor = I.getOperand(1); 4236 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4237 return !CInt || CInt->isZero(); 4238 } 4239 4240 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4241 VPTransformState &State) { 4242 switch (I.getOpcode()) { 4243 case Instruction::Call: 4244 case Instruction::Br: 4245 case Instruction::PHI: 4246 case Instruction::GetElementPtr: 4247 case Instruction::Select: 4248 llvm_unreachable("This instruction is handled by a different recipe."); 4249 case Instruction::UDiv: 4250 case Instruction::SDiv: 4251 case Instruction::SRem: 4252 case Instruction::URem: 4253 case Instruction::Add: 4254 case Instruction::FAdd: 4255 case Instruction::Sub: 4256 case Instruction::FSub: 4257 case Instruction::FNeg: 4258 case Instruction::Mul: 4259 case Instruction::FMul: 4260 case Instruction::FDiv: 4261 case Instruction::FRem: 4262 case Instruction::Shl: 4263 case Instruction::LShr: 4264 case Instruction::AShr: 4265 case Instruction::And: 4266 case Instruction::Or: 4267 case Instruction::Xor: { 4268 // Just widen unops and binops. 4269 setDebugLocFromInst(Builder, &I); 4270 4271 for (unsigned Part = 0; Part < UF; ++Part) { 4272 SmallVector<Value *, 2> Ops; 4273 for (VPValue *VPOp : User.operands()) 4274 Ops.push_back(State.get(VPOp, Part)); 4275 4276 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4277 4278 if (auto *VecOp = dyn_cast<Instruction>(V)) 4279 VecOp->copyIRFlags(&I); 4280 4281 // Use this vector value for all users of the original instruction. 4282 VectorLoopValueMap.setVectorValue(&I, Part, V); 4283 addMetadata(V, &I); 4284 } 4285 4286 break; 4287 } 4288 case Instruction::ICmp: 4289 case Instruction::FCmp: { 4290 // Widen compares. Generate vector compares. 4291 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4292 auto *Cmp = cast<CmpInst>(&I); 4293 setDebugLocFromInst(Builder, Cmp); 4294 for (unsigned Part = 0; Part < UF; ++Part) { 4295 Value *A = State.get(User.getOperand(0), Part); 4296 Value *B = State.get(User.getOperand(1), Part); 4297 Value *C = nullptr; 4298 if (FCmp) { 4299 // Propagate fast math flags. 4300 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4301 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4302 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4303 } else { 4304 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4305 } 4306 VectorLoopValueMap.setVectorValue(&I, Part, C); 4307 addMetadata(C, &I); 4308 } 4309 4310 break; 4311 } 4312 4313 case Instruction::ZExt: 4314 case Instruction::SExt: 4315 case Instruction::FPToUI: 4316 case Instruction::FPToSI: 4317 case Instruction::FPExt: 4318 case Instruction::PtrToInt: 4319 case Instruction::IntToPtr: 4320 case Instruction::SIToFP: 4321 case Instruction::UIToFP: 4322 case Instruction::Trunc: 4323 case Instruction::FPTrunc: 4324 case Instruction::BitCast: { 4325 auto *CI = cast<CastInst>(&I); 4326 setDebugLocFromInst(Builder, CI); 4327 4328 /// Vectorize casts. 4329 Type *DestTy = 4330 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4331 4332 for (unsigned Part = 0; Part < UF; ++Part) { 4333 Value *A = State.get(User.getOperand(0), Part); 4334 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4335 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4336 addMetadata(Cast, &I); 4337 } 4338 break; 4339 } 4340 default: 4341 // This instruction is not vectorized by simple widening. 4342 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4343 llvm_unreachable("Unhandled instruction!"); 4344 } // end of switch. 4345 } 4346 4347 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4348 VPTransformState &State) { 4349 assert(!isa<DbgInfoIntrinsic>(I) && 4350 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4351 setDebugLocFromInst(Builder, &I); 4352 4353 Module *M = I.getParent()->getParent()->getParent(); 4354 auto *CI = cast<CallInst>(&I); 4355 4356 SmallVector<Type *, 4> Tys; 4357 for (Value *ArgOperand : CI->arg_operands()) 4358 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4359 4360 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4361 4362 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4363 // version of the instruction. 4364 // Is it beneficial to perform intrinsic call compared to lib call? 4365 bool NeedToScalarize = false; 4366 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4367 bool UseVectorIntrinsic = 4368 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4369 assert((UseVectorIntrinsic || !NeedToScalarize) && 4370 "Instruction should be scalarized elsewhere."); 4371 4372 for (unsigned Part = 0; Part < UF; ++Part) { 4373 SmallVector<Value *, 4> Args; 4374 for (auto &I : enumerate(ArgOperands.operands())) { 4375 // Some intrinsics have a scalar argument - don't replace it with a 4376 // vector. 4377 Value *Arg; 4378 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4379 Arg = State.get(I.value(), Part); 4380 else 4381 Arg = State.get(I.value(), {0, 0}); 4382 Args.push_back(Arg); 4383 } 4384 4385 Function *VectorF; 4386 if (UseVectorIntrinsic) { 4387 // Use vector version of the intrinsic. 4388 Type *TysForDecl[] = {CI->getType()}; 4389 if (VF > 1) 4390 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4391 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4392 assert(VectorF && "Can't retrieve vector intrinsic."); 4393 } else { 4394 // Use vector version of the function call. 4395 const VFShape Shape = 4396 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4397 #ifndef NDEBUG 4398 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4399 "Can't create vector function."); 4400 #endif 4401 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4402 } 4403 SmallVector<OperandBundleDef, 1> OpBundles; 4404 CI->getOperandBundlesAsDefs(OpBundles); 4405 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4406 4407 if (isa<FPMathOperator>(V)) 4408 V->copyFastMathFlags(CI); 4409 4410 VectorLoopValueMap.setVectorValue(&I, Part, V); 4411 addMetadata(V, &I); 4412 } 4413 } 4414 4415 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4416 VPUser &Operands, 4417 bool InvariantCond, 4418 VPTransformState &State) { 4419 setDebugLocFromInst(Builder, &I); 4420 4421 // The condition can be loop invariant but still defined inside the 4422 // loop. This means that we can't just use the original 'cond' value. 4423 // We have to take the 'vectorized' value and pick the first lane. 4424 // Instcombine will make this a no-op. 4425 auto *InvarCond = 4426 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4427 4428 for (unsigned Part = 0; Part < UF; ++Part) { 4429 Value *Cond = 4430 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4431 Value *Op0 = State.get(Operands.getOperand(1), Part); 4432 Value *Op1 = State.get(Operands.getOperand(2), Part); 4433 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4434 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4435 addMetadata(Sel, &I); 4436 } 4437 } 4438 4439 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4440 // We should not collect Scalars more than once per VF. Right now, this 4441 // function is called from collectUniformsAndScalars(), which already does 4442 // this check. Collecting Scalars for VF=1 does not make any sense. 4443 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4444 "This function should not be visited twice for the same VF"); 4445 4446 SmallSetVector<Instruction *, 8> Worklist; 4447 4448 // These sets are used to seed the analysis with pointers used by memory 4449 // accesses that will remain scalar. 4450 SmallSetVector<Instruction *, 8> ScalarPtrs; 4451 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4452 4453 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4454 // The pointer operands of loads and stores will be scalar as long as the 4455 // memory access is not a gather or scatter operation. The value operand of a 4456 // store will remain scalar if the store is scalarized. 4457 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4458 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4459 assert(WideningDecision != CM_Unknown && 4460 "Widening decision should be ready at this moment"); 4461 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4462 if (Ptr == Store->getValueOperand()) 4463 return WideningDecision == CM_Scalarize; 4464 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4465 "Ptr is neither a value or pointer operand"); 4466 return WideningDecision != CM_GatherScatter; 4467 }; 4468 4469 // A helper that returns true if the given value is a bitcast or 4470 // getelementptr instruction contained in the loop. 4471 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4472 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4473 isa<GetElementPtrInst>(V)) && 4474 !TheLoop->isLoopInvariant(V); 4475 }; 4476 4477 // A helper that evaluates a memory access's use of a pointer. If the use 4478 // will be a scalar use, and the pointer is only used by memory accesses, we 4479 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4480 // PossibleNonScalarPtrs. 4481 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4482 // We only care about bitcast and getelementptr instructions contained in 4483 // the loop. 4484 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4485 return; 4486 4487 // If the pointer has already been identified as scalar (e.g., if it was 4488 // also identified as uniform), there's nothing to do. 4489 auto *I = cast<Instruction>(Ptr); 4490 if (Worklist.count(I)) 4491 return; 4492 4493 // If the use of the pointer will be a scalar use, and all users of the 4494 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4495 // place the pointer in PossibleNonScalarPtrs. 4496 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4497 return isa<LoadInst>(U) || isa<StoreInst>(U); 4498 })) 4499 ScalarPtrs.insert(I); 4500 else 4501 PossibleNonScalarPtrs.insert(I); 4502 }; 4503 4504 // We seed the scalars analysis with three classes of instructions: (1) 4505 // instructions marked uniform-after-vectorization, (2) bitcast and 4506 // getelementptr instructions used by memory accesses requiring a scalar use, 4507 // and (3) pointer induction variables and their update instructions (we 4508 // currently only scalarize these). 4509 // 4510 // (1) Add to the worklist all instructions that have been identified as 4511 // uniform-after-vectorization. 4512 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4513 4514 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4515 // memory accesses requiring a scalar use. The pointer operands of loads and 4516 // stores will be scalar as long as the memory accesses is not a gather or 4517 // scatter operation. The value operand of a store will remain scalar if the 4518 // store is scalarized. 4519 for (auto *BB : TheLoop->blocks()) 4520 for (auto &I : *BB) { 4521 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4522 evaluatePtrUse(Load, Load->getPointerOperand()); 4523 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4524 evaluatePtrUse(Store, Store->getPointerOperand()); 4525 evaluatePtrUse(Store, Store->getValueOperand()); 4526 } 4527 } 4528 for (auto *I : ScalarPtrs) 4529 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4530 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4531 Worklist.insert(I); 4532 } 4533 4534 // (3) Add to the worklist all pointer induction variables and their update 4535 // instructions. 4536 // 4537 // TODO: Once we are able to vectorize pointer induction variables we should 4538 // no longer insert them into the worklist here. 4539 auto *Latch = TheLoop->getLoopLatch(); 4540 for (auto &Induction : Legal->getInductionVars()) { 4541 auto *Ind = Induction.first; 4542 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4543 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4544 continue; 4545 Worklist.insert(Ind); 4546 Worklist.insert(IndUpdate); 4547 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4548 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4549 << "\n"); 4550 } 4551 4552 // Insert the forced scalars. 4553 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4554 // induction variable when the PHI user is scalarized. 4555 auto ForcedScalar = ForcedScalars.find(VF); 4556 if (ForcedScalar != ForcedScalars.end()) 4557 for (auto *I : ForcedScalar->second) 4558 Worklist.insert(I); 4559 4560 // Expand the worklist by looking through any bitcasts and getelementptr 4561 // instructions we've already identified as scalar. This is similar to the 4562 // expansion step in collectLoopUniforms(); however, here we're only 4563 // expanding to include additional bitcasts and getelementptr instructions. 4564 unsigned Idx = 0; 4565 while (Idx != Worklist.size()) { 4566 Instruction *Dst = Worklist[Idx++]; 4567 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4568 continue; 4569 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4570 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4571 auto *J = cast<Instruction>(U); 4572 return !TheLoop->contains(J) || Worklist.count(J) || 4573 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4574 isScalarUse(J, Src)); 4575 })) { 4576 Worklist.insert(Src); 4577 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4578 } 4579 } 4580 4581 // An induction variable will remain scalar if all users of the induction 4582 // variable and induction variable update remain scalar. 4583 for (auto &Induction : Legal->getInductionVars()) { 4584 auto *Ind = Induction.first; 4585 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4586 4587 // We already considered pointer induction variables, so there's no reason 4588 // to look at their users again. 4589 // 4590 // TODO: Once we are able to vectorize pointer induction variables we 4591 // should no longer skip over them here. 4592 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4593 continue; 4594 4595 // If tail-folding is applied, the primary induction variable will be used 4596 // to feed a vector compare. 4597 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4598 continue; 4599 4600 // Determine if all users of the induction variable are scalar after 4601 // vectorization. 4602 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4603 auto *I = cast<Instruction>(U); 4604 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4605 }); 4606 if (!ScalarInd) 4607 continue; 4608 4609 // Determine if all users of the induction variable update instruction are 4610 // scalar after vectorization. 4611 auto ScalarIndUpdate = 4612 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4613 auto *I = cast<Instruction>(U); 4614 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4615 }); 4616 if (!ScalarIndUpdate) 4617 continue; 4618 4619 // The induction variable and its update instruction will remain scalar. 4620 Worklist.insert(Ind); 4621 Worklist.insert(IndUpdate); 4622 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4623 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4624 << "\n"); 4625 } 4626 4627 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4628 } 4629 4630 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4631 if (!blockNeedsPredication(I->getParent())) 4632 return false; 4633 switch(I->getOpcode()) { 4634 default: 4635 break; 4636 case Instruction::Load: 4637 case Instruction::Store: { 4638 if (!Legal->isMaskRequired(I)) 4639 return false; 4640 auto *Ptr = getLoadStorePointerOperand(I); 4641 auto *Ty = getMemInstValueType(I); 4642 // We have already decided how to vectorize this instruction, get that 4643 // result. 4644 if (VF > 1) { 4645 InstWidening WideningDecision = getWideningDecision(I, VF); 4646 assert(WideningDecision != CM_Unknown && 4647 "Widening decision should be ready at this moment"); 4648 return WideningDecision == CM_Scalarize; 4649 } 4650 const Align Alignment = getLoadStoreAlignment(I); 4651 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4652 isLegalMaskedGather(Ty, Alignment)) 4653 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4654 isLegalMaskedScatter(Ty, Alignment)); 4655 } 4656 case Instruction::UDiv: 4657 case Instruction::SDiv: 4658 case Instruction::SRem: 4659 case Instruction::URem: 4660 return mayDivideByZero(*I); 4661 } 4662 return false; 4663 } 4664 4665 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4666 unsigned VF) { 4667 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4668 assert(getWideningDecision(I, VF) == CM_Unknown && 4669 "Decision should not be set yet."); 4670 auto *Group = getInterleavedAccessGroup(I); 4671 assert(Group && "Must have a group."); 4672 4673 // If the instruction's allocated size doesn't equal it's type size, it 4674 // requires padding and will be scalarized. 4675 auto &DL = I->getModule()->getDataLayout(); 4676 auto *ScalarTy = getMemInstValueType(I); 4677 if (hasIrregularType(ScalarTy, DL, VF)) 4678 return false; 4679 4680 // Check if masking is required. 4681 // A Group may need masking for one of two reasons: it resides in a block that 4682 // needs predication, or it was decided to use masking to deal with gaps. 4683 bool PredicatedAccessRequiresMasking = 4684 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4685 bool AccessWithGapsRequiresMasking = 4686 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4687 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4688 return true; 4689 4690 // If masked interleaving is required, we expect that the user/target had 4691 // enabled it, because otherwise it either wouldn't have been created or 4692 // it should have been invalidated by the CostModel. 4693 assert(useMaskedInterleavedAccesses(TTI) && 4694 "Masked interleave-groups for predicated accesses are not enabled."); 4695 4696 auto *Ty = getMemInstValueType(I); 4697 const Align Alignment = getLoadStoreAlignment(I); 4698 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4699 : TTI.isLegalMaskedStore(Ty, Alignment); 4700 } 4701 4702 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4703 unsigned VF) { 4704 // Get and ensure we have a valid memory instruction. 4705 LoadInst *LI = dyn_cast<LoadInst>(I); 4706 StoreInst *SI = dyn_cast<StoreInst>(I); 4707 assert((LI || SI) && "Invalid memory instruction"); 4708 4709 auto *Ptr = getLoadStorePointerOperand(I); 4710 4711 // In order to be widened, the pointer should be consecutive, first of all. 4712 if (!Legal->isConsecutivePtr(Ptr)) 4713 return false; 4714 4715 // If the instruction is a store located in a predicated block, it will be 4716 // scalarized. 4717 if (isScalarWithPredication(I)) 4718 return false; 4719 4720 // If the instruction's allocated size doesn't equal it's type size, it 4721 // requires padding and will be scalarized. 4722 auto &DL = I->getModule()->getDataLayout(); 4723 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4724 if (hasIrregularType(ScalarTy, DL, VF)) 4725 return false; 4726 4727 return true; 4728 } 4729 4730 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4731 // We should not collect Uniforms more than once per VF. Right now, 4732 // this function is called from collectUniformsAndScalars(), which 4733 // already does this check. Collecting Uniforms for VF=1 does not make any 4734 // sense. 4735 4736 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4737 "This function should not be visited twice for the same VF"); 4738 4739 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4740 // not analyze again. Uniforms.count(VF) will return 1. 4741 Uniforms[VF].clear(); 4742 4743 // We now know that the loop is vectorizable! 4744 // Collect instructions inside the loop that will remain uniform after 4745 // vectorization. 4746 4747 // Global values, params and instructions outside of current loop are out of 4748 // scope. 4749 auto isOutOfScope = [&](Value *V) -> bool { 4750 Instruction *I = dyn_cast<Instruction>(V); 4751 return (!I || !TheLoop->contains(I)); 4752 }; 4753 4754 SetVector<Instruction *> Worklist; 4755 BasicBlock *Latch = TheLoop->getLoopLatch(); 4756 4757 // Instructions that are scalar with predication must not be considered 4758 // uniform after vectorization, because that would create an erroneous 4759 // replicating region where only a single instance out of VF should be formed. 4760 // TODO: optimize such seldom cases if found important, see PR40816. 4761 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4762 if (isScalarWithPredication(I, VF)) { 4763 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4764 << *I << "\n"); 4765 return; 4766 } 4767 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4768 Worklist.insert(I); 4769 }; 4770 4771 // Start with the conditional branch. If the branch condition is an 4772 // instruction contained in the loop that is only used by the branch, it is 4773 // uniform. 4774 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4775 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4776 addToWorklistIfAllowed(Cmp); 4777 4778 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4779 // are pointers that are treated like consecutive pointers during 4780 // vectorization. The pointer operands of interleaved accesses are an 4781 // example. 4782 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4783 4784 // Holds pointer operands of instructions that are possibly non-uniform. 4785 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4786 4787 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4788 InstWidening WideningDecision = getWideningDecision(I, VF); 4789 assert(WideningDecision != CM_Unknown && 4790 "Widening decision should be ready at this moment"); 4791 4792 return (WideningDecision == CM_Widen || 4793 WideningDecision == CM_Widen_Reverse || 4794 WideningDecision == CM_Interleave); 4795 }; 4796 // Iterate over the instructions in the loop, and collect all 4797 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4798 // that a consecutive-like pointer operand will be scalarized, we collect it 4799 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4800 // getelementptr instruction can be used by both vectorized and scalarized 4801 // memory instructions. For example, if a loop loads and stores from the same 4802 // location, but the store is conditional, the store will be scalarized, and 4803 // the getelementptr won't remain uniform. 4804 for (auto *BB : TheLoop->blocks()) 4805 for (auto &I : *BB) { 4806 // If there's no pointer operand, there's nothing to do. 4807 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4808 if (!Ptr) 4809 continue; 4810 4811 // True if all users of Ptr are memory accesses that have Ptr as their 4812 // pointer operand. 4813 auto UsersAreMemAccesses = 4814 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4815 return getLoadStorePointerOperand(U) == Ptr; 4816 }); 4817 4818 // Ensure the memory instruction will not be scalarized or used by 4819 // gather/scatter, making its pointer operand non-uniform. If the pointer 4820 // operand is used by any instruction other than a memory access, we 4821 // conservatively assume the pointer operand may be non-uniform. 4822 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4823 PossibleNonUniformPtrs.insert(Ptr); 4824 4825 // If the memory instruction will be vectorized and its pointer operand 4826 // is consecutive-like, or interleaving - the pointer operand should 4827 // remain uniform. 4828 else 4829 ConsecutiveLikePtrs.insert(Ptr); 4830 } 4831 4832 // Add to the Worklist all consecutive and consecutive-like pointers that 4833 // aren't also identified as possibly non-uniform. 4834 for (auto *V : ConsecutiveLikePtrs) 4835 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4836 addToWorklistIfAllowed(V); 4837 4838 // Expand Worklist in topological order: whenever a new instruction 4839 // is added , its users should be already inside Worklist. It ensures 4840 // a uniform instruction will only be used by uniform instructions. 4841 unsigned idx = 0; 4842 while (idx != Worklist.size()) { 4843 Instruction *I = Worklist[idx++]; 4844 4845 for (auto OV : I->operand_values()) { 4846 // isOutOfScope operands cannot be uniform instructions. 4847 if (isOutOfScope(OV)) 4848 continue; 4849 // First order recurrence Phi's should typically be considered 4850 // non-uniform. 4851 auto *OP = dyn_cast<PHINode>(OV); 4852 if (OP && Legal->isFirstOrderRecurrence(OP)) 4853 continue; 4854 // If all the users of the operand are uniform, then add the 4855 // operand into the uniform worklist. 4856 auto *OI = cast<Instruction>(OV); 4857 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4858 auto *J = cast<Instruction>(U); 4859 return Worklist.count(J) || 4860 (OI == getLoadStorePointerOperand(J) && 4861 isUniformDecision(J, VF)); 4862 })) 4863 addToWorklistIfAllowed(OI); 4864 } 4865 } 4866 4867 // Returns true if Ptr is the pointer operand of a memory access instruction 4868 // I, and I is known to not require scalarization. 4869 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4870 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4871 }; 4872 4873 // For an instruction to be added into Worklist above, all its users inside 4874 // the loop should also be in Worklist. However, this condition cannot be 4875 // true for phi nodes that form a cyclic dependence. We must process phi 4876 // nodes separately. An induction variable will remain uniform if all users 4877 // of the induction variable and induction variable update remain uniform. 4878 // The code below handles both pointer and non-pointer induction variables. 4879 for (auto &Induction : Legal->getInductionVars()) { 4880 auto *Ind = Induction.first; 4881 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4882 4883 // Determine if all users of the induction variable are uniform after 4884 // vectorization. 4885 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4886 auto *I = cast<Instruction>(U); 4887 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4888 isVectorizedMemAccessUse(I, Ind); 4889 }); 4890 if (!UniformInd) 4891 continue; 4892 4893 // Determine if all users of the induction variable update instruction are 4894 // uniform after vectorization. 4895 auto UniformIndUpdate = 4896 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4897 auto *I = cast<Instruction>(U); 4898 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4899 isVectorizedMemAccessUse(I, IndUpdate); 4900 }); 4901 if (!UniformIndUpdate) 4902 continue; 4903 4904 // The induction variable and its update instruction will remain uniform. 4905 addToWorklistIfAllowed(Ind); 4906 addToWorklistIfAllowed(IndUpdate); 4907 } 4908 4909 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4910 } 4911 4912 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4913 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4914 4915 if (Legal->getRuntimePointerChecking()->Need) { 4916 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4917 "runtime pointer checks needed. Enable vectorization of this " 4918 "loop with '#pragma clang loop vectorize(enable)' when " 4919 "compiling with -Os/-Oz", 4920 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4921 return true; 4922 } 4923 4924 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4925 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4926 "runtime SCEV checks needed. Enable vectorization of this " 4927 "loop with '#pragma clang loop vectorize(enable)' when " 4928 "compiling with -Os/-Oz", 4929 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4930 return true; 4931 } 4932 4933 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4934 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4935 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4936 "runtime stride == 1 checks needed. Enable vectorization of " 4937 "this loop with '#pragma clang loop vectorize(enable)' when " 4938 "compiling with -Os/-Oz", 4939 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4940 return true; 4941 } 4942 4943 return false; 4944 } 4945 4946 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 4947 unsigned UserIC) { 4948 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4949 // TODO: It may by useful to do since it's still likely to be dynamically 4950 // uniform if the target can skip. 4951 reportVectorizationFailure( 4952 "Not inserting runtime ptr check for divergent target", 4953 "runtime pointer checks needed. Not enabled for divergent target", 4954 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4955 return None; 4956 } 4957 4958 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4959 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4960 if (TC == 1) { 4961 reportVectorizationFailure("Single iteration (non) loop", 4962 "loop trip count is one, irrelevant for vectorization", 4963 "SingleIterationLoop", ORE, TheLoop); 4964 return None; 4965 } 4966 4967 switch (ScalarEpilogueStatus) { 4968 case CM_ScalarEpilogueAllowed: 4969 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 4970 case CM_ScalarEpilogueNotNeededUsePredicate: 4971 LLVM_DEBUG( 4972 dbgs() << "LV: vector predicate hint/switch found.\n" 4973 << "LV: Not allowing scalar epilogue, creating predicated " 4974 << "vector loop.\n"); 4975 break; 4976 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4977 // fallthrough as a special case of OptForSize 4978 case CM_ScalarEpilogueNotAllowedOptSize: 4979 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4980 LLVM_DEBUG( 4981 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4982 else 4983 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4984 << "count.\n"); 4985 4986 // Bail if runtime checks are required, which are not good when optimising 4987 // for size. 4988 if (runtimeChecksRequired()) 4989 return None; 4990 break; 4991 } 4992 4993 // Now try the tail folding 4994 4995 // Invalidate interleave groups that require an epilogue if we can't mask 4996 // the interleave-group. 4997 if (!useMaskedInterleavedAccesses(TTI)) { 4998 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4999 "No decisions should have been taken at this point"); 5000 // Note: There is no need to invalidate any cost modeling decisions here, as 5001 // non where taken so far. 5002 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5003 } 5004 5005 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5006 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5007 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5008 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5009 // Accept MaxVF if we do not have a tail. 5010 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5011 return MaxVF; 5012 } 5013 5014 // If we don't know the precise trip count, or if the trip count that we 5015 // found modulo the vectorization factor is not zero, try to fold the tail 5016 // by masking. 5017 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5018 if (Legal->prepareToFoldTailByMasking()) { 5019 FoldTailByMasking = true; 5020 return MaxVF; 5021 } 5022 5023 if (TC == 0) { 5024 reportVectorizationFailure( 5025 "Unable to calculate the loop count due to complex control flow", 5026 "unable to calculate the loop count due to complex control flow", 5027 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5028 return None; 5029 } 5030 5031 reportVectorizationFailure( 5032 "Cannot optimize for size and vectorize at the same time.", 5033 "cannot optimize for size and vectorize at the same time. " 5034 "Enable vectorization of this loop with '#pragma clang loop " 5035 "vectorize(enable)' when compiling with -Os/-Oz", 5036 "NoTailLoopWithOptForSize", ORE, TheLoop); 5037 return None; 5038 } 5039 5040 unsigned 5041 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5042 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5043 unsigned SmallestType, WidestType; 5044 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5045 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5046 5047 // Get the maximum safe dependence distance in bits computed by LAA. 5048 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5049 // the memory accesses that is most restrictive (involved in the smallest 5050 // dependence distance). 5051 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5052 5053 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5054 5055 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5056 WidestRegister = PowerOf2Floor(WidestRegister); 5057 5058 unsigned MaxVectorSize = WidestRegister / WidestType; 5059 5060 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5061 << " / " << WidestType << " bits.\n"); 5062 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5063 << WidestRegister << " bits.\n"); 5064 5065 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5066 " into one vector!"); 5067 if (MaxVectorSize == 0) { 5068 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5069 MaxVectorSize = 1; 5070 return MaxVectorSize; 5071 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5072 isPowerOf2_32(ConstTripCount)) { 5073 // We need to clamp the VF to be the ConstTripCount. There is no point in 5074 // choosing a higher viable VF as done in the loop below. 5075 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5076 << ConstTripCount << "\n"); 5077 MaxVectorSize = ConstTripCount; 5078 return MaxVectorSize; 5079 } 5080 5081 unsigned MaxVF = MaxVectorSize; 5082 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5083 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5084 // Collect all viable vectorization factors larger than the default MaxVF 5085 // (i.e. MaxVectorSize). 5086 SmallVector<unsigned, 8> VFs; 5087 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5088 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5089 VFs.push_back(VS); 5090 5091 // For each VF calculate its register usage. 5092 auto RUs = calculateRegisterUsage(VFs); 5093 5094 // Select the largest VF which doesn't require more registers than existing 5095 // ones. 5096 for (int i = RUs.size() - 1; i >= 0; --i) { 5097 bool Selected = true; 5098 for (auto& pair : RUs[i].MaxLocalUsers) { 5099 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5100 if (pair.second > TargetNumRegisters) 5101 Selected = false; 5102 } 5103 if (Selected) { 5104 MaxVF = VFs[i]; 5105 break; 5106 } 5107 } 5108 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5109 if (MaxVF < MinVF) { 5110 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5111 << ") with target's minimum: " << MinVF << '\n'); 5112 MaxVF = MinVF; 5113 } 5114 } 5115 } 5116 return MaxVF; 5117 } 5118 5119 VectorizationFactor 5120 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5121 float Cost = expectedCost(1).first; 5122 const float ScalarCost = Cost; 5123 unsigned Width = 1; 5124 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5125 5126 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5127 if (ForceVectorization && MaxVF > 1) { 5128 // Ignore scalar width, because the user explicitly wants vectorization. 5129 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5130 // evaluation. 5131 Cost = std::numeric_limits<float>::max(); 5132 } 5133 5134 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5135 // Notice that the vector loop needs to be executed less times, so 5136 // we need to divide the cost of the vector loops by the width of 5137 // the vector elements. 5138 VectorizationCostTy C = expectedCost(i); 5139 float VectorCost = C.first / (float)i; 5140 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5141 << " costs: " << (int)VectorCost << ".\n"); 5142 if (!C.second && !ForceVectorization) { 5143 LLVM_DEBUG( 5144 dbgs() << "LV: Not considering vector loop of width " << i 5145 << " because it will not generate any vector instructions.\n"); 5146 continue; 5147 } 5148 if (VectorCost < Cost) { 5149 Cost = VectorCost; 5150 Width = i; 5151 } 5152 } 5153 5154 if (!EnableCondStoresVectorization && NumPredStores) { 5155 reportVectorizationFailure("There are conditional stores.", 5156 "store that is conditionally executed prevents vectorization", 5157 "ConditionalStore", ORE, TheLoop); 5158 Width = 1; 5159 Cost = ScalarCost; 5160 } 5161 5162 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5163 << "LV: Vectorization seems to be not beneficial, " 5164 << "but was forced by a user.\n"); 5165 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5166 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5167 return Factor; 5168 } 5169 5170 std::pair<unsigned, unsigned> 5171 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5172 unsigned MinWidth = -1U; 5173 unsigned MaxWidth = 8; 5174 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5175 5176 // For each block. 5177 for (BasicBlock *BB : TheLoop->blocks()) { 5178 // For each instruction in the loop. 5179 for (Instruction &I : BB->instructionsWithoutDebug()) { 5180 Type *T = I.getType(); 5181 5182 // Skip ignored values. 5183 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5184 continue; 5185 5186 // Only examine Loads, Stores and PHINodes. 5187 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5188 continue; 5189 5190 // Examine PHI nodes that are reduction variables. Update the type to 5191 // account for the recurrence type. 5192 if (auto *PN = dyn_cast<PHINode>(&I)) { 5193 if (!Legal->isReductionVariable(PN)) 5194 continue; 5195 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5196 T = RdxDesc.getRecurrenceType(); 5197 } 5198 5199 // Examine the stored values. 5200 if (auto *ST = dyn_cast<StoreInst>(&I)) 5201 T = ST->getValueOperand()->getType(); 5202 5203 // Ignore loaded pointer types and stored pointer types that are not 5204 // vectorizable. 5205 // 5206 // FIXME: The check here attempts to predict whether a load or store will 5207 // be vectorized. We only know this for certain after a VF has 5208 // been selected. Here, we assume that if an access can be 5209 // vectorized, it will be. We should also look at extending this 5210 // optimization to non-pointer types. 5211 // 5212 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5213 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5214 continue; 5215 5216 MinWidth = std::min(MinWidth, 5217 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5218 MaxWidth = std::max(MaxWidth, 5219 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5220 } 5221 } 5222 5223 return {MinWidth, MaxWidth}; 5224 } 5225 5226 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5227 unsigned LoopCost) { 5228 // -- The interleave heuristics -- 5229 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5230 // There are many micro-architectural considerations that we can't predict 5231 // at this level. For example, frontend pressure (on decode or fetch) due to 5232 // code size, or the number and capabilities of the execution ports. 5233 // 5234 // We use the following heuristics to select the interleave count: 5235 // 1. If the code has reductions, then we interleave to break the cross 5236 // iteration dependency. 5237 // 2. If the loop is really small, then we interleave to reduce the loop 5238 // overhead. 5239 // 3. We don't interleave if we think that we will spill registers to memory 5240 // due to the increased register pressure. 5241 5242 if (!isScalarEpilogueAllowed()) 5243 return 1; 5244 5245 // We used the distance for the interleave count. 5246 if (Legal->getMaxSafeDepDistBytes() != -1U) 5247 return 1; 5248 5249 // Do not interleave loops with a relatively small known or estimated trip 5250 // count. 5251 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5252 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5253 return 1; 5254 5255 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5256 // We divide by these constants so assume that we have at least one 5257 // instruction that uses at least one register. 5258 for (auto& pair : R.MaxLocalUsers) { 5259 pair.second = std::max(pair.second, 1U); 5260 } 5261 5262 // We calculate the interleave count using the following formula. 5263 // Subtract the number of loop invariants from the number of available 5264 // registers. These registers are used by all of the interleaved instances. 5265 // Next, divide the remaining registers by the number of registers that is 5266 // required by the loop, in order to estimate how many parallel instances 5267 // fit without causing spills. All of this is rounded down if necessary to be 5268 // a power of two. We want power of two interleave count to simplify any 5269 // addressing operations or alignment considerations. 5270 // We also want power of two interleave counts to ensure that the induction 5271 // variable of the vector loop wraps to zero, when tail is folded by masking; 5272 // this currently happens when OptForSize, in which case IC is set to 1 above. 5273 unsigned IC = UINT_MAX; 5274 5275 for (auto& pair : R.MaxLocalUsers) { 5276 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5277 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5278 << " registers of " 5279 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5280 if (VF == 1) { 5281 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5282 TargetNumRegisters = ForceTargetNumScalarRegs; 5283 } else { 5284 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5285 TargetNumRegisters = ForceTargetNumVectorRegs; 5286 } 5287 unsigned MaxLocalUsers = pair.second; 5288 unsigned LoopInvariantRegs = 0; 5289 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5290 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5291 5292 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5293 // Don't count the induction variable as interleaved. 5294 if (EnableIndVarRegisterHeur) { 5295 TmpIC = 5296 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5297 std::max(1U, (MaxLocalUsers - 1))); 5298 } 5299 5300 IC = std::min(IC, TmpIC); 5301 } 5302 5303 // Clamp the interleave ranges to reasonable counts. 5304 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5305 5306 // Check if the user has overridden the max. 5307 if (VF == 1) { 5308 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5309 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5310 } else { 5311 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5312 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5313 } 5314 5315 // If trip count is known or estimated compile time constant, limit the 5316 // interleave count to be less than the trip count divided by VF. 5317 if (BestKnownTC) { 5318 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5319 } 5320 5321 // If we did not calculate the cost for VF (because the user selected the VF) 5322 // then we calculate the cost of VF here. 5323 if (LoopCost == 0) 5324 LoopCost = expectedCost(VF).first; 5325 5326 assert(LoopCost && "Non-zero loop cost expected"); 5327 5328 // Clamp the calculated IC to be between the 1 and the max interleave count 5329 // that the target and trip count allows. 5330 if (IC > MaxInterleaveCount) 5331 IC = MaxInterleaveCount; 5332 else if (IC < 1) 5333 IC = 1; 5334 5335 // Interleave if we vectorized this loop and there is a reduction that could 5336 // benefit from interleaving. 5337 if (VF > 1 && !Legal->getReductionVars().empty()) { 5338 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5339 return IC; 5340 } 5341 5342 // Note that if we've already vectorized the loop we will have done the 5343 // runtime check and so interleaving won't require further checks. 5344 bool InterleavingRequiresRuntimePointerCheck = 5345 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5346 5347 // We want to interleave small loops in order to reduce the loop overhead and 5348 // potentially expose ILP opportunities. 5349 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5350 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5351 // We assume that the cost overhead is 1 and we use the cost model 5352 // to estimate the cost of the loop and interleave until the cost of the 5353 // loop overhead is about 5% of the cost of the loop. 5354 unsigned SmallIC = 5355 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5356 5357 // Interleave until store/load ports (estimated by max interleave count) are 5358 // saturated. 5359 unsigned NumStores = Legal->getNumStores(); 5360 unsigned NumLoads = Legal->getNumLoads(); 5361 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5362 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5363 5364 // If we have a scalar reduction (vector reductions are already dealt with 5365 // by this point), we can increase the critical path length if the loop 5366 // we're interleaving is inside another loop. Limit, by default to 2, so the 5367 // critical path only gets increased by one reduction operation. 5368 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5369 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5370 SmallIC = std::min(SmallIC, F); 5371 StoresIC = std::min(StoresIC, F); 5372 LoadsIC = std::min(LoadsIC, F); 5373 } 5374 5375 if (EnableLoadStoreRuntimeInterleave && 5376 std::max(StoresIC, LoadsIC) > SmallIC) { 5377 LLVM_DEBUG( 5378 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5379 return std::max(StoresIC, LoadsIC); 5380 } 5381 5382 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5383 return SmallIC; 5384 } 5385 5386 // Interleave if this is a large loop (small loops are already dealt with by 5387 // this point) that could benefit from interleaving. 5388 bool HasReductions = !Legal->getReductionVars().empty(); 5389 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5390 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5391 return IC; 5392 } 5393 5394 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5395 return 1; 5396 } 5397 5398 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5399 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5400 // This function calculates the register usage by measuring the highest number 5401 // of values that are alive at a single location. Obviously, this is a very 5402 // rough estimation. We scan the loop in a topological order in order and 5403 // assign a number to each instruction. We use RPO to ensure that defs are 5404 // met before their users. We assume that each instruction that has in-loop 5405 // users starts an interval. We record every time that an in-loop value is 5406 // used, so we have a list of the first and last occurrences of each 5407 // instruction. Next, we transpose this data structure into a multi map that 5408 // holds the list of intervals that *end* at a specific location. This multi 5409 // map allows us to perform a linear search. We scan the instructions linearly 5410 // and record each time that a new interval starts, by placing it in a set. 5411 // If we find this value in the multi-map then we remove it from the set. 5412 // The max register usage is the maximum size of the set. 5413 // We also search for instructions that are defined outside the loop, but are 5414 // used inside the loop. We need this number separately from the max-interval 5415 // usage number because when we unroll, loop-invariant values do not take 5416 // more register. 5417 LoopBlocksDFS DFS(TheLoop); 5418 DFS.perform(LI); 5419 5420 RegisterUsage RU; 5421 5422 // Each 'key' in the map opens a new interval. The values 5423 // of the map are the index of the 'last seen' usage of the 5424 // instruction that is the key. 5425 using IntervalMap = DenseMap<Instruction *, unsigned>; 5426 5427 // Maps instruction to its index. 5428 SmallVector<Instruction *, 64> IdxToInstr; 5429 // Marks the end of each interval. 5430 IntervalMap EndPoint; 5431 // Saves the list of instruction indices that are used in the loop. 5432 SmallPtrSet<Instruction *, 8> Ends; 5433 // Saves the list of values that are used in the loop but are 5434 // defined outside the loop, such as arguments and constants. 5435 SmallPtrSet<Value *, 8> LoopInvariants; 5436 5437 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5438 for (Instruction &I : BB->instructionsWithoutDebug()) { 5439 IdxToInstr.push_back(&I); 5440 5441 // Save the end location of each USE. 5442 for (Value *U : I.operands()) { 5443 auto *Instr = dyn_cast<Instruction>(U); 5444 5445 // Ignore non-instruction values such as arguments, constants, etc. 5446 if (!Instr) 5447 continue; 5448 5449 // If this instruction is outside the loop then record it and continue. 5450 if (!TheLoop->contains(Instr)) { 5451 LoopInvariants.insert(Instr); 5452 continue; 5453 } 5454 5455 // Overwrite previous end points. 5456 EndPoint[Instr] = IdxToInstr.size(); 5457 Ends.insert(Instr); 5458 } 5459 } 5460 } 5461 5462 // Saves the list of intervals that end with the index in 'key'. 5463 using InstrList = SmallVector<Instruction *, 2>; 5464 DenseMap<unsigned, InstrList> TransposeEnds; 5465 5466 // Transpose the EndPoints to a list of values that end at each index. 5467 for (auto &Interval : EndPoint) 5468 TransposeEnds[Interval.second].push_back(Interval.first); 5469 5470 SmallPtrSet<Instruction *, 8> OpenIntervals; 5471 5472 // Get the size of the widest register. 5473 unsigned MaxSafeDepDist = -1U; 5474 if (Legal->getMaxSafeDepDistBytes() != -1U) 5475 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5476 unsigned WidestRegister = 5477 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5478 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5479 5480 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5481 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5482 5483 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5484 5485 // A lambda that gets the register usage for the given type and VF. 5486 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5487 if (Ty->isTokenTy()) 5488 return 0U; 5489 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5490 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5491 }; 5492 5493 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5494 Instruction *I = IdxToInstr[i]; 5495 5496 // Remove all of the instructions that end at this location. 5497 InstrList &List = TransposeEnds[i]; 5498 for (Instruction *ToRemove : List) 5499 OpenIntervals.erase(ToRemove); 5500 5501 // Ignore instructions that are never used within the loop. 5502 if (Ends.find(I) == Ends.end()) 5503 continue; 5504 5505 // Skip ignored values. 5506 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5507 continue; 5508 5509 // For each VF find the maximum usage of registers. 5510 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5511 // Count the number of live intervals. 5512 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5513 5514 if (VFs[j] == 1) { 5515 for (auto Inst : OpenIntervals) { 5516 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5517 if (RegUsage.find(ClassID) == RegUsage.end()) 5518 RegUsage[ClassID] = 1; 5519 else 5520 RegUsage[ClassID] += 1; 5521 } 5522 } else { 5523 collectUniformsAndScalars(VFs[j]); 5524 for (auto Inst : OpenIntervals) { 5525 // Skip ignored values for VF > 1. 5526 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5527 continue; 5528 if (isScalarAfterVectorization(Inst, VFs[j])) { 5529 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5530 if (RegUsage.find(ClassID) == RegUsage.end()) 5531 RegUsage[ClassID] = 1; 5532 else 5533 RegUsage[ClassID] += 1; 5534 } else { 5535 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5536 if (RegUsage.find(ClassID) == RegUsage.end()) 5537 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5538 else 5539 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5540 } 5541 } 5542 } 5543 5544 for (auto& pair : RegUsage) { 5545 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5546 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5547 else 5548 MaxUsages[j][pair.first] = pair.second; 5549 } 5550 } 5551 5552 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5553 << OpenIntervals.size() << '\n'); 5554 5555 // Add the current instruction to the list of open intervals. 5556 OpenIntervals.insert(I); 5557 } 5558 5559 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5560 SmallMapVector<unsigned, unsigned, 4> Invariant; 5561 5562 for (auto Inst : LoopInvariants) { 5563 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5564 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5565 if (Invariant.find(ClassID) == Invariant.end()) 5566 Invariant[ClassID] = Usage; 5567 else 5568 Invariant[ClassID] += Usage; 5569 } 5570 5571 LLVM_DEBUG({ 5572 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5573 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5574 << " item\n"; 5575 for (const auto &pair : MaxUsages[i]) { 5576 dbgs() << "LV(REG): RegisterClass: " 5577 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5578 << " registers\n"; 5579 } 5580 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5581 << " item\n"; 5582 for (const auto &pair : Invariant) { 5583 dbgs() << "LV(REG): RegisterClass: " 5584 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5585 << " registers\n"; 5586 } 5587 }); 5588 5589 RU.LoopInvariantRegs = Invariant; 5590 RU.MaxLocalUsers = MaxUsages[i]; 5591 RUs[i] = RU; 5592 } 5593 5594 return RUs; 5595 } 5596 5597 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5598 // TODO: Cost model for emulated masked load/store is completely 5599 // broken. This hack guides the cost model to use an artificially 5600 // high enough value to practically disable vectorization with such 5601 // operations, except where previously deployed legality hack allowed 5602 // using very low cost values. This is to avoid regressions coming simply 5603 // from moving "masked load/store" check from legality to cost model. 5604 // Masked Load/Gather emulation was previously never allowed. 5605 // Limited number of Masked Store/Scatter emulation was allowed. 5606 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5607 return isa<LoadInst>(I) || 5608 (isa<StoreInst>(I) && 5609 NumPredStores > NumberOfStoresToPredicate); 5610 } 5611 5612 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5613 // If we aren't vectorizing the loop, or if we've already collected the 5614 // instructions to scalarize, there's nothing to do. Collection may already 5615 // have occurred if we have a user-selected VF and are now computing the 5616 // expected cost for interleaving. 5617 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5618 return; 5619 5620 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5621 // not profitable to scalarize any instructions, the presence of VF in the 5622 // map will indicate that we've analyzed it already. 5623 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5624 5625 // Find all the instructions that are scalar with predication in the loop and 5626 // determine if it would be better to not if-convert the blocks they are in. 5627 // If so, we also record the instructions to scalarize. 5628 for (BasicBlock *BB : TheLoop->blocks()) { 5629 if (!blockNeedsPredication(BB)) 5630 continue; 5631 for (Instruction &I : *BB) 5632 if (isScalarWithPredication(&I)) { 5633 ScalarCostsTy ScalarCosts; 5634 // Do not apply discount logic if hacked cost is needed 5635 // for emulated masked memrefs. 5636 if (!useEmulatedMaskMemRefHack(&I) && 5637 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5638 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5639 // Remember that BB will remain after vectorization. 5640 PredicatedBBsAfterVectorization.insert(BB); 5641 } 5642 } 5643 } 5644 5645 int LoopVectorizationCostModel::computePredInstDiscount( 5646 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5647 unsigned VF) { 5648 assert(!isUniformAfterVectorization(PredInst, VF) && 5649 "Instruction marked uniform-after-vectorization will be predicated"); 5650 5651 // Initialize the discount to zero, meaning that the scalar version and the 5652 // vector version cost the same. 5653 int Discount = 0; 5654 5655 // Holds instructions to analyze. The instructions we visit are mapped in 5656 // ScalarCosts. Those instructions are the ones that would be scalarized if 5657 // we find that the scalar version costs less. 5658 SmallVector<Instruction *, 8> Worklist; 5659 5660 // Returns true if the given instruction can be scalarized. 5661 auto canBeScalarized = [&](Instruction *I) -> bool { 5662 // We only attempt to scalarize instructions forming a single-use chain 5663 // from the original predicated block that would otherwise be vectorized. 5664 // Although not strictly necessary, we give up on instructions we know will 5665 // already be scalar to avoid traversing chains that are unlikely to be 5666 // beneficial. 5667 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5668 isScalarAfterVectorization(I, VF)) 5669 return false; 5670 5671 // If the instruction is scalar with predication, it will be analyzed 5672 // separately. We ignore it within the context of PredInst. 5673 if (isScalarWithPredication(I)) 5674 return false; 5675 5676 // If any of the instruction's operands are uniform after vectorization, 5677 // the instruction cannot be scalarized. This prevents, for example, a 5678 // masked load from being scalarized. 5679 // 5680 // We assume we will only emit a value for lane zero of an instruction 5681 // marked uniform after vectorization, rather than VF identical values. 5682 // Thus, if we scalarize an instruction that uses a uniform, we would 5683 // create uses of values corresponding to the lanes we aren't emitting code 5684 // for. This behavior can be changed by allowing getScalarValue to clone 5685 // the lane zero values for uniforms rather than asserting. 5686 for (Use &U : I->operands()) 5687 if (auto *J = dyn_cast<Instruction>(U.get())) 5688 if (isUniformAfterVectorization(J, VF)) 5689 return false; 5690 5691 // Otherwise, we can scalarize the instruction. 5692 return true; 5693 }; 5694 5695 // Compute the expected cost discount from scalarizing the entire expression 5696 // feeding the predicated instruction. We currently only consider expressions 5697 // that are single-use instruction chains. 5698 Worklist.push_back(PredInst); 5699 while (!Worklist.empty()) { 5700 Instruction *I = Worklist.pop_back_val(); 5701 5702 // If we've already analyzed the instruction, there's nothing to do. 5703 if (ScalarCosts.find(I) != ScalarCosts.end()) 5704 continue; 5705 5706 // Compute the cost of the vector instruction. Note that this cost already 5707 // includes the scalarization overhead of the predicated instruction. 5708 unsigned VectorCost = getInstructionCost(I, VF).first; 5709 5710 // Compute the cost of the scalarized instruction. This cost is the cost of 5711 // the instruction as if it wasn't if-converted and instead remained in the 5712 // predicated block. We will scale this cost by block probability after 5713 // computing the scalarization overhead. 5714 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5715 5716 // Compute the scalarization overhead of needed insertelement instructions 5717 // and phi nodes. 5718 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5719 ScalarCost += TTI.getScalarizationOverhead( 5720 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5721 APInt::getAllOnesValue(VF), true, false); 5722 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5723 } 5724 5725 // Compute the scalarization overhead of needed extractelement 5726 // instructions. For each of the instruction's operands, if the operand can 5727 // be scalarized, add it to the worklist; otherwise, account for the 5728 // overhead. 5729 for (Use &U : I->operands()) 5730 if (auto *J = dyn_cast<Instruction>(U.get())) { 5731 assert(VectorType::isValidElementType(J->getType()) && 5732 "Instruction has non-scalar type"); 5733 if (canBeScalarized(J)) 5734 Worklist.push_back(J); 5735 else if (needsExtract(J, VF)) 5736 ScalarCost += TTI.getScalarizationOverhead( 5737 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5738 APInt::getAllOnesValue(VF), false, true); 5739 } 5740 5741 // Scale the total scalar cost by block probability. 5742 ScalarCost /= getReciprocalPredBlockProb(); 5743 5744 // Compute the discount. A non-negative discount means the vector version 5745 // of the instruction costs more, and scalarizing would be beneficial. 5746 Discount += VectorCost - ScalarCost; 5747 ScalarCosts[I] = ScalarCost; 5748 } 5749 5750 return Discount; 5751 } 5752 5753 LoopVectorizationCostModel::VectorizationCostTy 5754 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5755 VectorizationCostTy Cost; 5756 5757 // For each block. 5758 for (BasicBlock *BB : TheLoop->blocks()) { 5759 VectorizationCostTy BlockCost; 5760 5761 // For each instruction in the old loop. 5762 for (Instruction &I : BB->instructionsWithoutDebug()) { 5763 // Skip ignored values. 5764 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5765 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5766 continue; 5767 5768 VectorizationCostTy C = getInstructionCost(&I, VF); 5769 5770 // Check if we should override the cost. 5771 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5772 C.first = ForceTargetInstructionCost; 5773 5774 BlockCost.first += C.first; 5775 BlockCost.second |= C.second; 5776 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5777 << " for VF " << VF << " For instruction: " << I 5778 << '\n'); 5779 } 5780 5781 // If we are vectorizing a predicated block, it will have been 5782 // if-converted. This means that the block's instructions (aside from 5783 // stores and instructions that may divide by zero) will now be 5784 // unconditionally executed. For the scalar case, we may not always execute 5785 // the predicated block. Thus, scale the block's cost by the probability of 5786 // executing it. 5787 if (VF == 1 && blockNeedsPredication(BB)) 5788 BlockCost.first /= getReciprocalPredBlockProb(); 5789 5790 Cost.first += BlockCost.first; 5791 Cost.second |= BlockCost.second; 5792 } 5793 5794 return Cost; 5795 } 5796 5797 /// Gets Address Access SCEV after verifying that the access pattern 5798 /// is loop invariant except the induction variable dependence. 5799 /// 5800 /// This SCEV can be sent to the Target in order to estimate the address 5801 /// calculation cost. 5802 static const SCEV *getAddressAccessSCEV( 5803 Value *Ptr, 5804 LoopVectorizationLegality *Legal, 5805 PredicatedScalarEvolution &PSE, 5806 const Loop *TheLoop) { 5807 5808 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5809 if (!Gep) 5810 return nullptr; 5811 5812 // We are looking for a gep with all loop invariant indices except for one 5813 // which should be an induction variable. 5814 auto SE = PSE.getSE(); 5815 unsigned NumOperands = Gep->getNumOperands(); 5816 for (unsigned i = 1; i < NumOperands; ++i) { 5817 Value *Opd = Gep->getOperand(i); 5818 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5819 !Legal->isInductionVariable(Opd)) 5820 return nullptr; 5821 } 5822 5823 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5824 return PSE.getSCEV(Ptr); 5825 } 5826 5827 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5828 return Legal->hasStride(I->getOperand(0)) || 5829 Legal->hasStride(I->getOperand(1)); 5830 } 5831 5832 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5833 unsigned VF) { 5834 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5835 Type *ValTy = getMemInstValueType(I); 5836 auto SE = PSE.getSE(); 5837 5838 unsigned AS = getLoadStoreAddressSpace(I); 5839 Value *Ptr = getLoadStorePointerOperand(I); 5840 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5841 5842 // Figure out whether the access is strided and get the stride value 5843 // if it's known in compile time 5844 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5845 5846 // Get the cost of the scalar memory instruction and address computation. 5847 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5848 5849 // Don't pass *I here, since it is scalar but will actually be part of a 5850 // vectorized loop where the user of it is a vectorized instruction. 5851 const Align Alignment = getLoadStoreAlignment(I); 5852 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5853 Alignment, AS, 5854 TTI::TCK_RecipThroughput); 5855 5856 // Get the overhead of the extractelement and insertelement instructions 5857 // we might create due to scalarization. 5858 Cost += getScalarizationOverhead(I, VF); 5859 5860 // If we have a predicated store, it may not be executed for each vector 5861 // lane. Scale the cost by the probability of executing the predicated 5862 // block. 5863 if (isPredicatedInst(I)) { 5864 Cost /= getReciprocalPredBlockProb(); 5865 5866 if (useEmulatedMaskMemRefHack(I)) 5867 // Artificially setting to a high enough value to practically disable 5868 // vectorization with such operations. 5869 Cost = 3000000; 5870 } 5871 5872 return Cost; 5873 } 5874 5875 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5876 unsigned VF) { 5877 Type *ValTy = getMemInstValueType(I); 5878 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5879 Value *Ptr = getLoadStorePointerOperand(I); 5880 unsigned AS = getLoadStoreAddressSpace(I); 5881 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5882 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5883 5884 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5885 "Stride should be 1 or -1 for consecutive memory access"); 5886 const Align Alignment = getLoadStoreAlignment(I); 5887 unsigned Cost = 0; 5888 if (Legal->isMaskRequired(I)) 5889 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5890 Alignment.value(), AS, CostKind); 5891 else 5892 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5893 CostKind, I); 5894 5895 bool Reverse = ConsecutiveStride < 0; 5896 if (Reverse) 5897 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5898 return Cost; 5899 } 5900 5901 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5902 unsigned VF) { 5903 Type *ValTy = getMemInstValueType(I); 5904 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5905 const Align Alignment = getLoadStoreAlignment(I); 5906 unsigned AS = getLoadStoreAddressSpace(I); 5907 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5908 if (isa<LoadInst>(I)) { 5909 return TTI.getAddressComputationCost(ValTy) + 5910 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5911 CostKind) + 5912 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5913 } 5914 StoreInst *SI = cast<StoreInst>(I); 5915 5916 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5917 return TTI.getAddressComputationCost(ValTy) + 5918 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5919 CostKind) + 5920 (isLoopInvariantStoreValue 5921 ? 0 5922 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5923 VF - 1)); 5924 } 5925 5926 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5927 unsigned VF) { 5928 Type *ValTy = getMemInstValueType(I); 5929 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5930 const Align Alignment = getLoadStoreAlignment(I); 5931 Value *Ptr = getLoadStorePointerOperand(I); 5932 5933 return TTI.getAddressComputationCost(VectorTy) + 5934 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5935 Legal->isMaskRequired(I), Alignment.value(), 5936 TargetTransformInfo::TCK_RecipThroughput, 5937 I); 5938 } 5939 5940 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5941 unsigned VF) { 5942 Type *ValTy = getMemInstValueType(I); 5943 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5944 unsigned AS = getLoadStoreAddressSpace(I); 5945 5946 auto Group = getInterleavedAccessGroup(I); 5947 assert(Group && "Fail to get an interleaved access group."); 5948 5949 unsigned InterleaveFactor = Group->getFactor(); 5950 VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5951 5952 // Holds the indices of existing members in an interleaved load group. 5953 // An interleaved store group doesn't need this as it doesn't allow gaps. 5954 SmallVector<unsigned, 4> Indices; 5955 if (isa<LoadInst>(I)) { 5956 for (unsigned i = 0; i < InterleaveFactor; i++) 5957 if (Group->getMember(i)) 5958 Indices.push_back(i); 5959 } 5960 5961 // Calculate the cost of the whole interleaved group. 5962 bool UseMaskForGaps = 5963 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5964 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5965 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5966 Group->getAlign().value(), AS, TTI::TCK_RecipThroughput, 5967 Legal->isMaskRequired(I), UseMaskForGaps); 5968 5969 if (Group->isReverse()) { 5970 // TODO: Add support for reversed masked interleaved access. 5971 assert(!Legal->isMaskRequired(I) && 5972 "Reverse masked interleaved access not supported."); 5973 Cost += Group->getNumMembers() * 5974 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5975 } 5976 return Cost; 5977 } 5978 5979 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5980 unsigned VF) { 5981 // Calculate scalar cost only. Vectorization cost should be ready at this 5982 // moment. 5983 if (VF == 1) { 5984 Type *ValTy = getMemInstValueType(I); 5985 const Align Alignment = getLoadStoreAlignment(I); 5986 unsigned AS = getLoadStoreAddressSpace(I); 5987 5988 return TTI.getAddressComputationCost(ValTy) + 5989 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 5990 TTI::TCK_RecipThroughput, I); 5991 } 5992 return getWideningCost(I, VF); 5993 } 5994 5995 LoopVectorizationCostModel::VectorizationCostTy 5996 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5997 // If we know that this instruction will remain uniform, check the cost of 5998 // the scalar version. 5999 if (isUniformAfterVectorization(I, VF)) 6000 VF = 1; 6001 6002 if (VF > 1 && isProfitableToScalarize(I, VF)) 6003 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6004 6005 // Forced scalars do not have any scalarization overhead. 6006 auto ForcedScalar = ForcedScalars.find(VF); 6007 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 6008 auto InstSet = ForcedScalar->second; 6009 if (InstSet.find(I) != InstSet.end()) 6010 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 6011 } 6012 6013 Type *VectorTy; 6014 unsigned C = getInstructionCost(I, VF, VectorTy); 6015 6016 bool TypeNotScalarized = 6017 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 6018 return VectorizationCostTy(C, TypeNotScalarized); 6019 } 6020 6021 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6022 unsigned VF) { 6023 6024 if (VF == 1) 6025 return 0; 6026 6027 unsigned Cost = 0; 6028 Type *RetTy = ToVectorTy(I->getType(), VF); 6029 if (!RetTy->isVoidTy() && 6030 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6031 Cost += TTI.getScalarizationOverhead( 6032 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false); 6033 6034 // Some targets keep addresses scalar. 6035 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6036 return Cost; 6037 6038 // Some targets support efficient element stores. 6039 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6040 return Cost; 6041 6042 // Collect operands to consider. 6043 CallInst *CI = dyn_cast<CallInst>(I); 6044 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6045 6046 // Skip operands that do not require extraction/scalarization and do not incur 6047 // any overhead. 6048 return Cost + TTI.getOperandsScalarizationOverhead( 6049 filterExtractingOperands(Ops, VF), VF); 6050 } 6051 6052 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6053 if (VF == 1) 6054 return; 6055 NumPredStores = 0; 6056 for (BasicBlock *BB : TheLoop->blocks()) { 6057 // For each instruction in the old loop. 6058 for (Instruction &I : *BB) { 6059 Value *Ptr = getLoadStorePointerOperand(&I); 6060 if (!Ptr) 6061 continue; 6062 6063 // TODO: We should generate better code and update the cost model for 6064 // predicated uniform stores. Today they are treated as any other 6065 // predicated store (see added test cases in 6066 // invariant-store-vectorization.ll). 6067 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6068 NumPredStores++; 6069 6070 if (Legal->isUniform(Ptr) && 6071 // Conditional loads and stores should be scalarized and predicated. 6072 // isScalarWithPredication cannot be used here since masked 6073 // gather/scatters are not considered scalar with predication. 6074 !Legal->blockNeedsPredication(I.getParent())) { 6075 // TODO: Avoid replicating loads and stores instead of 6076 // relying on instcombine to remove them. 6077 // Load: Scalar load + broadcast 6078 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6079 unsigned Cost = getUniformMemOpCost(&I, VF); 6080 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6081 continue; 6082 } 6083 6084 // We assume that widening is the best solution when possible. 6085 if (memoryInstructionCanBeWidened(&I, VF)) { 6086 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6087 int ConsecutiveStride = 6088 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6089 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6090 "Expected consecutive stride."); 6091 InstWidening Decision = 6092 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6093 setWideningDecision(&I, VF, Decision, Cost); 6094 continue; 6095 } 6096 6097 // Choose between Interleaving, Gather/Scatter or Scalarization. 6098 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6099 unsigned NumAccesses = 1; 6100 if (isAccessInterleaved(&I)) { 6101 auto Group = getInterleavedAccessGroup(&I); 6102 assert(Group && "Fail to get an interleaved access group."); 6103 6104 // Make one decision for the whole group. 6105 if (getWideningDecision(&I, VF) != CM_Unknown) 6106 continue; 6107 6108 NumAccesses = Group->getNumMembers(); 6109 if (interleavedAccessCanBeWidened(&I, VF)) 6110 InterleaveCost = getInterleaveGroupCost(&I, VF); 6111 } 6112 6113 unsigned GatherScatterCost = 6114 isLegalGatherOrScatter(&I) 6115 ? getGatherScatterCost(&I, VF) * NumAccesses 6116 : std::numeric_limits<unsigned>::max(); 6117 6118 unsigned ScalarizationCost = 6119 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6120 6121 // Choose better solution for the current VF, 6122 // write down this decision and use it during vectorization. 6123 unsigned Cost; 6124 InstWidening Decision; 6125 if (InterleaveCost <= GatherScatterCost && 6126 InterleaveCost < ScalarizationCost) { 6127 Decision = CM_Interleave; 6128 Cost = InterleaveCost; 6129 } else if (GatherScatterCost < ScalarizationCost) { 6130 Decision = CM_GatherScatter; 6131 Cost = GatherScatterCost; 6132 } else { 6133 Decision = CM_Scalarize; 6134 Cost = ScalarizationCost; 6135 } 6136 // If the instructions belongs to an interleave group, the whole group 6137 // receives the same decision. The whole group receives the cost, but 6138 // the cost will actually be assigned to one instruction. 6139 if (auto Group = getInterleavedAccessGroup(&I)) 6140 setWideningDecision(Group, VF, Decision, Cost); 6141 else 6142 setWideningDecision(&I, VF, Decision, Cost); 6143 } 6144 } 6145 6146 // Make sure that any load of address and any other address computation 6147 // remains scalar unless there is gather/scatter support. This avoids 6148 // inevitable extracts into address registers, and also has the benefit of 6149 // activating LSR more, since that pass can't optimize vectorized 6150 // addresses. 6151 if (TTI.prefersVectorizedAddressing()) 6152 return; 6153 6154 // Start with all scalar pointer uses. 6155 SmallPtrSet<Instruction *, 8> AddrDefs; 6156 for (BasicBlock *BB : TheLoop->blocks()) 6157 for (Instruction &I : *BB) { 6158 Instruction *PtrDef = 6159 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6160 if (PtrDef && TheLoop->contains(PtrDef) && 6161 getWideningDecision(&I, VF) != CM_GatherScatter) 6162 AddrDefs.insert(PtrDef); 6163 } 6164 6165 // Add all instructions used to generate the addresses. 6166 SmallVector<Instruction *, 4> Worklist; 6167 for (auto *I : AddrDefs) 6168 Worklist.push_back(I); 6169 while (!Worklist.empty()) { 6170 Instruction *I = Worklist.pop_back_val(); 6171 for (auto &Op : I->operands()) 6172 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6173 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6174 AddrDefs.insert(InstOp).second) 6175 Worklist.push_back(InstOp); 6176 } 6177 6178 for (auto *I : AddrDefs) { 6179 if (isa<LoadInst>(I)) { 6180 // Setting the desired widening decision should ideally be handled in 6181 // by cost functions, but since this involves the task of finding out 6182 // if the loaded register is involved in an address computation, it is 6183 // instead changed here when we know this is the case. 6184 InstWidening Decision = getWideningDecision(I, VF); 6185 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6186 // Scalarize a widened load of address. 6187 setWideningDecision(I, VF, CM_Scalarize, 6188 (VF * getMemoryInstructionCost(I, 1))); 6189 else if (auto Group = getInterleavedAccessGroup(I)) { 6190 // Scalarize an interleave group of address loads. 6191 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6192 if (Instruction *Member = Group->getMember(I)) 6193 setWideningDecision(Member, VF, CM_Scalarize, 6194 (VF * getMemoryInstructionCost(Member, 1))); 6195 } 6196 } 6197 } else 6198 // Make sure I gets scalarized and a cost estimate without 6199 // scalarization overhead. 6200 ForcedScalars[VF].insert(I); 6201 } 6202 } 6203 6204 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6205 unsigned VF, 6206 Type *&VectorTy) { 6207 Type *RetTy = I->getType(); 6208 if (canTruncateToMinimalBitwidth(I, VF)) 6209 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6210 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6211 auto SE = PSE.getSE(); 6212 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6213 6214 // TODO: We need to estimate the cost of intrinsic calls. 6215 switch (I->getOpcode()) { 6216 case Instruction::GetElementPtr: 6217 // We mark this instruction as zero-cost because the cost of GEPs in 6218 // vectorized code depends on whether the corresponding memory instruction 6219 // is scalarized or not. Therefore, we handle GEPs with the memory 6220 // instruction cost. 6221 return 0; 6222 case Instruction::Br: { 6223 // In cases of scalarized and predicated instructions, there will be VF 6224 // predicated blocks in the vectorized loop. Each branch around these 6225 // blocks requires also an extract of its vector compare i1 element. 6226 bool ScalarPredicatedBB = false; 6227 BranchInst *BI = cast<BranchInst>(I); 6228 if (VF > 1 && BI->isConditional() && 6229 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6230 PredicatedBBsAfterVectorization.end() || 6231 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6232 PredicatedBBsAfterVectorization.end())) 6233 ScalarPredicatedBB = true; 6234 6235 if (ScalarPredicatedBB) { 6236 // Return cost for branches around scalarized and predicated blocks. 6237 VectorType *Vec_i1Ty = 6238 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6239 return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), 6240 false, true) + 6241 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6242 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6243 // The back-edge branch will remain, as will all scalar branches. 6244 return TTI.getCFInstrCost(Instruction::Br); 6245 else 6246 // This branch will be eliminated by if-conversion. 6247 return 0; 6248 // Note: We currently assume zero cost for an unconditional branch inside 6249 // a predicated block since it will become a fall-through, although we 6250 // may decide in the future to call TTI for all branches. 6251 } 6252 case Instruction::PHI: { 6253 auto *Phi = cast<PHINode>(I); 6254 6255 // First-order recurrences are replaced by vector shuffles inside the loop. 6256 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6257 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6258 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6259 cast<VectorType>(VectorTy), VF - 1, 6260 VectorType::get(RetTy, 1)); 6261 6262 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6263 // converted into select instructions. We require N - 1 selects per phi 6264 // node, where N is the number of incoming values. 6265 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6266 return (Phi->getNumIncomingValues() - 1) * 6267 TTI.getCmpSelInstrCost( 6268 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6269 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6270 CostKind); 6271 6272 return TTI.getCFInstrCost(Instruction::PHI); 6273 } 6274 case Instruction::UDiv: 6275 case Instruction::SDiv: 6276 case Instruction::URem: 6277 case Instruction::SRem: 6278 // If we have a predicated instruction, it may not be executed for each 6279 // vector lane. Get the scalarization cost and scale this amount by the 6280 // probability of executing the predicated block. If the instruction is not 6281 // predicated, we fall through to the next case. 6282 if (VF > 1 && isScalarWithPredication(I)) { 6283 unsigned Cost = 0; 6284 6285 // These instructions have a non-void type, so account for the phi nodes 6286 // that we will create. This cost is likely to be zero. The phi node 6287 // cost, if any, should be scaled by the block probability because it 6288 // models a copy at the end of each predicated block. 6289 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6290 6291 // The cost of the non-predicated instruction. 6292 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6293 6294 // The cost of insertelement and extractelement instructions needed for 6295 // scalarization. 6296 Cost += getScalarizationOverhead(I, VF); 6297 6298 // Scale the cost by the probability of executing the predicated blocks. 6299 // This assumes the predicated block for each vector lane is equally 6300 // likely. 6301 return Cost / getReciprocalPredBlockProb(); 6302 } 6303 LLVM_FALLTHROUGH; 6304 case Instruction::Add: 6305 case Instruction::FAdd: 6306 case Instruction::Sub: 6307 case Instruction::FSub: 6308 case Instruction::Mul: 6309 case Instruction::FMul: 6310 case Instruction::FDiv: 6311 case Instruction::FRem: 6312 case Instruction::Shl: 6313 case Instruction::LShr: 6314 case Instruction::AShr: 6315 case Instruction::And: 6316 case Instruction::Or: 6317 case Instruction::Xor: { 6318 // Since we will replace the stride by 1 the multiplication should go away. 6319 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6320 return 0; 6321 // Certain instructions can be cheaper to vectorize if they have a constant 6322 // second vector operand. One example of this are shifts on x86. 6323 Value *Op2 = I->getOperand(1); 6324 TargetTransformInfo::OperandValueProperties Op2VP; 6325 TargetTransformInfo::OperandValueKind Op2VK = 6326 TTI.getOperandInfo(Op2, Op2VP); 6327 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6328 Op2VK = TargetTransformInfo::OK_UniformValue; 6329 6330 SmallVector<const Value *, 4> Operands(I->operand_values()); 6331 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6332 return N * TTI.getArithmeticInstrCost( 6333 I->getOpcode(), VectorTy, CostKind, 6334 TargetTransformInfo::OK_AnyValue, 6335 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6336 } 6337 case Instruction::FNeg: { 6338 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6339 return N * TTI.getArithmeticInstrCost( 6340 I->getOpcode(), VectorTy, CostKind, 6341 TargetTransformInfo::OK_AnyValue, 6342 TargetTransformInfo::OK_AnyValue, 6343 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6344 I->getOperand(0), I); 6345 } 6346 case Instruction::Select: { 6347 SelectInst *SI = cast<SelectInst>(I); 6348 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6349 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6350 Type *CondTy = SI->getCondition()->getType(); 6351 if (!ScalarCond) 6352 CondTy = VectorType::get(CondTy, VF); 6353 6354 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6355 CostKind, I); 6356 } 6357 case Instruction::ICmp: 6358 case Instruction::FCmp: { 6359 Type *ValTy = I->getOperand(0)->getType(); 6360 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6361 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6362 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6363 VectorTy = ToVectorTy(ValTy, VF); 6364 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6365 I); 6366 } 6367 case Instruction::Store: 6368 case Instruction::Load: { 6369 unsigned Width = VF; 6370 if (Width > 1) { 6371 InstWidening Decision = getWideningDecision(I, Width); 6372 assert(Decision != CM_Unknown && 6373 "CM decision should be taken at this point"); 6374 if (Decision == CM_Scalarize) 6375 Width = 1; 6376 } 6377 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6378 return getMemoryInstructionCost(I, VF); 6379 } 6380 case Instruction::ZExt: 6381 case Instruction::SExt: 6382 case Instruction::FPToUI: 6383 case Instruction::FPToSI: 6384 case Instruction::FPExt: 6385 case Instruction::PtrToInt: 6386 case Instruction::IntToPtr: 6387 case Instruction::SIToFP: 6388 case Instruction::UIToFP: 6389 case Instruction::Trunc: 6390 case Instruction::FPTrunc: 6391 case Instruction::BitCast: { 6392 // We optimize the truncation of induction variables having constant 6393 // integer steps. The cost of these truncations is the same as the scalar 6394 // operation. 6395 if (isOptimizableIVTruncate(I, VF)) { 6396 auto *Trunc = cast<TruncInst>(I); 6397 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6398 Trunc->getSrcTy(), CostKind, Trunc); 6399 } 6400 6401 Type *SrcScalarTy = I->getOperand(0)->getType(); 6402 Type *SrcVecTy = 6403 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6404 if (canTruncateToMinimalBitwidth(I, VF)) { 6405 // This cast is going to be shrunk. This may remove the cast or it might 6406 // turn it into slightly different cast. For example, if MinBW == 16, 6407 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6408 // 6409 // Calculate the modified src and dest types. 6410 Type *MinVecTy = VectorTy; 6411 if (I->getOpcode() == Instruction::Trunc) { 6412 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6413 VectorTy = 6414 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6415 } else if (I->getOpcode() == Instruction::ZExt || 6416 I->getOpcode() == Instruction::SExt) { 6417 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6418 VectorTy = 6419 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6420 } 6421 } 6422 6423 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6424 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, 6425 CostKind, I); 6426 } 6427 case Instruction::Call: { 6428 bool NeedToScalarize; 6429 CallInst *CI = cast<CallInst>(I); 6430 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6431 if (getVectorIntrinsicIDForCall(CI, TLI)) 6432 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6433 return CallCost; 6434 } 6435 default: 6436 // The cost of executing VF copies of the scalar instruction. This opcode 6437 // is unknown. Assume that it is the same as 'mul'. 6438 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6439 CostKind) + 6440 getScalarizationOverhead(I, VF); 6441 } // end of switch. 6442 } 6443 6444 char LoopVectorize::ID = 0; 6445 6446 static const char lv_name[] = "Loop Vectorization"; 6447 6448 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6449 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6450 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6451 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6452 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6453 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6454 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6455 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6456 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6457 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6458 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6459 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6460 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6461 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6462 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6463 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6464 6465 namespace llvm { 6466 6467 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6468 6469 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6470 bool VectorizeOnlyWhenForced) { 6471 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6472 } 6473 6474 } // end namespace llvm 6475 6476 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6477 // Check if the pointer operand of a load or store instruction is 6478 // consecutive. 6479 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6480 return Legal->isConsecutivePtr(Ptr); 6481 return false; 6482 } 6483 6484 void LoopVectorizationCostModel::collectValuesToIgnore() { 6485 // Ignore ephemeral values. 6486 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6487 6488 // Ignore type-promoting instructions we identified during reduction 6489 // detection. 6490 for (auto &Reduction : Legal->getReductionVars()) { 6491 RecurrenceDescriptor &RedDes = Reduction.second; 6492 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6493 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6494 } 6495 // Ignore type-casting instructions we identified during induction 6496 // detection. 6497 for (auto &Induction : Legal->getInductionVars()) { 6498 InductionDescriptor &IndDes = Induction.second; 6499 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6500 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6501 } 6502 } 6503 6504 // TODO: we could return a pair of values that specify the max VF and 6505 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6506 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6507 // doesn't have a cost model that can choose which plan to execute if 6508 // more than one is generated. 6509 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6510 LoopVectorizationCostModel &CM) { 6511 unsigned WidestType; 6512 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6513 return WidestVectorRegBits / WidestType; 6514 } 6515 6516 VectorizationFactor 6517 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6518 unsigned VF = UserVF; 6519 // Outer loop handling: They may require CFG and instruction level 6520 // transformations before even evaluating whether vectorization is profitable. 6521 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6522 // the vectorization pipeline. 6523 if (!OrigLoop->empty()) { 6524 // If the user doesn't provide a vectorization factor, determine a 6525 // reasonable one. 6526 if (!UserVF) { 6527 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6528 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6529 6530 // Make sure we have a VF > 1 for stress testing. 6531 if (VPlanBuildStressTest && VF < 2) { 6532 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6533 << "overriding computed VF.\n"); 6534 VF = 4; 6535 } 6536 } 6537 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6538 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6539 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6540 << " to build VPlans.\n"); 6541 buildVPlans(VF, VF); 6542 6543 // For VPlan build stress testing, we bail out after VPlan construction. 6544 if (VPlanBuildStressTest) 6545 return VectorizationFactor::Disabled(); 6546 6547 return {VF, 0}; 6548 } 6549 6550 LLVM_DEBUG( 6551 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6552 "VPlan-native path.\n"); 6553 return VectorizationFactor::Disabled(); 6554 } 6555 6556 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF, 6557 unsigned UserIC) { 6558 assert(OrigLoop->empty() && "Inner loop expected."); 6559 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 6560 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6561 return None; 6562 6563 // Invalidate interleave groups if all blocks of loop will be predicated. 6564 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6565 !useMaskedInterleavedAccesses(*TTI)) { 6566 LLVM_DEBUG( 6567 dbgs() 6568 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6569 "which requires masked-interleaved support.\n"); 6570 if (CM.InterleaveInfo.invalidateGroups()) 6571 // Invalidating interleave groups also requires invalidating all decisions 6572 // based on them, which includes widening decisions and uniform and scalar 6573 // values. 6574 CM.invalidateCostModelingDecisions(); 6575 } 6576 6577 if (UserVF) { 6578 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6579 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6580 // Collect the instructions (and their associated costs) that will be more 6581 // profitable to scalarize. 6582 CM.selectUserVectorizationFactor(UserVF); 6583 buildVPlansWithVPRecipes(UserVF, UserVF); 6584 LLVM_DEBUG(printPlans(dbgs())); 6585 return {{UserVF, 0}}; 6586 } 6587 6588 unsigned MaxVF = MaybeMaxVF.getValue(); 6589 assert(MaxVF != 0 && "MaxVF is zero."); 6590 6591 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6592 // Collect Uniform and Scalar instructions after vectorization with VF. 6593 CM.collectUniformsAndScalars(VF); 6594 6595 // Collect the instructions (and their associated costs) that will be more 6596 // profitable to scalarize. 6597 if (VF > 1) 6598 CM.collectInstsToScalarize(VF); 6599 } 6600 6601 buildVPlansWithVPRecipes(1, MaxVF); 6602 LLVM_DEBUG(printPlans(dbgs())); 6603 if (MaxVF == 1) 6604 return VectorizationFactor::Disabled(); 6605 6606 // Select the optimal vectorization factor. 6607 return CM.selectVectorizationFactor(MaxVF); 6608 } 6609 6610 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6611 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6612 << '\n'); 6613 BestVF = VF; 6614 BestUF = UF; 6615 6616 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6617 return !Plan->hasVF(VF); 6618 }); 6619 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6620 } 6621 6622 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6623 DominatorTree *DT) { 6624 // Perform the actual loop transformation. 6625 6626 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6627 VPCallbackILV CallbackILV(ILV); 6628 6629 VPTransformState State{BestVF, BestUF, LI, 6630 DT, ILV.Builder, ILV.VectorLoopValueMap, 6631 &ILV, CallbackILV}; 6632 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6633 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6634 State.CanonicalIV = ILV.Induction; 6635 6636 //===------------------------------------------------===// 6637 // 6638 // Notice: any optimization or new instruction that go 6639 // into the code below should also be implemented in 6640 // the cost-model. 6641 // 6642 //===------------------------------------------------===// 6643 6644 // 2. Copy and widen instructions from the old loop into the new loop. 6645 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6646 VPlans.front()->execute(&State); 6647 6648 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6649 // predication, updating analyses. 6650 ILV.fixVectorizedLoop(); 6651 } 6652 6653 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6654 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6655 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6656 6657 // We create new control-flow for the vectorized loop, so the original 6658 // condition will be dead after vectorization if it's only used by the 6659 // branch. 6660 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6661 if (Cmp && Cmp->hasOneUse()) 6662 DeadInstructions.insert(Cmp); 6663 6664 // We create new "steps" for induction variable updates to which the original 6665 // induction variables map. An original update instruction will be dead if 6666 // all its users except the induction variable are dead. 6667 for (auto &Induction : Legal->getInductionVars()) { 6668 PHINode *Ind = Induction.first; 6669 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6670 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6671 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6672 DeadInstructions.end(); 6673 })) 6674 DeadInstructions.insert(IndUpdate); 6675 6676 // We record as "Dead" also the type-casting instructions we had identified 6677 // during induction analysis. We don't need any handling for them in the 6678 // vectorized loop because we have proven that, under a proper runtime 6679 // test guarding the vectorized loop, the value of the phi, and the casted 6680 // value of the phi, are the same. The last instruction in this casting chain 6681 // will get its scalar/vector/widened def from the scalar/vector/widened def 6682 // of the respective phi node. Any other casts in the induction def-use chain 6683 // have no other uses outside the phi update chain, and will be ignored. 6684 InductionDescriptor &IndDes = Induction.second; 6685 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6686 DeadInstructions.insert(Casts.begin(), Casts.end()); 6687 } 6688 } 6689 6690 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6691 6692 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6693 6694 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6695 Instruction::BinaryOps BinOp) { 6696 // When unrolling and the VF is 1, we only need to add a simple scalar. 6697 Type *Ty = Val->getType(); 6698 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6699 6700 if (Ty->isFloatingPointTy()) { 6701 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6702 6703 // Floating point operations had to be 'fast' to enable the unrolling. 6704 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6705 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6706 } 6707 Constant *C = ConstantInt::get(Ty, StartIdx); 6708 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6709 } 6710 6711 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6712 SmallVector<Metadata *, 4> MDs; 6713 // Reserve first location for self reference to the LoopID metadata node. 6714 MDs.push_back(nullptr); 6715 bool IsUnrollMetadata = false; 6716 MDNode *LoopID = L->getLoopID(); 6717 if (LoopID) { 6718 // First find existing loop unrolling disable metadata. 6719 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6720 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6721 if (MD) { 6722 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6723 IsUnrollMetadata = 6724 S && S->getString().startswith("llvm.loop.unroll.disable"); 6725 } 6726 MDs.push_back(LoopID->getOperand(i)); 6727 } 6728 } 6729 6730 if (!IsUnrollMetadata) { 6731 // Add runtime unroll disable metadata. 6732 LLVMContext &Context = L->getHeader()->getContext(); 6733 SmallVector<Metadata *, 1> DisableOperands; 6734 DisableOperands.push_back( 6735 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6736 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6737 MDs.push_back(DisableNode); 6738 MDNode *NewLoopID = MDNode::get(Context, MDs); 6739 // Set operand 0 to refer to the loop id itself. 6740 NewLoopID->replaceOperandWith(0, NewLoopID); 6741 L->setLoopID(NewLoopID); 6742 } 6743 } 6744 6745 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6746 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6747 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6748 bool PredicateAtRangeStart = Predicate(Range.Start); 6749 6750 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6751 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6752 Range.End = TmpVF; 6753 break; 6754 } 6755 6756 return PredicateAtRangeStart; 6757 } 6758 6759 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6760 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6761 /// of VF's starting at a given VF and extending it as much as possible. Each 6762 /// vectorization decision can potentially shorten this sub-range during 6763 /// buildVPlan(). 6764 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6765 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6766 VFRange SubRange = {VF, MaxVF + 1}; 6767 VPlans.push_back(buildVPlan(SubRange)); 6768 VF = SubRange.End; 6769 } 6770 } 6771 6772 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6773 VPlanPtr &Plan) { 6774 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6775 6776 // Look for cached value. 6777 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6778 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6779 if (ECEntryIt != EdgeMaskCache.end()) 6780 return ECEntryIt->second; 6781 6782 VPValue *SrcMask = createBlockInMask(Src, Plan); 6783 6784 // The terminator has to be a branch inst! 6785 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6786 assert(BI && "Unexpected terminator found"); 6787 6788 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6789 return EdgeMaskCache[Edge] = SrcMask; 6790 6791 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6792 assert(EdgeMask && "No Edge Mask found for condition"); 6793 6794 if (BI->getSuccessor(0) != Dst) 6795 EdgeMask = Builder.createNot(EdgeMask); 6796 6797 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6798 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6799 6800 return EdgeMaskCache[Edge] = EdgeMask; 6801 } 6802 6803 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6804 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6805 6806 // Look for cached value. 6807 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6808 if (BCEntryIt != BlockMaskCache.end()) 6809 return BCEntryIt->second; 6810 6811 // All-one mask is modelled as no-mask following the convention for masked 6812 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6813 VPValue *BlockMask = nullptr; 6814 6815 if (OrigLoop->getHeader() == BB) { 6816 if (!CM.blockNeedsPredication(BB)) 6817 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6818 6819 // Introduce the early-exit compare IV <= BTC to form header block mask. 6820 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6821 // Start by constructing the desired canonical IV. 6822 VPValue *IV = nullptr; 6823 if (Legal->getPrimaryInduction()) 6824 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6825 else { 6826 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 6827 Builder.getInsertBlock()->appendRecipe(IVRecipe); 6828 IV = IVRecipe->getVPValue(); 6829 } 6830 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6831 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6832 return BlockMaskCache[BB] = BlockMask; 6833 } 6834 6835 // This is the block mask. We OR all incoming edges. 6836 for (auto *Predecessor : predecessors(BB)) { 6837 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6838 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6839 return BlockMaskCache[BB] = EdgeMask; 6840 6841 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6842 BlockMask = EdgeMask; 6843 continue; 6844 } 6845 6846 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6847 } 6848 6849 return BlockMaskCache[BB] = BlockMask; 6850 } 6851 6852 VPWidenMemoryInstructionRecipe * 6853 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6854 VPlanPtr &Plan) { 6855 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6856 "Must be called with either a load or store"); 6857 6858 auto willWiden = [&](unsigned VF) -> bool { 6859 if (VF == 1) 6860 return false; 6861 LoopVectorizationCostModel::InstWidening Decision = 6862 CM.getWideningDecision(I, VF); 6863 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6864 "CM decision should be taken at this point."); 6865 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6866 return true; 6867 if (CM.isScalarAfterVectorization(I, VF) || 6868 CM.isProfitableToScalarize(I, VF)) 6869 return false; 6870 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6871 }; 6872 6873 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6874 return nullptr; 6875 6876 VPValue *Mask = nullptr; 6877 if (Legal->isMaskRequired(I)) 6878 Mask = createBlockInMask(I->getParent(), Plan); 6879 6880 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6881 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6882 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6883 6884 StoreInst *Store = cast<StoreInst>(I); 6885 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6886 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6887 } 6888 6889 VPWidenIntOrFpInductionRecipe * 6890 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 6891 // Check if this is an integer or fp induction. If so, build the recipe that 6892 // produces its scalar and vector values. 6893 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6894 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6895 II.getKind() == InductionDescriptor::IK_FpInduction) 6896 return new VPWidenIntOrFpInductionRecipe(Phi); 6897 6898 return nullptr; 6899 } 6900 6901 VPWidenIntOrFpInductionRecipe * 6902 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 6903 VFRange &Range) const { 6904 // Optimize the special case where the source is a constant integer 6905 // induction variable. Notice that we can only optimize the 'trunc' case 6906 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6907 // (c) other casts depend on pointer size. 6908 6909 // Determine whether \p K is a truncation based on an induction variable that 6910 // can be optimized. 6911 auto isOptimizableIVTruncate = 6912 [&](Instruction *K) -> std::function<bool(unsigned)> { 6913 return 6914 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6915 }; 6916 6917 if (LoopVectorizationPlanner::getDecisionAndClampRange( 6918 isOptimizableIVTruncate(I), Range)) 6919 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6920 I); 6921 return nullptr; 6922 } 6923 6924 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 6925 // We know that all PHIs in non-header blocks are converted into selects, so 6926 // we don't have to worry about the insertion order and we can just use the 6927 // builder. At this point we generate the predication tree. There may be 6928 // duplications since this is a simple recursive scan, but future 6929 // optimizations will clean it up. 6930 6931 SmallVector<VPValue *, 2> Operands; 6932 unsigned NumIncoming = Phi->getNumIncomingValues(); 6933 for (unsigned In = 0; In < NumIncoming; In++) { 6934 VPValue *EdgeMask = 6935 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6936 assert((EdgeMask || NumIncoming == 1) && 6937 "Multiple predecessors with one having a full mask"); 6938 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 6939 if (EdgeMask) 6940 Operands.push_back(EdgeMask); 6941 } 6942 return new VPBlendRecipe(Phi, Operands); 6943 } 6944 6945 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 6946 VPlan &Plan) const { 6947 6948 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6949 [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, 6950 Range); 6951 6952 if (IsPredicated) 6953 return nullptr; 6954 6955 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6956 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6957 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6958 return nullptr; 6959 6960 auto willWiden = [&](unsigned VF) -> bool { 6961 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6962 // The following case may be scalarized depending on the VF. 6963 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6964 // version of the instruction. 6965 // Is it beneficial to perform intrinsic call compared to lib call? 6966 bool NeedToScalarize = false; 6967 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6968 bool UseVectorIntrinsic = 6969 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6970 return UseVectorIntrinsic || !NeedToScalarize; 6971 }; 6972 6973 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6974 return nullptr; 6975 6976 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 6977 } 6978 6979 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 6980 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 6981 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 6982 // Instruction should be widened, unless it is scalar after vectorization, 6983 // scalarization is profitable or it is predicated. 6984 auto WillScalarize = [this, I](unsigned VF) -> bool { 6985 return CM.isScalarAfterVectorization(I, VF) || 6986 CM.isProfitableToScalarize(I, VF) || 6987 CM.isScalarWithPredication(I, VF); 6988 }; 6989 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 6990 Range); 6991 } 6992 6993 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 6994 auto IsVectorizableOpcode = [](unsigned Opcode) { 6995 switch (Opcode) { 6996 case Instruction::Add: 6997 case Instruction::And: 6998 case Instruction::AShr: 6999 case Instruction::BitCast: 7000 case Instruction::FAdd: 7001 case Instruction::FCmp: 7002 case Instruction::FDiv: 7003 case Instruction::FMul: 7004 case Instruction::FNeg: 7005 case Instruction::FPExt: 7006 case Instruction::FPToSI: 7007 case Instruction::FPToUI: 7008 case Instruction::FPTrunc: 7009 case Instruction::FRem: 7010 case Instruction::FSub: 7011 case Instruction::ICmp: 7012 case Instruction::IntToPtr: 7013 case Instruction::LShr: 7014 case Instruction::Mul: 7015 case Instruction::Or: 7016 case Instruction::PtrToInt: 7017 case Instruction::SDiv: 7018 case Instruction::Select: 7019 case Instruction::SExt: 7020 case Instruction::Shl: 7021 case Instruction::SIToFP: 7022 case Instruction::SRem: 7023 case Instruction::Sub: 7024 case Instruction::Trunc: 7025 case Instruction::UDiv: 7026 case Instruction::UIToFP: 7027 case Instruction::URem: 7028 case Instruction::Xor: 7029 case Instruction::ZExt: 7030 return true; 7031 } 7032 return false; 7033 }; 7034 7035 if (!IsVectorizableOpcode(I->getOpcode())) 7036 return nullptr; 7037 7038 // Success: widen this instruction. 7039 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7040 } 7041 7042 VPBasicBlock *VPRecipeBuilder::handleReplication( 7043 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7044 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7045 VPlanPtr &Plan) { 7046 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7047 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7048 Range); 7049 7050 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7051 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7052 7053 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7054 IsUniform, IsPredicated); 7055 setRecipe(I, Recipe); 7056 7057 // Find if I uses a predicated instruction. If so, it will use its scalar 7058 // value. Avoid hoisting the insert-element which packs the scalar value into 7059 // a vector value, as that happens iff all users use the vector value. 7060 for (auto &Op : I->operands()) 7061 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7062 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7063 PredInst2Recipe[PredInst]->setAlsoPack(false); 7064 7065 // Finalize the recipe for Instr, first if it is not predicated. 7066 if (!IsPredicated) { 7067 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7068 VPBB->appendRecipe(Recipe); 7069 return VPBB; 7070 } 7071 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7072 assert(VPBB->getSuccessors().empty() && 7073 "VPBB has successors when handling predicated replication."); 7074 // Record predicated instructions for above packing optimizations. 7075 PredInst2Recipe[I] = Recipe; 7076 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7077 VPBlockUtils::insertBlockAfter(Region, VPBB); 7078 auto *RegSucc = new VPBasicBlock(); 7079 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7080 return RegSucc; 7081 } 7082 7083 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7084 VPRecipeBase *PredRecipe, 7085 VPlanPtr &Plan) { 7086 // Instructions marked for predication are replicated and placed under an 7087 // if-then construct to prevent side-effects. 7088 7089 // Generate recipes to compute the block mask for this region. 7090 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7091 7092 // Build the triangular if-then region. 7093 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7094 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7095 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7096 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7097 auto *PHIRecipe = 7098 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7099 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7100 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7101 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7102 7103 // Note: first set Entry as region entry and then connect successors starting 7104 // from it in order, to propagate the "parent" of each VPBasicBlock. 7105 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7106 VPBlockUtils::connectBlocks(Pred, Exit); 7107 7108 return Region; 7109 } 7110 7111 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7112 VFRange &Range, 7113 VPlanPtr &Plan) { 7114 // First, check for specific widening recipes that deal with calls, memory 7115 // operations, inductions and Phi nodes. 7116 if (auto *CI = dyn_cast<CallInst>(Instr)) 7117 return tryToWidenCall(CI, Range, *Plan); 7118 7119 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7120 return tryToWidenMemory(Instr, Range, Plan); 7121 7122 VPRecipeBase *Recipe; 7123 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7124 if (Phi->getParent() != OrigLoop->getHeader()) 7125 return tryToBlend(Phi, Plan); 7126 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7127 return Recipe; 7128 return new VPWidenPHIRecipe(Phi); 7129 } 7130 7131 if (isa<TruncInst>(Instr) && 7132 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7133 return Recipe; 7134 7135 if (!shouldWiden(Instr, Range)) 7136 return nullptr; 7137 7138 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7139 return new VPWidenGEPRecipe(GEP, OrigLoop); 7140 7141 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7142 bool InvariantCond = 7143 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7144 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7145 InvariantCond); 7146 } 7147 7148 return tryToWiden(Instr, *Plan); 7149 } 7150 7151 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7152 unsigned MaxVF) { 7153 assert(OrigLoop->empty() && "Inner loop expected."); 7154 7155 // Collect conditions feeding internal conditional branches; they need to be 7156 // represented in VPlan for it to model masking. 7157 SmallPtrSet<Value *, 1> NeedDef; 7158 7159 auto *Latch = OrigLoop->getLoopLatch(); 7160 for (BasicBlock *BB : OrigLoop->blocks()) { 7161 if (BB == Latch) 7162 continue; 7163 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7164 if (Branch && Branch->isConditional()) 7165 NeedDef.insert(Branch->getCondition()); 7166 } 7167 7168 // If the tail is to be folded by masking, the primary induction variable, if 7169 // exists needs to be represented in VPlan for it to model early-exit masking. 7170 // Also, both the Phi and the live-out instruction of each reduction are 7171 // required in order to introduce a select between them in VPlan. 7172 if (CM.foldTailByMasking()) { 7173 if (Legal->getPrimaryInduction()) 7174 NeedDef.insert(Legal->getPrimaryInduction()); 7175 for (auto &Reduction : Legal->getReductionVars()) { 7176 NeedDef.insert(Reduction.first); 7177 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7178 } 7179 } 7180 7181 // Collect instructions from the original loop that will become trivially dead 7182 // in the vectorized loop. We don't need to vectorize these instructions. For 7183 // example, original induction update instructions can become dead because we 7184 // separately emit induction "steps" when generating code for the new loop. 7185 // Similarly, we create a new latch condition when setting up the structure 7186 // of the new loop, so the old one can become dead. 7187 SmallPtrSet<Instruction *, 4> DeadInstructions; 7188 collectTriviallyDeadInstructions(DeadInstructions); 7189 7190 // Add assume instructions we need to drop to DeadInstructions, to prevent 7191 // them from being added to the VPlan. 7192 // TODO: We only need to drop assumes in blocks that get flattend. If the 7193 // control flow is preserved, we should keep them. 7194 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7195 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7196 7197 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7198 // Dead instructions do not need sinking. Remove them from SinkAfter. 7199 for (Instruction *I : DeadInstructions) 7200 SinkAfter.erase(I); 7201 7202 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7203 VFRange SubRange = {VF, MaxVF + 1}; 7204 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7205 DeadInstructions, SinkAfter)); 7206 VF = SubRange.End; 7207 } 7208 } 7209 7210 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7211 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7212 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7213 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7214 7215 // Hold a mapping from predicated instructions to their recipes, in order to 7216 // fix their AlsoPack behavior if a user is determined to replicate and use a 7217 // scalar instead of vector value. 7218 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7219 7220 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7221 7222 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7223 7224 // --------------------------------------------------------------------------- 7225 // Pre-construction: record ingredients whose recipes we'll need to further 7226 // process after constructing the initial VPlan. 7227 // --------------------------------------------------------------------------- 7228 7229 // Mark instructions we'll need to sink later and their targets as 7230 // ingredients whose recipe we'll need to record. 7231 for (auto &Entry : SinkAfter) { 7232 RecipeBuilder.recordRecipeOf(Entry.first); 7233 RecipeBuilder.recordRecipeOf(Entry.second); 7234 } 7235 7236 // For each interleave group which is relevant for this (possibly trimmed) 7237 // Range, add it to the set of groups to be later applied to the VPlan and add 7238 // placeholders for its members' Recipes which we'll be replacing with a 7239 // single VPInterleaveRecipe. 7240 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7241 auto applyIG = [IG, this](unsigned VF) -> bool { 7242 return (VF >= 2 && // Query is illegal for VF == 1 7243 CM.getWideningDecision(IG->getInsertPos(), VF) == 7244 LoopVectorizationCostModel::CM_Interleave); 7245 }; 7246 if (!getDecisionAndClampRange(applyIG, Range)) 7247 continue; 7248 InterleaveGroups.insert(IG); 7249 for (unsigned i = 0; i < IG->getFactor(); i++) 7250 if (Instruction *Member = IG->getMember(i)) 7251 RecipeBuilder.recordRecipeOf(Member); 7252 }; 7253 7254 // --------------------------------------------------------------------------- 7255 // Build initial VPlan: Scan the body of the loop in a topological order to 7256 // visit each basic block after having visited its predecessor basic blocks. 7257 // --------------------------------------------------------------------------- 7258 7259 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7260 auto Plan = std::make_unique<VPlan>(); 7261 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7262 Plan->setEntry(VPBB); 7263 7264 // Represent values that will have defs inside VPlan. 7265 for (Value *V : NeedDef) 7266 Plan->addVPValue(V); 7267 7268 // Scan the body of the loop in a topological order to visit each basic block 7269 // after having visited its predecessor basic blocks. 7270 LoopBlocksDFS DFS(OrigLoop); 7271 DFS.perform(LI); 7272 7273 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7274 // Relevant instructions from basic block BB will be grouped into VPRecipe 7275 // ingredients and fill a new VPBasicBlock. 7276 unsigned VPBBsForBB = 0; 7277 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7278 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7279 VPBB = FirstVPBBForBB; 7280 Builder.setInsertPoint(VPBB); 7281 7282 // Introduce each ingredient into VPlan. 7283 // TODO: Model and preserve debug instrinsics in VPlan. 7284 for (Instruction &I : BB->instructionsWithoutDebug()) { 7285 Instruction *Instr = &I; 7286 7287 // First filter out irrelevant instructions, to ensure no recipes are 7288 // built for them. 7289 if (isa<BranchInst>(Instr) || 7290 DeadInstructions.find(Instr) != DeadInstructions.end()) 7291 continue; 7292 7293 if (auto Recipe = 7294 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7295 RecipeBuilder.setRecipe(Instr, Recipe); 7296 VPBB->appendRecipe(Recipe); 7297 continue; 7298 } 7299 7300 // Otherwise, if all widening options failed, Instruction is to be 7301 // replicated. This may create a successor for VPBB. 7302 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7303 Instr, Range, VPBB, PredInst2Recipe, Plan); 7304 if (NextVPBB != VPBB) { 7305 VPBB = NextVPBB; 7306 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7307 : ""); 7308 } 7309 } 7310 } 7311 7312 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7313 // may also be empty, such as the last one VPBB, reflecting original 7314 // basic-blocks with no recipes. 7315 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7316 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7317 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7318 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7319 delete PreEntry; 7320 7321 // --------------------------------------------------------------------------- 7322 // Transform initial VPlan: Apply previously taken decisions, in order, to 7323 // bring the VPlan to its final state. 7324 // --------------------------------------------------------------------------- 7325 7326 // Apply Sink-After legal constraints. 7327 for (auto &Entry : SinkAfter) { 7328 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7329 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7330 Sink->moveAfter(Target); 7331 } 7332 7333 // Interleave memory: for each Interleave Group we marked earlier as relevant 7334 // for this VPlan, replace the Recipes widening its memory instructions with a 7335 // single VPInterleaveRecipe at its insertion point. 7336 for (auto IG : InterleaveGroups) { 7337 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7338 RecipeBuilder.getRecipe(IG->getInsertPos())); 7339 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7340 ->insertBefore(Recipe); 7341 7342 for (unsigned i = 0; i < IG->getFactor(); ++i) 7343 if (Instruction *Member = IG->getMember(i)) { 7344 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7345 } 7346 } 7347 7348 // Finally, if tail is folded by masking, introduce selects between the phi 7349 // and the live-out instruction of each reduction, at the end of the latch. 7350 if (CM.foldTailByMasking()) { 7351 Builder.setInsertPoint(VPBB); 7352 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7353 for (auto &Reduction : Legal->getReductionVars()) { 7354 VPValue *Phi = Plan->getVPValue(Reduction.first); 7355 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7356 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7357 } 7358 } 7359 7360 std::string PlanName; 7361 raw_string_ostream RSO(PlanName); 7362 unsigned VF = Range.Start; 7363 Plan->addVF(VF); 7364 RSO << "Initial VPlan for VF={" << VF; 7365 for (VF *= 2; VF < Range.End; VF *= 2) { 7366 Plan->addVF(VF); 7367 RSO << "," << VF; 7368 } 7369 RSO << "},UF>=1"; 7370 RSO.flush(); 7371 Plan->setName(PlanName); 7372 7373 return Plan; 7374 } 7375 7376 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7377 // Outer loop handling: They may require CFG and instruction level 7378 // transformations before even evaluating whether vectorization is profitable. 7379 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7380 // the vectorization pipeline. 7381 assert(!OrigLoop->empty()); 7382 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7383 7384 // Create new empty VPlan 7385 auto Plan = std::make_unique<VPlan>(); 7386 7387 // Build hierarchical CFG 7388 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7389 HCFGBuilder.buildHierarchicalCFG(); 7390 7391 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7392 Plan->addVF(VF); 7393 7394 if (EnableVPlanPredication) { 7395 VPlanPredicator VPP(*Plan); 7396 VPP.predicate(); 7397 7398 // Avoid running transformation to recipes until masked code generation in 7399 // VPlan-native path is in place. 7400 return Plan; 7401 } 7402 7403 SmallPtrSet<Instruction *, 1> DeadInstructions; 7404 VPlanTransforms::VPInstructionsToVPRecipes( 7405 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7406 return Plan; 7407 } 7408 7409 Value* LoopVectorizationPlanner::VPCallbackILV:: 7410 getOrCreateVectorValues(Value *V, unsigned Part) { 7411 return ILV.getOrCreateVectorValue(V, Part); 7412 } 7413 7414 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7415 Value *V, const VPIteration &Instance) { 7416 return ILV.getOrCreateScalarValue(V, Instance); 7417 } 7418 7419 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7420 VPSlotTracker &SlotTracker) const { 7421 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7422 IG->getInsertPos()->printAsOperand(O, false); 7423 O << ", "; 7424 getAddr()->printAsOperand(O, SlotTracker); 7425 VPValue *Mask = getMask(); 7426 if (Mask) { 7427 O << ", "; 7428 Mask->printAsOperand(O, SlotTracker); 7429 } 7430 for (unsigned i = 0; i < IG->getFactor(); ++i) 7431 if (Instruction *I = IG->getMember(i)) 7432 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7433 } 7434 7435 void VPWidenCallRecipe::execute(VPTransformState &State) { 7436 State.ILV->widenCallInstruction(Ingredient, User, State); 7437 } 7438 7439 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7440 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7441 } 7442 7443 void VPWidenRecipe::execute(VPTransformState &State) { 7444 State.ILV->widenInstruction(Ingredient, User, State); 7445 } 7446 7447 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7448 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7449 IsIndexLoopInvariant); 7450 } 7451 7452 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7453 assert(!State.Instance && "Int or FP induction being replicated."); 7454 State.ILV->widenIntOrFpInduction(IV, Trunc); 7455 } 7456 7457 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7458 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7459 } 7460 7461 void VPBlendRecipe::execute(VPTransformState &State) { 7462 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7463 // We know that all PHIs in non-header blocks are converted into 7464 // selects, so we don't have to worry about the insertion order and we 7465 // can just use the builder. 7466 // At this point we generate the predication tree. There may be 7467 // duplications since this is a simple recursive scan, but future 7468 // optimizations will clean it up. 7469 7470 unsigned NumIncoming = getNumIncomingValues(); 7471 7472 // Generate a sequence of selects of the form: 7473 // SELECT(Mask3, In3, 7474 // SELECT(Mask2, In2, 7475 // SELECT(Mask1, In1, 7476 // In0))) 7477 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7478 // are essentially undef are taken from In0. 7479 InnerLoopVectorizer::VectorParts Entry(State.UF); 7480 for (unsigned In = 0; In < NumIncoming; ++In) { 7481 for (unsigned Part = 0; Part < State.UF; ++Part) { 7482 // We might have single edge PHIs (blocks) - use an identity 7483 // 'select' for the first PHI operand. 7484 Value *In0 = State.get(getIncomingValue(In), Part); 7485 if (In == 0) 7486 Entry[Part] = In0; // Initialize with the first incoming value. 7487 else { 7488 // Select between the current value and the previous incoming edge 7489 // based on the incoming mask. 7490 Value *Cond = State.get(getMask(In), Part); 7491 Entry[Part] = 7492 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7493 } 7494 } 7495 } 7496 for (unsigned Part = 0; Part < State.UF; ++Part) 7497 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7498 } 7499 7500 void VPInterleaveRecipe::execute(VPTransformState &State) { 7501 assert(!State.Instance && "Interleave group being replicated."); 7502 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7503 } 7504 7505 void VPReplicateRecipe::execute(VPTransformState &State) { 7506 if (State.Instance) { // Generate a single instance. 7507 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7508 IsPredicated, State); 7509 // Insert scalar instance packing it into a vector. 7510 if (AlsoPack && State.VF > 1) { 7511 // If we're constructing lane 0, initialize to start from undef. 7512 if (State.Instance->Lane == 0) { 7513 Value *Undef = 7514 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7515 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7516 } 7517 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7518 } 7519 return; 7520 } 7521 7522 // Generate scalar instances for all VF lanes of all UF parts, unless the 7523 // instruction is uniform inwhich case generate only the first lane for each 7524 // of the UF parts. 7525 unsigned EndLane = IsUniform ? 1 : State.VF; 7526 for (unsigned Part = 0; Part < State.UF; ++Part) 7527 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7528 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 7529 IsPredicated, State); 7530 } 7531 7532 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7533 assert(State.Instance && "Branch on Mask works only on single instance."); 7534 7535 unsigned Part = State.Instance->Part; 7536 unsigned Lane = State.Instance->Lane; 7537 7538 Value *ConditionBit = nullptr; 7539 VPValue *BlockInMask = getMask(); 7540 if (BlockInMask) { 7541 ConditionBit = State.get(BlockInMask, Part); 7542 if (ConditionBit->getType()->isVectorTy()) 7543 ConditionBit = State.Builder.CreateExtractElement( 7544 ConditionBit, State.Builder.getInt32(Lane)); 7545 } else // Block in mask is all-one. 7546 ConditionBit = State.Builder.getTrue(); 7547 7548 // Replace the temporary unreachable terminator with a new conditional branch, 7549 // whose two destinations will be set later when they are created. 7550 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7551 assert(isa<UnreachableInst>(CurrentTerminator) && 7552 "Expected to replace unreachable terminator with conditional branch."); 7553 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7554 CondBr->setSuccessor(0, nullptr); 7555 ReplaceInstWithInst(CurrentTerminator, CondBr); 7556 } 7557 7558 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7559 assert(State.Instance && "Predicated instruction PHI works per instance."); 7560 Instruction *ScalarPredInst = cast<Instruction>( 7561 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7562 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7563 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7564 assert(PredicatingBB && "Predicated block has no single predecessor."); 7565 7566 // By current pack/unpack logic we need to generate only a single phi node: if 7567 // a vector value for the predicated instruction exists at this point it means 7568 // the instruction has vector users only, and a phi for the vector value is 7569 // needed. In this case the recipe of the predicated instruction is marked to 7570 // also do that packing, thereby "hoisting" the insert-element sequence. 7571 // Otherwise, a phi node for the scalar value is needed. 7572 unsigned Part = State.Instance->Part; 7573 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7574 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7575 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7576 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7577 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7578 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7579 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7580 } else { 7581 Type *PredInstType = PredInst->getType(); 7582 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7583 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7584 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7585 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7586 } 7587 } 7588 7589 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7590 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7591 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7592 getMask()); 7593 } 7594 7595 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7596 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7597 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7598 // for predication. 7599 static ScalarEpilogueLowering getScalarEpilogueLowering( 7600 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7601 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7602 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7603 LoopVectorizationLegality &LVL) { 7604 bool OptSize = 7605 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7606 PGSOQueryType::IRPass); 7607 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7608 // don't look at hints or options, and don't request a scalar epilogue. 7609 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7610 return CM_ScalarEpilogueNotAllowedOptSize; 7611 7612 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7613 !PreferPredicateOverEpilog; 7614 7615 // 2) Next, if disabling predication is requested on the command line, honour 7616 // this and request a scalar epilogue. 7617 if (PredicateOptDisabled) 7618 return CM_ScalarEpilogueAllowed; 7619 7620 // 3) and 4) look if enabling predication is requested on the command line, 7621 // with a loop hint, or if the TTI hook indicates this is profitable, request 7622 // predication . 7623 if (PreferPredicateOverEpilog || 7624 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7625 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7626 LVL.getLAI()) && 7627 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7628 return CM_ScalarEpilogueNotNeededUsePredicate; 7629 7630 return CM_ScalarEpilogueAllowed; 7631 } 7632 7633 // Process the loop in the VPlan-native vectorization path. This path builds 7634 // VPlan upfront in the vectorization pipeline, which allows to apply 7635 // VPlan-to-VPlan transformations from the very beginning without modifying the 7636 // input LLVM IR. 7637 static bool processLoopInVPlanNativePath( 7638 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7639 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7640 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7641 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7642 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7643 7644 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7645 Function *F = L->getHeader()->getParent(); 7646 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7647 7648 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7649 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7650 7651 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7652 &Hints, IAI); 7653 // Use the planner for outer loop vectorization. 7654 // TODO: CM is not used at this point inside the planner. Turn CM into an 7655 // optional argument if we don't need it in the future. 7656 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 7657 7658 // Get user vectorization factor. 7659 const unsigned UserVF = Hints.getWidth(); 7660 7661 // Plan how to best vectorize, return the best VF and its cost. 7662 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7663 7664 // If we are stress testing VPlan builds, do not attempt to generate vector 7665 // code. Masked vector code generation support will follow soon. 7666 // Also, do not attempt to vectorize if no vector code will be produced. 7667 if (VPlanBuildStressTest || EnableVPlanPredication || 7668 VectorizationFactor::Disabled() == VF) 7669 return false; 7670 7671 LVP.setBestPlan(VF.Width, 1); 7672 7673 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7674 &CM); 7675 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7676 << L->getHeader()->getParent()->getName() << "\"\n"); 7677 LVP.executePlan(LB, DT); 7678 7679 // Mark the loop as already vectorized to avoid vectorizing again. 7680 Hints.setAlreadyVectorized(); 7681 7682 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 7683 return true; 7684 } 7685 7686 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 7687 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 7688 !EnableLoopInterleaving), 7689 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 7690 !EnableLoopVectorization) {} 7691 7692 bool LoopVectorizePass::processLoop(Loop *L) { 7693 assert((EnableVPlanNativePath || L->empty()) && 7694 "VPlan-native path is not enabled. Only process inner loops."); 7695 7696 #ifndef NDEBUG 7697 const std::string DebugLocStr = getDebugLocString(L); 7698 #endif /* NDEBUG */ 7699 7700 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7701 << L->getHeader()->getParent()->getName() << "\" from " 7702 << DebugLocStr << "\n"); 7703 7704 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7705 7706 LLVM_DEBUG( 7707 dbgs() << "LV: Loop hints:" 7708 << " force=" 7709 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7710 ? "disabled" 7711 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7712 ? "enabled" 7713 : "?")) 7714 << " width=" << Hints.getWidth() 7715 << " unroll=" << Hints.getInterleave() << "\n"); 7716 7717 // Function containing loop 7718 Function *F = L->getHeader()->getParent(); 7719 7720 // Looking at the diagnostic output is the only way to determine if a loop 7721 // was vectorized (other than looking at the IR or machine code), so it 7722 // is important to generate an optimization remark for each loop. Most of 7723 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7724 // generated as OptimizationRemark and OptimizationRemarkMissed are 7725 // less verbose reporting vectorized loops and unvectorized loops that may 7726 // benefit from vectorization, respectively. 7727 7728 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7729 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7730 return false; 7731 } 7732 7733 PredicatedScalarEvolution PSE(*SE, *L); 7734 7735 // Check if it is legal to vectorize the loop. 7736 LoopVectorizationRequirements Requirements(*ORE); 7737 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7738 &Requirements, &Hints, DB, AC); 7739 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7740 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7741 Hints.emitRemarkWithHints(); 7742 return false; 7743 } 7744 7745 // Check the function attributes and profiles to find out if this function 7746 // should be optimized for size. 7747 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7748 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7749 7750 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7751 // here. They may require CFG and instruction level transformations before 7752 // even evaluating whether vectorization is profitable. Since we cannot modify 7753 // the incoming IR, we need to build VPlan upfront in the vectorization 7754 // pipeline. 7755 if (!L->empty()) 7756 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7757 ORE, BFI, PSI, Hints); 7758 7759 assert(L->empty() && "Inner loop expected."); 7760 7761 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7762 // count by optimizing for size, to minimize overheads. 7763 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7764 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7765 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7766 << "This loop is worth vectorizing only if no scalar " 7767 << "iteration overheads are incurred."); 7768 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7769 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7770 else { 7771 LLVM_DEBUG(dbgs() << "\n"); 7772 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7773 } 7774 } 7775 7776 // Check the function attributes to see if implicit floats are allowed. 7777 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7778 // an integer loop and the vector instructions selected are purely integer 7779 // vector instructions? 7780 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7781 reportVectorizationFailure( 7782 "Can't vectorize when the NoImplicitFloat attribute is used", 7783 "loop not vectorized due to NoImplicitFloat attribute", 7784 "NoImplicitFloat", ORE, L); 7785 Hints.emitRemarkWithHints(); 7786 return false; 7787 } 7788 7789 // Check if the target supports potentially unsafe FP vectorization. 7790 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7791 // for the target we're vectorizing for, to make sure none of the 7792 // additional fp-math flags can help. 7793 if (Hints.isPotentiallyUnsafe() && 7794 TTI->isFPVectorizationPotentiallyUnsafe()) { 7795 reportVectorizationFailure( 7796 "Potentially unsafe FP op prevents vectorization", 7797 "loop not vectorized due to unsafe FP support.", 7798 "UnsafeFP", ORE, L); 7799 Hints.emitRemarkWithHints(); 7800 return false; 7801 } 7802 7803 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7804 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7805 7806 // If an override option has been passed in for interleaved accesses, use it. 7807 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7808 UseInterleaved = EnableInterleavedMemAccesses; 7809 7810 // Analyze interleaved memory accesses. 7811 if (UseInterleaved) { 7812 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7813 } 7814 7815 // Use the cost model. 7816 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7817 F, &Hints, IAI); 7818 CM.collectValuesToIgnore(); 7819 7820 // Use the planner for vectorization. 7821 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 7822 7823 // Get user vectorization factor and interleave count. 7824 unsigned UserVF = Hints.getWidth(); 7825 unsigned UserIC = Hints.getInterleave(); 7826 7827 // Plan how to best vectorize, return the best VF and its cost. 7828 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 7829 7830 VectorizationFactor VF = VectorizationFactor::Disabled(); 7831 unsigned IC = 1; 7832 7833 if (MaybeVF) { 7834 VF = *MaybeVF; 7835 // Select the interleave count. 7836 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7837 } 7838 7839 // Identify the diagnostic messages that should be produced. 7840 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7841 bool VectorizeLoop = true, InterleaveLoop = true; 7842 if (Requirements.doesNotMeet(F, L, Hints)) { 7843 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7844 "requirements.\n"); 7845 Hints.emitRemarkWithHints(); 7846 return false; 7847 } 7848 7849 if (VF.Width == 1) { 7850 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7851 VecDiagMsg = std::make_pair( 7852 "VectorizationNotBeneficial", 7853 "the cost-model indicates that vectorization is not beneficial"); 7854 VectorizeLoop = false; 7855 } 7856 7857 if (!MaybeVF && UserIC > 1) { 7858 // Tell the user interleaving was avoided up-front, despite being explicitly 7859 // requested. 7860 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7861 "interleaving should be avoided up front\n"); 7862 IntDiagMsg = std::make_pair( 7863 "InterleavingAvoided", 7864 "Ignoring UserIC, because interleaving was avoided up front"); 7865 InterleaveLoop = false; 7866 } else if (IC == 1 && UserIC <= 1) { 7867 // Tell the user interleaving is not beneficial. 7868 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7869 IntDiagMsg = std::make_pair( 7870 "InterleavingNotBeneficial", 7871 "the cost-model indicates that interleaving is not beneficial"); 7872 InterleaveLoop = false; 7873 if (UserIC == 1) { 7874 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7875 IntDiagMsg.second += 7876 " and is explicitly disabled or interleave count is set to 1"; 7877 } 7878 } else if (IC > 1 && UserIC == 1) { 7879 // Tell the user interleaving is beneficial, but it explicitly disabled. 7880 LLVM_DEBUG( 7881 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7882 IntDiagMsg = std::make_pair( 7883 "InterleavingBeneficialButDisabled", 7884 "the cost-model indicates that interleaving is beneficial " 7885 "but is explicitly disabled or interleave count is set to 1"); 7886 InterleaveLoop = false; 7887 } 7888 7889 // Override IC if user provided an interleave count. 7890 IC = UserIC > 0 ? UserIC : IC; 7891 7892 // Emit diagnostic messages, if any. 7893 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7894 if (!VectorizeLoop && !InterleaveLoop) { 7895 // Do not vectorize or interleaving the loop. 7896 ORE->emit([&]() { 7897 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7898 L->getStartLoc(), L->getHeader()) 7899 << VecDiagMsg.second; 7900 }); 7901 ORE->emit([&]() { 7902 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7903 L->getStartLoc(), L->getHeader()) 7904 << IntDiagMsg.second; 7905 }); 7906 return false; 7907 } else if (!VectorizeLoop && InterleaveLoop) { 7908 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7909 ORE->emit([&]() { 7910 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7911 L->getStartLoc(), L->getHeader()) 7912 << VecDiagMsg.second; 7913 }); 7914 } else if (VectorizeLoop && !InterleaveLoop) { 7915 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7916 << ") in " << DebugLocStr << '\n'); 7917 ORE->emit([&]() { 7918 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7919 L->getStartLoc(), L->getHeader()) 7920 << IntDiagMsg.second; 7921 }); 7922 } else if (VectorizeLoop && InterleaveLoop) { 7923 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7924 << ") in " << DebugLocStr << '\n'); 7925 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7926 } 7927 7928 LVP.setBestPlan(VF.Width, IC); 7929 7930 using namespace ore; 7931 bool DisableRuntimeUnroll = false; 7932 MDNode *OrigLoopID = L->getLoopID(); 7933 7934 if (!VectorizeLoop) { 7935 assert(IC > 1 && "interleave count should not be 1 or 0"); 7936 // If we decided that it is not legal to vectorize the loop, then 7937 // interleave it. 7938 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7939 &CM); 7940 LVP.executePlan(Unroller, DT); 7941 7942 ORE->emit([&]() { 7943 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7944 L->getHeader()) 7945 << "interleaved loop (interleaved count: " 7946 << NV("InterleaveCount", IC) << ")"; 7947 }); 7948 } else { 7949 // If we decided that it is *legal* to vectorize the loop, then do it. 7950 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7951 &LVL, &CM); 7952 LVP.executePlan(LB, DT); 7953 ++LoopsVectorized; 7954 7955 // Add metadata to disable runtime unrolling a scalar loop when there are 7956 // no runtime checks about strides and memory. A scalar loop that is 7957 // rarely used is not worth unrolling. 7958 if (!LB.areSafetyChecksAdded()) 7959 DisableRuntimeUnroll = true; 7960 7961 // Report the vectorization decision. 7962 ORE->emit([&]() { 7963 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7964 L->getHeader()) 7965 << "vectorized loop (vectorization width: " 7966 << NV("VectorizationFactor", VF.Width) 7967 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7968 }); 7969 } 7970 7971 Optional<MDNode *> RemainderLoopID = 7972 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7973 LLVMLoopVectorizeFollowupEpilogue}); 7974 if (RemainderLoopID.hasValue()) { 7975 L->setLoopID(RemainderLoopID.getValue()); 7976 } else { 7977 if (DisableRuntimeUnroll) 7978 AddRuntimeUnrollDisableMetaData(L); 7979 7980 // Mark the loop as already vectorized to avoid vectorizing again. 7981 Hints.setAlreadyVectorized(); 7982 } 7983 7984 assert(!verifyFunction(*L->getHeader()->getParent())); 7985 return true; 7986 } 7987 7988 LoopVectorizeResult LoopVectorizePass::runImpl( 7989 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7990 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7991 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7992 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7993 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7994 SE = &SE_; 7995 LI = &LI_; 7996 TTI = &TTI_; 7997 DT = &DT_; 7998 BFI = &BFI_; 7999 TLI = TLI_; 8000 AA = &AA_; 8001 AC = &AC_; 8002 GetLAA = &GetLAA_; 8003 DB = &DB_; 8004 ORE = &ORE_; 8005 PSI = PSI_; 8006 8007 // Don't attempt if 8008 // 1. the target claims to have no vector registers, and 8009 // 2. interleaving won't help ILP. 8010 // 8011 // The second condition is necessary because, even if the target has no 8012 // vector registers, loop vectorization may still enable scalar 8013 // interleaving. 8014 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8015 TTI->getMaxInterleaveFactor(1) < 2) 8016 return LoopVectorizeResult(false, false); 8017 8018 bool Changed = false, CFGChanged = false; 8019 8020 // The vectorizer requires loops to be in simplified form. 8021 // Since simplification may add new inner loops, it has to run before the 8022 // legality and profitability checks. This means running the loop vectorizer 8023 // will simplify all loops, regardless of whether anything end up being 8024 // vectorized. 8025 for (auto &L : *LI) 8026 Changed |= CFGChanged |= 8027 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8028 8029 // Build up a worklist of inner-loops to vectorize. This is necessary as 8030 // the act of vectorizing or partially unrolling a loop creates new loops 8031 // and can invalidate iterators across the loops. 8032 SmallVector<Loop *, 8> Worklist; 8033 8034 for (Loop *L : *LI) 8035 collectSupportedLoops(*L, LI, ORE, Worklist); 8036 8037 LoopsAnalyzed += Worklist.size(); 8038 8039 // Now walk the identified inner loops. 8040 while (!Worklist.empty()) { 8041 Loop *L = Worklist.pop_back_val(); 8042 8043 // For the inner loops we actually process, form LCSSA to simplify the 8044 // transform. 8045 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8046 8047 Changed |= CFGChanged |= processLoop(L); 8048 } 8049 8050 // Process each loop nest in the function. 8051 return LoopVectorizeResult(Changed, CFGChanged); 8052 } 8053 8054 PreservedAnalyses LoopVectorizePass::run(Function &F, 8055 FunctionAnalysisManager &AM) { 8056 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8057 auto &LI = AM.getResult<LoopAnalysis>(F); 8058 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8059 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8060 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8061 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8062 auto &AA = AM.getResult<AAManager>(F); 8063 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8064 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8065 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8066 MemorySSA *MSSA = EnableMSSALoopDependency 8067 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8068 : nullptr; 8069 8070 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8071 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8072 [&](Loop &L) -> const LoopAccessInfo & { 8073 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8074 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8075 }; 8076 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8077 ProfileSummaryInfo *PSI = 8078 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8079 LoopVectorizeResult Result = 8080 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8081 if (!Result.MadeAnyChange) 8082 return PreservedAnalyses::all(); 8083 PreservedAnalyses PA; 8084 8085 // We currently do not preserve loopinfo/dominator analyses with outer loop 8086 // vectorization. Until this is addressed, mark these analyses as preserved 8087 // only for non-VPlan-native path. 8088 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8089 if (!EnableVPlanNativePath) { 8090 PA.preserve<LoopAnalysis>(); 8091 PA.preserve<DominatorTreeAnalysis>(); 8092 } 8093 PA.preserve<BasicAA>(); 8094 PA.preserve<GlobalsAA>(); 8095 if (!Result.MadeCFGChange) 8096 PA.preserveSet<CFGAnalyses>(); 8097 return PA; 8098 } 8099